1 @/***************************************************************************** 2 @* 3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 @* 5 @* Licensed under the Apache License, Version 2.0 (the "License"); 6 @* you may not use this file except in compliance with the License. 7 @* You may obtain a copy of the License at: 8 @* 9 @* http://www.apache.org/licenses/LICENSE-2.0 10 @* 11 @* Unless required by applicable law or agreed to in writing, software 12 @* distributed under the License is distributed on an "AS IS" BASIS, 13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 @* See the License for the specific language governing permissions and 15 @* limitations under the License. 16 @* 17 @*****************************************************************************/ 18 @/** 19 @/******************************************************************************* 20 @* @file 21 @* ihevc_deblk_luma_vert.s 22 @* 23 @* @brief 24 @* contains function definitions for inter prediction interpolation. 25 @* functions are coded using neon intrinsics and can be compiled using 26 27 @* rvct 28 @* 29 @* @author 30 @* anand s 31 @* 32 @* @par list of functions: 33 @* 34 @* 35 @* @remarks 36 @* none 37 @* 38 @*******************************************************************************/ 39 40 .text 41 .align 4 42 43 44 45 46 47 .extern gai4_ihevc_tc_table 48 .extern gai4_ihevc_beta_table 49 50 .globl ihevc_deblk_luma_vert_a9q 51 52 gai4_ihevc_tc_table_addr: 53 .long gai4_ihevc_tc_table - ulbl1 - 8 54 55 gai4_ihevc_beta_table_addr: 56 .long gai4_ihevc_beta_table - ulbl2 - 8 57 58 .type ihevc_deblk_luma_vert_a9q, %function 59 60 ihevc_deblk_luma_vert_a9q: 61 62 push {r3-r12,lr} 63 ldr r4,[sp,#0x2c] 64 ldr r5,[sp,#0x30] 65 66 add r3,r3,r4 67 add r3,r3,#1 68 ldr r6, [sp,#0x34] 69 asr r3,r3,#1 70 add r7,r3,r5,lsl #1 71 add r3,r3,r6,lsl #1 72 cmp r7,#0x33 73 movgt r7,#0x33 74 bgt l1.56 75 cmp r7,#0x0 76 movlt r7,#0x0 @ r7 has the beta_index value 77 l1.56: 78 79 @ bic r2,r2,#1 80 asr r2,r2,#1 81 82 add r3,r3,r2,lsl #1 83 cmp r3,#0x35 84 movgt r3,#0x35 85 bgt l1.88 86 cmp r3,#0x0 87 movlt r3,#0x0 @ r3 has the tc_index value 88 89 @ qp_luma = (quant_param_p + quant_param_q + 1) >> 1@ 90 @ beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@ 91 @ tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@ 92 93 l1.88: 94 ldr r2,gai4_ihevc_beta_table_addr 95 ulbl2: 96 add r2,r2,pc 97 vmov.i8 d18,#0x2 98 ldr r4,gai4_ihevc_tc_table_addr 99 ulbl1: 100 add r4,r4,pc 101 102 ldr r5,[r2,r7,lsl #2] @ beta 103 vmov.i16 q8,#0x2 104 ldr r6,[r4,r3,lsl #2] @ tc 105 lsl r8,r6,#1 106 cmp r6,#0 107 vdup.8 d19,r8 108 sub r7,r0,#4 109 vmov.i8 d23,#0x3 110 beq l1.964 111 112 113 vld1.8 {d24},[r7],r1 114 ldrb r8,[r0,#-3] @ -3 value 115 vld1.8 {d1},[r7],r1 116 ldrb r10,[r0,#-2] @-2 value 117 vld1.8 {d2},[r7],r1 118 ldrb r11,[r0,#-1] @-1 value 119 vld1.8 {d0},[r7] 120 ldrb r12,[r0,#0] @ 0 value 121 ldrb r9,[r0,#1] @ 1 value 122 vtrn.8 d24,d1 123 ldrb r2,[r0,#2] @ 2 value 124 vtrn.8 d2,d0 125 add r12,r12,r2 126 subs r9,r12,r9,lsl #1 @ dq0 value is stored in r9 127 rsbmi r9,r9,#0 128 @dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@ 129 vtrn.16 d24,d2 130 add r8,r8,r11 131 vtrn.16 d1,d0 132 subs r8,r8,r10,lsl #1 133 rsbmi r8,r8,#0 @ dp0 value is stored in r8 134 @ dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@ 135 136 137 138 add r14,r1,r1,lsl #1 139 add r14,r0,r14 140 141 vdup.32 d4,d24[1] 142 ldrb r2,[r14,#-3] @ -2 value 143 vdup.32 d7,d2[1] 144 ldrb r10,[r14,#-2] @ -2 value 145 vdup.32 d3,d2[0] 146 ldrb r11,[r14,#-1] @ -1 value 147 vdup.32 d5,d1[1] 148 ldrb r12,[r14,#0] @ 0 value 149 vdup.32 d6,d1[0] 150 ldrb r3,[r14,#1] @ 1 value 151 vdup.32 d2,d0[0] 152 ldrb r4,[r14,#2] @ 2 value 153 154 155 add r12,r12,r4 156 subs r12,r12,r3,lsl #1 @ dq3value is stored in r12 157 rsbmi r12,r12,#0 158 @ dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@ 159 160 161 add r2,r2,r11 162 subs r11,r2,r10,lsl #1 163 rsbmi r11,r11,#0 @ dp3 value is stored in r8 164 @ dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2] + pu1_src[3 * src_strd - 1] )@ 165 166 167 168 add r3,r8,r9 @ r3 has the d0 value 169 add r4,r11,r12 @ r4 has the d3 value 170 171 172 @ d0 = dp0 + dq0@ 173 @ d3 = dp3 + dq3@ 174 175 add r14,r8,r11 @ r13 has the value dp 176 add r12,r12,r9 @ r12 has the value dq 177 @ dp = dp0 + dp3@ 178 @ dq = dq0 + dq3@ 179 180 add r11, r3, r4 @ r3 has the value d 181 182 @ d = d0 + d3@ 183 184 185 cmp r11,r5 186 vdup.32 d22,d0[1] 187 bge l1.964 188 189 @ if(d < beta) 190 191 192 @ registers which cannont be altered : r3,r4 r5,r6,r12,r13,r0,r1,r11 193 194 @ registers for use: r2,r7,r8,r9,r10, 195 vqsub.u8 d30,d7,d19 196 asr r10,r5,#2 197 vqadd.u8 d31,d7,d19 198 cmp r10,r3,lsl #1 199 vaddl.u8 q0,d5,d4 200 ble l1.336 201 202 ldrb r2,[r0,#-4] 203 vaddw.u8 q0,q0,d2 204 ldrb r7,[r0,#-1] 205 vmull.u8 q10,d7,d23 206 ldrb r3,[r0,#0] 207 vmlal.u8 q10,d22,d18 208 ldrb r8,[r0,#3] 209 @ ubfx r7,r2,#24,#8 @ has the -1 value 210 @ and r2,#0xff @ has the -4 value 211 @ ubfx r8,r3,#24,#8 @ has the 3 value 212 @ and r3,#0xff @ r4 has the 0 value 213 214 vadd.i16 q10,q10,q0 215 subs r8,r8,r3 216 vrshrn.i16 d22,q10,#3 217 rsbmi r8,r8,#0 218 subs r2,r2,r7 219 vmin.u8 d21,d22,d31 220 rsbmi r2,r2,#0 221 vmax.u8 d22,d21,d30 222 add r8,r8,r2 223 vaddl.u8 q10,d7,d3 224 cmp r8,r5,asr #3 225 vmla.i16 q10,q0,q8 226 bge l1.336 227 vaddw.u8 q0,q0,d7 228 subs r7,r3,r7 229 vrshrn.i16 d20,q10,#3 230 rsbmi r7,r7,#0 231 vrshrn.i16 d0,q0,#2 232 mov r10,#5 233 vqadd.u8 d30,d5,d19 234 mul r10,r10,r6 235 vqsub.u8 d31,d5,d19 236 add r10,#1 237 cmp r7,r10,asr #1 238 bge l1.336 239 240 241 @ if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4]) < (beta >> 3) ) 242 @ && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) ) 243 244 245 asr r10,r5,#2 246 vqsub.u8 d25,d4,d19 247 cmp r10,r4,lsl #1 248 vqadd.u8 d21,d4,d19 249 ble l1.336 250 vmin.u8 d26,d20,d21 251 add r4,r1,r1,lsl #1 252 add r4,r4,r0 253 vmax.u8 d20,d26,d25 254 ldrb r2,[r4,#-4] 255 vmin.u8 d19,d0,d30 256 ldrb r7,[r4,#-1] 257 vmax.u8 d21,d19,d31 258 ldrb r3,[r4,#0] 259 lsl r10,r6,#1 260 ldrb r8,[r4,#3] 261 @ ubfx r7,r2,#24,#8 @ has the -1 value 262 @ and r2,#0xff @ has the -4 value 263 @ ubfx r8,r3,#24,#8 @ has the 3 value 264 @ and r3,#0xff @ r4 has the 0 value 265 vaddl.u8 q0,d2,d3 266 vdup.8 d19,r10 267 subs r8,r8,r3 268 vaddw.u8 q0,q0,d4 269 rsbmi r8,r8,#0 270 vqadd.u8 d30,d2,d19 271 subs r2,r2,r7 272 vqsub.u8 d31,d2,d19 273 rsbmi r2,r2,#0 274 vaddl.u8 q13,d5,d6 275 add r8,r8,r2 276 vmla.i16 q13,q0,q8 277 cmp r8,r5,asr #3 278 bge l1.336 279 vrshrn.i16 d26,q13,#3 280 subs r7,r3,r7 281 vqadd.u8 d27,d3,d19 282 rsbmi r7,r7,#0 283 vqsub.u8 d28,d3,d19 284 mov r10,#5 285 vmin.u8 d16,d26,d30 286 mul r10,r10,r6 287 add r10,#1 288 cmp r7,r10,asr #1 289 vmax.u8 d26,d16,d31 290 bge l1.336 291 vqadd.u8 d30,d6,d19 292 293 mov r2,#2 294 ldr r4,[sp,#0x38] @ loading the filter_flag_p 295 vqsub.u8 d31,d6,d19 296 ldr r5,[sp,#0x3c] @ loading the filter_flag_q 297 b end_dep_deq_decision 298 @ r2 has the value of de 299 @ r6 has teh value of tc 300 @ r5 has the value of beta 301 @ r14 has the value of dp 302 @ r12 has the value of dq 303 @ r0 has the value of source address 304 @ r1 has the src stride 305 306 l1.336: 307 mov r2,#1 308 l1.424: 309 mov r11,r5 310 ldr r4,[sp,#0x38] @ loading the filter_flag_p 311 ldr r5,[sp,#0x3c] @ loading the filter_flag_q 312 313 cmp r6,#1 314 moveq r9,#0 315 moveq r10,#0 316 beq end_dep_deq_decision 317 318 and r7,r4,r5 319 320 cmp r7,#1 321 beq both_flags_set 322 cmp r4,#0 323 beq set_flag_dep_zero 324 325 326 add r8,r11,r11,asr #1 327 mov r10,#0 328 asr r8,#3 329 cmp r8,r14 330 movgt r9,#1 331 movle r9,#0 332 b end_dep_deq_decision 333 set_flag_dep_zero: 334 335 add r8,r11,r11,asr #1 336 mov r9,#0 337 asr r8,#3 338 cmp r8,r12 339 movgt r10,#1 340 movle r10,#0 341 b end_dep_deq_decision 342 343 both_flags_set: 344 add r8,r11,r11,asr #1 345 asr r8,#3 346 cmp r8,r14 347 movgt r9,#1 348 movle r9,#0 349 cmp r8,r12 350 movgt r10,#1 351 movle r10,#0 352 end_dep_deq_decision: 353 354 @r0=source address 355 @r1=stride 356 @ r2 =de 357 @ r4=flag p 358 @r5= flag q 359 @r6 =tc 360 @ r9 =dep 361 @ r10=deq 362 @ b l1.964 363 364 365 cmp r2,#2 366 @ r4 has the value of de 367 bne l1.968 368 369 cmp r5,#0 370 beq l1.780 371 @ r5 has the flag of q 372 373 add r3,r0,#2 374 vst1.8 {d22[0]},[r3],r1 375 376 vst1.8 {d22[1]},[r3],r1 377 378 vst1.8 {d22[2]},[r3],r1 379 380 vst1.8 {d22[3]},[r3] 381 add r3,r0,r1 382 vtrn.8 d20,d21 383 384 vst1.16 {d20[0]},[r0] 385 vst1.16 {d21[0]},[r3],r1 386 vst1.16 {d20[1]},[r3],r1 387 vst1.16 {d21[1]},[r3] 388 389 390 l1.780: 391 cmp r4,#0 392 beq l1.964 393 @ r5 has the flag p 394 395 396 vdup.32 d7,d24[0] 397 sub r3,r0,#1 398 vaddw.u8 q8,q0,d6 399 add r7,r3,r1 400 vrshrn.i16 d2,q8,#2 401 vst1.8 {d26[0]},[r3] 402 sub r0,r0,#3 403 vmin.u8 d16,d2,d27 404 vst1.8 {d26[1]},[r7],r1 405 vmull.u8 q1,d6,d23 406 vmlal.u8 q1,d7,d18 407 vst1.8 {d26[2]},[r7],r1 408 vmax.u8 d5,d16,d28 409 vst1.8 {d26[3]},[r7] 410 vadd.i16 q0,q1,q0 411 vrshrn.i16 d0,q0,#3 412 413 414 vmin.u8 d1,d0,d30 415 vmax.u8 d0,d1,d31 416 417 vtrn.8 d0,d5 418 vst1.16 {d0[0]},[r0],r1 419 vst1.16 {d5[0]},[r0],r1 420 vst1.16 {d0[1]},[r0],r1 421 vst1.16 {d5[1]},[r0] 422 l1.964: 423 pop {r3-r12,pc} 424 l1.968: 425 426 427 vmov.i16 q0,#0x9 428 rsb r11,r6,#0 429 cmp r4,#0 430 @ checks for the flag p 431 vmov.i16 q8,#0x3 432 vmov.i8 d24,#0x1 433 434 435 vdup.8 d30,r11 436 and r11,r6,#0xff 437 vdup.8 d31,r11 438 439 vsubl.u8 q9,d4,d2 440 vmul.i16 q9,q9,q0 441 vsubl.u8 q0,d5,d3 442 443 444 445 vmul.i16 q8,q0,q8 446 vsub.i16 q8,q9,q8 447 vrshr.s16 q8,q8,#4 448 @ delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@ 449 450 vabs.s16 q0,q8 451 vmovn.i16 d0,q0 452 @ storing the absolute values of delta in d0 453 454 vqmovn.s16 d16,q8 455 @ storing the clipped values of delta in d16 456 457 vmov.i8 d1,#0xa 458 vdup.8 d21,r11 459 vmul.i8 d1,d1,d21 460 @ d1 stores the value (10 * tc) 461 462 @if(abs(delta) < 10 * tc) 463 464 vmin.s8 d18,d16,d31 465 vmax.s8 d20,d18,d30 466 467 @ delta = clip3(delta, -tc, tc)@ 468 vmovl.s8 q8,d20 469 vmovl.u8 q9,d2 470 vadd.i16 q9,q9,q8 471 472 vqmovun.s16 d22,q9 473 vmovl.u8 q9,d4 474 vsub.i16 q8,q9,q8 475 vqmovun.s16 d23,q8 476 @ tmp_p0 = clip_u8(pu1_src[-1] + delta)@ 477 @ tmp_q0 = clip_u8(pu1_src[0] - delta)@ 478 beq l1.1272 479 480 481 482 cmp r9,#1 483 bne l1.1212 484 @ checks for the flag dep 485 486 asr r3,r6,#1 487 488 489 vaddl.u8 q8,d6,d2 490 vaddw.u8 q8,q8,d24 491 vdup.8 d18,r3 492 rsb r3,r3,#0 493 vdup.8 d19,r3 494 vshr.u16 q8,q8,#1 495 vmovn.i16 d16,q8 496 497 vsubl.u8 q8,d16,d3 498 vaddw.s8 q8,q8,d20 499 vshr.s16 q8,q8,#1 500 vqmovn.s16 d16,q8 501 502 vmin.s8 d17,d16,d18 503 vmax.s8 d16,d19,d17 504 505 506 507 508 vmovl.u8 q9,d3 509 vmovl.s8 q8,d16 510 vadd.i16 q8,q9,q8 511 512 vqmovun.s16 d16,q8 513 vmov d30,d3 514 vcge.u8 d3,d0,d1 515 516 517 vbsl d3,d30,d16 518 l1.1212: 519 vdup.8 d16,r11 520 sub r12,r0,#3 521 sub r3,r0,#1 522 @ vmul.i8 d16,d16,d1 523 vtrn.8 d6,d3 524 vst1.16 {d6[0]},[r12],r1 525 vcge.u8 d16,d0,d1 526 vst1.16 {d3[0]},[r12],r1 527 vbsl d16,d2,d22 528 vst1.8 {d16[0]},[r3],r1 529 vst1.8 {d16[1]},[r3],r1 530 vst1.16 {d6[1]},[r12],r1 531 vst1.8 {d16[2]},[r3],r1 532 vst1.16 {d3[1]},[r12] 533 vst1.8 {d16[3]},[r3] 534 l1.1272: 535 @ ldr r3,[sp,#0x38] 536 cmp r5,#0 537 beq l1.964 538 @ checks for the flag q 539 cmp r10,#1 540 bne l1.1412 541 @ checks for the flag deq 542 vmov d2,d7 543 asr r3,r6,#1 544 545 vdup.8 d6,r3 546 rsb r3,r3,#0 547 vdup.8 d16,r3 548 vaddl.u8 q1,d2,d4 549 vaddw.u8 q1,q1,d24 550 vshr.u16 q1,q1,#1 551 vmovn.i16 d2,q1 552 553 vsubl.u8 q1,d2,d5 554 vsubw.s8 q1,q1,d20 555 vshr.s16 q1,q1,#1 556 vqmovn.s16 d3,q1 557 558 vmin.s8 d2,d3,d6 559 vmax.s8 d3,d16,d2 560 @ vdup.8 d6,r2 561 @ vmul.i8 d6,d6,d1 562 563 564 565 vmovl.u8 q8,d5 566 vmovl.s8 q1,d3 567 vadd.i16 q1,q8,q1 568 vqmovun.s16 d3,q1 569 vmov d30,d5 570 vcge.u8 d5,d0,d1 571 572 573 vbsl d5,d30,d3 574 l1.1412: 575 @ vdup.8 d2,r2 576 add r3,r0,#2 577 add r11,r3,r1 578 @ vmul.i8 d1,d2,d1 579 vst1.8 {d7[0]},[r3] 580 vst1.8 {d7[1]},[r11],r1 581 vst1.8 {d7[2]},[r11],r1 582 vcge.u8 d0,d0,d1 583 vst1.8 {d7[3]},[r11] 584 vbsl d0,d4,d23 585 vtrn.8 d0,d5 586 vst1.16 {d0[0]},[r0],r1 587 vst1.16 {d5[0]},[r0],r1 588 vst1.16 {d0[1]},[r0],r1 589 vst1.16 {d5[1]},[r0] 590 pop {r3-r12,pc} 591 592 593 594