1 ;// 2 ;// 3 ;// File Name: omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s 4 ;// OpenMAX DL: v1.0.2 5 ;// Revision: 9641 6 ;// Date: Thursday, February 7, 2008 7 ;// 8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9 ;// 10 ;// 11 ;// 12 13 INCLUDE omxtypes_s.h 14 INCLUDE armCOMM_s.h 15 16 M_VARIANTS ARM1136JS 17 18 IMPORT armVCM4P10_DeblockingLumabSLT4_unsafe 19 IMPORT armVCM4P10_DeblockingLumabSGE4_unsafe 20 21 22 IF ARM1136JS 23 24 MASK_0 EQU 0x00000000 25 MASK_1 EQU 0x01010101 26 MASK_2 EQU 0xff00ff00 27 LOOP_COUNT EQU 0x11110000 28 29 ;// Declare input registers 30 31 pSrcDst RN 0 32 srcdstStep RN 1 33 pAlphaArg RN 2 34 pBetaArg RN 3 35 36 pThresholds RN 14 37 pBS RN 9 38 pQ0 RN 0 39 bS RN 2 40 41 alpha RN 6 42 alpha0 RN 6 43 alpha1 RN 8 44 45 beta RN 7 46 beta0 RN 7 47 beta1 RN 9 48 49 ;// Declare Local/Temporary variables 50 51 ;// Pixels 52 p_0 RN 3 53 p_1 RN 5 54 p_2 RN 4 55 p_3 RN 2 56 q_0 RN 8 57 q_1 RN 9 58 q_2 RN 10 59 q_3 RN 12 60 61 ;// Unpacking 62 mask RN 11 63 64 row0 RN 2 65 row1 RN 4 66 row2 RN 5 67 row3 RN 3 68 69 row4 RN 8 70 row5 RN 9 71 row6 RN 10 72 row7 RN 12 73 row8 RN 14 74 row9 RN 7 75 76 tunpk0 RN 8 77 tunpk1 RN 9 78 tunpk2 RN 10 79 tunpk3 RN 12 80 tunpk4 RN 0 81 82 tunpk5 RN 1 83 tunpk6 RN 14 84 tunpk7 RN 2 85 tunpk8 RN 5 86 tunpk9 RN 6 87 88 89 ;// Filtering 90 91 dp0q0 RN 12 92 dp1p0 RN 12 93 dq1q0 RN 12 94 dp2p0 RN 12 95 dq2q0 RN 12 96 97 ap0q0 RN 1 98 filt RN 2 99 100 m00 RN 14 101 m01 RN 11 102 103 apflg RN 0 104 aqflg RN 6 105 apqflg RN 0 106 107 108 ;//Declarations for bSLT4 kernel 109 110 tC0 RN 7 111 ptC0 RN 1 112 113 pQ0a RN 0 114 Stepa RN 1 115 maska RN 14 116 117 P0a RN 1 118 P1a RN 8 119 Q0a RN 7 120 Q1a RN 11 121 122 ;//Declarations for bSGE4 kernel 123 124 pQ0b RN 0 125 Stepb RN 1 126 maskb RN 14 127 128 P0b RN 6 129 P1b RN 7 130 P2b RN 1 131 P3b RN 3 132 133 Q0b RN 9 134 Q1b RN 0 135 Q2b RN 2 136 Q3b RN 3 137 138 ;// Miscellanous 139 XY RN 8 140 t0 RN 3 141 t1 RN 12 142 t2 RN 14 143 t7 RN 7 144 t4 RN 4 145 t5 RN 1 146 t8 RN 6 147 a RN 0 148 149 150 151 ;// Allocate stack memory 152 M_ALLOC4 ppThresholds,4 153 M_ALLOC4 pQ_3,4 154 M_ALLOC4 pP_3,4 155 M_ALLOC8 pAlphaBeta0,8 156 M_ALLOC8 pAlphaBeta1,8 157 M_ALLOC8 pXYBS,4 158 M_ALLOC4 ppBS,4 159 M_ALLOC8 ppQ0Step,4 160 M_ALLOC4 pStep,4 161 162 ;// Function header 163 M_START omxVCM4P10_FilterDeblockingLuma_VerEdge_I, r11 164 165 ;//Input arguments on the stack 166 M_ARG ppThresholdsArg, 4 167 M_ARG ppBSArg, 4 168 169 LDR t4,=MASK_1 170 171 LDRB alpha0, [pAlphaArg] 172 LDRB beta0, [pBetaArg] 173 LDRB alpha1, [pAlphaArg,#1] 174 LDRB beta1, [pBetaArg,#1] 175 176 MUL alpha0, alpha0, t4 177 MUL beta0, beta0, t4 178 MUL alpha1, alpha1, t4 179 MUL beta1, beta1, t4 180 181 M_STRD alpha0, beta0, pAlphaBeta0 182 M_STRD alpha1, beta1, pAlphaBeta1 183 184 LDR XY,=LOOP_COUNT 185 M_LDR pBS, ppBSArg 186 M_LDR pThresholds, ppThresholdsArg 187 M_STR srcdstStep, pStep 188 M_STRD XY, pBS, pXYBS 189 M_STR pThresholds, ppThresholds 190 191 SUB pQ0, pQ0, #4 192 LoopY 193 ;//---------------Load Pixels------------------- 194 195 ;//----------------Pack p0-p3----------------------- 196 LDR mask, =MASK_2 197 198 M_LDR row0, [pQ0], srcdstStep 199 M_LDR row1, [pQ0], srcdstStep 200 LDR row2, [pQ0] 201 LDR row3, [pQ0, srcdstStep] 202 SUB pQ0, pQ0, srcdstStep, LSL #1 203 204 ;// row0 = [r0p0 r0p1 r0p2 r0p3] 205 ;// row1 = [r1p0 r1p1 r1p2 r1p3] 206 ;// row2 = [r2p0 r2p1 r2p2 r2p3] 207 ;// row3 = [r3p0 r3p1 r3p2 r3p3] 208 209 AND tunpk0, mask, row0 210 AND tunpk6, mask, row0, LSL#8 211 UXTAB16 tunpk0, tunpk0, row1, ROR#8 212 UXTAB16 tunpk6, tunpk6, row1 213 AND tunpk2, mask, row2 214 AND tunpk3, mask, row2, LSL#8 215 UXTAB16 tunpk2, tunpk2, row3, ROR#8 216 UXTAB16 tunpk3, tunpk3, row3 217 218 ;// tunpk0 = [r0p0 r1p0 r0p2 r1p2] 219 ;// tunpk6 = [r0p1 r1p1 r0p3 r1p3] 220 ;// tunpk2 = [r2p0 r3p0 r2p2 r3p2] 221 ;// tunpk3 = [r2p1 r3p1 r2p3 r3p3] 222 223 PKHTB p_0, tunpk0, tunpk2, ASR#16 224 PKHTB p_1, tunpk6, tunpk3, ASR#16 225 PKHBT p_2, tunpk2, tunpk0, LSL#16 226 PKHBT p_3, tunpk3, tunpk6, LSL#16 227 228 229 ;// p_0 = [r0p0 r1p0 r2p0 r3p0] 230 ;// p_1 = [r0p1 r1p1 r2p1 r3p1] 231 ;// p_2 = [r0p2 r1p2 r2p1 r3p2] 232 ;// p_3 = [r0p3 r1p3 r2p3 r3p3] 233 234 M_STR p_3, pP_3 235 236 ;//----------------Pack q0-q3----------------------- 237 LoopX 238 LDRB bS, [pBS], #4 239 M_STR pQ0, ppQ0Step 240 LDR mask, =MASK_2 241 CMP bS, #0 242 M_STR pBS, ppBS 243 244 LDR row4, [pQ0, #4]! 245 BEQ.W NoFilterBS0 246 M_LDR row5, [pQ0, srcdstStep]! 247 M_LDR row6, [pQ0, srcdstStep]! 248 M_LDR row7, [pQ0, srcdstStep] 249 250 ;// row4 = [r0q3 r0q2 r0q1 r0q0] 251 ;// row5 = [r1q3 r1q2 r1q1 r1q0] 252 ;// row6 = [r2q3 r2q2 r2q1 r2q0] 253 ;// row7 = [r3q3 r3q2 r3q1 r3q0] 254 255 AND tunpk4, mask, row4 256 CMP bS, #4 257 AND tunpk5, mask, row4, LSL#8 258 UXTAB16 tunpk4, tunpk4, row5, ROR#8 259 UXTAB16 tunpk5, tunpk5, row5 260 AND tunpk6, mask, row6 261 AND tunpk7, mask, row6, LSL#8 262 UXTAB16 tunpk6, tunpk6, row7, ROR#8 263 UXTAB16 tunpk7, tunpk7, row7 264 265 ;// tunpk4 = [r0q0 r1q0 r0q2 r1q2] 266 ;// tunpk5 = [r0q1 r1q1 r0q3 r1q3] 267 ;// tunpk6 = [r2q0 r3q0 r2q2 r3q2] 268 ;// tunpk7 = [r2q1 r3q1 r2q3 r3q3] 269 270 PKHTB q_3, tunpk4, tunpk6, ASR#16 271 PKHTB q_2, tunpk5, tunpk7, ASR#16 272 PKHBT q_1, tunpk6, tunpk4, LSL#16 273 M_STR q_3, pQ_3 274 PKHBT q_0, tunpk7, tunpk5, LSL#16 275 276 277 ;// q_0 = [r0q0 r1q0 r2q0 r3q0] 278 ;// q_1 = [r0q1 r1q1 r2q1 r3q1] 279 ;// q_2 = [r0q2 r1q2 r2q1 r3q2] 280 ;// q_3 = [r0q3 r1q3 r2q3 r3q3] 281 282 283 ;//--------------Filtering Decision ------------------- 284 LDR m01, =MASK_1 ;// 01010101 mask 285 MOV m00, #MASK_0 ;// 00000000 mask 286 287 ;// Check |p0-q0|<Alpha 288 USUB8 dp0q0, p_0, q_0 289 USUB8 a, q_0, p_0 290 SEL ap0q0, a, dp0q0 291 USUB8 a, ap0q0, alpha 292 SEL filt, m00, m01 293 294 ;// Check |p1-p0|<Beta 295 USUB8 dp1p0, p_1, p_0 296 USUB8 a, p_0, p_1 297 SEL a, a, dp1p0 298 USUB8 a, a, beta 299 SEL filt, m00, filt 300 301 ;// Check |q1-q0|<Beta 302 USUB8 dq1q0, q_1, q_0 303 USUB8 a, q_0, q_1 304 SEL a, a, dq1q0 305 USUB8 a, a, beta 306 SEL filt, m00, filt 307 308 ;// Check ap<Beta 309 USUB8 dp2p0, p_2, p_0 310 USUB8 a, p_0, p_2 311 SEL a, a, dp2p0 312 USUB8 a, a, beta 313 SEL apflg, m00, filt ;// apflg = filt && (ap<beta) 314 315 ;// Check aq<Beta 316 USUB8 dq2q0, q_2, q_0 317 USUB8 t2, q_0, q_2 318 SEL t2, t2, dq2q0 319 USUB8 t2, t2, beta 320 MOV t7,#0 321 322 323 BLT bSLT4 324 ;//-------------------Filter-------------------- 325 bSGE4 326 ;//---------bSGE4 Execution--------------- 327 SEL t1, t7, filt ;// aqflg = filt && (aq<beta) 328 CMP filt, #0 329 ORR apqflg, apflg, t1, LSL #1 330 M_LDRD pQ0, srcdstStep, ppQ0Step, EQ 331 BEQ NoFilterFilt0 332 333 BL armVCM4P10_DeblockingLumabSGE4_unsafe 334 335 ;//---------Store result--------------- 336 337 LDR maskb,=MASK_2 338 339 ;// P0b = [r0p0 r1p0 r2p0 r3p0] 340 ;// P1b = [r0p1 r1p1 r2p1 r3p1] 341 ;// P2b = [r0p2 r1p2 r2p2 r3p2] 342 ;// P3b = [r0p3 r1p3 r2p3 r3p3] 343 344 M_LDR P3b, pP_3 345 M_STR Q0b, pP_3 346 347 ;//------Pack p0-p3------ 348 AND tunpk0, maskb, P0b 349 AND tunpk2, maskb, P0b, LSL#8 350 UXTAB16 tunpk0, tunpk0, P1b, ROR#8 351 UXTAB16 tunpk2, tunpk2, P1b 352 353 AND tunpk3, maskb, P2b 354 AND tunpk8, maskb, P2b, LSL#8 355 UXTAB16 tunpk3, tunpk3, P3b, ROR#8 356 UXTAB16 tunpk8, tunpk8, P3b 357 358 ;// tunpk0 = [r0p0 r0p1 r2p0 r2p1] 359 ;// tunpk2 = [r1p0 r1p1 r3p0 r3p1] 360 ;// tunpk3 = [r0p2 r0p3 r2p2 r2p3] 361 ;// tunpk8 = [r1p2 r1p3 r3p2 r3p3] 362 363 MOV p_2, Q1b 364 M_LDRD pQ0b, Stepb, ppQ0Step 365 366 PKHTB row9, tunpk0, tunpk3, ASR#16 367 PKHBT row7, tunpk3, tunpk0, LSL#16 368 PKHTB row3, tunpk2, tunpk8, ASR#16 369 PKHBT row6, tunpk8, tunpk2, LSL#16 370 371 ;// row9 = [r0p0 r0p1 r0p2 r0p3] 372 ;// row3 = [r1p0 r1p1 r1p2 r1p3] 373 ;// row7 = [r2p0 r2p1 r2p2 r2p3] 374 ;// row6 = [r3p0 r3p1 r3p2 r3p3] 375 376 M_STR row9, [pQ0b], Stepb 377 STR row7, [pQ0b, Stepb] 378 STR row6, [pQ0b, Stepb, LSL #1] 379 STR row3, [pQ0b], #4 380 381 M_LDR Q3b, pQ_3 382 383 ;// Q0b = [r0q0 r1q0 r2q0 r3q0] 384 ;// Q1b = [r0q1 r1q1 r2q1 r3q1] 385 ;// Q2b = [r0q2 r1q2 r2q2 r3q2] 386 ;// Q3b = [r0q3 r1q3 r2q3 r3q3] 387 388 ;//------Pack q0-q3------ 389 AND tunpk0, maskb, p_2 390 AND tunpk2, maskb, p_2, LSL#8 391 UXTAB16 tunpk0, tunpk0, Q0b, ROR#8 392 UXTAB16 tunpk2, tunpk2, Q0b 393 394 AND tunpk3, maskb, Q3b 395 AND tunpk8, maskb, Q3b, LSL#8 396 UXTAB16 tunpk3, tunpk3, Q2b, ROR#8 397 UXTAB16 tunpk8, tunpk8, Q2b 398 399 ;// tunpk0 = [r0q1 r0q0 r2q1 r2q0] 400 ;// tunpk2 = [r1q1 r1q0 r3q1 r3q0] 401 ;// tunpk3 = [r0q3 r0q2 r2q3 r2q2] 402 ;// tunpk8 = [r1q3 r1q2 r3q3 r3q2] 403 404 PKHTB row8, tunpk3, tunpk0, ASR#16 405 PKHBT row7, tunpk0, tunpk3, LSL#16 406 PKHTB row4, tunpk8, tunpk2, ASR#16 407 PKHBT row6, tunpk2, tunpk8, LSL#16 408 409 ;// row8 = [r0q0 r0q1 r0q2 r0q3] 410 ;// row4 = [r1q0 r1q1 r1q2 r1q3] 411 ;// row7 = [r2q0 r2q1 r2q2 r2q3] 412 ;// row6 = [r3q0 r3q1 r3q2 r3q3] 413 414 STR row4, [pQ0b] 415 STR row7, [pQ0b, Stepb] 416 STR row6, [pQ0b, Stepb, LSL #1] 417 418 SUB pQ0, pQ0b, Stepb 419 MOV p_1, Q2b 420 421 STR row8, [pQ0] 422 423 M_LDRD XY, pBS, pXYBS 424 M_LDR pThresholds, ppThresholds 425 M_LDRD alpha, beta, pAlphaBeta1 426 427 ADDS XY, XY, XY 428 ADD pThresholds, #4 429 M_STR pThresholds, ppThresholds 430 M_STR XY, pXYBS 431 BCC LoopX 432 B ExitLoopY 433 434 ;//---------- Exit of LoopX -------------- 435 ;//---- for the case of no filtering ----- 436 437 NoFilterFilt0 438 ADD pQ0, pQ0, #4 439 NoFilterBS0 440 ;// Load counter for LoopX 441 M_LDRD XY, pBS, pXYBS 442 M_LDR pThresholds, ppThresholds 443 M_LDRD alpha, beta, pAlphaBeta1 444 445 ;// Align the pointer 446 ADDS XY, XY, XY 447 ADD pThresholds, pThresholds, #4 448 M_STR pThresholds, ppThresholds 449 M_STR XY, pXYBS 450 BCC LoopY 451 B ExitLoopY 452 453 bSLT4 454 ;//---------bSLT4 Execution--------------- 455 SEL aqflg, t7, filt ;// aqflg = filt && (aq<beta) 456 M_LDR ptC0, ppThresholds 457 CMP filt, #0 458 M_LDRD pQ0, srcdstStep, ppQ0Step, EQ 459 BEQ NoFilterFilt0 460 461 LDRB tC0, [ptC0], #4 462 M_STR ptC0, ppThresholds 463 464 BL armVCM4P10_DeblockingLumabSLT4_unsafe 465 466 ;//---------Store result--------------- 467 ;//--------Pack p1,p0,q1,q0------------ 468 469 ;//Load destination pointer 470 LDR maska,=MASK_2 471 M_STR Q0a, pP_3 472 MOV p_1, q_2 473 474 ;// P1a = [r0p1 r1p1 r2p1 r3p1] 475 ;// P0a = [r0p0 r1p0 r2p0 r3p0] 476 ;// Q0a = [r0q0 r1q0 r2q0 r3q0] 477 ;// Q1a = [r0q1 r1q1 r2q1 r3q1] 478 479 AND tunpk1, maska, P0a 480 AND tunpk2, maska, P0a, LSL#8 481 UXTAB16 tunpk1, tunpk1, P1a, ROR#8 482 UXTAB16 tunpk2, tunpk2, P1a 483 484 M_LDRD pQ0a, Stepa, ppQ0Step 485 486 AND tunpk9, maska, Q1a 487 AND tunpk3, maska, Q1a, LSL#8 488 UXTAB16 tunpk9, tunpk9, Q0a, ROR#8 489 UXTAB16 tunpk3, tunpk3, Q0a 490 491 ;// tunpk1 = [r0p0 r0p1 r2p0 r2p1] 492 ;// tunpk2 = [r1p0 r1p1 r3p0 r3p1] 493 ;// tunpk9 = [r0q1 r0q0 r2q1 r2q0] 494 ;// tunpk3 = [r1q1 r1q0 r3q1 r3q0] 495 496 MOV t4, tunpk1, LSR #16 497 MOV t0, tunpk9, LSR #16 498 499 STRH t4,[pQ0a, #2]! ;//Stores [r0p0 r0p1] 500 STRH t0,[pQ0a, #2] ;//Stores [r0q0 r0q1] 501 502 MOV t4, tunpk2, LSR #16 503 MOV t0, tunpk3, LSR #16 504 505 M_STRH t4,[pQ0a, Stepa]! ;//Stores [r1p0 r1p1] 506 STRH t0,[pQ0a, #2] ;//Stores [r1q0 r1q1] 507 508 M_STRH tunpk1,[pQ0a, Stepa]! ;//Stores [r2p0 r2p1] 509 STRH tunpk2,[pQ0a, Stepa] ;//Stores [r3p0 r3p1] 510 STRH tunpk9,[pQ0a, #2]! ;//Stores [r2q0 r2q1] 511 STRH tunpk3,[pQ0a, Stepa] ;//Stores [r3q0 r3q1] 512 513 SUB pQ0, pQ0a, Stepa, LSL #1 514 515 ;// Load counter 516 M_LDRD XY, pBS, pXYBS 517 518 ;// Reload Pixels 519 M_LDR p_0, pQ_3 520 MOV p_2, Q1a 521 522 M_LDRD alpha, beta, pAlphaBeta1 523 524 ADDS XY, XY, XY 525 M_STR XY, pXYBS 526 BCC LoopX 527 528 ;//-------- Common Exit of LoopY ----------------- 529 ;// Align the pointers 530 M_LDR pThresholds, ppThresholds 531 ExitLoopY 532 SUB pQ0, pQ0, #16 533 ADD pQ0, pQ0, srcdstStep, LSL #2 534 SUB pBS, pBS, #15 535 SUB pThresholds, pThresholds, #15 536 M_STR pThresholds, ppThresholds 537 538 M_LDRD alpha, beta, pAlphaBeta0 539 540 BNE LoopY 541 MOV r0, #OMX_Sts_NoErr 542 543 M_END 544 ;//-----------------End Filter-------------------- 545 546 ENDIF 547 548 END 549 550