1 ;// 2 ;// 3 ;// File Name: omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s 4 ;// OpenMAX DL: v1.0.2 5 ;// Revision: 12290 6 ;// Date: Wednesday, April 9, 2008 7 ;// 8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9 ;// 10 ;// 11 ;// 12 13 INCLUDE omxtypes_s.h 14 INCLUDE armCOMM_s.h 15 16 M_VARIANTS CortexA8 17 18 IMPORT armVCM4P10_DeblockingLumabSLT4_unsafe 19 IMPORT armVCM4P10_DeblockingLumabSGE4_unsafe 20 21 IF CortexA8 22 23 LOOP_COUNT EQU 0x11000000 24 25 26 ;// Function arguments 27 28 pSrcDst RN 0 29 srcdstStep RN 1 30 pAlpha RN 2 31 pBeta RN 3 32 33 pThresholds RN 5 34 pBS RN 4 35 bS10 RN 12 36 37 pAlpha_0 RN 2 38 pBeta_0 RN 3 39 40 pAlpha_1 RN 7 41 pBeta_1 RN 8 42 43 pTmp RN 10 44 pTmpStep RN 11 45 46 ;// Loop 47 48 XY RN 9 49 50 ;// Rows input 51 dRow0 DN D7.U8 52 dRow1 DN D8.U8 53 dRow2 DN D5.U8 54 dRow3 DN D10.U8 55 dRow4 DN D6.U8 56 dRow5 DN D9.U8 57 dRow6 DN D4.U8 58 dRow7 DN D11.U8 59 60 ;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2 61 ;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3 62 63 ;// Rows output 64 dRown0 DN D7.U8 65 dRown1 DN D24.U8 66 dRown2 DN D30.U8 67 dRown3 DN D10.U8 68 dRown4 DN D6.U8 69 dRown5 DN D25.U8 70 dRown6 DN D29.U8 71 dRown7 DN D11.U8 72 73 ;// dP_0n DN D29.U8 74 ;// dP_1n DN D30.U8 75 ;// dP_2n DN D31.U8 76 ;// 77 ;// dQ_0n DN D24.U8 ;!!;Temp2 78 ;// dQ_1n DN D25.U8 ;!!;Temp2 79 ;// dQ_2n DN D28.U8 ;!!;dQ_0t 80 ;// 81 ;// dRown0 - dP_3, dRown1 - dQ_0n 82 ;// dRown2 - dP_1n, dRown3 - dQ_2 83 ;// dRown4 - dP_2, dRown5 - dQ_1n 84 ;// dRown6 - dP_0n, dRown7 - dQ_3 85 86 dRow0n DN D7.U8 87 dRow1n DN D24.U8 88 dRow2n DN D30.U8 89 dRow3n DN D28.U8 90 dRow4n DN D31.U8 91 dRow5n DN D25.U8 92 dRow6n DN D29.U8 93 dRow7n DN D11.U8 94 95 ;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n 96 ;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3 97 98 ;// Pixels 99 dP_0 DN D4.U8 100 dP_1 DN D5.U8 101 dP_2 DN D6.U8 102 dP_3 DN D7.U8 103 dQ_0 DN D8.U8 104 dQ_1 DN D9.U8 105 dQ_2 DN D10.U8 106 dQ_3 DN D11.U8 107 108 109 ;// Filtering Decision 110 dAlpha DN D0.U8 111 dBeta DN D2.U8 112 113 dFilt DN D16.U8 114 dAqflg DN D12.U8 115 dApflg DN D17.U8 116 117 dAp0q0 DN D13.U8 118 dAp1p0 DN D12.U8 119 dAq1q0 DN D18.U8 120 dAp2p0 DN D19.U8 121 dAq2q0 DN D17.U8 122 123 ;// bSLT4 124 dTC0 DN D18.U8 125 dTC1 DN D19.U8 126 dTC01 DN D18.U8 127 128 dTCs DN D31.S8 129 dTC DN D31.U8 130 131 dMask_0 DN D14.U8 132 dMask_1 DN D15.U8 133 134 Mask_0 RN 6 135 136 dTemp DN D19.U8 137 138 ;// Computing P0,Q0 139 qDq0p0 QN Q10.S16 140 qDp1q1 QN Q11.S16 141 qDelta QN Q10.S16 ; reuse qDq0p0 142 dDelta DN D20.S8 143 144 145 ;// Computing P1,Q1 146 dRp0q0 DN D24.U8 147 148 dMaxP DN D23.U8 149 dMinP DN D22.U8 150 151 dMaxQ DN D19.U8 152 dMinQ DN D21.U8 153 154 dDeltaP DN D26.U8 155 dDeltaQ DN D27.U8 156 157 qP_0n QN Q14.S16 158 qQ_0n QN Q12.S16 159 160 dQ_0n DN D24.U8 161 dQ_1n DN D25.U8 162 dP_0n DN D29.U8 163 dP_1n DN D30.U8 164 165 ;// bSGE4 166 167 qSp0q0 QN Q10.U16 168 169 qSp2q1 QN Q11.U16 170 qSp0q0p1 QN Q12.U16 171 qSp3p2 QN Q13.U16 172 dHSp0q1 DN D28.U8 173 174 qSq2p1 QN Q11.U16 175 qSp0q0q1 QN Q12.U16 176 qSq3q2 QN Q13.U16 ;!! 177 dHSq0p1 DN D28.U8 ;!! 178 179 qTemp1 QN Q11.U16 ;!!;qSp2q1 180 qTemp2 QN Q12.U16 ;!!;qSp0q0p1 181 182 dP_0t DN D28.U8 ;!!;dHSp0q1 183 dQ_0t DN D22.U8 ;!!;Temp1 184 185 dP_0n DN D29.U8 186 dP_1n DN D30.U8 187 dP_2n DN D31.U8 188 189 dQ_0n DN D24.U8 ;!!;Temp2 190 dQ_1n DN D25.U8 ;!!;Temp2 191 dQ_2n DN D28.U8 ;!!;dQ_0t 192 193 194 ;// Function header 195 M_START omxVCM4P10_FilterDeblockingLuma_VerEdge_I, r11, d15 196 197 ;//Arguments on the stack 198 M_ARG ppThresholds, 4 199 M_ARG ppBS, 4 200 201 ;// d0-dAlpha_0 202 ;// d2-dBeta_0 203 204 ADD pAlpha_1, pAlpha_0, #1 205 ADD pBeta_1, pBeta_0, #1 206 207 VLD1 {dAlpha[]}, [pAlpha_0] 208 SUB pSrcDst, pSrcDst, #4 209 VLD1 {dBeta[]}, [pBeta_0] 210 211 M_LDR pBS, ppBS 212 M_LDR pThresholds, ppThresholds 213 214 MOV Mask_0,#0 215 216 ;dMask_0-14 217 ;dMask_1-15 218 219 VMOV dMask_0, #0 220 VMOV dMask_1, #1 221 222 LDR XY,=LOOP_COUNT 223 224 ADD pTmpStep, srcdstStep, srcdstStep 225 226 ;// p0-p3 - d4-d7 227 ;// q0-q3 - d8-d11 228 LoopY 229 LoopX 230 LDRH bS10, [pBS], #4 231 232 CMP bS10, #0 233 BEQ NoFilterBS0 234 235 ;// Load 8 rows of data 236 ADD pTmp, pSrcDst, srcdstStep 237 VLD1 dRow0, [pSrcDst], pTmpStep 238 VLD1 dRow1, [pTmp], pTmpStep 239 VLD1 dRow2, [pSrcDst], pTmpStep 240 VZIP.8 dRow0, dRow1 241 VLD1 dRow3, [pTmp], pTmpStep 242 VLD1 dRow4, [pSrcDst], pTmpStep 243 VZIP.8 dRow2, dRow3 244 VLD1 dRow5, [pTmp], pTmpStep 245 VLD1 dRow6, [pSrcDst], pTmpStep 246 VLD1 dRow7, [pTmp], pTmpStep 247 VZIP.8 dRow4, dRow5 248 VZIP.16 dRow1, dRow3 249 250 251 ;// dRow0 = [q3r0 q2r0 q1r0 q0r0 p0r0 p1r0 p2r0 p3r0] 252 ;// dRow1 = [q3r1 q2r1 q1r1 q0r1 p0r1 p1r1 p2r1 p3r1] 253 ;// dRow2 = [q3r2 q2r2 q1r2 q0r2 p0r2 p1r2 p2r2 p3r2] 254 ;// dRow3 = [q3r3 q2r3 q1r3 q0r3 p0r3 p1r3 p2r3 p3r3] 255 ;// dRow4 = [q3r4 q2r4 q1r4 q0r4 p0r4 p1r4 p2r4 p3r4] 256 ;// dRow5 = [q3r5 q2r5 q1r5 q0r5 p0r5 p1r5 p2r5 p3r5] 257 ;// dRow6 = [q3r6 q2r6 q1r6 q0r6 p0r6 p1r6 p2r6 p3r6] 258 ;// dRow7 = [q3r7 q2r7 q1r7 q0r7 p0r7 p1r7 p2r7 p3r7] 259 260 ;// 8x8 Transpose 261 262 VZIP.8 dRow6, dRow7 263 264 SUB pSrcDst, pSrcDst, srcdstStep, LSL #3 265 VZIP.16 dRow0, dRow2 266 VZIP.16 dRow5, dRow7 267 268 269 VZIP.16 dRow4, dRow6 270 VZIP.32 dRow1, dRow5 271 VZIP.32 dRow2, dRow6 272 VZIP.32 dRow3, dRow7 273 VZIP.32 dRow0, dRow4 274 275 276 ;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2 277 ;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3 278 279 ;// dQ_0 = [q0r7 q0r6 q0r5 q0r4 q0r3 q0r2 q0r1 q0r0] 280 ;// dQ_1 = [q1r7 q1r6 q1r5 q1r4 q1r3 q1r2 q1r1 q1r0] 281 ;// dQ_2 = [q2r7 q2r6 q2r5 q2r4 q2r3 q2r2 q2r1 q2r0] 282 ;// dQ_3 = [q3r7 q3r6 q3r5 q3r4 q3r3 q3r2 q3r1 q3r0] 283 284 ;// dP_0 = [p0r7 p0r6 p0r5 p0r4 p0r3 p0r2 p0r1 p0r0] 285 ;// dP_1 = [p1r7 p1r6 p1r5 p1r4 p1r3 p1r2 p1r1 p1r0] 286 ;// dP_2 = [p2r7 p2r6 p2r5 p2r4 p2r3 p2r2 p2r1 p2r0] 287 ;// dP_3 = [p3r7 p3r6 p3r5 p3r4 p3r3 p3r2 p3r1 p3r0] 288 289 VABD dAp0q0, dP_0, dQ_0 290 VABD dAp1p0, dP_1, dP_0 291 292 VABD dAq1q0, dQ_1, dQ_0 293 VABD dAp2p0, dP_2, dP_0 294 295 TST bS10, #0xff 296 VCGT dFilt, dAlpha, dAp0q0 297 298 VMAX dAp1p0, dAq1q0, dAp1p0 299 VABD dAq2q0, dQ_2, dQ_0 300 301 VMOVEQ.U32 dFilt[0], Mask_0 302 TST bS10, #0xff00 303 304 VCGT dAp2p0, dBeta, dAp2p0 305 VCGT dAp1p0, dBeta, dAp1p0 306 307 VMOVEQ.U32 dFilt[1], Mask_0 308 309 VCGT dAq2q0, dBeta, dAq2q0 310 VAND dFilt, dFilt, dAp1p0 311 TST bS10, #4 312 313 VAND dAqflg, dFilt, dAq2q0 314 VAND dApflg, dFilt, dAp2p0 315 316 BNE bSGE4 317 bSLT4 318 ;// bS < 4 Filtering 319 320 BL armVCM4P10_DeblockingLumabSLT4_unsafe 321 322 ;// Transpose 323 324 VZIP.8 dP_3, dP_2 325 VZIP.8 dP_1n, dP_0n 326 VZIP.8 dQ_0n, dQ_1n 327 VZIP.8 dQ_2, dQ_3 328 329 330 VZIP.16 dP_3, dP_1n 331 ADD pTmp, pSrcDst, srcdstStep 332 VZIP.16 dQ_0n, dQ_2 333 VZIP.16 dQ_1n, dQ_3 334 VZIP.16 dP_2, dP_0n 335 336 VZIP.32 dP_3, dQ_0n 337 VZIP.32 dP_1n, dQ_2 338 VZIP.32 dP_2, dQ_1n 339 VZIP.32 dP_0n, dQ_3 340 341 ;// dRown0 - dP_3, dRown1 - dQ_0n 342 ;// dRown2 - dP_1n, dRown3 - dQ_2 343 ;// dRown4 - dP_2, dRown5 - dQ_1n 344 ;// dRown6 - dP_0n, dRown7 - dQ_3 345 346 VST1 dRown0, [pSrcDst], pTmpStep 347 VST1 dRown1, [pTmp], pTmpStep 348 VST1 dRown2, [pSrcDst], pTmpStep 349 VST1 dRown3, [pTmp], pTmpStep 350 ;1 351 VST1 dRown4, [pSrcDst], pTmpStep 352 VST1 dRown5, [pTmp], pTmpStep 353 ADDS XY, XY, XY 354 VST1 dRown6, [pSrcDst], pTmpStep 355 ADD pThresholds, pThresholds, #2 356 VST1 dRown7, [pTmp], srcdstStep 357 358 SUB pSrcDst, pSrcDst, srcdstStep, LSL #3 359 VLD1 {dAlpha[]}, [pAlpha_1] 360 ADD pSrcDst, pSrcDst, #4 361 VLD1 {dBeta[]}, [pBeta_1] 362 363 BCC LoopX 364 B ExitLoopY 365 366 NoFilterBS0 367 ADD pSrcDst, pSrcDst, #4 368 ADDS XY, XY, XY 369 VLD1 {dAlpha[]}, [pAlpha_1] 370 ADD pThresholds, pThresholds, #4 371 VLD1 {dBeta[]}, [pBeta_1] 372 BCC LoopX 373 B ExitLoopY 374 bSGE4 375 ;// bS >= 4 Filtering 376 377 BL armVCM4P10_DeblockingLumabSGE4_unsafe 378 379 ;// Transpose 380 381 VZIP.8 dP_3, dP_2n 382 VZIP.8 dP_1n, dP_0n 383 VZIP.8 dQ_0n, dQ_1n 384 VZIP.8 dQ_2n, dQ_3 385 386 VZIP.16 dP_3, dP_1n 387 ADD pTmp, pSrcDst, srcdstStep 388 VZIP.16 dQ_0n, dQ_2n 389 VZIP.16 dQ_1n, dQ_3 390 VZIP.16 dP_2n, dP_0n 391 392 VZIP.32 dP_3, dQ_0n 393 VZIP.32 dP_1n, dQ_2n 394 VZIP.32 dP_2n, dQ_1n 395 VZIP.32 dP_0n, dQ_3 396 397 ;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n 398 ;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3 399 400 VST1 dRow0n, [pSrcDst], pTmpStep 401 VST1 dRow1n, [pTmp], pTmpStep 402 VST1 dRow2n, [pSrcDst], pTmpStep 403 VST1 dRow3n, [pTmp], pTmpStep 404 VST1 dRow4n, [pSrcDst], pTmpStep 405 VST1 dRow5n, [pTmp], pTmpStep 406 ADDS XY,XY,XY 407 VST1 dRow6n, [pSrcDst], pTmpStep 408 ADD pThresholds, pThresholds, #4 409 VST1 dRow7n, [pTmp], pTmpStep 410 411 SUB pSrcDst, pSrcDst, srcdstStep, LSL #3 412 VLD1 {dAlpha[]}, [pAlpha_1] 413 ADD pSrcDst, pSrcDst, #4 414 VLD1 {dBeta[]}, [pBeta_1] 415 416 BCC LoopX 417 418 ExitLoopY 419 SUB pBS, pBS, #14 420 SUB pThresholds, pThresholds, #14 421 SUB pSrcDst, pSrcDst, #16 422 VLD1 {dAlpha[]}, [pAlpha_0] 423 ADD pSrcDst, pSrcDst, srcdstStep, LSL #3 424 VLD1 {dBeta[]}, [pBeta_0] 425 BNE LoopY 426 427 MOV r0, #OMX_Sts_NoErr 428 429 M_END 430 431 ENDIF 432 433 434 END 435 436 437