1 ;// 2 ;// Copyright (C) 2007-2008 ARM Limited 3 ;// 4 ;// Licensed under the Apache License, Version 2.0 (the "License"); 5 ;// you may not use this file except in compliance with the License. 6 ;// You may obtain a copy of the License at 7 ;// 8 ;// http://www.apache.org/licenses/LICENSE-2.0 9 ;// 10 ;// Unless required by applicable law or agreed to in writing, software 11 ;// distributed under the License is distributed on an "AS IS" BASIS, 12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 ;// See the License for the specific language governing permissions and 14 ;// limitations under the License. 15 ;// 16 ;// 17 ;// 18 ;// File Name: omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s 19 ;// OpenMAX DL: v1.0.2 20 ;// Revision: 12290 21 ;// Date: Wednesday, April 9, 2008 22 ;// 23 ;// 24 ;// 25 ;// 26 27 INCLUDE omxtypes_s.h 28 INCLUDE armCOMM_s.h 29 30 M_VARIANTS CortexA8 31 32 IMPORT armVCM4P10_DeblockingLumabSLT4_unsafe 33 IMPORT armVCM4P10_DeblockingLumabSGE4_unsafe 34 35 IF CortexA8 36 37 LOOP_COUNT EQU 0x11000000 38 39 40 ;// Function arguments 41 42 pSrcDst RN 0 43 srcdstStep RN 1 44 pAlpha RN 2 45 pBeta RN 3 46 47 pThresholds RN 5 48 pBS RN 4 49 bS10 RN 12 50 51 pAlpha_0 RN 2 52 pBeta_0 RN 3 53 54 pAlpha_1 RN 7 55 pBeta_1 RN 8 56 57 pTmp RN 10 58 pTmpStep RN 11 59 60 ;// Loop 61 62 XY RN 9 63 64 ;// Rows input 65 dRow0 DN D7.U8 66 dRow1 DN D8.U8 67 dRow2 DN D5.U8 68 dRow3 DN D10.U8 69 dRow4 DN D6.U8 70 dRow5 DN D9.U8 71 dRow6 DN D4.U8 72 dRow7 DN D11.U8 73 74 ;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2 75 ;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3 76 77 ;// Rows output 78 dRown0 DN D7.U8 79 dRown1 DN D24.U8 80 dRown2 DN D30.U8 81 dRown3 DN D10.U8 82 dRown4 DN D6.U8 83 dRown5 DN D25.U8 84 dRown6 DN D29.U8 85 dRown7 DN D11.U8 86 87 ;// dP_0n DN D29.U8 88 ;// dP_1n DN D30.U8 89 ;// dP_2n DN D31.U8 90 ;// 91 ;// dQ_0n DN D24.U8 ;!!;Temp2 92 ;// dQ_1n DN D25.U8 ;!!;Temp2 93 ;// dQ_2n DN D28.U8 ;!!;dQ_0t 94 ;// 95 ;// dRown0 - dP_3, dRown1 - dQ_0n 96 ;// dRown2 - dP_1n, dRown3 - dQ_2 97 ;// dRown4 - dP_2, dRown5 - dQ_1n 98 ;// dRown6 - dP_0n, dRown7 - dQ_3 99 100 dRow0n DN D7.U8 101 dRow1n DN D24.U8 102 dRow2n DN D30.U8 103 dRow3n DN D28.U8 104 dRow4n DN D31.U8 105 dRow5n DN D25.U8 106 dRow6n DN D29.U8 107 dRow7n DN D11.U8 108 109 ;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n 110 ;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3 111 112 ;// Pixels 113 dP_0 DN D4.U8 114 dP_1 DN D5.U8 115 dP_2 DN D6.U8 116 dP_3 DN D7.U8 117 dQ_0 DN D8.U8 118 dQ_1 DN D9.U8 119 dQ_2 DN D10.U8 120 dQ_3 DN D11.U8 121 122 123 ;// Filtering Decision 124 dAlpha DN D0.U8 125 dBeta DN D2.U8 126 127 dFilt DN D16.U8 128 dAqflg DN D12.U8 129 dApflg DN D17.U8 130 131 dAp0q0 DN D13.U8 132 dAp1p0 DN D12.U8 133 dAq1q0 DN D18.U8 134 dAp2p0 DN D19.U8 135 dAq2q0 DN D17.U8 136 137 ;// bSLT4 138 dTC0 DN D18.U8 139 dTC1 DN D19.U8 140 dTC01 DN D18.U8 141 142 dTCs DN D31.S8 143 dTC DN D31.U8 144 145 dMask_0 DN D14.U8 146 dMask_1 DN D15.U8 147 148 Mask_0 RN 6 149 150 dTemp DN D19.U8 151 152 ;// Computing P0,Q0 153 qDq0p0 QN Q10.S16 154 qDp1q1 QN Q11.S16 155 qDelta QN Q10.S16 ; reuse qDq0p0 156 dDelta DN D20.S8 157 158 159 ;// Computing P1,Q1 160 dRp0q0 DN D24.U8 161 162 dMaxP DN D23.U8 163 dMinP DN D22.U8 164 165 dMaxQ DN D19.U8 166 dMinQ DN D21.U8 167 168 dDeltaP DN D26.U8 169 dDeltaQ DN D27.U8 170 171 qP_0n QN Q14.S16 172 qQ_0n QN Q12.S16 173 174 dQ_0n DN D24.U8 175 dQ_1n DN D25.U8 176 dP_0n DN D29.U8 177 dP_1n DN D30.U8 178 179 ;// bSGE4 180 181 qSp0q0 QN Q10.U16 182 183 qSp2q1 QN Q11.U16 184 qSp0q0p1 QN Q12.U16 185 qSp3p2 QN Q13.U16 186 dHSp0q1 DN D28.U8 187 188 qSq2p1 QN Q11.U16 189 qSp0q0q1 QN Q12.U16 190 qSq3q2 QN Q13.U16 ;!! 191 dHSq0p1 DN D28.U8 ;!! 192 193 qTemp1 QN Q11.U16 ;!!;qSp2q1 194 qTemp2 QN Q12.U16 ;!!;qSp0q0p1 195 196 dP_0t DN D28.U8 ;!!;dHSp0q1 197 dQ_0t DN D22.U8 ;!!;Temp1 198 199 dP_0n DN D29.U8 200 dP_1n DN D30.U8 201 dP_2n DN D31.U8 202 203 dQ_0n DN D24.U8 ;!!;Temp2 204 dQ_1n DN D25.U8 ;!!;Temp2 205 dQ_2n DN D28.U8 ;!!;dQ_0t 206 207 208 ;// Function header 209 M_START omxVCM4P10_FilterDeblockingLuma_VerEdge_I, r11, d15 210 211 ;//Arguments on the stack 212 M_ARG ppThresholds, 4 213 M_ARG ppBS, 4 214 215 ;// d0-dAlpha_0 216 ;// d2-dBeta_0 217 218 ADD pAlpha_1, pAlpha_0, #1 219 ADD pBeta_1, pBeta_0, #1 220 221 VLD1 {dAlpha[]}, [pAlpha_0] 222 SUB pSrcDst, pSrcDst, #4 223 VLD1 {dBeta[]}, [pBeta_0] 224 225 M_LDR pBS, ppBS 226 M_LDR pThresholds, ppThresholds 227 228 MOV Mask_0,#0 229 230 ;dMask_0-14 231 ;dMask_1-15 232 233 VMOV dMask_0, #0 234 VMOV dMask_1, #1 235 236 LDR XY,=LOOP_COUNT 237 238 ADD pTmpStep, srcdstStep, srcdstStep 239 240 ;// p0-p3 - d4-d7 241 ;// q0-q3 - d8-d11 242 LoopY 243 LoopX 244 LDRH bS10, [pBS], #4 245 246 CMP bS10, #0 247 BEQ NoFilterBS0 248 249 ;// Load 8 rows of data 250 ADD pTmp, pSrcDst, srcdstStep 251 VLD1 dRow0, [pSrcDst], pTmpStep 252 VLD1 dRow1, [pTmp], pTmpStep 253 VLD1 dRow2, [pSrcDst], pTmpStep 254 VZIP.8 dRow0, dRow1 255 VLD1 dRow3, [pTmp], pTmpStep 256 VLD1 dRow4, [pSrcDst], pTmpStep 257 VZIP.8 dRow2, dRow3 258 VLD1 dRow5, [pTmp], pTmpStep 259 VLD1 dRow6, [pSrcDst], pTmpStep 260 VLD1 dRow7, [pTmp], pTmpStep 261 VZIP.8 dRow4, dRow5 262 VZIP.16 dRow1, dRow3 263 264 265 ;// dRow0 = [q3r0 q2r0 q1r0 q0r0 p0r0 p1r0 p2r0 p3r0] 266 ;// dRow1 = [q3r1 q2r1 q1r1 q0r1 p0r1 p1r1 p2r1 p3r1] 267 ;// dRow2 = [q3r2 q2r2 q1r2 q0r2 p0r2 p1r2 p2r2 p3r2] 268 ;// dRow3 = [q3r3 q2r3 q1r3 q0r3 p0r3 p1r3 p2r3 p3r3] 269 ;// dRow4 = [q3r4 q2r4 q1r4 q0r4 p0r4 p1r4 p2r4 p3r4] 270 ;// dRow5 = [q3r5 q2r5 q1r5 q0r5 p0r5 p1r5 p2r5 p3r5] 271 ;// dRow6 = [q3r6 q2r6 q1r6 q0r6 p0r6 p1r6 p2r6 p3r6] 272 ;// dRow7 = [q3r7 q2r7 q1r7 q0r7 p0r7 p1r7 p2r7 p3r7] 273 274 ;// 8x8 Transpose 275 276 VZIP.8 dRow6, dRow7 277 278 SUB pSrcDst, pSrcDst, srcdstStep, LSL #3 279 VZIP.16 dRow0, dRow2 280 VZIP.16 dRow5, dRow7 281 282 283 VZIP.16 dRow4, dRow6 284 VZIP.32 dRow1, dRow5 285 VZIP.32 dRow2, dRow6 286 VZIP.32 dRow3, dRow7 287 VZIP.32 dRow0, dRow4 288 289 290 ;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2 291 ;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3 292 293 ;// dQ_0 = [q0r7 q0r6 q0r5 q0r4 q0r3 q0r2 q0r1 q0r0] 294 ;// dQ_1 = [q1r7 q1r6 q1r5 q1r4 q1r3 q1r2 q1r1 q1r0] 295 ;// dQ_2 = [q2r7 q2r6 q2r5 q2r4 q2r3 q2r2 q2r1 q2r0] 296 ;// dQ_3 = [q3r7 q3r6 q3r5 q3r4 q3r3 q3r2 q3r1 q3r0] 297 298 ;// dP_0 = [p0r7 p0r6 p0r5 p0r4 p0r3 p0r2 p0r1 p0r0] 299 ;// dP_1 = [p1r7 p1r6 p1r5 p1r4 p1r3 p1r2 p1r1 p1r0] 300 ;// dP_2 = [p2r7 p2r6 p2r5 p2r4 p2r3 p2r2 p2r1 p2r0] 301 ;// dP_3 = [p3r7 p3r6 p3r5 p3r4 p3r3 p3r2 p3r1 p3r0] 302 303 VABD dAp0q0, dP_0, dQ_0 304 VABD dAp1p0, dP_1, dP_0 305 306 VABD dAq1q0, dQ_1, dQ_0 307 VABD dAp2p0, dP_2, dP_0 308 309 TST bS10, #0xff 310 VCGT dFilt, dAlpha, dAp0q0 311 312 VMAX dAp1p0, dAq1q0, dAp1p0 313 VABD dAq2q0, dQ_2, dQ_0 314 315 VMOVEQ.U32 dFilt[0], Mask_0 316 TST bS10, #0xff00 317 318 VCGT dAp2p0, dBeta, dAp2p0 319 VCGT dAp1p0, dBeta, dAp1p0 320 321 VMOVEQ.U32 dFilt[1], Mask_0 322 323 VCGT dAq2q0, dBeta, dAq2q0 324 VAND dFilt, dFilt, dAp1p0 325 TST bS10, #4 326 327 VAND dAqflg, dFilt, dAq2q0 328 VAND dApflg, dFilt, dAp2p0 329 330 BNE bSGE4 331 bSLT4 332 ;// bS < 4 Filtering 333 334 BL armVCM4P10_DeblockingLumabSLT4_unsafe 335 336 ;// Transpose 337 338 VZIP.8 dP_3, dP_2 339 VZIP.8 dP_1n, dP_0n 340 VZIP.8 dQ_0n, dQ_1n 341 VZIP.8 dQ_2, dQ_3 342 343 344 VZIP.16 dP_3, dP_1n 345 ADD pTmp, pSrcDst, srcdstStep 346 VZIP.16 dQ_0n, dQ_2 347 VZIP.16 dQ_1n, dQ_3 348 VZIP.16 dP_2, dP_0n 349 350 VZIP.32 dP_3, dQ_0n 351 VZIP.32 dP_1n, dQ_2 352 VZIP.32 dP_2, dQ_1n 353 VZIP.32 dP_0n, dQ_3 354 355 ;// dRown0 - dP_3, dRown1 - dQ_0n 356 ;// dRown2 - dP_1n, dRown3 - dQ_2 357 ;// dRown4 - dP_2, dRown5 - dQ_1n 358 ;// dRown6 - dP_0n, dRown7 - dQ_3 359 360 VST1 dRown0, [pSrcDst], pTmpStep 361 VST1 dRown1, [pTmp], pTmpStep 362 VST1 dRown2, [pSrcDst], pTmpStep 363 VST1 dRown3, [pTmp], pTmpStep 364 ;1 365 VST1 dRown4, [pSrcDst], pTmpStep 366 VST1 dRown5, [pTmp], pTmpStep 367 ADDS XY, XY, XY 368 VST1 dRown6, [pSrcDst], pTmpStep 369 ADD pThresholds, pThresholds, #2 370 VST1 dRown7, [pTmp], srcdstStep 371 372 SUB pSrcDst, pSrcDst, srcdstStep, LSL #3 373 VLD1 {dAlpha[]}, [pAlpha_1] 374 ADD pSrcDst, pSrcDst, #4 375 VLD1 {dBeta[]}, [pBeta_1] 376 377 BCC LoopX 378 B ExitLoopY 379 380 NoFilterBS0 381 ADD pSrcDst, pSrcDst, #4 382 ADDS XY, XY, XY 383 VLD1 {dAlpha[]}, [pAlpha_1] 384 ADD pThresholds, pThresholds, #4 385 VLD1 {dBeta[]}, [pBeta_1] 386 BCC LoopX 387 B ExitLoopY 388 bSGE4 389 ;// bS >= 4 Filtering 390 391 BL armVCM4P10_DeblockingLumabSGE4_unsafe 392 393 ;// Transpose 394 395 VZIP.8 dP_3, dP_2n 396 VZIP.8 dP_1n, dP_0n 397 VZIP.8 dQ_0n, dQ_1n 398 VZIP.8 dQ_2n, dQ_3 399 400 VZIP.16 dP_3, dP_1n 401 ADD pTmp, pSrcDst, srcdstStep 402 VZIP.16 dQ_0n, dQ_2n 403 VZIP.16 dQ_1n, dQ_3 404 VZIP.16 dP_2n, dP_0n 405 406 VZIP.32 dP_3, dQ_0n 407 VZIP.32 dP_1n, dQ_2n 408 VZIP.32 dP_2n, dQ_1n 409 VZIP.32 dP_0n, dQ_3 410 411 ;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n 412 ;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3 413 414 VST1 dRow0n, [pSrcDst], pTmpStep 415 VST1 dRow1n, [pTmp], pTmpStep 416 VST1 dRow2n, [pSrcDst], pTmpStep 417 VST1 dRow3n, [pTmp], pTmpStep 418 VST1 dRow4n, [pSrcDst], pTmpStep 419 VST1 dRow5n, [pTmp], pTmpStep 420 ADDS XY,XY,XY 421 VST1 dRow6n, [pSrcDst], pTmpStep 422 ADD pThresholds, pThresholds, #4 423 VST1 dRow7n, [pTmp], pTmpStep 424 425 SUB pSrcDst, pSrcDst, srcdstStep, LSL #3 426 VLD1 {dAlpha[]}, [pAlpha_1] 427 ADD pSrcDst, pSrcDst, #4 428 VLD1 {dBeta[]}, [pBeta_1] 429 430 BCC LoopX 431 432 ExitLoopY 433 SUB pBS, pBS, #14 434 SUB pThresholds, pThresholds, #14 435 SUB pSrcDst, pSrcDst, #16 436 VLD1 {dAlpha[]}, [pAlpha_0] 437 ADD pSrcDst, pSrcDst, srcdstStep, LSL #3 438 VLD1 {dBeta[]}, [pBeta_0] 439 BNE LoopY 440 441 MOV r0, #OMX_Sts_NoErr 442 443 M_END 444 445 ENDIF 446 447 448 END 449 450 451