1 ;// 2 ;// 3 ;// File Name: omxVCM4P10_FilterDeblockingLuma_HorEdge_I_s.s 4 ;// OpenMAX DL: v1.0.2 5 ;// Revision: 12290 6 ;// Date: Wednesday, April 9, 2008 7 ;// 8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9 ;// 10 ;// 11 ;// 12 13 INCLUDE omxtypes_s.h 14 INCLUDE armCOMM_s.h 15 16 M_VARIANTS CortexA8 17 18 IMPORT armVCM4P10_DeblockingLumabSLT4_unsafe 19 IMPORT armVCM4P10_DeblockingLumabSGE4_unsafe 20 21 IF CortexA8 22 23 LOOP_COUNT EQU 0x55000000 24 25 26 ;// Function arguments 27 28 pSrcDst RN 0 29 srcdstStep RN 1 30 pAlpha RN 2 31 pBeta RN 3 32 33 pThresholds RN 5 34 pBS RN 4 35 bS10 RN 12 36 37 pAlpha_0 RN 2 38 pBeta_0 RN 3 39 40 pAlpha_1 RN 7 41 pBeta_1 RN 8 42 43 44 45 ;// Loop 46 47 XY RN 9 48 49 pTmp RN 6 50 step RN 10 51 52 ;// Pixels 53 dP_0 DN D4.U8 54 dP_1 DN D5.U8 55 dP_2 DN D6.U8 56 dP_3 DN D7.U8 57 dQ_0 DN D8.U8 58 dQ_1 DN D9.U8 59 dQ_2 DN D10.U8 60 dQ_3 DN D11.U8 61 62 63 ;// Filtering Decision 64 dAlpha DN D0.U8 65 dBeta DN D2.U8 66 67 dFilt DN D16.U8 68 dAqflg DN D12.U8 69 dApflg DN D17.U8 70 71 dAp0q0 DN D13.U8 72 dAp1p0 DN D12.U8 73 dAq1q0 DN D18.U8 74 dAp2p0 DN D19.U8 75 dAq2q0 DN D17.U8 76 77 ;// bSLT4 78 dTC0 DN D18.U8 79 dTC1 DN D19.U8 80 dTC01 DN D18.U8 81 82 dTCs DN D31.S8 83 dTC DN D31.U8 84 85 dMask_0 DN D14.U8 86 dMask_1 DN D15.U8 87 88 Mask_0 RN 11 89 90 dTemp DN D19.U8 91 92 ;// Computing P0,Q0 93 qDq0p0 QN Q10.S16 94 qDp1q1 QN Q11.S16 95 qDelta QN Q10.S16 ; reuse qDq0p0 96 dDelta DN D20.S8 97 98 99 ;// Computing P1,Q1 100 dRp0q0 DN D24.U8 101 102 dMaxP DN D23.U8 103 dMinP DN D22.U8 104 105 dMaxQ DN D19.U8 106 dMinQ DN D21.U8 107 108 dDeltaP DN D26.U8 109 dDeltaQ DN D27.U8 110 111 qP_0n QN Q14.S16 112 qQ_0n QN Q12.S16 113 114 dQ_0n DN D24.U8 115 dQ_1n DN D25.U8 116 dP_0n DN D29.U8 117 dP_1n DN D30.U8 118 119 ;// bSGE4 120 121 qSp0q0 QN Q10.U16 122 123 qSp2q1 QN Q11.U16 124 qSp0q0p1 QN Q12.U16 125 qSp3p2 QN Q13.U16 126 dHSp0q1 DN D28.U8 127 128 qSq2p1 QN Q11.U16 129 qSp0q0q1 QN Q12.U16 130 qSq3q2 QN Q13.U16 ;!! 131 dHSq0p1 DN D28.U8 ;!! 132 133 qTemp1 QN Q11.U16 ;!!;qSp2q1 134 qTemp2 QN Q12.U16 ;!!;qSp0q0p1 135 136 dP_0t DN D28.U8 ;!!;dHSp0q1 137 dQ_0t DN D22.U8 ;!!;Temp1 138 139 dP_0n DN D29.U8 140 dP_1n DN D30.U8 141 dP_2n DN D31.U8 142 143 dQ_0n DN D24.U8 ;!!;Temp2 144 dQ_1n DN D25.U8 ;!!;Temp2 145 dQ_2n DN D28.U8 ;!!;dQ_0t 146 147 148 ;// Function header 149 M_START omxVCM4P10_FilterDeblockingLuma_HorEdge_I, r11, d15 150 151 ;//Arguments on the stack 152 M_ARG ppThresholds, 4 153 M_ARG ppBS, 4 154 155 ;// d0-dAlpha_0 156 ;// d2-dBeta_0 157 158 ADD pAlpha_1, pAlpha_0, #1 159 ADD pBeta_1, pBeta_0, #1 160 161 VLD1 {dAlpha[]}, [pAlpha_0] 162 SUB pSrcDst, pSrcDst, srcdstStep, LSL #2 163 VLD1 {dBeta[]}, [pBeta_0] 164 165 M_LDR pBS, ppBS 166 M_LDR pThresholds, ppThresholds 167 168 MOV Mask_0,#0 169 170 ;dMask_0-14 171 ;dMask_1-15 172 173 VMOV dMask_0, #0 174 VMOV dMask_1, #1 175 176 ADD step, srcdstStep, srcdstStep 177 178 LDR XY,=LOOP_COUNT 179 180 ;// p0-p3 - d4-d7 181 ;// q0-q3 - d8-d11 182 LoopY 183 LoopX 184 LDRH bS10, [pBS], #2 185 ADD pTmp, pSrcDst, srcdstStep 186 CMP bS10, #0 187 BEQ NoFilterBS0 188 189 VLD1 dP_3, [pSrcDst], step 190 VLD1 dP_2, [pTmp], step 191 VLD1 dP_1, [pSrcDst], step 192 VLD1 dP_0, [pTmp], step 193 VLD1 dQ_0, [pSrcDst], step 194 VABD dAp1p0, dP_0, dP_1 195 VLD1 dQ_1, [pTmp] 196 VABD dAp0q0, dQ_0, dP_0 197 VLD1 dQ_2, [pSrcDst], srcdstStep 198 199 VABD dAq1q0, dQ_1, dQ_0 200 VABD dAp2p0, dP_2, dP_0 201 VCGT dFilt, dAlpha, dAp0q0 202 203 TST bS10, #0xff 204 VMAX dAp1p0, dAq1q0, dAp1p0 205 VABD dAq2q0, dQ_2, dQ_0 206 207 VMOVEQ.U32 dFilt[0], Mask_0 208 TST bS10, #0xff00 209 210 VCGT dAp2p0, dBeta, dAp2p0 211 VCGT dAp1p0, dBeta, dAp1p0 212 213 VMOVEQ.U32 dFilt[1], Mask_0 214 215 VCGT dAq2q0, dBeta, dAq2q0 216 VLD1 dQ_3, [pSrcDst] 217 VAND dFilt, dFilt, dAp1p0 218 TST bS10, #4 219 220 VAND dAqflg, dFilt, dAq2q0 221 VAND dApflg, dFilt, dAp2p0 222 223 BNE bSGE4 224 bSLT4 225 ;// bS < 4 Filtering 226 SUB pSrcDst, pSrcDst, srcdstStep, LSL #2 227 SUB pSrcDst, pSrcDst, srcdstStep 228 229 BL armVCM4P10_DeblockingLumabSLT4_unsafe 230 231 ;// Result Storage 232 VST1 dP_1n, [pSrcDst], srcdstStep 233 VST1 dP_0n, [pSrcDst], srcdstStep 234 SUB pTmp, pSrcDst, srcdstStep, LSL #2 235 VST1 dQ_0n, [pSrcDst], srcdstStep 236 ADDS XY, XY, XY 237 VST1 dQ_1n, [pSrcDst] 238 ADD pSrcDst, pTmp, #8 239 240 BCC LoopX 241 B ExitLoopY 242 243 NoFilterBS0 244 ADD pSrcDst, pSrcDst, #8 245 ADDS XY, XY, XY 246 ADD pThresholds, pThresholds, #2 247 BCC LoopX 248 B ExitLoopY 249 bSGE4 250 ;// bS >= 4 Filtering 251 SUB pSrcDst, pSrcDst, srcdstStep, LSL #2 252 SUB pSrcDst, pSrcDst, srcdstStep, LSL #1 253 BL armVCM4P10_DeblockingLumabSGE4_unsafe 254 255 ;// Result Storage 256 VST1 dP_2n, [pSrcDst], srcdstStep 257 VST1 dP_1n, [pSrcDst], srcdstStep 258 VST1 dP_0n, [pSrcDst], srcdstStep 259 SUB pTmp, pSrcDst, srcdstStep, LSL #2 260 VST1 dQ_0n, [pSrcDst], srcdstStep 261 ADDS XY,XY,XY 262 VST1 dQ_1n, [pSrcDst], srcdstStep 263 ADD pThresholds, pThresholds, #2 264 VST1 dQ_2n, [pSrcDst] 265 266 ADD pSrcDst, pTmp, #8 267 BCC LoopX 268 269 ExitLoopY 270 271 SUB pSrcDst, pSrcDst, #16 272 VLD1 {dAlpha[]}, [pAlpha_1] 273 ADD pSrcDst, pSrcDst, srcdstStep, LSL #2 274 VLD1 {dBeta[]}, [pBeta_1] 275 BNE LoopY 276 277 MOV r0, #OMX_Sts_NoErr 278 279 M_END 280 281 ENDIF 282 283 284 285 286 END 287 288 289