1 ;// 2 ;// 3 ;// File Name: omxVCM4P10_FilterDeblockingChroma_VerEdge_I_s.s 4 ;// OpenMAX DL: v1.0.2 5 ;// Revision: 12290 6 ;// Date: Wednesday, April 9, 2008 7 ;// 8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9 ;// 10 ;// 11 ;// 12 13 INCLUDE omxtypes_s.h 14 INCLUDE armCOMM_s.h 15 16 M_VARIANTS CortexA8 17 18 IF CortexA8 19 20 IMPORT armVCM4P10_DeblockingChromabSGE4_unsafe 21 IMPORT armVCM4P10_DeblockingChromabSLT4_unsafe 22 23 LOOP_COUNT EQU 0x40000000 24 MASK_3 EQU 0x03030303 25 MASK_4 EQU 0x04040404 26 27 ;// Function arguments 28 29 pSrcDst RN 0 30 srcdstStep RN 1 31 pAlpha RN 2 32 pBeta RN 3 33 34 pThresholds RN 5 35 pBS RN 4 36 bS3210 RN 6 37 pSrcDst_P RN 10 38 pSrcDst_Q RN 12 39 40 pTmp RN 10 41 pTmp2 RN 12 42 step RN 14 43 44 ;// Loop 45 46 XY RN 7 47 48 ;// Rows input 49 dRow0 DN D7.U8 50 dRow1 DN D8.U8 51 dRow2 DN D5.U8 52 dRow3 DN D10.U8 53 dRow4 DN D6.U8 54 dRow5 DN D9.U8 55 dRow6 DN D4.U8 56 dRow7 DN D11.U8 57 58 59 ;// Pixels 60 dP_0 DN D4.U8 61 dP_1 DN D5.U8 62 dP_2 DN D6.U8 63 dQ_0 DN D8.U8 64 dQ_1 DN D9.U8 65 dQ_2 DN D10.U8 66 67 ;// Filtering Decision 68 dAlpha DN D0.U8 69 dBeta DN D2.U8 70 71 dFilt DN D16.U8 72 dAqflg DN D12.U8 73 dApflg DN D17.U8 74 75 dAp0q0 DN D13.U8 76 dAp1p0 DN D12.U8 77 dAq1q0 DN D18.U8 78 dAp2p0 DN D19.U8 79 dAq2q0 DN D17.U8 80 81 qBS3210 QN Q13.U16 82 dBS3210 DN D26 83 dMask_bs DN D27 84 dFilt_bs DN D26.U16 85 86 ;// bSLT4 87 dMask_0 DN D14.U8 88 dMask_1 DN D15.U8 89 dMask_4 DN D1.U16 90 91 Mask_4 RN 8 92 Mask_3 RN 9 93 94 dTemp DN D19.U8 95 96 ;// Result 97 dP_0t DN D13.U8 98 dQ_0t DN D31.U8 99 100 dP_0n DN D29.U8 101 dQ_0n DN D24.U8 102 103 104 ;// Function header 105 M_START omxVCM4P10_FilterDeblockingChroma_VerEdge_I, r12, d15 106 107 ;//Arguments on the stack 108 M_ARG ppThresholds, 4 109 M_ARG ppBS, 4 110 111 ;// d0-dAlpha_0 112 ;// d2-dBeta_0 113 114 ;load alpha1,beta1 somewhere to avoid more loads 115 VLD1 {dAlpha[]}, [pAlpha]! 116 SUB pSrcDst, pSrcDst, #4 117 VLD1 {dBeta[]}, [pBeta]! 118 119 M_LDR pBS, ppBS 120 M_LDR pThresholds, ppThresholds 121 122 LDR Mask_4, =MASK_4 123 LDR Mask_3, =MASK_3 124 125 ;dMask_0-14 126 ;dMask_1-15 127 ;dMask_4-19 128 129 VMOV dMask_0, #0 130 VMOV dMask_1, #1 131 VMOV dMask_4, #4 132 133 LDR XY, =LOOP_COUNT 134 135 ;// p0-p3 - d4-d7 136 ;// q0-q3 - d8-d11 137 138 139 LoopY 140 LDR bS3210, [pBS], #8 141 ADD pTmp, pSrcDst, srcdstStep 142 ADD step, srcdstStep, srcdstStep 143 144 ;1 145 VLD1 dRow0, [pSrcDst], step 146 ;1 147 VLD1 dRow1, [pTmp], step 148 VLD1 dRow2, [pSrcDst], step 149 VLD1 dRow3, [pTmp], step 150 VLD1 dRow4, [pSrcDst], step 151 VLD1 dRow5, [pTmp], step 152 VLD1 dRow6, [pSrcDst], step 153 VLD1 dRow7, [pTmp], step 154 155 156 ;// dRow0 = [q3r0 q2r0 q1r0 q0r0 p0r0 p1r0 p2r0 p3r0] 157 ;// dRow1 = [q3r1 q2r1 q1r1 q0r1 p0r1 p1r1 p2r1 p3r1] 158 ;// dRow2 = [q3r2 q2r2 q1r2 q0r2 p0r2 p1r2 p2r2 p3r2] 159 ;// dRow3 = [q3r3 q2r3 q1r3 q0r3 p0r3 p1r3 p2r3 p3r3] 160 ;// dRow4 = [q3r4 q2r4 q1r4 q0r4 p0r4 p1r4 p2r4 p3r4] 161 ;// dRow5 = [q3r5 q2r5 q1r5 q0r5 p0r5 p1r5 p2r5 p3r5] 162 ;// dRow6 = [q3r6 q2r6 q1r6 q0r6 p0r6 p1r6 p2r6 p3r6] 163 ;// dRow7 = [q3r7 q2r7 q1r7 q0r7 p0r7 p1r7 p2r7 p3r7] 164 165 ;// 8x8 Transpose 166 VZIP.8 dRow0, dRow1 167 VZIP.8 dRow2, dRow3 168 VZIP.8 dRow4, dRow5 169 VZIP.8 dRow6, dRow7 170 171 VZIP.16 dRow0, dRow2 172 VZIP.16 dRow1, dRow3 173 VZIP.16 dRow4, dRow6 174 VZIP.16 dRow5, dRow7 175 176 VZIP.32 dRow0, dRow4 177 VZIP.32 dRow2, dRow6 178 VZIP.32 dRow3, dRow7 179 VZIP.32 dRow1, dRow5 180 181 182 ;Realign the pointers 183 184 CMP bS3210, #0 185 VABD dAp2p0, dP_2, dP_0 186 VABD dAp0q0, dP_0, dQ_0 187 BEQ NoFilterBS0 188 189 VABD dAp1p0, dP_1, dP_0 190 VABD dAq1q0, dQ_1, dQ_0 191 192 VMOV.U32 dBS3210[0], bS3210 193 VCGT dFilt, dAlpha, dAp0q0 194 VMAX dAp1p0, dAq1q0, dAp1p0 195 VMOVL qBS3210, dBS3210.U8 196 VABD dAq2q0, dQ_2, dQ_0 197 VCGT dMask_bs.S16, dBS3210.S16, #0 198 199 VCGT dAp1p0, dBeta, dAp1p0 200 VCGT dAp2p0, dBeta, dAp2p0 201 VAND dFilt, dMask_bs.U8 202 203 TST bS3210, Mask_3 204 205 VCGT dAq2q0, dBeta, dAq2q0 206 VAND dFilt, dFilt, dAp1p0 207 208 VAND dAqflg, dFilt, dAq2q0 209 VAND dApflg, dFilt, dAp2p0 210 211 ;// bS < 4 Filtering 212 BLNE armVCM4P10_DeblockingChromabSLT4_unsafe 213 214 TST bS3210, Mask_4 215 216 SUB pSrcDst, pSrcDst, srcdstStep, LSL #3 217 VTST dFilt_bs, dFilt_bs, dMask_4 218 219 ;// bS == 4 Filtering 220 BLNE armVCM4P10_DeblockingChromabSGE4_unsafe 221 222 VBIT dP_0n, dP_0t, dFilt_bs 223 VBIT dQ_0n, dQ_0t, dFilt_bs 224 225 ;// Result Storage 226 ADD pSrcDst_P, pSrcDst, #3 227 VBIF dP_0n, dP_0, dFilt 228 229 ADD pTmp2, pSrcDst_P, srcdstStep 230 ADD step, srcdstStep, srcdstStep 231 VBIF dQ_0n, dQ_0, dFilt 232 233 ADDS XY, XY, XY 234 235 VST1 {dP_0n[0]}, [pSrcDst_P], step 236 VST1 {dP_0n[1]}, [pTmp2], step 237 VST1 {dP_0n[2]}, [pSrcDst_P], step 238 VST1 {dP_0n[3]}, [pTmp2], step 239 VST1 {dP_0n[4]}, [pSrcDst_P], step 240 VST1 {dP_0n[5]}, [pTmp2], step 241 VST1 {dP_0n[6]}, [pSrcDst_P], step 242 VST1 {dP_0n[7]}, [pTmp2], step 243 244 ADD pSrcDst_Q, pSrcDst, #4 245 ADD pTmp, pSrcDst_Q, srcdstStep 246 247 VST1 {dQ_0n[0]}, [pSrcDst_Q], step 248 VST1 {dQ_0n[1]}, [pTmp], step 249 VST1 {dQ_0n[2]}, [pSrcDst_Q], step 250 VST1 {dQ_0n[3]}, [pTmp], step 251 VST1 {dQ_0n[4]}, [pSrcDst_Q], step 252 VST1 {dQ_0n[5]}, [pTmp], step 253 VST1 {dQ_0n[6]}, [pSrcDst_Q], step 254 VST1 {dQ_0n[7]}, [pTmp], step 255 256 ADD pSrcDst, pSrcDst, #4 257 258 BNE LoopY 259 260 MOV r0, #OMX_Sts_NoErr 261 262 M_EXIT 263 264 NoFilterBS0 265 VLD1 {dAlpha[]}, [pAlpha] 266 ADD pSrcDst, pSrcDst, #4 267 SUB pSrcDst, pSrcDst, srcdstStep, LSL #3 268 ADDS XY, XY, XY 269 VLD1 {dBeta[]}, [pBeta] 270 ADD pThresholds, pThresholds, #4 271 BNE LoopY 272 273 MOV r0, #OMX_Sts_NoErr 274 275 M_END 276 277 ENDIF 278 279 280 END 281 282 283