1 ;// 2 ;// 3 ;// File Name: armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s 4 ;// OpenMAX DL: v1.0.2 5 ;// Revision: 12290 6 ;// Date: Wednesday, April 9, 2008 7 ;// 8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9 ;// 10 ;// 11 ;// 12 13 INCLUDE omxtypes_s.h 14 INCLUDE armCOMM_s.h 15 16 EXPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe 17 18 M_VARIANTS CortexA8 19 20 IF CortexA8 21 22 M_START armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe, r11 23 24 ;// Declare input registers 25 pSrc RN 0 26 srcStep RN 1 27 pDst RN 2 28 dstStep RN 3 29 30 ;// Declare Neon registers 31 dCoeff5 DN 30.S16 32 dCoeff20 DN 31.S16 33 qCoeff5 QN 14.S32 34 qCoeff20 QN 15.S32 35 36 qSrc01 QN 0.U8 37 dSrc0 DN 0.U8 38 dSrc1 DN 1.U8 39 40 dSrcb DN 4.U8 41 dSrcc DN 2.U8 42 dSrcd DN 3.U8 43 dSrce DN 5.U8 44 dSrcf DN 1.U8 45 46 qSrcb QN 2.S16 47 qSrcc QN 1.S16 48 dSrcB DN 4.S16 49 dSrcC DN 2.S16 50 51 qRes0 QN 5.S16 52 qRes1 QN 6.S16 53 qRes2 QN 7.S16 54 qRes3 QN 8.S16 55 qRes4 QN 9.S16 56 qRes5 QN 10.S16 57 qRes6 QN 11.S16 58 qRes7 QN 12.S16 59 qRes8 QN 13.S16 60 61 dRes0 DN 10.S16 62 dRes1 DN 12.S16 63 dRes2 DN 14.S16 64 dRes3 DN 16.S16 65 dRes4 DN 18.S16 66 dRes5 DN 20.S16 67 dRes6 DN 22.S16 68 dRes7 DN 24.S16 69 dRes8 DN 26.S16 70 71 qAcc01 QN 5.S32 72 qAcc23 QN 6.S32 73 qAcc45 QN 2.S32 74 qAcc67 QN 3.S32 75 qSumBE QN 0.S32 76 qSumCD QN 1.S32 77 78 dTempAcc0 DN 0.U16 79 dTempAcc1 DN 2.U16 80 dTempAcc2 DN 4.U16 81 dTempAcc3 DN 6.U16 82 83 qTAcc0 QN 0.U16 84 qTAcc1 QN 1.U16 85 qTAcc2 QN 2.U16 86 qTAcc3 QN 3.U16 87 88 dAcc0 DN 0.U8 89 dAcc1 DN 2.U8 90 dAcc2 DN 4.U8 91 dAcc3 DN 6.U8 92 93 dTmp0 DN 8.S16 94 dTmp1 DN 9.S16 95 qTmp0 QN 4.S32 96 97 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 98 VMOV dCoeff20, #20 99 VMOV dCoeff5, #5 100 101 ;// Row0 102 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 103 VEXT dSrcc, dSrc0, dSrc1, #2 104 VEXT dSrcd, dSrc0, dSrc1, #3 105 VEXT dSrce, dSrc0, dSrc1, #4 106 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 107 VADDL qSrcc, dSrcc, dSrcd ;// c+d 108 VADDL qSrcb, dSrcb, dSrce ;// b+e 109 VADDL qRes0, dSrc0, dSrcf ;// Acc=a+f 110 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 111 VMLA dRes0, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 112 ; VMLS dRes0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 113 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 114 115 ;// Row1 116 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 117 VEXT dSrcc, dSrc0, dSrc1, #2 118 VEXT dSrcd, dSrc0, dSrc1, #3 119 VEXT dSrce, dSrc0, dSrc1, #4 120 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 121 VADDL qSrcc, dSrcc, dSrcd ;// c+d 122 VADDL qSrcb, dSrcb, dSrce ;// b+e 123 VADDL qRes1, dSrc0, dSrcf ;// Acc=a+f 124 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 125 126 VSUB dRes0, dRes0, dTmp0 ;// TeRi 127 128 VMLA dRes1, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 129 ; VMLS dRes1, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 130 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 131 132 ;// Row2 133 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 134 VEXT dSrcc, dSrc0, dSrc1, #2 135 VEXT dSrcd, dSrc0, dSrc1, #3 136 VEXT dSrce, dSrc0, dSrc1, #4 137 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 138 VADDL qSrcc, dSrcc, dSrcd ;// c+d 139 VADDL qSrcb, dSrcb, dSrce ;// b+e 140 VADDL qRes2, dSrc0, dSrcf ;// Acc=a+f 141 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 142 143 VSUB dRes1, dRes1, dTmp0 144 145 VMLA dRes2, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 146 ; VMLS dRes2, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 147 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 148 149 ;// Row3 150 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 151 VEXT dSrcc, dSrc0, dSrc1, #2 152 VEXT dSrcd, dSrc0, dSrc1, #3 153 VEXT dSrce, dSrc0, dSrc1, #4 154 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 155 VADDL qSrcc, dSrcc, dSrcd ;// c+d 156 VADDL qSrcb, dSrcb, dSrce ;// b+e 157 VADDL qRes3, dSrc0, dSrcf ;// Acc=a+f 158 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 159 160 VSUB dRes2, dRes2, dTmp0 161 162 VMLA dRes3, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 163 ; VMLS dRes3, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 164 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 165 166 ;// Row4 167 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 168 VEXT dSrcc, dSrc0, dSrc1, #2 169 VEXT dSrcd, dSrc0, dSrc1, #3 170 VEXT dSrce, dSrc0, dSrc1, #4 171 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 172 VADDL qSrcc, dSrcc, dSrcd ;// c+d 173 VADDL qSrcb, dSrcb, dSrce ;// b+e 174 VADDL qRes4, dSrc0, dSrcf ;// Acc=a+f 175 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 176 177 VSUB dRes3, dRes3, dTmp0 178 179 VMLA dRes4, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 180 ; VMLS dRes4, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 181 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 182 183 ;// Row5 184 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 185 VEXT dSrcc, dSrc0, dSrc1, #2 186 VEXT dSrcd, dSrc0, dSrc1, #3 187 VEXT dSrce, dSrc0, dSrc1, #4 188 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 189 VADDL qSrcc, dSrcc, dSrcd ;// c+d 190 VADDL qSrcb, dSrcb, dSrce ;// b+e 191 VADDL qRes5, dSrc0, dSrcf ;// Acc=a+f 192 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 193 194 VSUB dRes4, dRes4, dTmp0 195 196 VMLA dRes5, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 197 ; VMLS dRes5, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 198 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 199 200 ;// Row6 201 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 202 VEXT dSrcc, dSrc0, dSrc1, #2 203 VEXT dSrcd, dSrc0, dSrc1, #3 204 VEXT dSrce, dSrc0, dSrc1, #4 205 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 206 VADDL qSrcc, dSrcc, dSrcd ;// c+d 207 VADDL qSrcb, dSrcb, dSrce ;// b+e 208 VADDL qRes6, dSrc0, dSrcf ;// Acc=a+f 209 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 210 211 VSUB dRes5, dRes5, dTmp0 212 213 VMLA dRes6, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 214 ; VMLS dRes6, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 215 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 216 217 ;// Row7 218 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 219 VEXT dSrcc, dSrc0, dSrc1, #2 220 VEXT dSrcd, dSrc0, dSrc1, #3 221 VEXT dSrce, dSrc0, dSrc1, #4 222 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 223 VADDL qSrcc, dSrcc, dSrcd ;// c+d 224 VADDL qSrcb, dSrcb, dSrce ;// b+e 225 VADDL qRes7, dSrc0, dSrcf ;// Acc=a+f 226 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 227 228 VSUB dRes6, dRes6, dTmp0 229 230 VMLA dRes7, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 231 ; VMLS dRes7, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 232 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 233 234 ;// Row8 235 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 236 VEXT dSrcc, dSrc0, dSrc1, #2 237 VEXT dSrcd, dSrc0, dSrc1, #3 238 VEXT dSrce, dSrc0, dSrc1, #4 239 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 240 VADDL qSrcc, dSrcc, dSrcd ;// c+d 241 VADDL qSrcb, dSrcb, dSrce ;// b+e 242 VADDL qRes8, dSrc0, dSrcf ;// Acc=a+f 243 244 VSUB dRes7, dRes7, dTmp0 245 246 VMLA dRes8, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 247 ; VMLS dRes8, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 248 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 249 250 VMOV qCoeff20, #20 251 VMOV qCoeff5, #5 252 253 ;// Col0 254 VADDL qAcc01, dRes0, dRes5 ;// Acc = a+f 255 VADDL qSumCD, dRes2, dRes3 ;// c+d 256 VADDL qSumBE, dRes1, dRes4 ;// b+e 257 258 VSUB dRes8, dRes8, dTmp0 259 260 VMLA qAcc01, qSumCD, qCoeff20 ;// Acc += 20*(c+d) 261 ; VMLS qAcc01, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) 262 VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) 263 264 ;// Col1 265 VADDL qAcc23, dRes1, dRes6 ;// Acc = a+f 266 VADDL qSumCD, dRes3, dRes4 ;// c+d 267 VADDL qSumBE, dRes2, dRes5 ;// b+e 268 VMLA qAcc23, qSumCD, qCoeff20 ;// Acc += 20*(c+d) 269 270 VSUB qAcc01, qAcc01, qTmp0 271 272 ; VMLS qAcc23, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) 273 VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) 274 275 ;// Col2 276 VADDL qAcc45, dRes2, dRes7 ;// Acc = a+f 277 VADDL qSumCD, dRes4, dRes5 ;// c+d 278 VADDL qSumBE, dRes3, dRes6 ;// b+e 279 VMLA qAcc45, qSumCD, qCoeff20 ;// Acc += 20*(c+d) 280 281 VSUB qAcc23, qAcc23, qTmp0 282 283 ; VMLS qAcc45, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) 284 VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) 285 286 ;// Col3 287 VADDL qAcc67, dRes3, dRes8 ;// Acc = a+f 288 VADDL qSumCD, dRes5, dRes6 ;// c+d 289 VADDL qSumBE, dRes4, dRes7 ;// b+e 290 VMLA qAcc67, qSumCD, qCoeff20 ;// Acc += 20*(c+d) 291 292 VSUB qAcc45, qAcc45, qTmp0 293 294 VMLS qAcc67, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) 295 296 VQRSHRUN dTempAcc0, qAcc01, #10 297 VQRSHRUN dTempAcc1, qAcc23, #10 298 VQRSHRUN dTempAcc2, qAcc45, #10 299 VQRSHRUN dTempAcc3, qAcc67, #10 300 301 VQMOVN dAcc0, qTAcc0 302 VQMOVN dAcc1, qTAcc1 303 VQMOVN dAcc2, qTAcc2 304 VQMOVN dAcc3, qTAcc3 305 306 M_END 307 308 ENDIF 309 310 311 312 END 313 314