1 ;// 2 ;// Copyright (C) 2007-2008 ARM Limited 3 ;// 4 ;// Licensed under the Apache License, Version 2.0 (the "License"); 5 ;// you may not use this file except in compliance with the License. 6 ;// You may obtain a copy of the License at 7 ;// 8 ;// http://www.apache.org/licenses/LICENSE-2.0 9 ;// 10 ;// Unless required by applicable law or agreed to in writing, software 11 ;// distributed under the License is distributed on an "AS IS" BASIS, 12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 ;// See the License for the specific language governing permissions and 14 ;// limitations under the License. 15 ;// 16 ;// 17 ;// 18 ;// File Name: armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s 19 ;// OpenMAX DL: v1.0.2 20 ;// Revision: 12290 21 ;// Date: Wednesday, April 9, 2008 22 ;// 23 ;// 24 ;// 25 ;// 26 27 INCLUDE omxtypes_s.h 28 INCLUDE armCOMM_s.h 29 30 EXPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe 31 32 M_VARIANTS CortexA8 33 34 IF CortexA8 35 36 M_START armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe, r11 37 38 ;// Declare input registers 39 pSrc RN 0 40 srcStep RN 1 41 pDst RN 2 42 dstStep RN 3 43 44 ;// Declare Neon registers 45 dCoeff5 DN 30.S16 46 dCoeff20 DN 31.S16 47 qCoeff5 QN 14.S32 48 qCoeff20 QN 15.S32 49 50 qSrc01 QN 0.U8 51 dSrc0 DN 0.U8 52 dSrc1 DN 1.U8 53 54 dSrcb DN 4.U8 55 dSrcc DN 2.U8 56 dSrcd DN 3.U8 57 dSrce DN 5.U8 58 dSrcf DN 1.U8 59 60 qSrcb QN 2.S16 61 qSrcc QN 1.S16 62 dSrcB DN 4.S16 63 dSrcC DN 2.S16 64 65 qRes0 QN 5.S16 66 qRes1 QN 6.S16 67 qRes2 QN 7.S16 68 qRes3 QN 8.S16 69 qRes4 QN 9.S16 70 qRes5 QN 10.S16 71 qRes6 QN 11.S16 72 qRes7 QN 12.S16 73 qRes8 QN 13.S16 74 75 dRes0 DN 10.S16 76 dRes1 DN 12.S16 77 dRes2 DN 14.S16 78 dRes3 DN 16.S16 79 dRes4 DN 18.S16 80 dRes5 DN 20.S16 81 dRes6 DN 22.S16 82 dRes7 DN 24.S16 83 dRes8 DN 26.S16 84 85 qAcc01 QN 5.S32 86 qAcc23 QN 6.S32 87 qAcc45 QN 2.S32 88 qAcc67 QN 3.S32 89 qSumBE QN 0.S32 90 qSumCD QN 1.S32 91 92 dTempAcc0 DN 0.U16 93 dTempAcc1 DN 2.U16 94 dTempAcc2 DN 4.U16 95 dTempAcc3 DN 6.U16 96 97 qTAcc0 QN 0.U16 98 qTAcc1 QN 1.U16 99 qTAcc2 QN 2.U16 100 qTAcc3 QN 3.U16 101 102 dAcc0 DN 0.U8 103 dAcc1 DN 2.U8 104 dAcc2 DN 4.U8 105 dAcc3 DN 6.U8 106 107 dTmp0 DN 8.S16 108 dTmp1 DN 9.S16 109 qTmp0 QN 4.S32 110 111 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 112 VMOV dCoeff20, #20 113 VMOV dCoeff5, #5 114 115 ;// Row0 116 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 117 VEXT dSrcc, dSrc0, dSrc1, #2 118 VEXT dSrcd, dSrc0, dSrc1, #3 119 VEXT dSrce, dSrc0, dSrc1, #4 120 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 121 VADDL qSrcc, dSrcc, dSrcd ;// c+d 122 VADDL qSrcb, dSrcb, dSrce ;// b+e 123 VADDL qRes0, dSrc0, dSrcf ;// Acc=a+f 124 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 125 VMLA dRes0, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 126 ; VMLS dRes0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 127 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 128 129 ;// Row1 130 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 131 VEXT dSrcc, dSrc0, dSrc1, #2 132 VEXT dSrcd, dSrc0, dSrc1, #3 133 VEXT dSrce, dSrc0, dSrc1, #4 134 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 135 VADDL qSrcc, dSrcc, dSrcd ;// c+d 136 VADDL qSrcb, dSrcb, dSrce ;// b+e 137 VADDL qRes1, dSrc0, dSrcf ;// Acc=a+f 138 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 139 140 VSUB dRes0, dRes0, dTmp0 ;// TeRi 141 142 VMLA dRes1, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 143 ; VMLS dRes1, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 144 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 145 146 ;// Row2 147 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 148 VEXT dSrcc, dSrc0, dSrc1, #2 149 VEXT dSrcd, dSrc0, dSrc1, #3 150 VEXT dSrce, dSrc0, dSrc1, #4 151 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 152 VADDL qSrcc, dSrcc, dSrcd ;// c+d 153 VADDL qSrcb, dSrcb, dSrce ;// b+e 154 VADDL qRes2, dSrc0, dSrcf ;// Acc=a+f 155 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 156 157 VSUB dRes1, dRes1, dTmp0 158 159 VMLA dRes2, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 160 ; VMLS dRes2, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 161 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 162 163 ;// Row3 164 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 165 VEXT dSrcc, dSrc0, dSrc1, #2 166 VEXT dSrcd, dSrc0, dSrc1, #3 167 VEXT dSrce, dSrc0, dSrc1, #4 168 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 169 VADDL qSrcc, dSrcc, dSrcd ;// c+d 170 VADDL qSrcb, dSrcb, dSrce ;// b+e 171 VADDL qRes3, dSrc0, dSrcf ;// Acc=a+f 172 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 173 174 VSUB dRes2, dRes2, dTmp0 175 176 VMLA dRes3, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 177 ; VMLS dRes3, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 178 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 179 180 ;// Row4 181 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 182 VEXT dSrcc, dSrc0, dSrc1, #2 183 VEXT dSrcd, dSrc0, dSrc1, #3 184 VEXT dSrce, dSrc0, dSrc1, #4 185 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 186 VADDL qSrcc, dSrcc, dSrcd ;// c+d 187 VADDL qSrcb, dSrcb, dSrce ;// b+e 188 VADDL qRes4, dSrc0, dSrcf ;// Acc=a+f 189 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 190 191 VSUB dRes3, dRes3, dTmp0 192 193 VMLA dRes4, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 194 ; VMLS dRes4, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 195 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 196 197 ;// Row5 198 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 199 VEXT dSrcc, dSrc0, dSrc1, #2 200 VEXT dSrcd, dSrc0, dSrc1, #3 201 VEXT dSrce, dSrc0, dSrc1, #4 202 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 203 VADDL qSrcc, dSrcc, dSrcd ;// c+d 204 VADDL qSrcb, dSrcb, dSrce ;// b+e 205 VADDL qRes5, dSrc0, dSrcf ;// Acc=a+f 206 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 207 208 VSUB dRes4, dRes4, dTmp0 209 210 VMLA dRes5, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 211 ; VMLS dRes5, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 212 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 213 214 ;// Row6 215 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 216 VEXT dSrcc, dSrc0, dSrc1, #2 217 VEXT dSrcd, dSrc0, dSrc1, #3 218 VEXT dSrce, dSrc0, dSrc1, #4 219 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 220 VADDL qSrcc, dSrcc, dSrcd ;// c+d 221 VADDL qSrcb, dSrcb, dSrce ;// b+e 222 VADDL qRes6, dSrc0, dSrcf ;// Acc=a+f 223 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 224 225 VSUB dRes5, dRes5, dTmp0 226 227 VMLA dRes6, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 228 ; VMLS dRes6, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 229 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 230 231 ;// Row7 232 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 233 VEXT dSrcc, dSrc0, dSrc1, #2 234 VEXT dSrcd, dSrc0, dSrc1, #3 235 VEXT dSrce, dSrc0, dSrc1, #4 236 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 237 VADDL qSrcc, dSrcc, dSrcd ;// c+d 238 VADDL qSrcb, dSrcb, dSrce ;// b+e 239 VADDL qRes7, dSrc0, dSrcf ;// Acc=a+f 240 VLD1 qSrc01, [pSrc], srcStep ;// [a0 a1 a2 a3 ..] 241 242 VSUB dRes6, dRes6, dTmp0 243 244 VMLA dRes7, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 245 ; VMLS dRes7, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 246 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 247 248 ;// Row8 249 VEXT dSrcb, dSrc0, dSrc1, #1 ;// [b0 b1 b2 b3 ..] 250 VEXT dSrcc, dSrc0, dSrc1, #2 251 VEXT dSrcd, dSrc0, dSrc1, #3 252 VEXT dSrce, dSrc0, dSrc1, #4 253 VEXT dSrcf, dSrc0, dSrc1, #5 ;// [f0 f1 f2 f3 ..] 254 VADDL qSrcc, dSrcc, dSrcd ;// c+d 255 VADDL qSrcb, dSrcb, dSrce ;// b+e 256 VADDL qRes8, dSrc0, dSrcf ;// Acc=a+f 257 258 VSUB dRes7, dRes7, dTmp0 259 260 VMLA dRes8, dSrcC, dCoeff20 ;// Acc += 20*(c+d) 261 ; VMLS dRes8, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 262 VMUL dTmp0, dSrcB, dCoeff5 ;// Acc -= 5*(b+e) 263 264 VMOV qCoeff20, #20 265 VMOV qCoeff5, #5 266 267 ;// Col0 268 VADDL qAcc01, dRes0, dRes5 ;// Acc = a+f 269 VADDL qSumCD, dRes2, dRes3 ;// c+d 270 VADDL qSumBE, dRes1, dRes4 ;// b+e 271 272 VSUB dRes8, dRes8, dTmp0 273 274 VMLA qAcc01, qSumCD, qCoeff20 ;// Acc += 20*(c+d) 275 ; VMLS qAcc01, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) 276 VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) 277 278 ;// Col1 279 VADDL qAcc23, dRes1, dRes6 ;// Acc = a+f 280 VADDL qSumCD, dRes3, dRes4 ;// c+d 281 VADDL qSumBE, dRes2, dRes5 ;// b+e 282 VMLA qAcc23, qSumCD, qCoeff20 ;// Acc += 20*(c+d) 283 284 VSUB qAcc01, qAcc01, qTmp0 285 286 ; VMLS qAcc23, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) 287 VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) 288 289 ;// Col2 290 VADDL qAcc45, dRes2, dRes7 ;// Acc = a+f 291 VADDL qSumCD, dRes4, dRes5 ;// c+d 292 VADDL qSumBE, dRes3, dRes6 ;// b+e 293 VMLA qAcc45, qSumCD, qCoeff20 ;// Acc += 20*(c+d) 294 295 VSUB qAcc23, qAcc23, qTmp0 296 297 ; VMLS qAcc45, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) 298 VMUL qTmp0, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) 299 300 ;// Col3 301 VADDL qAcc67, dRes3, dRes8 ;// Acc = a+f 302 VADDL qSumCD, dRes5, dRes6 ;// c+d 303 VADDL qSumBE, dRes4, dRes7 ;// b+e 304 VMLA qAcc67, qSumCD, qCoeff20 ;// Acc += 20*(c+d) 305 306 VSUB qAcc45, qAcc45, qTmp0 307 308 VMLS qAcc67, qSumBE, qCoeff5 ;// Acc -= 20*(b+e) 309 310 VQRSHRUN dTempAcc0, qAcc01, #10 311 VQRSHRUN dTempAcc1, qAcc23, #10 312 VQRSHRUN dTempAcc2, qAcc45, #10 313 VQRSHRUN dTempAcc3, qAcc67, #10 314 315 VQMOVN dAcc0, qTAcc0 316 VQMOVN dAcc1, qTAcc1 317 VQMOVN dAcc2, qTAcc2 318 VQMOVN dAcc3, qTAcc3 319 320 M_END 321 322 ENDIF 323 324 325 326 END 327 328