1 ;// 2 ;// Copyright (C) 2007-2008 ARM Limited 3 ;// 4 ;// Licensed under the Apache License, Version 2.0 (the "License"); 5 ;// you may not use this file except in compliance with the License. 6 ;// You may obtain a copy of the License at 7 ;// 8 ;// http://www.apache.org/licenses/LICENSE-2.0 9 ;// 10 ;// Unless required by applicable law or agreed to in writing, software 11 ;// distributed under the License is distributed on an "AS IS" BASIS, 12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 ;// See the License for the specific language governing permissions and 14 ;// limitations under the License. 15 ;// 16 ;// 17 ;// 18 ;// File Name: armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s 19 ;// OpenMAX DL: v1.0.2 20 ;// Revision: 12290 21 ;// Date: Wednesday, April 9, 2008 22 ;// 23 ;// 24 ;// 25 ;// 26 27 INCLUDE omxtypes_s.h 28 INCLUDE armCOMM_s.h 29 30 M_VARIANTS CortexA8 31 32 EXPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 33 34 IF CortexA8 35 36 M_START armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe, r11 37 38 ;// Declare input registers 39 pSrc RN 0 40 srcStep RN 1 41 pDst RN 2 42 dstStep RN 3 43 44 Temp RN 12 45 46 ;// Declare Neon registers 47 dCoeff5 DN 30.S16 48 dCoeff20 DN 31.S16 49 50 dSrc0 DN 7.U8 51 dSrc1 DN 8.U8 52 dSrc2 DN 9.U8 53 dSrc3 DN 10.U8 54 dSrc4 DN 11.U8 55 dSrc5 DN 12.U8 56 dSrc6 DN 13.U8 57 dSrc7 DN 14.U8 58 dSrc8 DN 15.U8 59 60 qSumBE01 QN 8.S16 61 qSumCD01 QN 9.S16 62 dSumBE0 DN 16.S16 63 dSumCD0 DN 18.S16 64 65 qAcc01 QN 0.S16 66 qAcc23 QN 1.S16 67 qAcc45 QN 2.S16 68 qAcc67 QN 3.S16 69 70 dRes0 DN 0.S16 71 dRes1 DN 2.S16 72 dRes2 DN 4.S16 73 dRes3 DN 6.S16 74 75 dAcc0 DN 0.U8 76 dAcc1 DN 2.U8 77 dAcc2 DN 4.U8 78 dAcc3 DN 6.U8 79 80 81 dTmp0 DN 20.S16 82 dTmp1 DN 21.S16 83 dTmp2 DN 22.S16 84 dTmp3 DN 23.S16 85 86 87 VLD1 dSrc0, [pSrc], srcStep ;// [a0 a1 a2 a3 .. ] 88 ADD Temp, pSrc, srcStep, LSL #2 89 VLD1 dSrc1, [pSrc], srcStep ;// [b0 b1 b2 b3 .. ] 90 ;// One cycle stall 91 VLD1 dSrc5, [Temp], srcStep 92 ;// One cycle stall 93 VLD1 dSrc2, [pSrc], srcStep ;// [c0 c1 c2 c3 .. ] 94 VADDL qAcc01, dSrc0, dSrc5 ;// Acc = a+f 95 VLD1 dSrc3, [pSrc], srcStep 96 ;// One cycle stall 97 VLD1 dSrc6, [Temp], srcStep ;// TeRi 98 99 VLD1 dSrc4, [pSrc], srcStep 100 VLD1 dSrc7, [Temp], srcStep ;// TeRi 101 VADDL qSumBE01, dSrc1, dSrc4 ;// b+e 102 VADDL qSumCD01, dSrc2, dSrc3 ;// c+d 103 VLD1 dSrc8, [Temp], srcStep ;// TeRi 104 VMLS dRes0, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) 105 ; VMLA dRes0, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 106 VMUL dTmp0, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 107 108 ; VLD1 dSrc6, [Temp], srcStep 109 VADDL qSumBE01, dSrc2, dSrc5 ;// b+e 110 VADDL qSumCD01, dSrc3, dSrc4 ;// c+d 111 VADDL qAcc23, dSrc1, dSrc6 ;// Acc = a+f 112 VMLS dRes1, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) 113 ; VMLA dRes1, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 114 VMUL dTmp1, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 115 116 ; VLD1 dSrc7, [Temp], srcStep 117 VADDL qSumBE01, dSrc3, dSrc6 ;// b+e 118 VADDL qSumCD01, dSrc4, dSrc5 ;// c+d 119 VADDL qAcc45, dSrc2, dSrc7 ;// Acc = a+f 120 VMLS dRes2, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) 121 ; VMLA dRes2, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 122 VMUL dTmp2, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 123 124 ; VLD1 dSrc8, [Temp], srcStep ;// [i0 i1 i2 i3 .. ] 125 VADDL qSumBE01, dSrc4, dSrc7 ;// b+e 126 VADDL qAcc67, dSrc3, dSrc8 ;// Acc = a+f 127 VADDL qSumCD01, dSrc5, dSrc6 ;// c+d 128 VMLS dRes3, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) 129 VADD dRes0, dRes0, dTmp0 130 VADD dRes1, dRes1, dTmp1 131 VADD dRes2, dRes2, dTmp2 132 VMLA dRes3, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 133 ; VMUL dTmp3, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 134 ; VADD dRes3, dRes3, dTmp3 135 136 VQRSHRUN dAcc0, qAcc01, #5 137 VQRSHRUN dAcc1, qAcc23, #5 138 VQRSHRUN dAcc2, qAcc45, #5 139 VQRSHRUN dAcc3, qAcc67, #5 140 141 M_END 142 143 ENDIF 144 145 146 147 END 148 149