1 ;// 2 ;// 3 ;// File Name: armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s 4 ;// OpenMAX DL: v1.0.2 5 ;// Revision: 12290 6 ;// Date: Wednesday, April 9, 2008 7 ;// 8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9 ;// 10 ;// 11 ;// 12 13 INCLUDE omxtypes_s.h 14 INCLUDE armCOMM_s.h 15 16 M_VARIANTS CortexA8 17 18 EXPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe 19 20 IF CortexA8 21 22 M_START armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe, r11 23 24 ;// Declare input registers 25 pSrc RN 0 26 srcStep RN 1 27 pDst RN 2 28 dstStep RN 3 29 30 Temp RN 12 31 32 ;// Declare Neon registers 33 dCoeff5 DN 30.S16 34 dCoeff20 DN 31.S16 35 36 dSrc0 DN 7.U8 37 dSrc1 DN 8.U8 38 dSrc2 DN 9.U8 39 dSrc3 DN 10.U8 40 dSrc4 DN 11.U8 41 dSrc5 DN 12.U8 42 dSrc6 DN 13.U8 43 dSrc7 DN 14.U8 44 dSrc8 DN 15.U8 45 46 qSumBE01 QN 8.S16 47 qSumCD01 QN 9.S16 48 dSumBE0 DN 16.S16 49 dSumCD0 DN 18.S16 50 51 qAcc01 QN 0.S16 52 qAcc23 QN 1.S16 53 qAcc45 QN 2.S16 54 qAcc67 QN 3.S16 55 56 dRes0 DN 0.S16 57 dRes1 DN 2.S16 58 dRes2 DN 4.S16 59 dRes3 DN 6.S16 60 61 dAcc0 DN 0.U8 62 dAcc1 DN 2.U8 63 dAcc2 DN 4.U8 64 dAcc3 DN 6.U8 65 66 67 dTmp0 DN 20.S16 68 dTmp1 DN 21.S16 69 dTmp2 DN 22.S16 70 dTmp3 DN 23.S16 71 72 73 VLD1 dSrc0, [pSrc], srcStep ;// [a0 a1 a2 a3 .. ] 74 ADD Temp, pSrc, srcStep, LSL #2 75 VLD1 dSrc1, [pSrc], srcStep ;// [b0 b1 b2 b3 .. ] 76 ;// One cycle stall 77 VLD1 dSrc5, [Temp], srcStep 78 ;// One cycle stall 79 VLD1 dSrc2, [pSrc], srcStep ;// [c0 c1 c2 c3 .. ] 80 VADDL qAcc01, dSrc0, dSrc5 ;// Acc = a+f 81 VLD1 dSrc3, [pSrc], srcStep 82 ;// One cycle stall 83 VLD1 dSrc6, [Temp], srcStep ;// TeRi 84 85 VLD1 dSrc4, [pSrc], srcStep 86 VLD1 dSrc7, [Temp], srcStep ;// TeRi 87 VADDL qSumBE01, dSrc1, dSrc4 ;// b+e 88 VADDL qSumCD01, dSrc2, dSrc3 ;// c+d 89 VLD1 dSrc8, [Temp], srcStep ;// TeRi 90 VMLS dRes0, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) 91 ; VMLA dRes0, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 92 VMUL dTmp0, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 93 94 ; VLD1 dSrc6, [Temp], srcStep 95 VADDL qSumBE01, dSrc2, dSrc5 ;// b+e 96 VADDL qSumCD01, dSrc3, dSrc4 ;// c+d 97 VADDL qAcc23, dSrc1, dSrc6 ;// Acc = a+f 98 VMLS dRes1, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) 99 ; VMLA dRes1, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 100 VMUL dTmp1, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 101 102 ; VLD1 dSrc7, [Temp], srcStep 103 VADDL qSumBE01, dSrc3, dSrc6 ;// b+e 104 VADDL qSumCD01, dSrc4, dSrc5 ;// c+d 105 VADDL qAcc45, dSrc2, dSrc7 ;// Acc = a+f 106 VMLS dRes2, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) 107 ; VMLA dRes2, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 108 VMUL dTmp2, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 109 110 ; VLD1 dSrc8, [Temp], srcStep ;// [i0 i1 i2 i3 .. ] 111 VADDL qSumBE01, dSrc4, dSrc7 ;// b+e 112 VADDL qAcc67, dSrc3, dSrc8 ;// Acc = a+f 113 VADDL qSumCD01, dSrc5, dSrc6 ;// c+d 114 VMLS dRes3, dSumBE0, dCoeff5 ;// Acc -= 20*(b+e) 115 VADD dRes0, dRes0, dTmp0 116 VADD dRes1, dRes1, dTmp1 117 VADD dRes2, dRes2, dTmp2 118 VMLA dRes3, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 119 ; VMUL dTmp3, dSumCD0, dCoeff20 ;// Acc += 20*(c+d) 120 ; VADD dRes3, dRes3, dTmp3 121 122 VQRSHRUN dAcc0, qAcc01, #5 123 VQRSHRUN dAcc1, qAcc23, #5 124 VQRSHRUN dAcc2, qAcc45, #5 125 VQRSHRUN dAcc3, qAcc67, #5 126 127 M_END 128 129 ENDIF 130 131 132 133 END 134 135