1 ;// 2 ;// 3 ;// File Name: armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s 4 ;// OpenMAX DL: v1.0.2 5 ;// Revision: 12290 6 ;// Date: Wednesday, April 9, 2008 7 ;// 8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9 ;// 10 ;// 11 ;// 12 13 INCLUDE omxtypes_s.h 14 INCLUDE armCOMM_s.h 15 16 M_VARIANTS CortexA8 17 18 EXPORT armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 19 20 DEBUG_ON SETL {FALSE} 21 22 IF CortexA8 23 24 M_START armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe, r11 25 26 ;// Declare input registers 27 pSrc RN 0 28 srcStep RN 1 29 pDst RN 2 30 dstStep RN 3 31 32 ;// Declare Neon registers 33 dCoeff5 DN 30.S16 34 dCoeff20 DN 31.S16 35 36 qSrcA01 QN 11.U8 37 qSrcB01 QN 12.U8 38 qSrcC01 QN 13.U8 39 qSrcD01 QN 14.U8 40 41 dSrcA0 DN 22.U8 42 dSrcA1 DN 23.U8 43 dSrcB0 DN 24.U8 44 dSrcB1 DN 25.U8 45 dSrcC0 DN 26.U8 46 dSrcC1 DN 27.U8 47 dSrcD0 DN 28.U8 48 dSrcD1 DN 29.U8 49 50 dSrcb DN 12.U8 51 dSrce DN 13.U8 52 dSrcf DN 10.U8 53 54 dSrc0c DN 14.U8 55 dSrc1c DN 16.U8 56 dSrc2c DN 18.U8 57 dSrc3c DN 20.U8 58 59 dSrc0d DN 15.U8 60 dSrc1d DN 17.U8 61 dSrc2d DN 19.U8 62 dSrc3d DN 21.U8 63 64 qTemp01 QN 4.S16 65 qTemp23 QN 6.S16 66 dTemp0 DN 8.S16 67 dTemp2 DN 12.S16 68 69 qRes01 QN 11.S16 70 qRes23 QN 12.S16 71 qRes45 QN 13.S16 72 qRes67 QN 14.S16 73 74 dRes0 DN 22.S16 75 dRes2 DN 24.S16 76 dRes4 DN 26.S16 77 dRes6 DN 28.S16 78 79 dAcc0 DN 22.U8 80 dAcc2 DN 24.U8 81 dAcc4 DN 26.U8 82 dAcc6 DN 28.U8 83 84 dResult0 DN 22.U32 85 dResult2 DN 24.U32 86 dResult4 DN 26.U32 87 dResult6 DN 28.U32 88 89 VLD1 qSrcA01, [pSrc], srcStep ;// Load A register [a0 a1 a2 a3 ..] 90 ;// One cycle stall 91 VEXT dSrcf, dSrcA0, dSrcA1, #5 ;// [f0 f1 f2 f3 ..] 92 VEXT dSrcb, dSrcA0, dSrcA1, #1 ;// [b0 b1 b2 b3 ..] 93 ; VLD1 qSrcB01, [pSrc], srcStep ;// Load B register [a0 a1 a2 a3 ..] 94 VEXT dSrc0c, dSrcA0, dSrcA1, #2 95 VEXT dSrc0d, dSrcA0, dSrcA1, #3 96 VEXT dSrce, dSrcA0, dSrcA1, #4 97 VADDL qRes01, dSrcA0, dSrcf ;// Acc=a+f 98 VADDL qTemp01, dSrc0c, dSrc0d ;// c+d 99 VADDL qTemp23, dSrcb, dSrce ;// b+e 100 101 VLD1 qSrcB01, [pSrc], srcStep ;// Load B register [a0 a1 a2 a3 ..] 102 ; VLD1 qSrcC01, [pSrc], srcStep ;// Load C register [a0 a1 a2 a3 ..] 103 VMLA dRes0, dTemp0, dCoeff20 ;// Acc += 20*(c+d) 104 ; VMLS dRes0, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) 105 VMUL dTemp0, dTemp2, dCoeff5 ;// TeRi 106 107 VEXT dSrcf, dSrcB0, dSrcB1, #5 ;// [f0 f1 f2 f3 ..] 108 VEXT dSrcb, dSrcB0, dSrcB1, #1 ;// [b0 b1 b2 b3 ..] 109 VEXT dSrc1c, dSrcB0, dSrcB1, #2 110 VEXT dSrc1d, dSrcB0, dSrcB1, #3 111 VEXT dSrce, dSrcB0, dSrcB1, #4 112 VADDL qRes23, dSrcB0, dSrcf ;// Acc=a+f 113 114 VSUB dRes0, dRes0, dTemp0 ;// TeRi 115 116 VADDL qTemp01, dSrc1c, dSrc1d ;// c+d 117 VADDL qTemp23, dSrcb, dSrce ;// b+e 118 119 VLD1 qSrcC01, [pSrc], srcStep ;// Load C register [a0 a1 a2 a3 ..] 120 ; VLD1 qSrcD01, [pSrc], srcStep ;// Load D register [a0 a1 a2 a3 ..] 121 122 VMLA dRes2, dTemp0, dCoeff20 ;// Acc += 20*(c+d) 123 ; VMLS dRes2, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) 124 VMUL dTemp0, dTemp2, dCoeff5 ;// TeRi 125 126 VEXT dSrcf, dSrcC0, dSrcC1, #5 ;// [f0 f1 f2 f3 ..] 127 VEXT dSrcb, dSrcC0, dSrcC1, #1 ;// [b0 b1 b2 b3 ..] 128 VEXT dSrc2c, dSrcC0, dSrcC1, #2 129 VEXT dSrc2d, dSrcC0, dSrcC1, #3 130 VEXT dSrce, dSrcC0, dSrcC1, #4 131 VADDL qRes45, dSrcC0, dSrcf ;// Acc=a+f 132 133 VSUB dRes2, dRes2, dTemp0 ;// TeRi 134 135 VADDL qTemp01, dSrc2c, dSrc2d ;// c+d 136 VADDL qTemp23, dSrcb, dSrce ;// b+e 137 138 VLD1 qSrcD01, [pSrc], srcStep ;// Load D register [a0 a1 a2 a3 ..] 139 140 VMLA dRes4, dTemp0, dCoeff20 ;// Acc += 20*(c+d) 141 ; VMLS dRes4, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) 142 VMUL dTemp0, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) TeRi 143 144 145 VEXT dSrcf, dSrcD0, dSrcD1, #5 ;// [f0 f1 f2 f3 ..] 146 VEXT dSrcb, dSrcD0, dSrcD1, #1 ;// [b0 b1 b2 b3 ..] 147 VEXT dSrc3c, dSrcD0, dSrcD1, #2 148 VEXT dSrc3d, dSrcD0, dSrcD1, #3 149 VEXT dSrce, dSrcD0, dSrcD1, #4 150 VADDL qRes67, dSrcD0, dSrcf ;// Acc=a+f 151 152 VSUB dRes4, dRes4, dTemp0 ;// TeRi 153 154 VADDL qTemp01, dSrc3c, dSrc3d ;// c+d 155 VADDL qTemp23, dSrcb, dSrce ;// b+e 156 VMLA dRes6, dTemp0, dCoeff20 ;// Acc += 20*(c+d) 157 VMLS dRes6, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) 158 159 VQRSHRUN dAcc0, qRes01, #5 ;// Acc = Sat ((Acc + 16) / 32) 160 VQRSHRUN dAcc2, qRes23, #5 ;// Acc = Sat ((Acc + 16) / 32) 161 VQRSHRUN dAcc4, qRes45, #5 ;// Acc = Sat ((Acc + 16) / 32) 162 VQRSHRUN dAcc6, qRes67, #5 ;// Acc = Sat ((Acc + 16) / 32) 163 164 M_END 165 166 ENDIF 167 168 169 END 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229