1 ;// 2 ;// Copyright (C) 2007-2008 ARM Limited 3 ;// 4 ;// Licensed under the Apache License, Version 2.0 (the "License"); 5 ;// you may not use this file except in compliance with the License. 6 ;// You may obtain a copy of the License at 7 ;// 8 ;// http://www.apache.org/licenses/LICENSE-2.0 9 ;// 10 ;// Unless required by applicable law or agreed to in writing, software 11 ;// distributed under the License is distributed on an "AS IS" BASIS, 12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 ;// See the License for the specific language governing permissions and 14 ;// limitations under the License. 15 ;// 16 ;// 17 ;// 18 ;// File Name: armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s 19 ;// OpenMAX DL: v1.0.2 20 ;// Revision: 12290 21 ;// Date: Wednesday, April 9, 2008 22 ;// 23 ;// 24 ;// 25 ;// 26 27 INCLUDE omxtypes_s.h 28 INCLUDE armCOMM_s.h 29 30 M_VARIANTS CortexA8 31 32 EXPORT armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe 33 34 DEBUG_ON SETL {FALSE} 35 36 IF CortexA8 37 38 M_START armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe, r11 39 40 ;// Declare input registers 41 pSrc RN 0 42 srcStep RN 1 43 pDst RN 2 44 dstStep RN 3 45 46 ;// Declare Neon registers 47 dCoeff5 DN 30.S16 48 dCoeff20 DN 31.S16 49 50 qSrcA01 QN 11.U8 51 qSrcB01 QN 12.U8 52 qSrcC01 QN 13.U8 53 qSrcD01 QN 14.U8 54 55 dSrcA0 DN 22.U8 56 dSrcA1 DN 23.U8 57 dSrcB0 DN 24.U8 58 dSrcB1 DN 25.U8 59 dSrcC0 DN 26.U8 60 dSrcC1 DN 27.U8 61 dSrcD0 DN 28.U8 62 dSrcD1 DN 29.U8 63 64 dSrcb DN 12.U8 65 dSrce DN 13.U8 66 dSrcf DN 10.U8 67 68 dSrc0c DN 14.U8 69 dSrc1c DN 16.U8 70 dSrc2c DN 18.U8 71 dSrc3c DN 20.U8 72 73 dSrc0d DN 15.U8 74 dSrc1d DN 17.U8 75 dSrc2d DN 19.U8 76 dSrc3d DN 21.U8 77 78 qTemp01 QN 4.S16 79 qTemp23 QN 6.S16 80 dTemp0 DN 8.S16 81 dTemp2 DN 12.S16 82 83 qRes01 QN 11.S16 84 qRes23 QN 12.S16 85 qRes45 QN 13.S16 86 qRes67 QN 14.S16 87 88 dRes0 DN 22.S16 89 dRes2 DN 24.S16 90 dRes4 DN 26.S16 91 dRes6 DN 28.S16 92 93 dAcc0 DN 22.U8 94 dAcc2 DN 24.U8 95 dAcc4 DN 26.U8 96 dAcc6 DN 28.U8 97 98 dResult0 DN 22.U32 99 dResult2 DN 24.U32 100 dResult4 DN 26.U32 101 dResult6 DN 28.U32 102 103 VLD1 qSrcA01, [pSrc], srcStep ;// Load A register [a0 a1 a2 a3 ..] 104 ;// One cycle stall 105 VEXT dSrcf, dSrcA0, dSrcA1, #5 ;// [f0 f1 f2 f3 ..] 106 VEXT dSrcb, dSrcA0, dSrcA1, #1 ;// [b0 b1 b2 b3 ..] 107 ; VLD1 qSrcB01, [pSrc], srcStep ;// Load B register [a0 a1 a2 a3 ..] 108 VEXT dSrc0c, dSrcA0, dSrcA1, #2 109 VEXT dSrc0d, dSrcA0, dSrcA1, #3 110 VEXT dSrce, dSrcA0, dSrcA1, #4 111 VADDL qRes01, dSrcA0, dSrcf ;// Acc=a+f 112 VADDL qTemp01, dSrc0c, dSrc0d ;// c+d 113 VADDL qTemp23, dSrcb, dSrce ;// b+e 114 115 VLD1 qSrcB01, [pSrc], srcStep ;// Load B register [a0 a1 a2 a3 ..] 116 ; VLD1 qSrcC01, [pSrc], srcStep ;// Load C register [a0 a1 a2 a3 ..] 117 VMLA dRes0, dTemp0, dCoeff20 ;// Acc += 20*(c+d) 118 ; VMLS dRes0, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) 119 VMUL dTemp0, dTemp2, dCoeff5 ;// TeRi 120 121 VEXT dSrcf, dSrcB0, dSrcB1, #5 ;// [f0 f1 f2 f3 ..] 122 VEXT dSrcb, dSrcB0, dSrcB1, #1 ;// [b0 b1 b2 b3 ..] 123 VEXT dSrc1c, dSrcB0, dSrcB1, #2 124 VEXT dSrc1d, dSrcB0, dSrcB1, #3 125 VEXT dSrce, dSrcB0, dSrcB1, #4 126 VADDL qRes23, dSrcB0, dSrcf ;// Acc=a+f 127 128 VSUB dRes0, dRes0, dTemp0 ;// TeRi 129 130 VADDL qTemp01, dSrc1c, dSrc1d ;// c+d 131 VADDL qTemp23, dSrcb, dSrce ;// b+e 132 133 VLD1 qSrcC01, [pSrc], srcStep ;// Load C register [a0 a1 a2 a3 ..] 134 ; VLD1 qSrcD01, [pSrc], srcStep ;// Load D register [a0 a1 a2 a3 ..] 135 136 VMLA dRes2, dTemp0, dCoeff20 ;// Acc += 20*(c+d) 137 ; VMLS dRes2, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) 138 VMUL dTemp0, dTemp2, dCoeff5 ;// TeRi 139 140 VEXT dSrcf, dSrcC0, dSrcC1, #5 ;// [f0 f1 f2 f3 ..] 141 VEXT dSrcb, dSrcC0, dSrcC1, #1 ;// [b0 b1 b2 b3 ..] 142 VEXT dSrc2c, dSrcC0, dSrcC1, #2 143 VEXT dSrc2d, dSrcC0, dSrcC1, #3 144 VEXT dSrce, dSrcC0, dSrcC1, #4 145 VADDL qRes45, dSrcC0, dSrcf ;// Acc=a+f 146 147 VSUB dRes2, dRes2, dTemp0 ;// TeRi 148 149 VADDL qTemp01, dSrc2c, dSrc2d ;// c+d 150 VADDL qTemp23, dSrcb, dSrce ;// b+e 151 152 VLD1 qSrcD01, [pSrc], srcStep ;// Load D register [a0 a1 a2 a3 ..] 153 154 VMLA dRes4, dTemp0, dCoeff20 ;// Acc += 20*(c+d) 155 ; VMLS dRes4, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) 156 VMUL dTemp0, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) TeRi 157 158 159 VEXT dSrcf, dSrcD0, dSrcD1, #5 ;// [f0 f1 f2 f3 ..] 160 VEXT dSrcb, dSrcD0, dSrcD1, #1 ;// [b0 b1 b2 b3 ..] 161 VEXT dSrc3c, dSrcD0, dSrcD1, #2 162 VEXT dSrc3d, dSrcD0, dSrcD1, #3 163 VEXT dSrce, dSrcD0, dSrcD1, #4 164 VADDL qRes67, dSrcD0, dSrcf ;// Acc=a+f 165 166 VSUB dRes4, dRes4, dTemp0 ;// TeRi 167 168 VADDL qTemp01, dSrc3c, dSrc3d ;// c+d 169 VADDL qTemp23, dSrcb, dSrce ;// b+e 170 VMLA dRes6, dTemp0, dCoeff20 ;// Acc += 20*(c+d) 171 VMLS dRes6, dTemp2, dCoeff5 ;// Acc -= 5*(b+e) 172 173 VQRSHRUN dAcc0, qRes01, #5 ;// Acc = Sat ((Acc + 16) / 32) 174 VQRSHRUN dAcc2, qRes23, #5 ;// Acc = Sat ((Acc + 16) / 32) 175 VQRSHRUN dAcc4, qRes45, #5 ;// Acc = Sat ((Acc + 16) / 32) 176 VQRSHRUN dAcc6, qRes67, #5 ;// Acc = Sat ((Acc + 16) / 32) 177 178 M_END 179 180 ENDIF 181 182 183 END 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243