1 ;// 2 ;// Copyright (C) 2007-2008 ARM Limited 3 ;// 4 ;// Licensed under the Apache License, Version 2.0 (the "License"); 5 ;// you may not use this file except in compliance with the License. 6 ;// You may obtain a copy of the License at 7 ;// 8 ;// http://www.apache.org/licenses/LICENSE-2.0 9 ;// 10 ;// Unless required by applicable law or agreed to in writing, software 11 ;// distributed under the License is distributed on an "AS IS" BASIS, 12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 ;// See the License for the specific language governing permissions and 14 ;// limitations under the License. 15 ;// 16 ;// 17 ;// 18 ;// File Name: omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s 19 ;// OpenMAX DL: v1.0.2 20 ;// Revision: 12290 21 ;// Date: Wednesday, April 9, 2008 22 ;// 23 ;// 24 ;// 25 ;// 26 ;// Description: 27 ;// H.264 inverse quantize and transform module 28 ;// 29 ;// 30 31 32 33 ;// Include standard headers 34 35 INCLUDE omxtypes_s.h 36 INCLUDE armCOMM_s.h 37 38 ;// Import symbols required from other files 39 ;// (For example tables) 40 41 IMPORT armVCM4P10_UnpackBlock4x4 42 IMPORT armVCM4P10_TransformResidual4x4 43 IMPORT armVCM4P10_QPDivTable 44 IMPORT armVCM4P10_VMatrixU16 45 IMPORT armVCM4P10_QPModuloTable 46 47 M_VARIANTS CortexA8 48 49 ;// Set debugging level 50 ;//DEBUG_ON SETL {TRUE} 51 52 53 ;// Static Function: armVCM4P10_DequantLumaAC4x4 54 55 ;// Guarding implementation by the processor name 56 57 58 59 ;// Guarding implementation by the processor name 60 61 62 63 64 65 66 ;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd 67 68 ;// Guarding implementation by the processor name 69 70 71 72 ;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd 73 74 ;// Guarding implementation by the processor name 75 76 IF CortexA8 77 78 79 ;// ARM Registers 80 81 ;//Input Registers 82 ppSrc RN 0 83 pPred RN 1 84 pDC RN 2 85 pDst RN 3 86 87 88 ;//Output Registers 89 result RN 0 90 91 ;//Local Scratch Registers 92 93 ;//Registers used in armVCM4P10_DequantLumaAC4x4 94 pQPdiv RN 10 95 pQPmod RN 11 96 pVRow RN 2 97 QPmod RN 12 98 shift RN 14 99 index0 RN 1 100 index1 RN 10 101 102 ;//Registers used in DequantTransformResidualFromPairAndAdd 103 pDelta RN 4 104 pDeltaTmp RN 6 105 AC RN 5 ;//Load from stack 106 pPredTemp RN 7 107 pDCTemp RN 8 108 pDstTemp RN 9 109 pDeltaArg1 RN 1 110 pDeltaArg0 RN 0 111 QP RN 1 ;//Load from stack 112 DCval RN 10 113 predstep RN 1 114 dstStep RN 10 115 PredVal1 RN 3 116 PredVal2 RN 5 117 118 119 120 121 ;// Neon Registers 122 123 ;// Registers used in armVCM4P10_DequantLumaAC4x4 124 125 dVmatrix DN D6.8 126 dindexRow0 DN D7.32 127 dindexRow1 DN D9.32 128 dByteIndexRow0 DN D7.8 129 dByteIndexRow1 DN D9.8 130 dVRow0 DN D8.8 131 dVRow1 DN D4.8 132 dVRow0U16 DN D8.U16 133 dVRow1U16 DN D4.U16 134 dVRow2U16 DN D8.U16 135 dVRow3U16 DN D4.U16 136 137 dShift DN D5.U16 138 dSrcRow0 DN D0.I16 139 dSrcRow1 DN D1.I16 140 dSrcRow2 DN D2.I16 141 dSrcRow3 DN D3.I16 142 dDqntRow0 DN D0.I16 143 dDqntRow1 DN D1.I16 144 dDqntRow2 DN D2.I16 145 dDqntRow3 DN D3.I16 146 147 ;// Registers used in TransformResidual4x4 148 149 ;// Packed Input pixels 150 dIn0 DN D0.S16 151 dIn1 DN D1.S16 152 dIn2 DN D2.S16 153 dIn3 DN D3.S16 154 qIn01 QN Q0.32 155 qIn23 QN Q1.32 156 157 ;// Intermediate calculations 158 dZero DN D4.S16 159 de0 DN D5.S16 160 de1 DN D6.S16 161 de2 DN D7.S16 162 de3 DN D8.S16 163 dIn1RS DN D7.S16 164 dIn3RS DN D8.S16 165 df0 DN D0.S16 166 df1 DN D1.S16 167 df2 DN D2.S16 168 df3 DN D3.S16 169 qf01 QN Q0.32 170 qf23 QN Q1.32 171 dg0 DN D5.S16 172 dg1 DN D6.S16 173 dg2 DN D7.S16 174 dg3 DN D8.S16 175 df1RS DN D7.S16 176 df3RS DN D8.S16 177 178 ;// Output pixels 179 dh0 DN D0.S16 180 dh1 DN D1.S16 181 dh2 DN D2.S16 182 dh3 DN D3.S16 183 184 ;// Registers used in DequantTransformResidualFromPairAndAdd 185 186 dDeltaRow0 DN D0.S16 187 dDeltaRow1 DN D1.S16 188 dDeltaRow2 DN D2.S16 189 dDeltaRow3 DN D3.S16 190 qDeltaRow01 QN Q0.S16 191 qDeltaRow23 QN Q1.S16 192 193 dPredValRow01 DN D4.U8 194 dPredValRow23 DN D5.U8 195 196 qSumRow01 QN Q3.S16 197 qSumRow23 QN Q4.S16 198 dDstRow01 DN D0.U8 199 dDstRow23 DN D1.U8 200 dDstRow0 DN D0.32[0] 201 dDstRow1 DN D0.32[1] 202 dDstRow2 DN D1.32[0] 203 dDstRow3 DN D1.32[1] 204 205 206 ;// Allocate stack memory required by the function 207 M_ALLOC8 pBuffer, 32 208 209 210 ;// Write function header 211 M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11,d9 212 213 ;// Define stack arguments 214 M_ARG predStepOnStack, 4 215 M_ARG dstStepOnStack,4 216 M_ARG QPOnStack, 4 217 M_ARG ACOnStack,4 218 219 220 M_ADR pDelta,pBuffer 221 M_LDR AC,ACOnStack 222 223 224 ;// Save registers r1,r2,r3 before function call 225 MOV pPredTemp,pPred 226 MOV pDCTemp,pDC 227 MOV pDstTemp,pDst 228 229 CMP AC,#0 230 BEQ DCcase 231 MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_UnpackBlock4x4 232 233 BL armVCM4P10_UnpackBlock4x4 234 235 ;//-------------------------------------------------------- 236 ;// armVCM4P10_DequantLumaAC4x4 : static function inlined 237 ;//-------------------------------------------------------- 238 239 ;//BL armVCM4P10_DequantLumaAC4x4 240 M_LDR QP,QPOnStack ;// Set up r1 for armVCM4P10_DequantLumaAC4x4 241 242 LDR pQPmod,=armVCM4P10_QPModuloTable 243 LDR pQPdiv,=armVCM4P10_QPDivTable 244 LDR pVRow,=armVCM4P10_VMatrixU16 245 246 247 LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6 248 LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6 249 250 LDR index1,=0x03020504 251 LDR index0,=0x05040100 ;// Indexes into dVmatrix 252 ADD pVRow,pVRow,QPmod 253 VDUP dindexRow0,index0 254 VDUP dindexRow1,index1 255 VDUP dShift,shift 256 257 ;// Load all 4x4 pVRow[] values 258 VLD1 dVmatrix,[pVRow] ;// dVmatrix = [0d|0c|0b|0a] 259 260 261 VTBL dVRow0,dVmatrix,dByteIndexRow0 ;// row0 = row2 = [pVRow[2] | pVRow[0] | pVRow[2] | pVRow[0]] 262 VTBL dVRow1,dVmatrix,dByteIndexRow1 ;// row1 = row3 = [pVRow[1] | pVRow[2] | pVRow[1] | pVRow[2]] 263 CMP pDCTemp,#0 264 ;// Load all the 4x4 'src' values 265 VLD1 { dSrcRow0,dSrcRow1,dSrcRow2,dSrcRow3 },[pDelta] 266 267 VSHL dVRow0U16,dVRow0U16,dShift 268 VSHL dVRow1U16,dVRow1U16,dShift 269 LDRSHNE DCval,[pDCTemp] 270 271 272 ;// Multiply src[] with pVRow[] 273 VMUL dDqntRow0,dSrcRow0,dVRow0U16 274 VMUL dDqntRow1,dSrcRow1,dVRow1U16 275 VMUL dDqntRow2,dSrcRow2,dVRow2U16 276 VMUL dDqntRow3,dSrcRow3,dVRow3U16 277 278 279 280 ;//------------------------------------------------------------- 281 ;// TransformResidual4x4 : Inlined to avoid Load/Stores 282 ;//------------------------------------------------------------- 283 284 285 ;//BL armVCM4P10_TransformResidual4x4 286 ;//STRHNE DCval,[pDelta] 287 VMOVNE dIn0[0],DCval 288 289 290 291 ;//***************************************************************** 292 ;// Transpose the input pixels : perform Row ops as Col ops 293 ;//***************************************************************** 294 295 VTRN dIn0,dIn1 296 VTRN dIn2,dIn3 297 VTRN qIn01,qIn23 298 299 300 VMOV dZero,#0 ;// Used to right shift by 1 301 302 303 ;//**************************************** 304 ;// Row Operations (Performed on columns) 305 ;//**************************************** 306 307 308 VADD de0,dIn0,dIn2 ;// e0 = d0 + d2 309 VSUB de1,dIn0,dIn2 ;// e1 = d0 - d2 310 VHADD dIn1RS,dIn1,dZero ;// (f1>>1) constZero is a register holding 0 311 VHADD dIn3RS,dIn3,dZero 312 VSUB de2,dIn1RS,dIn3 ;// e2 = (d1>>1) - d3 313 VADD de3,dIn1,dIn3RS ;// e3 = d1 + (d3>>1) 314 VADD df0,de0,de3 ;// f0 = e0 + e3 315 VADD df1,de1,de2 ;// f1 = e1 + e2 316 VSUB df2,de1,de2 ;// f2 = e1 - e2 317 VSUB df3,de0,de3 ;// f3 = e0 - e3 318 319 320 321 ;//***************************************************************** 322 ;// Transpose the resultant matrix 323 ;//***************************************************************** 324 325 VTRN df0,df1 326 VTRN df2,df3 327 VTRN qf01,qf23 328 329 330 ;//******************************* 331 ;// Coloumn Operations 332 ;//******************************* 333 334 335 VADD dg0,df0,df2 ;// e0 = d0 + d2 336 VSUB dg1,df0,df2 ;// e1 = d0 - d2 337 VHADD df1RS,df1,dZero ;// (f1>>1) constZero is a register holding 0 338 VHADD df3RS,df3,dZero 339 VSUB dg2,df1RS,df3 ;// e2 = (d1>>1) - d3 340 VADD dg3,df1,df3RS ;// e3 = d1 + (d3>>1) 341 VADD dh0,dg0,dg3 ;// f0 = e0 + e3 342 VADD dh1,dg1,dg2 ;// f1 = e1 + e2 343 VSUB dh2,dg1,dg2 ;// f2 = e1 - e2 344 VSUB dh3,dg0,dg3 ;// f3 = e0 - e3 345 346 347 ;//************************************************ 348 ;// Calculate final value (colOp[i][j] + 32)>>6 349 ;//************************************************ 350 351 VRSHR dh0,#6 352 VRSHR dh1,#6 353 VRSHR dh2,#6 354 VRSHR dh3,#6 355 356 357 B OutDCcase 358 359 360 DCcase 361 ;// Calculate the Transformed DCvalue : (DCval+32)>>6 362 LDRSH DCval,[pDCTemp] 363 ADD DCval,DCval,#32 364 ASR DCval,DCval,#6 365 366 VDUP dDeltaRow0, DCval ;// pDelta[0] = pDelta[1] = pDelta[2] = pDelta[3] = DCval 367 VDUP dDeltaRow1, DCval ;// pDelta[4] = pDelta[5] = pDelta[6] = pDelta[7] = DCval 368 VDUP dDeltaRow2, DCval ;// pDelta[8] = pDelta[9] = pDelta[10] = pDelta[11] = DCval 369 VDUP dDeltaRow3, DCval 370 371 372 OutDCcase 373 M_LDR predstep,predStepOnStack 374 M_LDR dstStep,dstStepOnStack 375 376 LDR PredVal1,[pPredTemp],predstep 377 LDR PredVal2,[pPredTemp],predstep 378 VMOV dPredValRow01,PredVal1,PredVal2 379 380 LDR PredVal1,[pPredTemp],predstep 381 LDR PredVal2,[pPredTemp] 382 VMOV dPredValRow23,PredVal1,PredVal2 383 384 385 VADDW qSumRow01,qDeltaRow01,dPredValRow01 386 VADDW qSumRow23,qDeltaRow23,dPredValRow23 387 VQMOVUN dDstRow01,qSumRow01 388 VQMOVUN dDstRow23,qSumRow23 389 390 391 VST1 dDstRow0,[pDstTemp],dstStep 392 VST1 dDstRow1,[pDstTemp],dstStep 393 VST1 dDstRow2,[pDstTemp],dstStep 394 VST1 dDstRow3,[pDstTemp] 395 396 ;// Set return value 397 MOV result,#OMX_Sts_NoErr 398 399 End 400 401 402 ;// Write function tail 403 404 M_END 405 406 ENDIF ;//CORTEXA8 407 408 409 410 END 411