1 ;// 2 ;// 3 ;// File Name: omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s 4 ;// OpenMAX DL: v1.0.2 5 ;// Revision: 12290 6 ;// Date: Wednesday, April 9, 2008 7 ;// 8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9 ;// 10 ;// 11 ;// 12 ;// Description: 13 ;// H.264 inverse quantize and transform module 14 ;// 15 ;// 16 17 18 19 ;// Include standard headers 20 21 INCLUDE omxtypes_s.h 22 INCLUDE armCOMM_s.h 23 24 ;// Import symbols required from other files 25 ;// (For example tables) 26 27 IMPORT armVCM4P10_UnpackBlock4x4 28 IMPORT armVCM4P10_TransformResidual4x4 29 IMPORT armVCM4P10_QPDivTable 30 IMPORT armVCM4P10_VMatrixU16 31 IMPORT armVCM4P10_QPModuloTable 32 33 M_VARIANTS CortexA8 34 35 ;// Set debugging level 36 ;//DEBUG_ON SETL {TRUE} 37 38 39 ;// Static Function: armVCM4P10_DequantLumaAC4x4 40 41 ;// Guarding implementation by the processor name 42 43 44 45 ;// Guarding implementation by the processor name 46 47 48 49 50 51 52 ;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd 53 54 ;// Guarding implementation by the processor name 55 56 57 58 ;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd 59 60 ;// Guarding implementation by the processor name 61 62 IF CortexA8 63 64 65 ;// ARM Registers 66 67 ;//Input Registers 68 ppSrc RN 0 69 pPred RN 1 70 pDC RN 2 71 pDst RN 3 72 73 74 ;//Output Registers 75 result RN 0 76 77 ;//Local Scratch Registers 78 79 ;//Registers used in armVCM4P10_DequantLumaAC4x4 80 pQPdiv RN 10 81 pQPmod RN 11 82 pVRow RN 2 83 QPmod RN 12 84 shift RN 14 85 index0 RN 1 86 index1 RN 10 87 88 ;//Registers used in DequantTransformResidualFromPairAndAdd 89 pDelta RN 4 90 pDeltaTmp RN 6 91 AC RN 5 ;//Load from stack 92 pPredTemp RN 7 93 pDCTemp RN 8 94 pDstTemp RN 9 95 pDeltaArg1 RN 1 96 pDeltaArg0 RN 0 97 QP RN 1 ;//Load from stack 98 DCval RN 10 99 predstep RN 1 100 dstStep RN 10 101 PredVal1 RN 3 102 PredVal2 RN 5 103 104 105 106 107 ;// Neon Registers 108 109 ;// Registers used in armVCM4P10_DequantLumaAC4x4 110 111 dVmatrix DN D6.8 112 dindexRow0 DN D7.32 113 dindexRow1 DN D9.32 114 dByteIndexRow0 DN D7.8 115 dByteIndexRow1 DN D9.8 116 dVRow0 DN D8.8 117 dVRow1 DN D4.8 118 dVRow0U16 DN D8.U16 119 dVRow1U16 DN D4.U16 120 dVRow2U16 DN D8.U16 121 dVRow3U16 DN D4.U16 122 123 dShift DN D5.U16 124 dSrcRow0 DN D0.I16 125 dSrcRow1 DN D1.I16 126 dSrcRow2 DN D2.I16 127 dSrcRow3 DN D3.I16 128 dDqntRow0 DN D0.I16 129 dDqntRow1 DN D1.I16 130 dDqntRow2 DN D2.I16 131 dDqntRow3 DN D3.I16 132 133 ;// Registers used in TransformResidual4x4 134 135 ;// Packed Input pixels 136 dIn0 DN D0.S16 137 dIn1 DN D1.S16 138 dIn2 DN D2.S16 139 dIn3 DN D3.S16 140 qIn01 QN Q0.32 141 qIn23 QN Q1.32 142 143 ;// Intermediate calculations 144 dZero DN D4.S16 145 de0 DN D5.S16 146 de1 DN D6.S16 147 de2 DN D7.S16 148 de3 DN D8.S16 149 dIn1RS DN D7.S16 150 dIn3RS DN D8.S16 151 df0 DN D0.S16 152 df1 DN D1.S16 153 df2 DN D2.S16 154 df3 DN D3.S16 155 qf01 QN Q0.32 156 qf23 QN Q1.32 157 dg0 DN D5.S16 158 dg1 DN D6.S16 159 dg2 DN D7.S16 160 dg3 DN D8.S16 161 df1RS DN D7.S16 162 df3RS DN D8.S16 163 164 ;// Output pixels 165 dh0 DN D0.S16 166 dh1 DN D1.S16 167 dh2 DN D2.S16 168 dh3 DN D3.S16 169 170 ;// Registers used in DequantTransformResidualFromPairAndAdd 171 172 dDeltaRow0 DN D0.S16 173 dDeltaRow1 DN D1.S16 174 dDeltaRow2 DN D2.S16 175 dDeltaRow3 DN D3.S16 176 qDeltaRow01 QN Q0.S16 177 qDeltaRow23 QN Q1.S16 178 179 dPredValRow01 DN D4.U8 180 dPredValRow23 DN D5.U8 181 182 qSumRow01 QN Q3.S16 183 qSumRow23 QN Q4.S16 184 dDstRow01 DN D0.U8 185 dDstRow23 DN D1.U8 186 dDstRow0 DN D0.32[0] 187 dDstRow1 DN D0.32[1] 188 dDstRow2 DN D1.32[0] 189 dDstRow3 DN D1.32[1] 190 191 192 ;// Allocate stack memory required by the function 193 M_ALLOC8 pBuffer, 32 194 195 196 ;// Write function header 197 M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11,d9 198 199 ;// Define stack arguments 200 M_ARG predStepOnStack, 4 201 M_ARG dstStepOnStack,4 202 M_ARG QPOnStack, 4 203 M_ARG ACOnStack,4 204 205 206 M_ADR pDelta,pBuffer 207 M_LDR AC,ACOnStack 208 209 210 ;// Save registers r1,r2,r3 before function call 211 MOV pPredTemp,pPred 212 MOV pDCTemp,pDC 213 MOV pDstTemp,pDst 214 215 CMP AC,#0 216 BEQ DCcase 217 MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_UnpackBlock4x4 218 219 BL armVCM4P10_UnpackBlock4x4 220 221 ;//-------------------------------------------------------- 222 ;// armVCM4P10_DequantLumaAC4x4 : static function inlined 223 ;//-------------------------------------------------------- 224 225 ;//BL armVCM4P10_DequantLumaAC4x4 226 M_LDR QP,QPOnStack ;// Set up r1 for armVCM4P10_DequantLumaAC4x4 227 228 LDR pQPmod,=armVCM4P10_QPModuloTable 229 LDR pQPdiv,=armVCM4P10_QPDivTable 230 LDR pVRow,=armVCM4P10_VMatrixU16 231 232 233 LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6 234 LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6 235 236 LDR index1,=0x03020504 237 LDR index0,=0x05040100 ;// Indexes into dVmatrix 238 ADD pVRow,pVRow,QPmod 239 VDUP dindexRow0,index0 240 VDUP dindexRow1,index1 241 VDUP dShift,shift 242 243 ;// Load all 4x4 pVRow[] values 244 VLD1 dVmatrix,[pVRow] ;// dVmatrix = [0d|0c|0b|0a] 245 246 247 VTBL dVRow0,dVmatrix,dByteIndexRow0 ;// row0 = row2 = [pVRow[2] | pVRow[0] | pVRow[2] | pVRow[0]] 248 VTBL dVRow1,dVmatrix,dByteIndexRow1 ;// row1 = row3 = [pVRow[1] | pVRow[2] | pVRow[1] | pVRow[2]] 249 CMP pDCTemp,#0 250 ;// Load all the 4x4 'src' values 251 VLD1 { dSrcRow0,dSrcRow1,dSrcRow2,dSrcRow3 },[pDelta] 252 253 VSHL dVRow0U16,dVRow0U16,dShift 254 VSHL dVRow1U16,dVRow1U16,dShift 255 LDRSHNE DCval,[pDCTemp] 256 257 258 ;// Multiply src[] with pVRow[] 259 VMUL dDqntRow0,dSrcRow0,dVRow0U16 260 VMUL dDqntRow1,dSrcRow1,dVRow1U16 261 VMUL dDqntRow2,dSrcRow2,dVRow2U16 262 VMUL dDqntRow3,dSrcRow3,dVRow3U16 263 264 265 266 ;//------------------------------------------------------------- 267 ;// TransformResidual4x4 : Inlined to avoid Load/Stores 268 ;//------------------------------------------------------------- 269 270 271 ;//BL armVCM4P10_TransformResidual4x4 272 ;//STRHNE DCval,[pDelta] 273 VMOVNE dIn0[0],DCval 274 275 276 277 ;//***************************************************************** 278 ;// Transpose the input pixels : perform Row ops as Col ops 279 ;//***************************************************************** 280 281 VTRN dIn0,dIn1 282 VTRN dIn2,dIn3 283 VTRN qIn01,qIn23 284 285 286 VMOV dZero,#0 ;// Used to right shift by 1 287 288 289 ;//**************************************** 290 ;// Row Operations (Performed on columns) 291 ;//**************************************** 292 293 294 VADD de0,dIn0,dIn2 ;// e0 = d0 + d2 295 VSUB de1,dIn0,dIn2 ;// e1 = d0 - d2 296 VHADD dIn1RS,dIn1,dZero ;// (f1>>1) constZero is a register holding 0 297 VHADD dIn3RS,dIn3,dZero 298 VSUB de2,dIn1RS,dIn3 ;// e2 = (d1>>1) - d3 299 VADD de3,dIn1,dIn3RS ;// e3 = d1 + (d3>>1) 300 VADD df0,de0,de3 ;// f0 = e0 + e3 301 VADD df1,de1,de2 ;// f1 = e1 + e2 302 VSUB df2,de1,de2 ;// f2 = e1 - e2 303 VSUB df3,de0,de3 ;// f3 = e0 - e3 304 305 306 307 ;//***************************************************************** 308 ;// Transpose the resultant matrix 309 ;//***************************************************************** 310 311 VTRN df0,df1 312 VTRN df2,df3 313 VTRN qf01,qf23 314 315 316 ;//******************************* 317 ;// Coloumn Operations 318 ;//******************************* 319 320 321 VADD dg0,df0,df2 ;// e0 = d0 + d2 322 VSUB dg1,df0,df2 ;// e1 = d0 - d2 323 VHADD df1RS,df1,dZero ;// (f1>>1) constZero is a register holding 0 324 VHADD df3RS,df3,dZero 325 VSUB dg2,df1RS,df3 ;// e2 = (d1>>1) - d3 326 VADD dg3,df1,df3RS ;// e3 = d1 + (d3>>1) 327 VADD dh0,dg0,dg3 ;// f0 = e0 + e3 328 VADD dh1,dg1,dg2 ;// f1 = e1 + e2 329 VSUB dh2,dg1,dg2 ;// f2 = e1 - e2 330 VSUB dh3,dg0,dg3 ;// f3 = e0 - e3 331 332 333 ;//************************************************ 334 ;// Calculate final value (colOp[i][j] + 32)>>6 335 ;//************************************************ 336 337 VRSHR dh0,#6 338 VRSHR dh1,#6 339 VRSHR dh2,#6 340 VRSHR dh3,#6 341 342 343 B OutDCcase 344 345 346 DCcase 347 ;// Calculate the Transformed DCvalue : (DCval+32)>>6 348 LDRSH DCval,[pDCTemp] 349 ADD DCval,DCval,#32 350 ASR DCval,DCval,#6 351 352 VDUP dDeltaRow0, DCval ;// pDelta[0] = pDelta[1] = pDelta[2] = pDelta[3] = DCval 353 VDUP dDeltaRow1, DCval ;// pDelta[4] = pDelta[5] = pDelta[6] = pDelta[7] = DCval 354 VDUP dDeltaRow2, DCval ;// pDelta[8] = pDelta[9] = pDelta[10] = pDelta[11] = DCval 355 VDUP dDeltaRow3, DCval 356 357 358 OutDCcase 359 M_LDR predstep,predStepOnStack 360 M_LDR dstStep,dstStepOnStack 361 362 LDR PredVal1,[pPredTemp],predstep 363 LDR PredVal2,[pPredTemp],predstep 364 VMOV dPredValRow01,PredVal1,PredVal2 365 366 LDR PredVal1,[pPredTemp],predstep 367 LDR PredVal2,[pPredTemp] 368 VMOV dPredValRow23,PredVal1,PredVal2 369 370 371 VADDW qSumRow01,qDeltaRow01,dPredValRow01 372 VADDW qSumRow23,qDeltaRow23,dPredValRow23 373 VQMOVUN dDstRow01,qSumRow01 374 VQMOVUN dDstRow23,qSumRow23 375 376 377 VST1 dDstRow0,[pDstTemp],dstStep 378 VST1 dDstRow1,[pDstTemp],dstStep 379 VST1 dDstRow2,[pDstTemp],dstStep 380 VST1 dDstRow3,[pDstTemp] 381 382 ;// Set return value 383 MOV result,#OMX_Sts_NoErr 384 385 End 386 387 388 ;// Write function tail 389 390 M_END 391 392 ENDIF ;//CORTEXA8 393 394 395 396 END 397