1 ;// 2 ;// (c) Copyright 2007 ARM Limited. All Rights Reserved. 3 ;// 4 ;// Description: 5 ;// H.264 inverse quantize and transform module 6 ;// 7 ;// 8 9 10 11 ;// Include standard headers 12 13 INCLUDE omxtypes_s.h 14 INCLUDE armCOMM_s.h 15 16 ;// Import symbols required from other files 17 ;// (For example tables) 18 19 IMPORT armVCM4P10_UnpackBlock4x4 20 IMPORT armVCM4P10_TransformResidual4x4 21 IMPORT armVCM4P10_QPDivTable 22 IMPORT armVCM4P10_VMatrixU16 23 IMPORT armVCM4P10_QPModuloTable 24 25 M_VARIANTS ARM1136JS, ARM1136JS_U 26 27 ;// Set debugging level 28 ;//DEBUG_ON SETL {TRUE} 29 30 31 ;// Static Function: armVCM4P10_DequantLumaAC4x4 32 33 ;// Guarding implementation by the processor name 34 35 IF ARM1136JS 36 37 ;//Input Registers 38 pSrcDst RN 0 39 QP RN 1 40 41 42 ;//Output Registers 43 44 45 ;//Local Scratch Registers 46 pQPdiv RN 4 47 pQPmod RN 5 48 pVRow RN 2 49 QPmod RN 6 50 shift RN 3 51 rowLuma01 RN 1 52 rowLuma23 RN 4 53 54 SrcDst00 RN 5 55 SrcDst02 RN 6 56 SrcDst10 RN 7 57 SrcDst12 RN 8 58 SrcDst20 RN 9 59 SrcDst22 RN 10 60 SrcDst30 RN 11 61 SrcDst32 RN 12 62 63 temp1 RN 2 64 temp2 RN 3 65 temp3 RN 14 66 67 68 ;// Allocate stack memory required by the function 69 70 ;// Write function header 71 M_START armVCM4P10_DequantLumaAC4x4,r11 72 73 LDR pQPmod,=armVCM4P10_QPModuloTable 74 LDR pQPdiv,=armVCM4P10_QPDivTable 75 LDR pVRow,=armVCM4P10_VMatrixU16 76 77 LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6 78 LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6 79 80 LDRH rowLuma01,[pVRow,QPmod]! ;// rowLuma01 = [00|0a] 81 LDRH temp3,[pVRow,#2] ;// temp3 = [00|0b] 82 LDRH rowLuma23,[pVRow,#4] ;// rowLuma23 = [00|0c] 83 ORR rowLuma01,rowLuma01,temp3,LSL #16 ;// rowLuma01 = [0b|0a] 84 85 ;// Load all the 16 'src' values 86 LDMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32} 87 88 89 ;//********************************************************************************************* 90 ;// 91 ;// 'Shift' ranges between [0,8] 92 ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation 93 ;// 94 ;//********************************************************************************************* 95 96 LSL rowLuma01,rowLuma01,shift 97 LSL rowLuma23,rowLuma23,shift 98 99 100 ;//********************************************************************************************** 101 ;// 102 ;// The idea is to unroll the Loop completely 103 ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above) 104 ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16' 105 ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2 106 ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above) 107 ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above) 108 ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated 109 ;// Here we interleave the PKHBT operations for various rows to avoide pipeline stalls 110 ;// 111 ;// We then pack the two 16 bit multiplication result into a word and store at one go 112 ;// 113 ;//********************************************************************************************** 114 115 116 ;// Row 1 117 118 119 SMULTB temp1,SrcDst00,rowLuma23 ;// pSrcDst[1] * (pVRow[2]<<Shift) 120 SMULBB SrcDst00,SrcDst00,rowLuma01 ;// pSrcDst[0] * (pVRow[0]<<Shift) 121 122 SMULTB temp2,SrcDst02,rowLuma23 ;// pSrcDst[3] * (pVRow[2]<<Shift) 123 SMULBB SrcDst02,SrcDst02,rowLuma01 ;// pSrcDst[2] * (pVRow[0]<<Shift) 124 125 PKHBT SrcDst00,SrcDst00,temp1,LSL #16 ;// Pack the first two product values 126 127 128 ;// Row 2 129 SMULTT temp1,SrcDst10,rowLuma01 ;// pSrcDst[5] * (pVRow[1]<<Shift) 130 SMULBB SrcDst10,SrcDst10,rowLuma23 ;// pSrcDst[4] * (pVRow[2]<<Shift) 131 132 PKHBT SrcDst02,SrcDst02,temp2,LSL #16 ;// Pack the next two product values 133 SMULTT temp2,SrcDst12,rowLuma01 ;// pSrcDst[7] * (pVRow[1]<<Shift) 134 SMULBB SrcDst12,SrcDst12,rowLuma23 ;// pSrcDst[6] * (pVRow[2]<<Shift) 135 136 PKHBT SrcDst10,SrcDst10,temp1,LSL #16 ;// Pack the next two product values 137 138 139 ;// Row 3 140 141 SMULTB temp1,SrcDst20,rowLuma23 ;// pSrcDst[9] * (pVRow[2]<<Shift) 142 SMULBB SrcDst20,SrcDst20,rowLuma01 ;// pSrcDst[8] * (pVRow[0]<<Shift) 143 144 PKHBT SrcDst12,SrcDst12,temp2,LSL #16 ;// Pack the next two product values 145 SMULTB temp2,SrcDst22,rowLuma23 ;// pSrcDst[11] * (pVRow[2]<<Shift) 146 SMULBB SrcDst22,SrcDst22,rowLuma01 ;// pSrcDst[10] * (pVRow[0]<<Shift) 147 148 PKHBT SrcDst20,SrcDst20,temp1,LSL #16 ;// Pack the next two product values 149 150 151 152 ;// Row 4 153 154 SMULTT temp1,SrcDst30,rowLuma01 ;// pSrcDst[13] * (pVRow[1]<<Shift) 155 SMULBB SrcDst30,SrcDst30,rowLuma23 ;// pSrcDst[12] * (pVRow[2]<<Shift) 156 157 SMULTT temp3,SrcDst32,rowLuma01 ;// pSrcDst[15] * (pVRow[1]<<Shift) 158 SMULBB SrcDst32,SrcDst32,rowLuma23 ;// pSrcDst[14] * (pVRow[2]<<Shift) 159 160 PKHBT SrcDst22,SrcDst22,temp2,LSL #16 ;// Pack the remaining product values 161 PKHBT SrcDst30,SrcDst30,temp1,LSL #16 162 PKHBT SrcDst32,SrcDst32,temp3,LSL #16 163 164 165 STMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32} 166 167 168 ;// Set return value 169 170 171 172 ;// Write function tail 173 M_END 174 175 ENDIF ;//ARM1136JS 176 177 178 ;// Guarding implementation by the processor name 179 180 IF ARM1136JS_U 181 182 ;//Input Registers 183 pSrcDst RN 0 184 QP RN 1 185 186 187 ;//Output Registers 188 189 190 ;//Local Scratch Registers 191 pQPdiv RN 4 192 pQPmod RN 5 193 pVRow RN 2 194 QPmod RN 6 195 shift RN 3 196 rowLuma01 RN 1 197 rowLuma23 RN 4 198 199 SrcDst00 RN 5 200 SrcDst02 RN 6 201 SrcDst10 RN 7 202 SrcDst12 RN 8 203 SrcDst20 RN 9 204 SrcDst22 RN 10 205 SrcDst30 RN 11 206 SrcDst32 RN 12 207 208 temp1 RN 2 209 temp2 RN 3 210 temp3 RN 14 211 212 213 ;// Allocate stack memory required by the function 214 215 ;// Write function header 216 M_START armVCM4P10_DequantLumaAC4x4,r11 217 218 LDR pQPmod,=armVCM4P10_QPModuloTable 219 LDR pQPdiv,=armVCM4P10_QPDivTable 220 LDR pVRow,=armVCM4P10_VMatrixU16 221 222 LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6 223 LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6 224 225 LDR rowLuma01,[pVRow,QPmod]! ;// rowLuma01 = [0b|0a] 226 LDR rowLuma23,[pVRow,#4] ;// rowLuma23 = [0d|0c] 227 228 ;// Load all the 16 'src' values 229 LDMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32} 230 231 232 ;//********************************************************************************************* 233 ;// 234 ;// 'Shift' ranges between [0,8] 235 ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation 236 ;// 237 ;//********************************************************************************************* 238 239 LSL rowLuma01,rowLuma01,shift 240 LSL rowLuma23,rowLuma23,shift 241 242 243 ;//********************************************************************************************** 244 ;// 245 ;// The idea is to unroll the Loop completely 246 ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above) 247 ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16' 248 ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2 249 ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above) 250 ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above) 251 ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated 252 ;// Here we interleave the PKHBT operations for various rows to avoide pipeline stalls 253 ;// 254 ;// We then pack the two 16 bit multiplication result into a word and store at one go 255 ;// 256 ;//********************************************************************************************** 257 258 259 ;// Row 1 260 261 262 SMULTB temp1,SrcDst00,rowLuma23 ;// pSrcDst[1] * (pVRow[2]<<Shift) 263 SMULBB SrcDst00,SrcDst00,rowLuma01 ;// pSrcDst[0] * (pVRow[0]<<Shift) 264 265 SMULTB temp2,SrcDst02,rowLuma23 ;// pSrcDst[3] * (pVRow[2]<<Shift) 266 SMULBB SrcDst02,SrcDst02,rowLuma01 ;// pSrcDst[2] * (pVRow[0]<<Shift) 267 268 PKHBT SrcDst00,SrcDst00,temp1,LSL #16 ;// Pack the first two product values 269 270 271 ;// Row 2 272 SMULTT temp1,SrcDst10,rowLuma01 ;// pSrcDst[5] * (pVRow[1]<<Shift) 273 SMULBB SrcDst10,SrcDst10,rowLuma23 ;// pSrcDst[4] * (pVRow[2]<<Shift) 274 275 PKHBT SrcDst02,SrcDst02,temp2,LSL #16 ;// Pack the next two product values 276 SMULTT temp2,SrcDst12,rowLuma01 ;// pSrcDst[7] * (pVRow[1]<<Shift) 277 SMULBB SrcDst12,SrcDst12,rowLuma23 ;// pSrcDst[6] * (pVRow[2]<<Shift) 278 279 PKHBT SrcDst10,SrcDst10,temp1,LSL #16 ;// Pack the next two product values 280 281 282 ;// Row 3 283 284 SMULTB temp1,SrcDst20,rowLuma23 ;// pSrcDst[9] * (pVRow[2]<<Shift) 285 SMULBB SrcDst20,SrcDst20,rowLuma01 ;// pSrcDst[8] * (pVRow[0]<<Shift) 286 287 PKHBT SrcDst12,SrcDst12,temp2,LSL #16 ;// Pack the next two product values 288 SMULTB temp2,SrcDst22,rowLuma23 ;// pSrcDst[11] * (pVRow[2]<<Shift) 289 SMULBB SrcDst22,SrcDst22,rowLuma01 ;// pSrcDst[10] * (pVRow[0]<<Shift) 290 291 PKHBT SrcDst20,SrcDst20,temp1,LSL #16 ;// Pack the next two product values 292 293 294 295 ;// Row 4 296 297 SMULTT temp1,SrcDst30,rowLuma01 ;// pSrcDst[13] * (pVRow[1]<<Shift) 298 SMULBB SrcDst30,SrcDst30,rowLuma23 ;// pSrcDst[12] * (pVRow[2]<<Shift) 299 300 SMULTT temp3,SrcDst32,rowLuma01 ;// pSrcDst[15] * (pVRow[1]<<Shift) 301 SMULBB SrcDst32,SrcDst32,rowLuma23 ;// pSrcDst[14] * (pVRow[2]<<Shift) 302 303 PKHBT SrcDst22,SrcDst22,temp2,LSL #16 ;// Pack the remaining product values 304 PKHBT SrcDst30,SrcDst30,temp1,LSL #16 305 PKHBT SrcDst32,SrcDst32,temp3,LSL #16 306 307 308 STMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32} 309 310 311 ;// Set return value 312 313 314 315 ;// Write function tail 316 M_END 317 318 ENDIF ;//ARM1136JS_U 319 320 321 322 323 324 ;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd 325 326 ;// Guarding implementation by the processor name 327 328 IF ARM1136JS 329 330 ;//Input Registers 331 ppSrc RN 0 332 pPred RN 1 333 pDC RN 2 334 pDst RN 3 335 336 337 ;//Output Registers 338 result RN 0 339 340 ;//Local Scratch Registers 341 pDelta RN 4 342 pDeltaTmp RN 6 343 AC RN 5 ;//Load from stack 344 pPredTemp RN 7 345 pDCTemp RN 8 346 pDstTemp RN 9 347 pDeltaArg1 RN 1 348 pDeltaArg0 RN 0 349 QP RN 1 ;//Load from stack 350 DCval RN 10 351 DCvalCopy RN 11 352 predstep RN 1 353 dstStep RN 10 354 ycounter RN 0 355 PredVal1 RN 3 356 PredVal2 RN 5 357 DeltaVal1 RN 2 358 DeltaVal2 RN 11 359 PredVal RN 8 360 tmpDeltaVal RN 6 361 sum1 RN 12 362 sum2 RN 14 363 364 365 366 ;// Allocate stack memory required by the function 367 M_ALLOC8 pBuffer, 32 368 369 370 ;// Write function header 371 M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11 372 373 ;// Define stack arguments 374 M_ARG predStepOnStack, 4 375 M_ARG dstStepOnStack,4 376 M_ARG QPOnStack, 4 377 M_ARG ACOnStack,4 378 379 380 M_ADR pDelta,pBuffer 381 M_LDR AC,ACOnStack 382 383 384 ;// Save registers r1,r2,r3 before function call 385 MOV pPredTemp,pPred 386 MOV pDCTemp,pDC 387 MOV pDstTemp,pDst 388 389 CMP AC,#0 390 BEQ DCcase 391 MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_UnpackBlock4x4 392 393 BL armVCM4P10_UnpackBlock4x4 394 395 M_LDR QP,QPOnStack ;// Set up r1 for DequantLumaAC4x4 396 MOV pDeltaArg0,pDelta ;// Set up r0 for DequantLumaAC4x4 397 398 BL armVCM4P10_DequantLumaAC4x4 399 400 401 CMP pDCTemp,#0 402 LDRSHNE DCval,[pDCTemp] 403 MOV pDeltaArg0,pDelta ;// Set up r0 for armVCM4P10_TransformResidual4x4 404 MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_TransformResidual4x4 405 STRHNE DCval,[pDelta] 406 407 BL armVCM4P10_TransformResidual4x4 408 B OutDCcase 409 410 411 DCcase 412 LDRSH DCval,[pDCTemp] 413 ADD DCval,DCval,#32 414 ASR DCval,DCval,#6 415 PKHBT DCval,DCval,DCval,LSL #16 ;// Duplicating the Lower halfword 416 MOV DCvalCopy, DCval ;// Needed for STRD 417 STRD DCval, [pDelta, #0] ;// pDelta[0] = pDelta[1] = pDelta[2] = pDelta[3] = DCval 418 STRD DCval, [pDelta, #8] ;// pDelta[4] = pDelta[5] = pDelta[6] = pDelta[7] = DCval 419 STRD DCval, [pDelta, #16] ;// pDelta[8] = pDelta[9] = pDelta[10] = pDelta[11] = DCval 420 STRD DCval, [pDelta, #24] 421 422 423 OutDCcase 424 M_LDR predstep,predStepOnStack 425 M_LDR dstStep,dstStepOnStack 426 427 LDMIA pDelta!,{tmpDeltaVal,DeltaVal2} ;// Pre load 428 MOV ycounter,#4 ;// Counter for the PredPlusDeltaLoop 429 LDR PredVal,[pPredTemp] ;// Pre load 430 431 PredPlusDeltaLoop 432 433 434 SUBS ycounter,ycounter,#1 435 ADD pPredTemp,pPredTemp,predstep ;// Increment pPred ptr 436 437 PKHBT DeltaVal1,tmpDeltaVal,DeltaVal2,LSL #16 ;// Deltaval1 = [C A] 438 PKHTB DeltaVal2,DeltaVal2,tmpDeltaVal,ASR #16 ;// DeltaVal2 = [D B] 439 440 UXTB16 PredVal1,PredVal ;// PredVal1 = [0c0a] 441 UXTB16 PredVal2,PredVal,ROR #8 ;// PredVal2 = [0d0b] 442 443 LDRGT PredVal,[pPredTemp] ;// Pre load 444 445 QADD16 sum2,DeltaVal2,PredVal2 ;// Add and saturate to 16 bits 446 QADD16 sum1,DeltaVal1,PredVal1 447 448 USAT16 sum2,#8,sum2 ;// armClip(0,255,sum2) 449 USAT16 sum1,#8,sum1 450 451 LDMGTIA pDelta!,{tmpDeltaVal,DeltaVal2} ;// Pre load 452 453 ORR sum1,sum1,sum2,LSL #8 ;// sum1 = [dcba] 454 STR sum1,[pDstTemp] 455 456 ADD pDstTemp,pDstTemp,dstStep ;// Increment pDst ptr 457 BGT PredPlusDeltaLoop 458 459 460 ;// Set return value 461 MOV result,#OMX_Sts_NoErr 462 463 End 464 465 466 ;// Write function tail 467 468 M_END 469 470 ENDIF ;//ARM1136JS 471 472 473 ;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd 474 475 ;// Guarding implementation by the processor name 476 477 478 479 480 END 481