1 ;// 2 ;// 3 ;// File Name: omxVCM4P10_TransformDequantLumaDCFromPair_s.s 4 ;// OpenMAX DL: v1.0.2 5 ;// Revision: 9641 6 ;// Date: Thursday, February 7, 2008 7 ;// 8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9 ;// 10 ;// 11 ;// 12 ;// Description: 13 ;// H.264 inverse quantize and transform module 14 ;// 15 ;// 16 17 ;// Include standard headers 18 19 INCLUDE omxtypes_s.h 20 INCLUDE armCOMM_s.h 21 22 ;// Import/Export symbols required from/to other files 23 ;// (For example tables) 24 25 IMPORT armVCM4P10_UnpackBlock4x4 26 IMPORT armVCM4P10_QPDivTable 27 IMPORT armVCM4P10_VMatrixQPModTable 28 29 M_VARIANTS ARM1136JS 30 31 ;// Set debugging level 32 ;//DEBUG_ON SETL {TRUE} 33 34 35 ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 36 37 38 ;// Guarding implementation by the processor name 39 40 IF ARM1136JS 41 42 43 ;//Input Registers 44 pData RN 0 45 QP RN 1 46 47 ;//Output Registers 48 49 50 ;//Local Scratch Registers 51 52 ;// Packed Input pixels 53 in00 RN 2 ;// Src[0] & Src[1] 54 in02 RN 3 ;// Src[2] & Src[3] 55 in10 RN 4 ;// Src[4] & Src[5] 56 in12 RN 5 ;// Src[6] & Src[7] 57 in20 RN 6 ;// Src[8] & Src[9] 58 in22 RN 7 ;// Src[10] & Src[11] 59 in30 RN 8 ;// Src[12] & Src[13] 60 in32 RN 9 ;// Src[14] & Src[15] 61 62 ;// Transpose for Row operations (Rows to cols) 63 trRow00 RN 2 64 trRow10 RN 10 65 trRow02 RN 3 66 trRow12 RN 5 67 trRow20 RN 11 68 trRow30 RN 12 69 trRow32 RN 14 70 trRow22 RN 7 71 72 ;// Intermediate calculations 73 rowSum1 RN 4 74 rowSum2 RN 6 75 rowDiff1 RN 8 76 rowDiff2 RN 9 77 78 79 ;// Row operated pixels 80 rowOp00 RN 2 81 rowOp10 RN 10 82 rowOp20 RN 11 83 rowOp30 RN 12 84 rowOp02 RN 3 85 rowOp12 RN 5 86 rowOp22 RN 7 87 rowOp32 RN 14 88 89 ;// Transpose for colulmn operations 90 trCol00 RN 2 91 trCol02 RN 3 92 trCol10 RN 4 93 trCol12 RN 5 94 trCol20 RN 6 95 trCol22 RN 7 96 trCol30 RN 8 97 trCol32 RN 9 98 99 ;// Intermediate calculations 100 colSum1 RN 10 101 colSum2 RN 11 102 colDiff1 RN 12 103 colDiff2 RN 14 104 105 106 ;// Coloumn operated pixels 107 colOp00 RN 2 108 colOp02 RN 3 109 colOp10 RN 4 110 colOp12 RN 5 111 colOp20 RN 6 112 colOp22 RN 7 113 colOp30 RN 8 114 colOp32 RN 9 115 116 ;// Temporary scratch varaibles 117 pQPDivTable RN 0 118 pQPModTable RN 11 119 Shift RN 10 120 Scale RN 14 121 Round RN 0 122 123 temp1 RN 10 124 temp2 RN 11 125 temp3 RN 12 126 temp4 RN 1 127 128 129 130 ;// InvTransformed and Dequantized pixels 131 out00 RN 2 132 out02 RN 3 133 out10 RN 4 134 out12 RN 5 135 out20 RN 6 136 out22 RN 7 137 out30 RN 8 138 out32 RN 9 139 140 141 142 143 ;// Allocate stack memory required by the function 144 M_ALLOC4 pDataOnStack, 4 145 146 ;// Write function header 147 M_START armVCM4P10_InvTransformDequantLumaDC4x4,r11 148 149 ;****************************************************************** 150 ;// The strategy used in implementing the transform is as follows:* 151 ;// Load the 4x4 block into 8 registers * 152 ;// Transpose the 4x4 matrix * 153 ;// Perform the row operations (on columns) using SIMD * 154 ;// Transpose the 4x4 result matrix * 155 ;// Perform the coloumn operations * 156 ;// Store the 4x4 block at one go * 157 ;****************************************************************** 158 159 ;// Load all the 4x4 pixels 160 161 LDMIA pData,{in00,in02,in10,in12,in20,in22,in30,in32} 162 163 ;//***************************************************************** 164 ;// 165 ;// Transpose the matrix inorder to perform row ops as coloumn ops 166 ;// Input: in[][] = original matrix 167 ;// Output: trRow[][]= transposed matrix 168 ;// Step1: Obtain the LL part of the transposed matrix 169 ;// Step2: Obtain the HL part 170 ;// step3: Obtain the LH part 171 ;// Step4: Obtain the HH part 172 ;// 173 ;//***************************************************************** 174 175 ;// LL 2x2 transposed matrix 176 ;// d0 d1 - - 177 ;// d4 d5 - - 178 ;// - - - - 179 ;// - - - - 180 181 PKHTB trRow10,in10,in00,ASR #16 ;// [5 4] = [f5:f1] 182 PKHBT trRow00,in00,in10,LSL #16 ;// [1 0] = [f4:f0] 183 184 ;// HL 2x2 transposed matrix 185 ;// - - - - 186 ;// - - - - 187 ;// d8 d9 - - 188 ;// d12 d13 - - 189 190 191 PKHTB trRow30,in12,in02,ASR #16 ;// [13 12] = [7 3] 192 PKHBT trRow20,in02,in12,LSL #16 ;// [9 8] = [6 2] 193 194 ;// LH 2x2 transposed matrix 195 ;// - - d2 d3 196 ;// - - d6 d7 197 ;// - - - - 198 ;// - - - - 199 200 PKHBT trRow02,in20,in30,LSL #16 ;// [3 2] = [f12:f8] 201 PKHTB trRow12,in30,in20,ASR #16 ;// [7 6] = [f13:f9] 202 203 204 205 206 ;// HH 2x2 transposed matrix 207 ;// - - - - 208 ;// - - - - 209 ;// - - d10 d11 210 ;// - - d14 d15 211 212 PKHTB trRow32,in32,in22,ASR #16 ;// [15 14] = [15 11] 213 PKHBT trRow22,in22,in32,LSL #16 ;// [11 10] = [14 10] 214 215 216 ;**************************************** 217 ;// Row Operations (Performed on columns) 218 ;**************************************** 219 220 221 ;// SIMD operations on first two columns(two rows of the original matrix) 222 223 SADD16 rowSum1,trRow00,trRow10 ;// (c0+c1) 224 SADD16 rowSum2,trRow20,trRow30 ;// (c2+c3) 225 SSUB16 rowDiff1,trRow00,trRow10 ;// (c0-c1) 226 SSUB16 rowDiff2,trRow20,trRow30 ;// (c2-c3) 227 SADD16 rowOp00,rowSum1,rowSum2 ;// (c0+c1+c2+c3) 228 SSUB16 rowOp10,rowSum1,rowSum2 ;// (c0+c1-c2-c3) 229 SSUB16 rowOp20,rowDiff1,rowDiff2 ;// (c0-c1-c2+c3) 230 SADD16 rowOp30,rowDiff1,rowDiff2 ;// (c0-c1+c2-c3) 231 232 233 ;// SIMD operations on next two columns(next two rows of the original matrix) 234 235 SADD16 rowSum1,trRow02,trRow12 ;// (c0+c1) 236 SADD16 rowSum2,trRow22,trRow32 ;// (c2+c3) 237 SSUB16 rowDiff1,trRow02,trRow12 ;// (c0-c1) 238 SSUB16 rowDiff2,trRow22,trRow32 ;// (c2-c3) 239 SADD16 rowOp02,rowSum1,rowSum2 ;// (c0+c1+c2+c3) 240 SSUB16 rowOp12,rowSum1,rowSum2 ;// (c0+c1-c2-c3) 241 SSUB16 rowOp22,rowDiff1,rowDiff2 ;// (c0-c1-c2+c3) 242 SADD16 rowOp32,rowDiff1,rowDiff2 ;// (c0-c1+c2-c3) 243 244 245 246 ;***************************************************************** 247 ;// Transpose the resultant matrix 248 ;// Input: rowOp[][] 249 ;// Output: trCol[][] 250 ;***************************************************************** 251 252 ;// LL 2x2 transposed matrix 253 ;// d0 d1 - - 254 ;// d4 d5 - - 255 ;// - - - - 256 ;// - - - - 257 258 PKHTB trCol10,rowOp10,rowOp00,ASR #16 ;// [5 4] = [f5:f1] 259 PKHBT trCol00,rowOp00,rowOp10,LSL #16 ;// [1 0] = [f4:f0] 260 261 ;// HL 2x2 transposed matrix 262 ;// - - - - 263 ;// - - - - 264 ;// d8 d9 - - 265 ;// d12 d13 - - 266 267 268 PKHTB trCol30,rowOp12,rowOp02,ASR #16 ;// [13 12] = [7 3] 269 PKHBT trCol20,rowOp02,rowOp12,LSL #16 ;// [9 8] = [6 2] 270 271 ;// LH 2x2 transposed matrix 272 ;// - - d2 d3 273 ;// - - d6 d7 274 ;// - - - - 275 ;// - - - - 276 277 PKHBT trCol02,rowOp20,rowOp30,LSL #16 ;// [3 2] = [f12:f8] 278 PKHTB trCol12,rowOp30,rowOp20,ASR #16 ;// [7 6] = [f13:f9] 279 280 281 282 283 ;// HH 2x2 transposed matrix 284 ;// - - - - 285 ;// - - - - 286 ;// - - d10 d11 287 ;// - - d14 d15 288 289 PKHTB trCol32,rowOp32,rowOp22,ASR #16 ;// [15 14] = [15 11] 290 PKHBT trCol22,rowOp22,rowOp32,LSL #16 ;// [11 10] = [14 10] 291 292 293 ;******************************* 294 ;// Coloumn Operations 295 ;******************************* 296 297 ;//-------------------------------------------------------------------------------------- 298 ;// Store pData(RN0) on stack and restore it only at the final store back 299 ;// This frees up a register (RN0) which is used to reduce number of intermediate stalls 300 ;//-------------------------------------------------------------------------------------- 301 M_STR pData,pDataOnStack 302 303 304 ;// SIMD operations on first two columns(two rows of the original matrix) 305 306 SADD16 colSum1,trCol00,trCol10 ;// (c0+c1) 307 SADD16 colSum2,trCol20,trCol30 ;// (c2+c3) 308 SSUB16 colDiff1,trCol00,trCol10 ;// (c0-c1) 309 SSUB16 colDiff2,trCol20,trCol30 ;// (c2-c3) 310 SADD16 colOp00,colSum1,colSum2 ;// (c0+c1+c2+c3) 311 SSUB16 colOp10,colSum1,colSum2 ;// (c0+c1-c2-c3) 312 SSUB16 colOp20,colDiff1,colDiff2 ;// (c0-c1-c2+c3) 313 SADD16 colOp30,colDiff1,colDiff2 ;// (c0-c1+c2-c3) 314 315 316 ;// SIMD operations on next two columns(next two rows of the original matrix) 317 318 LDR pQPDivTable, =armVCM4P10_QPDivTable ;// QP Division look-up-table base pointer 319 SADD16 colSum1,trCol02,trCol12 ;// (c0+c1) 320 SADD16 colSum2,trCol22,trCol32 ;// (c2+c3) 321 SSUB16 colDiff1,trCol02,trCol12 ;// (c0-c1) 322 SSUB16 colDiff2,trCol22,trCol32 ;// (c2-c3) 323 SADD16 colOp02,colSum1,colSum2 ;// (c0+c1+c2+c3) 324 SSUB16 colOp12,colSum1,colSum2 ;// (c0+c1-c2-c3) 325 LDR pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer 326 LDRSB Shift, [pQPDivTable, QP] ;// Shift = pQPDivTable[QP] 327 SSUB16 colOp22,colDiff1,colDiff2 ;// (c0-c1-c2+c3) 328 SADD16 colOp32,colDiff1,colDiff2 ;// (c0-c1+c2-c3) 329 330 331 LDRSB Scale, [pQPModTable, QP] ;// Scale = pQPModTable[QP] 332 333 ;//---------------------------------------------------------------------- 334 ;// 335 ;// <Dequantize> improves on the c-reference code 336 ;// Both the cases i.e., Shift>=0 and Shift<0 cases are covered together 337 ;// We do not subtract 2 from Shift as in C reference, instead perform a 338 ;// Scale << Shift once in the beginning and do a right shift by a 339 ;// constant 2 after the Multiplication. The value of Round would be 2 340 ;// 341 ;// By doing this we aviod the Branches required and also 342 ;// reduce the code size substantially 343 ;// 344 ;//---------------------------------------------------------------------- 345 346 MOV Round, #2 ;// Round = 2 347 LSL Scale, Scale, Shift ;// Scale = Scale << Shift 348 349 350 ;// Row 1 351 SMLABB temp1, colOp00, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round 352 SMLABB temp3, colOp02, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round 353 SMLATB temp2, colOp00, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round 354 SMLATB temp4, colOp02, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round 355 356 ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2 357 ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2 358 PKHBT out00, temp1, temp2, LSL #14 ;// c0w0 = | Temp2 | Temp1 | 359 PKHBT out02, temp3, temp4, LSL #14 ;// c1w0 = | Temp2 | Temp1 | 360 361 362 ;// Row 2 363 SMLABB temp1, colOp10, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round 364 SMLABB temp3, colOp12, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round 365 SMLATB temp2, colOp10, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round 366 SMLATB temp4, colOp12, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round 367 368 ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2 369 ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2 370 PKHBT out10, temp1, temp2, LSL #14 ;// c0w0 = | Temp2 | Temp1 | 371 PKHBT out12, temp3, temp4, LSL #14 ;// c1w0 = | Temp2 | Temp1 | 372 373 ;// Row 3 374 SMLABB temp1, colOp20, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round 375 SMLABB temp3, colOp22, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round 376 SMLATB temp2, colOp20, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round 377 SMLATB temp4, colOp22, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round 378 379 ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2 380 ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2 381 PKHBT out20, temp1, temp2, LSL #14 ;// c0w0 = | Temp2 | Temp1 | 382 PKHBT out22, temp3, temp4, LSL #14 ;// c1w0 = | Temp2 | Temp1 | 383 384 ;// Row 4 385 SMLABB temp1, colOp30, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round 386 SMLABB temp3, colOp32, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round 387 SMLATB temp2, colOp30, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round 388 SMLATB temp4, colOp32, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round 389 390 M_LDR pData,pDataOnStack ;// Restore pData pointer from stack 391 ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2 392 ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2 393 PKHBT out30, temp1, temp2, LSL #14 ;// c0w0 = | Temp2 | Temp1 | 394 PKHBT out32, temp3, temp4, LSL #14 ;// c1w0 = | Temp2 | Temp1 | 395 396 397 398 ;*************************** 399 ;// Store all the 4x4 pixels 400 ;*************************** 401 402 store_coeff 403 404 STMIA pData,{out00,out02,out10,out12,out20,out22,out30,out32} 405 406 407 408 ;// Set return value 409 410 411 ;// Write function tail 412 M_END 413 414 ENDIF ;//ARM1136JS 415 416 417 ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 418 419 ;// Guarding implementation by the processor name 420 421 422 423 424 ;// Function: omxVCM4P10_TransformDequantLumaDCFromPair 425 426 ;//Input Registers 427 ppSrc RN 0 428 pDst RN 1 429 QPR2 RN 2 430 431 ;//Output Registers 432 result RN 0 433 434 ;//Local Scratch Registers 435 pDstR4 RN 4 436 pDstR0 RN 0 437 QPR1 RN 1 438 QPR5 RN 5 439 440 ;// Guarding implementation by the processor name 441 442 IF ARM1136JS 443 444 ;// Allocate stack memory required by the function 445 446 447 ;// Write function header 448 M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5 449 450 MOV pDstR4,pDst ;// Saving register r1 451 MOV QPR5,QPR2 ;// Saving register r2 452 BL armVCM4P10_UnpackBlock4x4 453 454 MOV pDstR0,pDstR4 ;// Setting up register r0 455 MOV QPR1,QPR5 ;// Setting up register r1 456 BL armVCM4P10_InvTransformDequantLumaDC4x4 457 458 459 ;// Set return value 460 MOV result,#OMX_Sts_NoErr 461 462 ;// Write function tail 463 M_END 464 465 466 ENDIF ;//ARM1136JS 467 468 469 END