1 ;// 2 ;// Copyright (C) 2007-2008 ARM Limited 3 ;// 4 ;// Licensed under the Apache License, Version 2.0 (the "License"); 5 ;// you may not use this file except in compliance with the License. 6 ;// You may obtain a copy of the License at 7 ;// 8 ;// http://www.apache.org/licenses/LICENSE-2.0 9 ;// 10 ;// Unless required by applicable law or agreed to in writing, software 11 ;// distributed under the License is distributed on an "AS IS" BASIS, 12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 ;// See the License for the specific language governing permissions and 14 ;// limitations under the License. 15 ;// 16 ;// 17 ;// 18 ;// File Name: omxVCM4P10_TransformDequantLumaDCFromPair_s.s 19 ;// OpenMAX DL: v1.0.2 20 ;// Revision: 9641 21 ;// Date: Thursday, February 7, 2008 22 ;// 23 ;// 24 ;// 25 ;// 26 ;// Description: 27 ;// H.264 inverse quantize and transform module 28 ;// 29 ;// 30 31 ;// Include standard headers 32 33 INCLUDE omxtypes_s.h 34 INCLUDE armCOMM_s.h 35 36 ;// Import/Export symbols required from/to other files 37 ;// (For example tables) 38 39 IMPORT armVCM4P10_UnpackBlock4x4 40 IMPORT armVCM4P10_QPDivTable 41 IMPORT armVCM4P10_VMatrixQPModTable 42 43 M_VARIANTS ARM1136JS 44 45 ;// Set debugging level 46 ;//DEBUG_ON SETL {TRUE} 47 48 49 ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 50 51 52 ;// Guarding implementation by the processor name 53 54 IF ARM1136JS 55 56 57 ;//Input Registers 58 pData RN 0 59 QP RN 1 60 61 ;//Output Registers 62 63 64 ;//Local Scratch Registers 65 66 ;// Packed Input pixels 67 in00 RN 2 ;// Src[0] & Src[1] 68 in02 RN 3 ;// Src[2] & Src[3] 69 in10 RN 4 ;// Src[4] & Src[5] 70 in12 RN 5 ;// Src[6] & Src[7] 71 in20 RN 6 ;// Src[8] & Src[9] 72 in22 RN 7 ;// Src[10] & Src[11] 73 in30 RN 8 ;// Src[12] & Src[13] 74 in32 RN 9 ;// Src[14] & Src[15] 75 76 ;// Transpose for Row operations (Rows to cols) 77 trRow00 RN 2 78 trRow10 RN 10 79 trRow02 RN 3 80 trRow12 RN 5 81 trRow20 RN 11 82 trRow30 RN 12 83 trRow32 RN 14 84 trRow22 RN 7 85 86 ;// Intermediate calculations 87 rowSum1 RN 4 88 rowSum2 RN 6 89 rowDiff1 RN 8 90 rowDiff2 RN 9 91 92 93 ;// Row operated pixels 94 rowOp00 RN 2 95 rowOp10 RN 10 96 rowOp20 RN 11 97 rowOp30 RN 12 98 rowOp02 RN 3 99 rowOp12 RN 5 100 rowOp22 RN 7 101 rowOp32 RN 14 102 103 ;// Transpose for colulmn operations 104 trCol00 RN 2 105 trCol02 RN 3 106 trCol10 RN 4 107 trCol12 RN 5 108 trCol20 RN 6 109 trCol22 RN 7 110 trCol30 RN 8 111 trCol32 RN 9 112 113 ;// Intermediate calculations 114 colSum1 RN 10 115 colSum2 RN 11 116 colDiff1 RN 12 117 colDiff2 RN 14 118 119 120 ;// Coloumn operated pixels 121 colOp00 RN 2 122 colOp02 RN 3 123 colOp10 RN 4 124 colOp12 RN 5 125 colOp20 RN 6 126 colOp22 RN 7 127 colOp30 RN 8 128 colOp32 RN 9 129 130 ;// Temporary scratch varaibles 131 pQPDivTable RN 0 132 pQPModTable RN 11 133 Shift RN 10 134 Scale RN 14 135 Round RN 0 136 137 temp1 RN 10 138 temp2 RN 11 139 temp3 RN 12 140 temp4 RN 1 141 142 143 144 ;// InvTransformed and Dequantized pixels 145 out00 RN 2 146 out02 RN 3 147 out10 RN 4 148 out12 RN 5 149 out20 RN 6 150 out22 RN 7 151 out30 RN 8 152 out32 RN 9 153 154 155 156 157 ;// Allocate stack memory required by the function 158 M_ALLOC4 pDataOnStack, 4 159 160 ;// Write function header 161 M_START armVCM4P10_InvTransformDequantLumaDC4x4,r11 162 163 ;****************************************************************** 164 ;// The strategy used in implementing the transform is as follows:* 165 ;// Load the 4x4 block into 8 registers * 166 ;// Transpose the 4x4 matrix * 167 ;// Perform the row operations (on columns) using SIMD * 168 ;// Transpose the 4x4 result matrix * 169 ;// Perform the coloumn operations * 170 ;// Store the 4x4 block at one go * 171 ;****************************************************************** 172 173 ;// Load all the 4x4 pixels 174 175 LDMIA pData,{in00,in02,in10,in12,in20,in22,in30,in32} 176 177 ;//***************************************************************** 178 ;// 179 ;// Transpose the matrix inorder to perform row ops as coloumn ops 180 ;// Input: in[][] = original matrix 181 ;// Output: trRow[][]= transposed matrix 182 ;// Step1: Obtain the LL part of the transposed matrix 183 ;// Step2: Obtain the HL part 184 ;// step3: Obtain the LH part 185 ;// Step4: Obtain the HH part 186 ;// 187 ;//***************************************************************** 188 189 ;// LL 2x2 transposed matrix 190 ;// d0 d1 - - 191 ;// d4 d5 - - 192 ;// - - - - 193 ;// - - - - 194 195 PKHTB trRow10,in10,in00,ASR #16 ;// [5 4] = [f5:f1] 196 PKHBT trRow00,in00,in10,LSL #16 ;// [1 0] = [f4:f0] 197 198 ;// HL 2x2 transposed matrix 199 ;// - - - - 200 ;// - - - - 201 ;// d8 d9 - - 202 ;// d12 d13 - - 203 204 205 PKHTB trRow30,in12,in02,ASR #16 ;// [13 12] = [7 3] 206 PKHBT trRow20,in02,in12,LSL #16 ;// [9 8] = [6 2] 207 208 ;// LH 2x2 transposed matrix 209 ;// - - d2 d3 210 ;// - - d6 d7 211 ;// - - - - 212 ;// - - - - 213 214 PKHBT trRow02,in20,in30,LSL #16 ;// [3 2] = [f12:f8] 215 PKHTB trRow12,in30,in20,ASR #16 ;// [7 6] = [f13:f9] 216 217 218 219 220 ;// HH 2x2 transposed matrix 221 ;// - - - - 222 ;// - - - - 223 ;// - - d10 d11 224 ;// - - d14 d15 225 226 PKHTB trRow32,in32,in22,ASR #16 ;// [15 14] = [15 11] 227 PKHBT trRow22,in22,in32,LSL #16 ;// [11 10] = [14 10] 228 229 230 ;**************************************** 231 ;// Row Operations (Performed on columns) 232 ;**************************************** 233 234 235 ;// SIMD operations on first two columns(two rows of the original matrix) 236 237 SADD16 rowSum1,trRow00,trRow10 ;// (c0+c1) 238 SADD16 rowSum2,trRow20,trRow30 ;// (c2+c3) 239 SSUB16 rowDiff1,trRow00,trRow10 ;// (c0-c1) 240 SSUB16 rowDiff2,trRow20,trRow30 ;// (c2-c3) 241 SADD16 rowOp00,rowSum1,rowSum2 ;// (c0+c1+c2+c3) 242 SSUB16 rowOp10,rowSum1,rowSum2 ;// (c0+c1-c2-c3) 243 SSUB16 rowOp20,rowDiff1,rowDiff2 ;// (c0-c1-c2+c3) 244 SADD16 rowOp30,rowDiff1,rowDiff2 ;// (c0-c1+c2-c3) 245 246 247 ;// SIMD operations on next two columns(next two rows of the original matrix) 248 249 SADD16 rowSum1,trRow02,trRow12 ;// (c0+c1) 250 SADD16 rowSum2,trRow22,trRow32 ;// (c2+c3) 251 SSUB16 rowDiff1,trRow02,trRow12 ;// (c0-c1) 252 SSUB16 rowDiff2,trRow22,trRow32 ;// (c2-c3) 253 SADD16 rowOp02,rowSum1,rowSum2 ;// (c0+c1+c2+c3) 254 SSUB16 rowOp12,rowSum1,rowSum2 ;// (c0+c1-c2-c3) 255 SSUB16 rowOp22,rowDiff1,rowDiff2 ;// (c0-c1-c2+c3) 256 SADD16 rowOp32,rowDiff1,rowDiff2 ;// (c0-c1+c2-c3) 257 258 259 260 ;***************************************************************** 261 ;// Transpose the resultant matrix 262 ;// Input: rowOp[][] 263 ;// Output: trCol[][] 264 ;***************************************************************** 265 266 ;// LL 2x2 transposed matrix 267 ;// d0 d1 - - 268 ;// d4 d5 - - 269 ;// - - - - 270 ;// - - - - 271 272 PKHTB trCol10,rowOp10,rowOp00,ASR #16 ;// [5 4] = [f5:f1] 273 PKHBT trCol00,rowOp00,rowOp10,LSL #16 ;// [1 0] = [f4:f0] 274 275 ;// HL 2x2 transposed matrix 276 ;// - - - - 277 ;// - - - - 278 ;// d8 d9 - - 279 ;// d12 d13 - - 280 281 282 PKHTB trCol30,rowOp12,rowOp02,ASR #16 ;// [13 12] = [7 3] 283 PKHBT trCol20,rowOp02,rowOp12,LSL #16 ;// [9 8] = [6 2] 284 285 ;// LH 2x2 transposed matrix 286 ;// - - d2 d3 287 ;// - - d6 d7 288 ;// - - - - 289 ;// - - - - 290 291 PKHBT trCol02,rowOp20,rowOp30,LSL #16 ;// [3 2] = [f12:f8] 292 PKHTB trCol12,rowOp30,rowOp20,ASR #16 ;// [7 6] = [f13:f9] 293 294 295 296 297 ;// HH 2x2 transposed matrix 298 ;// - - - - 299 ;// - - - - 300 ;// - - d10 d11 301 ;// - - d14 d15 302 303 PKHTB trCol32,rowOp32,rowOp22,ASR #16 ;// [15 14] = [15 11] 304 PKHBT trCol22,rowOp22,rowOp32,LSL #16 ;// [11 10] = [14 10] 305 306 307 ;******************************* 308 ;// Coloumn Operations 309 ;******************************* 310 311 ;//-------------------------------------------------------------------------------------- 312 ;// Store pData(RN0) on stack and restore it only at the final store back 313 ;// This frees up a register (RN0) which is used to reduce number of intermediate stalls 314 ;//-------------------------------------------------------------------------------------- 315 M_STR pData,pDataOnStack 316 317 318 ;// SIMD operations on first two columns(two rows of the original matrix) 319 320 SADD16 colSum1,trCol00,trCol10 ;// (c0+c1) 321 SADD16 colSum2,trCol20,trCol30 ;// (c2+c3) 322 SSUB16 colDiff1,trCol00,trCol10 ;// (c0-c1) 323 SSUB16 colDiff2,trCol20,trCol30 ;// (c2-c3) 324 SADD16 colOp00,colSum1,colSum2 ;// (c0+c1+c2+c3) 325 SSUB16 colOp10,colSum1,colSum2 ;// (c0+c1-c2-c3) 326 SSUB16 colOp20,colDiff1,colDiff2 ;// (c0-c1-c2+c3) 327 SADD16 colOp30,colDiff1,colDiff2 ;// (c0-c1+c2-c3) 328 329 330 ;// SIMD operations on next two columns(next two rows of the original matrix) 331 332 LDR pQPDivTable, =armVCM4P10_QPDivTable ;// QP Division look-up-table base pointer 333 SADD16 colSum1,trCol02,trCol12 ;// (c0+c1) 334 SADD16 colSum2,trCol22,trCol32 ;// (c2+c3) 335 SSUB16 colDiff1,trCol02,trCol12 ;// (c0-c1) 336 SSUB16 colDiff2,trCol22,trCol32 ;// (c2-c3) 337 SADD16 colOp02,colSum1,colSum2 ;// (c0+c1+c2+c3) 338 SSUB16 colOp12,colSum1,colSum2 ;// (c0+c1-c2-c3) 339 LDR pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer 340 LDRSB Shift, [pQPDivTable, QP] ;// Shift = pQPDivTable[QP] 341 SSUB16 colOp22,colDiff1,colDiff2 ;// (c0-c1-c2+c3) 342 SADD16 colOp32,colDiff1,colDiff2 ;// (c0-c1+c2-c3) 343 344 345 LDRSB Scale, [pQPModTable, QP] ;// Scale = pQPModTable[QP] 346 347 ;//---------------------------------------------------------------------- 348 ;// 349 ;// <Dequantize> improves on the c-reference code 350 ;// Both the cases i.e., Shift>=0 and Shift<0 cases are covered together 351 ;// We do not subtract 2 from Shift as in C reference, instead perform a 352 ;// Scale << Shift once in the beginning and do a right shift by a 353 ;// constant 2 after the Multiplication. The value of Round would be 2 354 ;// 355 ;// By doing this we aviod the Branches required and also 356 ;// reduce the code size substantially 357 ;// 358 ;//---------------------------------------------------------------------- 359 360 MOV Round, #2 ;// Round = 2 361 LSL Scale, Scale, Shift ;// Scale = Scale << Shift 362 363 364 ;// Row 1 365 SMLABB temp1, colOp00, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round 366 SMLABB temp3, colOp02, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round 367 SMLATB temp2, colOp00, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round 368 SMLATB temp4, colOp02, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round 369 370 ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2 371 ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2 372 PKHBT out00, temp1, temp2, LSL #14 ;// c0w0 = | Temp2 | Temp1 | 373 PKHBT out02, temp3, temp4, LSL #14 ;// c1w0 = | Temp2 | Temp1 | 374 375 376 ;// Row 2 377 SMLABB temp1, colOp10, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round 378 SMLABB temp3, colOp12, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round 379 SMLATB temp2, colOp10, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round 380 SMLATB temp4, colOp12, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round 381 382 ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2 383 ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2 384 PKHBT out10, temp1, temp2, LSL #14 ;// c0w0 = | Temp2 | Temp1 | 385 PKHBT out12, temp3, temp4, LSL #14 ;// c1w0 = | Temp2 | Temp1 | 386 387 ;// Row 3 388 SMLABB temp1, colOp20, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round 389 SMLABB temp3, colOp22, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round 390 SMLATB temp2, colOp20, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round 391 SMLATB temp4, colOp22, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round 392 393 ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2 394 ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2 395 PKHBT out20, temp1, temp2, LSL #14 ;// c0w0 = | Temp2 | Temp1 | 396 PKHBT out22, temp3, temp4, LSL #14 ;// c1w0 = | Temp2 | Temp1 | 397 398 ;// Row 4 399 SMLABB temp1, colOp30, Scale, Round ;// Temp1 = B(c0w0) * Scale + Round 400 SMLABB temp3, colOp32, Scale, Round ;// Temp3 = B(c1w0) * Scale + Round 401 SMLATB temp2, colOp30, Scale, Round ;// Temp2 = T(c0w0) * Scale + Round 402 SMLATB temp4, colOp32, Scale, Round ;// Temp4 = T(c1w0) * Scale + Round 403 404 M_LDR pData,pDataOnStack ;// Restore pData pointer from stack 405 ASR temp1, temp1, #2 ;// Temp1 = Temp1 >> 2 406 ASR temp3, temp3, #2 ;// Temp3 = Temp3 >> 2 407 PKHBT out30, temp1, temp2, LSL #14 ;// c0w0 = | Temp2 | Temp1 | 408 PKHBT out32, temp3, temp4, LSL #14 ;// c1w0 = | Temp2 | Temp1 | 409 410 411 412 ;*************************** 413 ;// Store all the 4x4 pixels 414 ;*************************** 415 416 store_coeff 417 418 STMIA pData,{out00,out02,out10,out12,out20,out22,out30,out32} 419 420 421 422 ;// Set return value 423 424 425 ;// Write function tail 426 M_END 427 428 ENDIF ;//ARM1136JS 429 430 431 ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 432 433 ;// Guarding implementation by the processor name 434 435 436 437 438 ;// Function: omxVCM4P10_TransformDequantLumaDCFromPair 439 440 ;//Input Registers 441 ppSrc RN 0 442 pDst RN 1 443 QPR2 RN 2 444 445 ;//Output Registers 446 result RN 0 447 448 ;//Local Scratch Registers 449 pDstR4 RN 4 450 pDstR0 RN 0 451 QPR1 RN 1 452 QPR5 RN 5 453 454 ;// Guarding implementation by the processor name 455 456 IF ARM1136JS 457 458 ;// Allocate stack memory required by the function 459 460 461 ;// Write function header 462 M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5 463 464 MOV pDstR4,pDst ;// Saving register r1 465 MOV QPR5,QPR2 ;// Saving register r2 466 BL armVCM4P10_UnpackBlock4x4 467 468 MOV pDstR0,pDstR4 ;// Setting up register r0 469 MOV QPR1,QPR5 ;// Setting up register r1 470 BL armVCM4P10_InvTransformDequantLumaDC4x4 471 472 473 ;// Set return value 474 MOV result,#OMX_Sts_NoErr 475 476 ;// Write function tail 477 M_END 478 479 480 ENDIF ;//ARM1136JS 481 482 483 END 484