1 ;// 2 ;// 3 ;// File Name: omxVCM4P2_MCReconBlock_s.s 4 ;// OpenMAX DL: v1.0.2 5 ;// Revision: 9641 6 ;// Date: Thursday, February 7, 2008 7 ;// 8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9 ;// 10 ;// 11 ;// 12 ;// Description: 13 ;// 14 ;// 15 16 ;// Include standard headers 17 INCLUDE omxtypes_s.h 18 INCLUDE armCOMM_s.h 19 20 ;// Import symbols required from other files 21 22 M_VARIANTS ARM1136JS 23 24 ;// *************************************************************************** 25 ;// ARM1136JS implementation 26 ;// *************************************************************************** 27 IF ARM1136JS 28 29 ;// *************************************************************************** 30 ;// MACRO DEFINITIONS 31 ;// *************************************************************************** 32 ;// Description: 33 ;// 34 ;// dest[j] = (x[j] + y[j] + round) >> 1, j=0..3 35 ;// 36 ;// Similar to UHADD8 instruction, but with a rounding value of 1 added to 37 ;// each sum before dividing by two, if round is 1 38 ;// 39 ;// Syntax: 40 ;// M_UHADD8R $dest, $x, $y, $round, $mask 41 ;// 42 ;// Inputs: 43 ;// $x four packed bytes, x[3] : x[2] : x[1] : x[0] 44 ;// $y four packed bytes, y[3] : y[2] : y[1] : y[0] 45 ;// $round 0 if no rounding to be added, 1 if rounding to be done 46 ;// $mask some register set to 0x80808080 47 ;// 48 ;// Outputs: 49 ;// $dest four packed bytes, z[3] : z[2] : z[1] : z[0] 50 51 MACRO 52 M_UHADD8R $dest, $x, $y, $round, $mask 53 IF $round = 1 54 IF $dest /= $y 55 MVN $dest, $x 56 UHSUB8 $dest, $y, $dest 57 EOR $dest, $dest, $mask 58 ELSE 59 MVN $dest, $y 60 UHSUB8 $dest, $x, $dest 61 EOR $dest, $dest, $mask 62 ENDIF 63 ELSE 64 UHADD8 $dest, $x, $y 65 ENDIF 66 MEND 67 ;// *************************************************************************** 68 ;// Description: 69 ;// Load 8 bytes from $pSrc (aligned or unaligned locations) 70 ;// 71 ;// Syntax: 72 ;// M_LOAD_X $pSrc, $srcStep, $out0, $out1, $scratch, $offset 73 ;// 74 ;// Inputs: 75 ;// $pSrc 4 byte aligned source pointer to an address just less than 76 ;// or equal to the data location 77 ;// $srcStep The stride on source 78 ;// $scratch A scratch register, used internally for temp calculations 79 ;// $offset Difference of source data location to the source pointer 80 ;// Use when $offset != 0 (unaligned load) 81 ;// 82 ;// Outputs: 83 ;// $pSrc In case the macro accepts stride, it increments the pSrc by 84 ;// that value, else unchanged 85 ;// $out0 four packed bytes, z[3] : z[2] : z[1] : z[0] 86 ;// $out1 four packed bytes, z[7] : z[6] : z[5] : z[4] 87 ;// 88 ;// Note: {$out0, $out1, $scratch} should be registers with ascending 89 ;// register numbering. In case offset is 0, $scratch is not modified. 90 91 MACRO 92 M_LOAD_X $pSrc, $srcStep, $out0, $out1, $scratch, $offset 93 IF $offset = 0 94 LDM $pSrc, {$out0, $out1} 95 ADD $pSrc, $pSrc, $srcStep 96 ELSE 97 LDM $pSrc, {$out0, $out1, $scratch} 98 ADD $pSrc, $pSrc, $srcStep 99 100 MOV $out0, $out0, LSR #8 * $offset 101 ORR $out0, $out0, $out1, LSL #(32 - 8 * ($offset)) 102 MOV $out1, $out1, LSR #8 * $offset 103 ORR $out1, $out1, $scratch, LSL #(32 - 8 * ($offset)) 104 ENDIF 105 MEND 106 107 ;// *************************************************************************** 108 ;// Description: 109 ;// Loads three words for X interpolation, update pointer to next row. For 110 ;// X interpolation, given a truncated-4byteAligned source pointer, 111 ;// invariably three continous words are required from there to get the 112 ;// nine bytes from the source pointer for filtering. 113 ;// 114 ;// Syntax: 115 ;// M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3 116 ;// 117 ;// Inputs: 118 ;// $pSrc 4 byte aligned source pointer to an address just less than 119 ;// or equal to the data location 120 ;// 121 ;// $srcStep The stride on source 122 ;// 123 ;// $offset Difference of source data location to the source pointer 124 ;// Use when $offset != 0 (unaligned load) 125 ;// 126 ;// Outputs: 127 ;// $pSrc Incremented by $srcStep 128 ;// 129 ;// $word0, $word1, $word2, $word3 130 ;// Three of these are outputs based on the $offset parameter. 131 ;// The outputs are specifically generated to be processed by 132 ;// the M_EXT_XINT macro. Following is the illustration to show 133 ;// how the nine bytes are spanned for different offsets from 134 ;// notTruncatedForAlignmentSourcePointer. 135 ;// 136 ;// ------------------------------------------------------ 137 ;// | Offset | Aligned Ptr | word0 | word1 | word2 | word3 | 138 ;// |------------------------------------------------------| 139 ;// | 0 | 0 | 0123 | 4567 | 8xxx | | 140 ;// | 1 | -1 | x012 | 3456 | 78xx | | 141 ;// | 2 | -2 | xx01 | 2345 | 678x | | 142 ;// | 3 | -3 | xxx0 | | 1234 | 5678 | 143 ;// ------------------------------------------------------ 144 ;// 145 ;// where the numbering (0-8) is to designate the 9 bytes from 146 ;// start of a particular row. The illustration doesn't take in 147 ;// account the positioning of bytes with in the word and the 148 ;// macro combination with M_EXT_XINT will work only in little 149 ;// endian environs 150 ;// 151 ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending 152 ;// register numbering 153 154 MACRO 155 M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3 156 IF $offset /= 3 157 LDM $pSrc, {$word0, $word1, $word2} 158 ELSE 159 LDM $pSrc, {$word0, $word2, $word3} 160 ENDIF 161 ADD $pSrc, $pSrc, $srcStep 162 MEND 163 164 ;// *************************************************************************** 165 ;// Description: 166 ;// Extract four registers of four pixels for X interpolation 167 ;// 168 ;// Syntax: 169 ;// M_EXT_XINT $offset, $word0, $word1, $word2, $word3 170 ;// 171 ;// Inputs: 172 ;// $offset Difference of source data location to the source pointer 173 ;// Use when $offset != 0 (unaligned load) 174 ;// 175 ;// $word0, $word1, $word2, $word3 176 ;// Three of these are inputs based on the $offset parameter. 177 ;// The inputs are specifically selected to be processed by 178 ;// the M_EXT_XINT macro. 179 ;// 180 ;// ------------------------------------------------------ 181 ;// | Offset | Aligned Ptr | word0 | word1 | word2 | word3 | 182 ;// |------------------------------------------------------| 183 ;// | 0 | 0 | 0123 | 4567 | 8xxx | yyyy | 184 ;// | 1 | -1 | x012 | 3456 | 78xx | yyyy | 185 ;// | 2 | -2 | xx01 | 2345 | 678x | yyyy | 186 ;// | 3 | -3 | xxx0 | yyyy | 1234 | 5678 | 187 ;// ------------------------------------------------------ 188 ;// 189 ;// Outputs: 190 ;// $word0, $word1, $word2, $word3 191 ;// Bytes from the original source pointer (not truncated for 192 ;// 4 byte alignment) as shown in the table. 193 ;// ------------------------------- 194 ;// | word0 | word1 | word2 | word3 | 195 ;// |-------------------------------| 196 ;// | 0123 | 4567 | 1234 | 5678 | 197 ;// ------------------------------- 198 ;// 199 ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending 200 ;// register numbering 201 202 MACRO 203 M_EXT_XINT $offset, $word0, $word1, $word2, $word3 204 IF $offset = 0 205 ; $word0 and $word1 are ok 206 ; $word2, $word3 are just 8 shifted versions 207 MOV $word3, $word1, LSR #8 208 ORR $word3, $word3, $word2, LSL #24 209 MOV $word2, $word0, LSR #8 210 ORR $word2, $word2, $word1, LSL #24 211 ELIF $offset = 3 212 ; $word2 and $word3 are ok (taken care while loading itself) 213 ; set $word0 & $word1 214 MOV $word0, $word0, LSR #24 215 ORR $word0, $word0, $word2, LSL #8 216 MOV $word1, $word2, LSR #24 217 ORR $word1, $word1, $word3, LSL #8 218 ELSE 219 MOV $word0, $word0, LSR #8 * $offset 220 ORR $word0, $word0, $word1, LSL #(32 - 8 * ($offset)) 221 MOV $word1, $word1, LSR #8 * $offset 222 ORR $word1, $word1, $word2, LSL #(32 - 8 * ($offset)) 223 224 MOV $word3, $word1, LSR #8 225 ORR $word3, $word3, $word2, LSL #(32 - 8 * (($offset)+1)) 226 MOV $word2, $word0, LSR #8 227 ORR $word2, $word2, $word1, LSL #24 228 ENDIF 229 MEND 230 231 ;// *************************************************************************** 232 ;// Description: 233 ;// Computes half-sum and xor of two inputs and puts them in the input 234 ;// registers in that order 235 ;// 236 ;// Syntax: 237 ;// M_HSUM_XOR $v0, $v1, $tmp 238 ;// 239 ;// Inputs: 240 ;// $v0 a, first input 241 ;// $v1 b, second input 242 ;// $tmp scratch register 243 ;// 244 ;// Outputs: 245 ;// $v0 (a + b)/2 246 ;// $v1 a ^ b 247 248 MACRO 249 M_HSUM_XOR $v0, $v1, $tmp 250 UHADD8 $tmp, $v0, $v1 ;// s0 = a + b 251 EOR $v1, $v0, $v1 ;// l0 = a ^ b 252 MOV $v0, $tmp ;// s0 253 MEND 254 ;// *************************************************************************** 255 ;// Description: 256 ;// Calculates average of 4 values (a,b,c,d) for HalfPixelXY predict type in 257 ;// mcReconBlock module. Very specific to the implementation of 258 ;// M_MCRECONBLOCK_HalfPixelXY done here. Uses "tmp" as scratch register and 259 ;// "yMask" for mask variable "0x1010101x" set in it. In yMask 4 lsbs are 260 ;// not significant and are used by the callee for row counter (y) 261 ;// 262 ;// Some points to note are: 263 ;// 1. Input is pair of pair-averages and Xors 264 ;// 2. $sum1 and $lsb1 are not modified and hence can be reused in another 265 ;// running average 266 ;// 3. Output is in the first argument 267 ;// 268 ;// Syntax: 269 ;// M_AVG4 $sum0, $lsb0, $sum1, $lsb1, $rndVal 270 ;// 271 ;// Inputs: 272 ;// $sum0 (a + b) >> 1, where a and b are 1st and 2nd inputs to be averaged 273 ;// $lsb0 (a ^ b) 274 ;// $sum1 (c + d) >> 1. Not modified 275 ;// $lsb1 (c ^ d) Not modified 276 ;// $rndVal Assembler Variable. 0 for rounding, 1 for no rounding 277 ;// 278 ;// Outputs: 279 ;// $sum0 (a + b + c + d + 1) / 4 : If no rounding 280 ;// (a + b + c + d + 2) / 4 : If rounding 281 282 MACRO 283 M_AVG4 $sum0, $lsb0, $sum1, $lsb1, $rndVal 284 LCLS OP1 285 LCLS OP2 286 IF $rndVal = 0 ;// rounding case 287 OP1 SETS "AND" 288 OP2 SETS "ORR" 289 ELSE ;// Not rounding case 290 OP1 SETS "ORR" 291 OP2 SETS "AND" 292 ENDIF 293 294 LCLS lsb2 295 LCLS sum2 296 LCLS dest 297 298 lsb2 SETS "tmp" 299 sum2 SETS "$lsb0" 300 dest SETS "$sum0" 301 302 $OP1 $lsb0, $lsb0, $lsb1 ;// e0 = e0 & e1 303 EOR $lsb2, $sum0, $sum1 ;// e2 = s0 ^ s1 304 $OP2 $lsb2, $lsb2, $lsb0 ;// e2 = e2 | e0 305 AND $lsb2, $lsb2, yMask, LSR # 4 ;// e2 = e2 & mask 306 UHADD8 $sum2, $sum0, $sum1 ;// s2 = (s0 + s1)/2 307 UADD8 $dest, $sum2, $lsb2 ;// dest = s2 + e2 308 MEND 309 ;// *************************************************************************** 310 ;// Motion compensation handler macros 311 ;// *************************************************************************** 312 ;// Description: 313 ;// Implement motion compensation routines using the named registers in 314 ;// callee function. Each of the following 4 implement the 4 predict type 315 ;// Each handles 8 cases each ie all the combinations of 4 types of source 316 ;// alignment offsets and 2 types of rounding flag 317 ;// 318 ;// Syntax: 319 ;// M_MCRECONBLOCK_IntegerPixel $rndVal, $offset 320 ;// M_MCRECONBLOCK_HalfPixelX $rndVal, $offset 321 ;// M_MCRECONBLOCK_HalfPixelY $rndVal, $offset 322 ;// M_MCRECONBLOCK_HalfPixelXY $rndVal, $offset 323 ;// 324 ;// Inputs: 325 ;// $rndVal Assembler Variable. 0 for rounding, 1 for no rounding 326 ;// $offset $pSrc MOD 4 value. Offset from 4 byte aligned location. 327 ;// 328 ;// Outputs: 329 ;// Outputs come in the named registers of the callee functions 330 ;// The macro loads the data from the source pointer, processes it and 331 ;// stores in the destination pointer. Does the whole prediction cycle 332 ;// of Motion Compensation routine for a particular predictType 333 ;// After this only residue addition to the predicted values remain 334 335 MACRO 336 M_MCRECONBLOCK_IntegerPixel $rndVal, $offset 337 ;// Algorithmic Description: 338 ;// This handles motion compensation for IntegerPixel predictType. Both 339 ;// rounding cases are handled by the same code base. It is just a copy 340 ;// from source to destination. Two lines are done per loop to reduce 341 ;// stalls. Loop has been software pipelined as well for that purpose. 342 ;// 343 ;// M_LOAD_X loads a whole row in two registers and then they are stored 344 345 CaseIntegerPixelRnd0Offset$offset 346 CaseIntegerPixelRnd1Offset$offset 347 M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp3, $offset 348 M_LOAD_X pSrc, srcStep, tmp3, tmp4, tmp5, $offset 349 YloopIntegerPixelOffset$offset 350 SUBS y, y, #2 351 STRD tmp1, tmp2, [pDst], dstStep 352 STRD tmp3, tmp4, [pDst], dstStep 353 M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp3, $offset 354 M_LOAD_X pSrc, srcStep, tmp3, tmp4, tmp5, $offset 355 BGT YloopIntegerPixelOffset$offset 356 357 B SwitchPredictTypeEnd 358 MEND 359 ;// *************************************************************************** 360 MACRO 361 M_MCRECONBLOCK_HalfPixelX $rndVal, $offset 362 ;// Algorithmic Description: 363 ;// This handles motion compensation for HalfPixelX predictType. The two 364 ;// rounding cases are handled by the different code base and spanned by 365 ;// different macro calls. Loop has been software pipelined to reduce 366 ;// stalls. 367 ;// 368 ;// Filtering involves averaging a pixel with the next horizontal pixel. 369 ;// M_LOAD_XINT and M_EXT_XINT combination generate 4 registers, 2 with 370 ;// all pixels in a row with 4 pixel in each register and another 2 371 ;// registers with pixels corresponding to one horizontally shifted pixel 372 ;// corresponding to the initial row pixels. These are set of packed 373 ;// registers appropriate to do 4 lane SIMD. 374 ;// After that M_UHADD8R macro does the averaging taking care of the 375 ;// rounding as required 376 377 CaseHalfPixelXRnd$rndVal.Offset$offset 378 IF $rndVal = 0 379 LDR mask, =0x80808080 380 ENDIF 381 382 M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4 383 YloopHalfPixelXRnd$rndVal.Offset$offset 384 SUBS y, y, #1 385 M_EXT_XINT $offset, tmp1, tmp2, tmp3, tmp4 386 M_UHADD8R tmp5, tmp1, tmp3, (1-$rndVal), mask 387 M_UHADD8R tmp6, tmp2, tmp4, (1-$rndVal), mask 388 STRD tmp5, tmp6, [pDst], dstStep 389 M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4 390 BGT YloopHalfPixelXRnd$rndVal.Offset$offset 391 392 B SwitchPredictTypeEnd 393 MEND 394 ;// *************************************************************************** 395 MACRO 396 M_MCRECONBLOCK_HalfPixelY $rndVal, $offset 397 ;// Algorithmic Description: 398 ;// This handles motion compensation for HalfPixelY predictType. The two 399 ;// rounding cases are handled by the different code base and spanned by 400 ;// different macro calls. PreLoading is used to avoid reload of same data. 401 ;// 402 ;// Filtering involves averaging a pixel with the next vertical pixel. 403 ;// M_LOAD_X generates 2 registers with all pixels in a row with 4 pixel in 404 ;// each register. These are set of packed registers appropriate to do 405 ;// 4 lane SIMD. After that M_UHADD8R macro does the averaging taking care 406 ;// of the rounding as required 407 408 CaseHalfPixelYRnd$rndVal.Offset$offset 409 IF $rndVal = 0 410 LDR mask, =0x80808080 411 ENDIF 412 413 M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp5, $offset ;// Pre-load 414 YloopHalfPixelYRnd$rndVal.Offset$offset 415 SUBS y, y, #2 416 ;// Processing one line 417 M_LOAD_X pSrc, srcStep, tmp3, tmp4, tmp5, $offset 418 M_UHADD8R tmp1, tmp1, tmp3, (1-$rndVal), mask 419 M_UHADD8R tmp2, tmp2, tmp4, (1-$rndVal), mask 420 STRD tmp1, tmp2, [pDst], dstStep 421 ;// Processing another line 422 M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp5, $offset 423 M_UHADD8R tmp3, tmp3, tmp1, (1-$rndVal), mask 424 M_UHADD8R tmp4, tmp4, tmp2, (1-$rndVal), mask 425 STRD tmp3, tmp4, [pDst], dstStep 426 427 BGT YloopHalfPixelYRnd$rndVal.Offset$offset 428 429 B SwitchPredictTypeEnd 430 MEND 431 ;// *************************************************************************** 432 MACRO 433 M_MCRECONBLOCK_HalfPixelXY $rndVal, $offset 434 ;// Algorithmic Description: 435 ;// This handles motion compensation for HalfPixelXY predictType. The two 436 ;// rounding cases are handled by the different code base and spanned by 437 ;// different macro calls. PreLoading is used to avoid reload of same data. 438 ;// 439 ;// Filtering involves averaging a pixel with the next vertical, horizontal 440 ;// and right-down diagonal pixels. Just as in HalfPixelX case, M_LOAD_XINT 441 ;// and M_EXT_XINT combination generates 4 registers with a row and its 442 ;// 1 pixel right shifted version, with 4 pixels in one register. Another 443 ;// call of that macro-combination gets another row. Then M_HSUM_XOR is 444 ;// called to get mutual half-sum and xor combinations of a row with its 445 ;// shifted version as they are inputs to the M_AVG4 macro which computes 446 ;// the 4 element average with rounding. Note that it is the half-sum/xor 447 ;// values that are preserved for next row as they can be re-used in the 448 ;// next call to the M_AVG4 and saves recomputation. 449 ;// Due to lack of register, the row counter and a masking value required 450 ;// in M_AVG4 are packed into a single register yMask where the last nibble 451 ;// holds the row counter values and rest holds the masking variable left 452 ;// shifted by 4 453 454 CaseHalfPixelXYRnd$rndVal.Offset$offset 455 LDR yMask, =((0x01010101 << 4) + 8) 456 457 M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b' 458 M_EXT_XINT $offset, t00, t01, t10, t11 459 M_HSUM_XOR t00, t10, tmp ;// s0, l0 460 M_HSUM_XOR t01, t11, tmp ;// s0', l0' 461 462 YloopHalfPixelXYRnd$rndVal.Offset$offset 463 ;// Processsing one line 464 ;// t00, t01, t10, t11 required from previous loop 465 M_LOAD_XINT pSrc, srcStep, $offset, t20, t21, t30, t31 ;// Load c, c', d, d' 466 SUB yMask, yMask, #2 467 M_EXT_XINT $offset, t20, t21, t30, t31 468 M_HSUM_XOR t20, t30, tmp ;// s1, l1 469 M_HSUM_XOR t21, t31, tmp ;// s1', l1' 470 M_AVG4 t00, t10, t20, t30, $rndVal ;// s0, l0, s1, l1 471 M_AVG4 t01, t11, t21, t31, $rndVal ;// s0', l0', s1', l1' 472 STRD t00, t01, [pDst], dstStep ;// store the average 473 474 ;// Processsing another line 475 ;// t20, t21, t30, t31 required from above 476 M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b' 477 TST yMask, #7 478 M_EXT_XINT $offset, t00, t01, t10, t11 479 M_HSUM_XOR t00, t10, tmp 480 M_HSUM_XOR t01, t11, tmp 481 M_AVG4 t20, t30, t00, t10, $rndVal 482 M_AVG4 t21, t31, t01, t11, $rndVal 483 STRD t20, t21, [pDst], dstStep 484 485 BGT YloopHalfPixelXYRnd$rndVal.Offset$offset 486 487 IF $offset/=3 :LOR: $rndVal/=1 488 B SwitchPredictTypeEnd 489 ENDIF 490 MEND 491 ;// *************************************************************************** 492 ;// Motion compensation handler macros end here 493 ;// *************************************************************************** 494 ;// Description: 495 ;// Populates all 4 kinds of offsets "cases" for each predictType and rndVal 496 ;// combination in the "switch" to prediction processing code segment 497 ;// 498 ;// Syntax: 499 ;// M_CASE_OFFSET $rnd, $predictType 500 ;// 501 ;// Inputs: 502 ;// $rnd 0 for rounding, 1 for no rounding 503 ;// $predictType The prediction mode 504 ;// 505 ;// Outputs: 506 ;// Populated list of "M_CASE"s for the "M_SWITCH" macro 507 508 MACRO 509 M_CASE_OFFSET $rnd, $predictType 510 M_CASE Case$predictType.Rnd$rnd.Offset0 511 M_CASE Case$predictType.Rnd$rnd.Offset1 512 M_CASE Case$predictType.Rnd$rnd.Offset2 513 M_CASE Case$predictType.Rnd$rnd.Offset3 514 MEND 515 ;// *************************************************************************** 516 ;// Description: 517 ;// Populates all 2 kinds of rounding "cases" for each predictType in the 518 ;// "switch" to prediction processing code segment 519 ;// 520 ;// Syntax: 521 ;// M_CASE_OFFSET $predictType 522 ;// 523 ;// Inputs: 524 ;// $predictType The prediction mode 525 ;// 526 ;// Outputs: 527 ;// Populated list of "M_CASE_OFFSET" macros 528 529 MACRO 530 M_CASE_MCRECONBLOCK $predictType 531 M_CASE_OFFSET 0, $predictType ;// 0 for rounding 532 M_CASE_OFFSET 1, $predictType ;// 1 for no rounding 533 MEND 534 ;// *************************************************************************** 535 ;// Description: 536 ;// Populates all 8 kinds of rounding and offset combinations handling macros 537 ;// for the specified predictType. In case of "IntegerPixel" predictType, 538 ;// rounding is not required so same code segment handles both cases 539 ;// 540 ;// Syntax: 541 ;// M_MCRECONBLOCK $predictType 542 ;// 543 ;// Inputs: 544 ;// $predictType The prediction mode 545 ;// 546 ;// Outputs: 547 ;// Populated list of "M_MCRECONBLOCK_<predictType>" macros for specified 548 ;// predictType. Each 549 ;// M_MCRECONBLOCK_<predictType> $rnd, $offset 550 ;// is an code segment (starting with a label indicating the predictType, 551 ;// rounding and offset combination) 552 ;// Four calls of this macro with the 4 prediction modes populate all the 32 553 ;// handlers 554 555 MACRO 556 M_MCRECONBLOCK $predictType 557 M_MCRECONBLOCK_$predictType 0, 0 558 M_MCRECONBLOCK_$predictType 0, 1 559 M_MCRECONBLOCK_$predictType 0, 2 560 M_MCRECONBLOCK_$predictType 0, 3 561 IF "$predictType" /= "IntegerPixel" ;// If not IntegerPixel then rounding makes a difference 562 M_MCRECONBLOCK_$predictType 1, 0 563 M_MCRECONBLOCK_$predictType 1, 1 564 M_MCRECONBLOCK_$predictType 1, 2 565 M_MCRECONBLOCK_$predictType 1, 3 566 ENDIF 567 MEND 568 ;// *************************************************************************** 569 ;// Input/Output Registers 570 pSrc RN 0 571 srcStep RN 1 572 arg_pSrcResidue RN 2 573 pSrcResidue RN 12 574 pDst RN 3 575 dstStep RN 2 576 predictType RN 10 577 rndVal RN 11 578 mask RN 11 579 580 ;// Local Scratch Registers 581 zero RN 12 582 y RN 14 583 584 tmp1 RN 4 585 tmp2 RN 5 586 tmp3 RN 6 587 tmp4 RN 7 588 tmp5 RN 8 589 tmp6 RN 9 590 tmp7 RN 10 591 tmp8 RN 11 592 tmp9 RN 12 593 594 t00 RN 4 595 t01 RN 5 596 t10 RN 6 597 t11 RN 7 598 t20 RN 8 599 t21 RN 9 600 t30 RN 10 601 t31 RN 11 602 tmp RN 12 603 604 yMask RN 14 605 606 dst RN 1 607 return RN 0 608 609 ;// Allocate memory on stack 610 M_ALLOC4 Stk_pDst, 4 611 M_ALLOC4 Stk_pSrcResidue, 4 612 ;// Function header 613 M_START omxVCM4P2_MCReconBlock, r11 614 ;// Define stack arguments 615 M_ARG Arg_dstStep, 4 616 M_ARG Arg_predictType, 4 617 M_ARG Arg_rndVal, 4 618 ;// Save on stack 619 M_STR pDst, Stk_pDst 620 M_STR arg_pSrcResidue, Stk_pSrcResidue 621 ;// Load argument from the stack 622 M_LDR dstStep, Arg_dstStep 623 M_LDR predictType, Arg_predictType 624 M_LDR rndVal, Arg_rndVal 625 626 MOV y, #8 627 628 AND tmp1, pSrc, #3 629 ORR predictType, tmp1, predictType, LSL #3 630 ORR predictType, predictType, rndVal, LSL #2 631 ;// Truncating source pointer to align to 4 byte location 632 BIC pSrc, pSrc, #3 633 634 ;// Implementation takes care of all combinations of different 635 ;// predictTypes, rounding cases and source pointer offsets to alignment 636 ;// of 4 bytes in different code bases unless one of these parameter wasn't 637 ;// making any difference to the implementation. Below M_CASE_MCRECONBLOCK 638 ;// macros branch into 8 M_CASE macros for all combinations of the 2 639 ;// rounding cases and 4 offsets of the pSrc pointer to the 4 byte 640 ;// alignment. 641 M_SWITCH predictType 642 M_CASE_MCRECONBLOCK IntegerPixel 643 M_CASE_MCRECONBLOCK HalfPixelX 644 M_CASE_MCRECONBLOCK HalfPixelY 645 M_CASE_MCRECONBLOCK HalfPixelXY 646 M_ENDSWITCH 647 648 ;// The M_MCRECONBLOCK macros populate the code bases by calling all 8 649 ;// particular macros (4 in case of IntegerPixel as rounding makes no 650 ;// difference there) to generate the code for all cases of rounding and 651 ;// offsets. LTORG is used to segment the code as code size bloated beyond 652 ;// 4KB. 653 M_MCRECONBLOCK IntegerPixel 654 M_MCRECONBLOCK HalfPixelX 655 LTORG 656 M_MCRECONBLOCK HalfPixelY 657 M_MCRECONBLOCK HalfPixelXY 658 SwitchPredictTypeEnd 659 660 ;// Residue Addition 661 ;// This is done in 2 lane SIMD though loads are further optimized and 662 ;// 4 bytes are loaded in case of destination buffer. Algorithmic 663 ;// details are in inlined comments 664 M_LDR pSrcResidue, Stk_pSrcResidue 665 CMP pSrcResidue, #0 666 BEQ pSrcResidueConditionEnd 667 pSrcResidueNotNull 668 M_LDR pDst, Stk_pDst 669 MOV y, #8 670 SUB dstStep, dstStep, #4 671 Yloop_pSrcResidueNotNull 672 SUBS y, y, #1 673 LDR dst, [pDst] ;// dst = [dcba] 674 LDMIA pSrcResidue!, {tmp1, tmp2} ;// tmp1=[DC] tmp2=[BA] 675 PKHBT tmp3, tmp1, tmp2, LSL #16 ;// Deltaval1 = [C A] 676 PKHTB tmp4, tmp2, tmp1, ASR #16 ;// DeltaVal2 = [D B] 677 UXTB16 tmp1, dst ;// tmp1 = [0c0a] 678 UXTB16 tmp2, dst, ROR #8 ;// tmp2 = [0d0b] 679 QADD16 tmp1, tmp1, tmp3 ;// Add and saturate to 16 bits 680 QADD16 tmp2, tmp2, tmp4 681 USAT16 tmp1, #8, tmp1 682 USAT16 tmp2, #8, tmp2 ;// armClip(0, 255, tmp2) 683 ORR tmp1, tmp1, tmp2, LSL #8 ;// tmp1 = [dcba] 684 STR tmp1, [pDst], #4 685 686 LDR dst, [pDst] 687 LDMIA pSrcResidue!, {tmp1, tmp2} 688 PKHBT tmp3, tmp1, tmp2, LSL #16 689 PKHTB tmp4, tmp2, tmp1, ASR #16 690 UXTB16 tmp1, dst 691 UXTB16 tmp2, dst, ROR #8 692 QADD16 tmp1, tmp1, tmp3 693 QADD16 tmp2, tmp2, tmp4 694 USAT16 tmp1, #8, tmp1 695 USAT16 tmp2, #8, tmp2 696 ORR tmp1, tmp1, tmp2, LSL #8 697 STR tmp1, [pDst], dstStep 698 699 BGT Yloop_pSrcResidueNotNull 700 pSrcResidueConditionEnd 701 702 MOV return, #OMX_Sts_NoErr 703 704 M_END 705 ENDIF ;// ARM1136JS 706 707 ;// *************************************************************************** 708 ;// CortexA8 implementation 709 ;// *************************************************************************** 710 END 711 ;// *************************************************************************** 712 ;// omxVCM4P2_MCReconBlock ends 713 ;// *************************************************************************** 714