1 ;// 2 ;// 3 ;// File Name: omxVCM4P2_MCReconBlock_s.s 4 ;// OpenMAX DL: v1.0.2 5 ;// Revision: 12290 6 ;// Date: Wednesday, April 9, 2008 7 ;// 8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. 9 ;// 10 ;// 11 ;// 12 ;// Description: 13 ;// 14 ;// 15 16 ;// Include standard headers 17 INCLUDE omxtypes_s.h 18 INCLUDE armCOMM_s.h 19 20 ;// Import symbols required from other files 21 22 M_VARIANTS CortexA8 23 24 ;// *************************************************************************** 25 ;// ARM1136JS implementation 26 ;// *************************************************************************** 27 28 ;// *************************************************************************** 29 ;// CortexA8 implementation 30 ;// *************************************************************************** 31 IF CortexA8 32 ;// *************************************************************************** 33 ;// MACRO DEFINITIONS 34 ;// *************************************************************************** 35 ;// Description: 36 ;// Does interpolation for the case of "IntegerPixel" predictType. Both 37 ;// rounding cases are handled. Just copies a block from pSrc to pDst 38 ;// 39 ;// Syntax: 40 ;// M_MCRECONBLOCK_IntegerPixel 41 ;// 42 ;// Inputs: None 43 ;// Outputs: None 44 45 MACRO 46 M_MCRECONBLOCK_IntegerPixel 47 CaseIntegerPixel_Rnd0 48 CaseIntegerPixel_Rnd1 49 50 VLD1 dRow0, [pSrc], srcStep 51 VLD1 dRow1, [pSrc], srcStep 52 VLD1 dRow2, [pSrc], srcStep 53 VLD1 dRow3, [pSrc], srcStep 54 VLD1 dRow4, [pSrc], srcStep 55 VLD1 dRow5, [pSrc], srcStep 56 VLD1 dRow6, [pSrc], srcStep 57 VLD1 dRow7, [pSrc], srcStep 58 59 VST1 dRow0, [pDst@64], dstStep 60 VST1 dRow1, [pDst@64], dstStep 61 VST1 dRow2, [pDst@64], dstStep 62 VST1 dRow3, [pDst@64], dstStep 63 VST1 dRow4, [pDst@64], dstStep 64 VST1 dRow5, [pDst@64], dstStep 65 VST1 dRow6, [pDst@64], dstStep 66 VST1 dRow7, [pDst@64], dstStep 67 68 B SwitchPredictTypeEnd 69 MEND 70 ;// *************************************************************************** 71 ;// Description: 72 ;// Does interpolation for the case of "HalfPixelX" predictType. The two 73 ;// rounding cases are handled by the parameter "$rndVal". Averages between 74 ;// a pixel and pixel right to it, rounding it based on $rndVal. The 75 ;// rounding is implemented by using opCode switching between "VRHADD" and 76 ;// "VHADD" instructions. 77 ;// 78 ;// Syntax: 79 ;// M_MCRECONBLOCK_HalfPixelX $rndVal 80 ;// 81 ;// Inputs: 82 ;// $rndVal: 0 for rounding and 1 for no rounding 83 ;// Outputs: None 84 85 MACRO 86 M_MCRECONBLOCK_HalfPixelX $rndVal 87 88 LCLS M_VHADDR 89 IF $rndVal = 0 90 M_VHADDR SETS "VRHADD" 91 ELSE 92 M_VHADDR SETS "VHADD" 93 ENDIF 94 95 CaseHalfPixelX_Rnd$rndVal 96 97 VLD1 {dRow0, dRow0Shft}, [pSrc], srcStep 98 VEXT dRow0Shft, dRow0, dRow0Shft, #1 99 VLD1 {dRow1, dRow1Shft}, [pSrc], srcStep 100 VEXT dRow1Shft, dRow1, dRow1Shft, #1 101 VLD1 {dRow2, dRow2Shft}, [pSrc], srcStep 102 VEXT dRow2Shft, dRow2, dRow2Shft, #1 103 VLD1 {dRow3, dRow3Shft}, [pSrc], srcStep 104 VEXT dRow3Shft, dRow3, dRow3Shft, #1 105 VLD1 {dRow4, dRow4Shft}, [pSrc], srcStep 106 VEXT dRow4Shft, dRow4, dRow4Shft, #1 107 VLD1 {dRow5, dRow5Shft}, [pSrc], srcStep 108 VEXT dRow5Shft, dRow5, dRow5Shft, #1 109 VLD1 {dRow6, dRow6Shft}, [pSrc], srcStep 110 VEXT dRow6Shft, dRow6, dRow6Shft, #1 111 VLD1 {dRow7, dRow7Shft}, [pSrc], srcStep 112 VEXT dRow7Shft, dRow7, dRow7Shft, #1 113 $M_VHADDR dRow0, dRow0, dRow0Shft 114 $M_VHADDR dRow1, dRow1, dRow1Shft 115 VST1 dRow0, [pDst@64], dstStep 116 $M_VHADDR dRow2, dRow2, dRow2Shft 117 VST1 dRow1, [pDst@64], dstStep 118 $M_VHADDR dRow3, dRow3, dRow3Shft 119 VST1 dRow2, [pDst@64], dstStep 120 $M_VHADDR dRow4, dRow4, dRow4Shft 121 VST1 dRow3, [pDst@64], dstStep 122 $M_VHADDR dRow5, dRow5, dRow5Shft 123 VST1 dRow4, [pDst@64], dstStep 124 $M_VHADDR dRow6, dRow6, dRow6Shft 125 VST1 dRow5, [pDst@64], dstStep 126 $M_VHADDR dRow7, dRow7, dRow7Shft 127 VST1 dRow6, [pDst@64], dstStep 128 VST1 dRow7, [pDst@64], dstStep 129 130 B SwitchPredictTypeEnd 131 MEND 132 ;// *************************************************************************** 133 ;// Description: 134 ;// Does interpolation for the case of "HalfPixelY" predictType. The two 135 ;// rounding cases are handled by the parameter "$rndVal". Averages between 136 ;// a pixel and pixel below it, rounding it based on $rndVal. The 137 ;// rounding is implemented by using opCode switching between "VRHADD" and 138 ;// "VHADD" instructions. 139 ;// 140 ;// Syntax: 141 ;// M_MCRECONBLOCK_HalfPixelY $rndVal 142 ;// 143 ;// Inputs: 144 ;// $rndVal: 0 for rounding and 1 for no rounding 145 ;// Outputs: None 146 147 MACRO 148 M_MCRECONBLOCK_HalfPixelY $rndVal 149 150 LCLS M_VHADDR 151 IF $rndVal = 0 152 M_VHADDR SETS "VRHADD" 153 ELSE 154 M_VHADDR SETS "VHADD" 155 ENDIF 156 157 CaseHalfPixelY_Rnd$rndVal 158 VLD1 dRow0, [pSrc], srcStep 159 VLD1 dRow1, [pSrc], srcStep 160 VLD1 dRow2, [pSrc], srcStep 161 VLD1 dRow3, [pSrc], srcStep 162 VLD1 dRow4, [pSrc], srcStep 163 VLD1 dRow5, [pSrc], srcStep 164 VLD1 dRow6, [pSrc], srcStep 165 VLD1 dRow7, [pSrc], srcStep 166 $M_VHADDR dRow0, dRow0, dRow1 167 VLD1 dRow8, [pSrc], srcStep 168 $M_VHADDR dRow1, dRow1, dRow2 169 VST1 dRow0, [pDst@64], dstStep 170 $M_VHADDR dRow2, dRow2, dRow3 171 VST1 dRow1, [pDst@64], dstStep 172 $M_VHADDR dRow3, dRow3, dRow4 173 VST1 dRow2, [pDst@64], dstStep 174 $M_VHADDR dRow4, dRow4, dRow5 175 VST1 dRow3, [pDst@64], dstStep 176 $M_VHADDR dRow5, dRow5, dRow6 177 VST1 dRow4, [pDst@64], dstStep 178 $M_VHADDR dRow6, dRow6, dRow7 179 VST1 dRow5, [pDst@64], dstStep 180 $M_VHADDR dRow7, dRow7, dRow8 181 VST1 dRow6, [pDst@64], dstStep 182 VST1 dRow7, [pDst@64], dstStep 183 184 B SwitchPredictTypeEnd 185 MEND 186 ;// *************************************************************************** 187 ;// Description: 188 ;// Does interpolation for the case of "IntegerPixel" predictType. Both 189 ;// rounding cases are handled. 190 ;// Typical computation for a row goes like this 191 ;// 1. VLD1 {dRow0, dRow0Shft}, [pSrc], srcStep ;// Load the row and next 8 bytes 192 ;// 2. VEXT dRow0Shft, dRow0, dRow0Shft, #1 ;// Generate the shifted row 193 ;// 3. VADDL qSum0, dRow0, dRow0Shft ;// Generate the sum of row and shifted row 194 ;// 5. VADD qSum0, qSum0, qSum1 ;// Add to the sum of next row (odd row sum has rounding value added to it) 195 ;// 6. VSHRN dRow0, qSum0, #2 ;// Divide by 4 196 ;// 7. VST1 dRow0, [pDst@64], dstStep ;// Store 197 ;// Odd rows undergo following computation after step 3 198 ;// 4. VADD qSum1, qSum1, qRound 199 ;// This saves for adding rounding value to each final sum (overall saves 4 200 ;// instructions). 201 ;// There is reuse of registers for qSum6, qSum7 & qSum8. Overall scheduling takes 202 ;// care of this and also minimizes stalls. Rounding value was modified in 203 ;// ARM register rndVal (originally used for rounding flag) before the switch. 204 ;// It is then populated into all lanes in this macro. No branching out to 205 ;// label "SwitchPredictTypeEnd" is required in the end of the macro as these 206 ;// are the last of switch cases. 207 ;// 208 ;// Syntax: 209 ;// M_MCRECONBLOCK_HalfPixelXY 210 ;// 211 ;// Inputs: None 212 ;// Outputs: None 213 214 MACRO 215 M_MCRECONBLOCK_HalfPixelXY 216 217 CaseHalfPixelXY_Rnd0 218 CaseHalfPixelXY_Rnd1 219 VLD1 {dRow0, dRow0Shft}, [pSrc], srcStep 220 VDUP qRound, rndVal 221 VLD1 {dRow1, dRow1Shft}, [pSrc], srcStep 222 VEXT dRow0Shft, dRow0, dRow0Shft, #1 223 VLD1 {dRow2, dRow2Shft}, [pSrc], srcStep 224 VEXT dRow1Shft, dRow1, dRow1Shft, #1 225 VLD1 {dRow3, dRow3Shft}, [pSrc], srcStep 226 VEXT dRow2Shft, dRow2, dRow2Shft, #1 227 VLD1 {dRow4, dRow4Shft}, [pSrc], srcStep 228 VADDL qSum0, dRow0, dRow0Shft 229 VLD1 {dRow5, dRow5Shft}, [pSrc], srcStep 230 VADDL qSum1, dRow1, dRow1Shft 231 VLD1 {dRow6, dRow6Shft}, [pSrc], srcStep 232 VEXT dRow3Shft, dRow3, dRow3Shft, #1 233 VLD1 {dRow7, dRow7Shft}, [pSrc], srcStep 234 VEXT dRow4Shft, dRow4, dRow4Shft, #1 235 VLD1 {dRow8, dRow8Shft}, [pSrc], srcStep 236 VADD qSum1, qSum1, qRound 237 VADDL qSum2, dRow2, dRow2Shft 238 VEXT dRow5Shft, dRow5, dRow5Shft, #1 239 VADD qSum0, qSum0, qSum1 240 VADDL qSum3, dRow3, dRow3Shft 241 VEXT dRow6Shft, dRow6, dRow6Shft, #1 242 VADD qSum1, qSum1, qSum2 243 VSHRN dRow0, qSum0, #2 244 VADDL qSum4, dRow4, dRow4Shft 245 VSHRN dRow1, qSum1, #2 246 VADD qSum3, qSum3, qRound 247 VADDL qSum5, dRow5, dRow5Shft 248 VST1 dRow0, [pDst@64], dstStep 249 VEXT dRow7Shft, dRow7, dRow7Shft, #1 250 VST1 dRow1, [pDst@64], dstStep 251 VEXT dRow8Shft, dRow8, dRow8Shft, #1 252 VADD qSum5, qSum5, qRound 253 VADD qSum2, qSum2, qSum3 254 VADD qSum3, qSum3, qSum4 255 VADD qSum4, qSum4, qSum5 256 VSHRN dRow2, qSum2, #2 257 VSHRN dRow3, qSum3, #2 258 VSHRN dRow4, qSum4, #2 259 VADDL qSum6, dRow6, dRow6Shft 260 VADDL qSum7, dRow7, dRow7Shft 261 VST1 dRow2, [pDst@64], dstStep 262 VADDL qSum8, dRow8, dRow8Shft 263 VADD qSum7, qSum7, qRound 264 VST1 dRow3, [pDst@64], dstStep 265 VST1 dRow4, [pDst@64], dstStep 266 VADD qSum5, qSum5, qSum6 267 VADD qSum6, qSum6, qSum7 268 VADD qSum7, qSum7, qSum8 269 VSHRN dRow5, qSum5, #2 270 VSHRN dRow6, qSum6, #2 271 VSHRN dRow7, qSum7, #2 272 VST1 dRow5, [pDst@64], dstStep 273 VST1 dRow6, [pDst@64], dstStep 274 VST1 dRow7, [pDst@64], dstStep 275 276 MEND 277 ;// *************************************************************************** 278 279 ;// Input/Output Registers 280 pSrc RN 0 281 srcStep RN 1 282 pSrcResidue RN 2 283 pDst RN 3 284 dstStep RN 4 285 predictType RN 5 286 rndVal RN 6 287 288 ;// Local Scratch Registers 289 pDstCopy RN 0 290 return RN 0 291 292 ;// Neon Registers 293 dRow0 DN D0.U8 294 dRow0Shft DN D1.U8 295 dRow1 DN D2.U8 296 dRow1Shft DN D3.U8 297 dRow2 DN D4.U8 298 dRow2Shft DN D5.U8 299 dRow3 DN D6.U8 300 dRow3Shft DN D7.U8 301 dRow4 DN D8.U8 302 dRow4Shft DN D9.U8 303 dRow5 DN D10.U8 304 dRow5Shft DN D11.U8 305 dRow6 DN D12.U8 306 dRow6Shft DN D13.U8 307 dRow7 DN D14.U8 308 dRow7Shft DN D15.U8 309 dRow8 DN D16.U8 310 dRow8Shft DN D17.U8 311 312 313 qSum0 QN Q9.U16 314 qSum1 QN Q10.U16 315 qSum2 QN Q11.U16 316 qSum3 QN Q12.U16 317 qSum4 QN Q13.U16 318 qSum5 QN Q14.U16 319 qSum6 QN Q0.U16 320 qSum7 QN Q1.U16 321 qSum8 QN Q2.U16 322 323 qRound QN Q15.U16 324 325 dDst0 DN D0.U8 326 dDst1 DN D1.U8 327 dDst2 DN D2.U8 328 dDst3 DN D3.U8 329 dDst4 DN D4.U8 330 dDst5 DN D5.U8 331 dDst6 DN D6.U8 332 dDst7 DN D7.U8 333 334 qRes0 QN Q4.S16 335 qRes1 QN Q5.S16 336 qRes2 QN Q6.S16 337 qRes3 QN Q7.S16 338 qRes4 QN Q8.S16 339 qRes5 QN Q9.S16 340 qRes6 QN Q10.S16 341 qRes7 QN Q11.S16 342 343 ;// Function header 344 M_START omxVCM4P2_MCReconBlock, r6, d15 345 ;// Define stack arguments 346 M_ARG Arg_dstStep, 4 347 M_ARG Arg_predictType, 4 348 M_ARG Arg_rndVal, 4 349 ;// Load argument from the stack 350 M_LDR dstStep, Arg_dstStep 351 M_LDR predictType, Arg_predictType 352 M_LDR rndVal, Arg_rndVal 353 ADD predictType, rndVal, predictType, LSL #1 354 RSB rndVal, rndVal, #2 ;// preparing rndVal for HalfPixelXY 355 356 ;// The following is implementation of switching to different code segments 357 ;// based on different predictType and rndVal flags. The corresponding 358 ;// labels (e.g. CaseIntegerPixel_Rnd0) are embedded in the macros following 359 ;// M_ENDSWITCH (e.g. M_MCRECONBLOCK_IntegerPixel). While "M_MCRECONBLOCK_IntegerPixel" 360 ;// and "M_MCRECONBLOCK_HalfPixelXY" handle for both rounding cases; 361 ;// "M_MCRECONBLOCK_HalfPixelX" and "M_MCRECONBLOCK_HalfPixelY" macros handle 362 ;// the two rounding cases in separate code bases. 363 ;// All these together implement the interpolation functionality 364 365 M_SWITCH predictType 366 M_CASE CaseIntegerPixel_Rnd0 367 M_CASE CaseIntegerPixel_Rnd1 368 M_CASE CaseHalfPixelX_Rnd0 369 M_CASE CaseHalfPixelX_Rnd1 370 M_CASE CaseHalfPixelY_Rnd0 371 M_CASE CaseHalfPixelY_Rnd1 372 M_CASE CaseHalfPixelXY_Rnd0 373 M_CASE CaseHalfPixelXY_Rnd1 374 M_ENDSWITCH 375 376 M_MCRECONBLOCK_IntegerPixel 377 M_MCRECONBLOCK_HalfPixelX 0 378 M_MCRECONBLOCK_HalfPixelX 1 379 M_MCRECONBLOCK_HalfPixelY 0 380 M_MCRECONBLOCK_HalfPixelY 1 381 M_MCRECONBLOCK_HalfPixelXY 382 SwitchPredictTypeEnd 383 384 ;// After interpolation is done, residue needs to be added. This is done 385 ;// only in case "pSrcResidue" parameter to the function is not NULL. 386 ;// Following is a completely unrolled code to do so. Each row and 387 ;// corresponding residue is loaded and residue is added and value 388 ;// stored 389 390 CMP pSrcResidue, #0 391 SUBNE pDst, pDst, dstStep, LSL #3 ;// Restoring pDst 392 MOVNE pDstCopy, pDst 393 BEQ pSrcResidueConditionEnd 394 pSrcResidueNotNull 395 VLD1 dDst0, [pDst@64], dstStep 396 VLD1 qRes0, [pSrcResidue@128]! 397 VLD1 dDst1, [pDst@64], dstStep 398 VLD1 qRes1, [pSrcResidue@128]! 399 VLD1 dDst2, [pDst@64], dstStep 400 VLD1 qRes2, [pSrcResidue@128]! 401 VADDW qRes0, qRes0, dDst0 402 VLD1 dDst3, [pDst@64], dstStep 403 VADDW qRes1, qRes1, dDst1 404 VLD1 qRes3, [pSrcResidue@128]! 405 VADDW qRes2, qRes2, dDst2 406 VLD1 dDst4, [pDst@64], dstStep 407 VQMOVUN dDst0, qRes0 408 VLD1 qRes4, [pSrcResidue@128]! 409 VADDW qRes3, qRes3, dDst3 410 VLD1 dDst5, [pDst@64], dstStep 411 VQMOVUN dDst1, qRes1 412 VLD1 qRes5, [pSrcResidue@128]! 413 VADDW qRes4, qRes4, dDst4 414 VLD1 dDst6, [pDst@64], dstStep 415 VQMOVUN dDst2, qRes2 416 VLD1 qRes6, [pSrcResidue@128]! 417 VADDW qRes5, qRes5, dDst5 418 VLD1 dDst7, [pDst@64], dstStep 419 VQMOVUN dDst3, qRes3 420 VLD1 qRes7, [pSrcResidue@128]! 421 VADDW qRes6, qRes6, dDst6 422 VST1 dDst0, [pDstCopy@64], dstStep 423 VQMOVUN dDst4, qRes4 424 VST1 dDst1, [pDstCopy@64], dstStep 425 VADDW qRes7, qRes7, dDst7 426 VST1 dDst2, [pDstCopy@64], dstStep 427 VQMOVUN dDst5, qRes5 428 VST1 dDst3, [pDstCopy@64], dstStep 429 VQMOVUN dDst6, qRes6 430 VST1 dDst4, [pDstCopy@64], dstStep 431 VQMOVUN dDst7, qRes7 432 VST1 dDst5, [pDstCopy@64], dstStep 433 VST1 dDst6, [pDstCopy@64], dstStep 434 VST1 dDst7, [pDstCopy@64], dstStep 435 436 pSrcResidueConditionEnd 437 MOV return, #OMX_Sts_NoErr 438 439 M_END 440 ENDIF ;// CortexA8 441 END 442 ;// *************************************************************************** 443 ;// omxVCM4P2_MCReconBlock ends 444 ;// *************************************************************************** 445