1 ;// 2 ;// Copyright (C) 2007-2008 ARM Limited 3 ;// 4 ;// Licensed under the Apache License, Version 2.0 (the "License"); 5 ;// you may not use this file except in compliance with the License. 6 ;// You may obtain a copy of the License at 7 ;// 8 ;// http://www.apache.org/licenses/LICENSE-2.0 9 ;// 10 ;// Unless required by applicable law or agreed to in writing, software 11 ;// distributed under the License is distributed on an "AS IS" BASIS, 12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 ;// See the License for the specific language governing permissions and 14 ;// limitations under the License. 15 ;// 16 ;// 17 ;// 18 ;// File Name: omxVCM4P2_MCReconBlock_s.s 19 ;// OpenMAX DL: v1.0.2 20 ;// Revision: 12290 21 ;// Date: Wednesday, April 9, 2008 22 ;// 23 ;// 24 ;// 25 ;// 26 ;// Description: 27 ;// 28 ;// 29 30 ;// Include standard headers 31 INCLUDE omxtypes_s.h 32 INCLUDE armCOMM_s.h 33 34 ;// Import symbols required from other files 35 36 M_VARIANTS CortexA8 37 38 ;// *************************************************************************** 39 ;// ARM1136JS implementation 40 ;// *************************************************************************** 41 42 ;// *************************************************************************** 43 ;// CortexA8 implementation 44 ;// *************************************************************************** 45 IF CortexA8 46 ;// *************************************************************************** 47 ;// MACRO DEFINITIONS 48 ;// *************************************************************************** 49 ;// Description: 50 ;// Does interpolation for the case of "IntegerPixel" predictType. Both 51 ;// rounding cases are handled. Just copies a block from pSrc to pDst 52 ;// 53 ;// Syntax: 54 ;// M_MCRECONBLOCK_IntegerPixel 55 ;// 56 ;// Inputs: None 57 ;// Outputs: None 58 59 MACRO 60 M_MCRECONBLOCK_IntegerPixel 61 CaseIntegerPixel_Rnd0 62 CaseIntegerPixel_Rnd1 63 64 VLD1 dRow0, [pSrc], srcStep 65 VLD1 dRow1, [pSrc], srcStep 66 VLD1 dRow2, [pSrc], srcStep 67 VLD1 dRow3, [pSrc], srcStep 68 VLD1 dRow4, [pSrc], srcStep 69 VLD1 dRow5, [pSrc], srcStep 70 VLD1 dRow6, [pSrc], srcStep 71 VLD1 dRow7, [pSrc], srcStep 72 73 VST1 dRow0, [pDst@64], dstStep 74 VST1 dRow1, [pDst@64], dstStep 75 VST1 dRow2, [pDst@64], dstStep 76 VST1 dRow3, [pDst@64], dstStep 77 VST1 dRow4, [pDst@64], dstStep 78 VST1 dRow5, [pDst@64], dstStep 79 VST1 dRow6, [pDst@64], dstStep 80 VST1 dRow7, [pDst@64], dstStep 81 82 B SwitchPredictTypeEnd 83 MEND 84 ;// *************************************************************************** 85 ;// Description: 86 ;// Does interpolation for the case of "HalfPixelX" predictType. The two 87 ;// rounding cases are handled by the parameter "$rndVal". Averages between 88 ;// a pixel and pixel right to it, rounding it based on $rndVal. The 89 ;// rounding is implemented by using opCode switching between "VRHADD" and 90 ;// "VHADD" instructions. 91 ;// 92 ;// Syntax: 93 ;// M_MCRECONBLOCK_HalfPixelX $rndVal 94 ;// 95 ;// Inputs: 96 ;// $rndVal: 0 for rounding and 1 for no rounding 97 ;// Outputs: None 98 99 MACRO 100 M_MCRECONBLOCK_HalfPixelX $rndVal 101 102 LCLS M_VHADDR 103 IF $rndVal = 0 104 M_VHADDR SETS "VRHADD" 105 ELSE 106 M_VHADDR SETS "VHADD" 107 ENDIF 108 109 CaseHalfPixelX_Rnd$rndVal 110 111 VLD1 {dRow0, dRow0Shft}, [pSrc], srcStep 112 VEXT dRow0Shft, dRow0, dRow0Shft, #1 113 VLD1 {dRow1, dRow1Shft}, [pSrc], srcStep 114 VEXT dRow1Shft, dRow1, dRow1Shft, #1 115 VLD1 {dRow2, dRow2Shft}, [pSrc], srcStep 116 VEXT dRow2Shft, dRow2, dRow2Shft, #1 117 VLD1 {dRow3, dRow3Shft}, [pSrc], srcStep 118 VEXT dRow3Shft, dRow3, dRow3Shft, #1 119 VLD1 {dRow4, dRow4Shft}, [pSrc], srcStep 120 VEXT dRow4Shft, dRow4, dRow4Shft, #1 121 VLD1 {dRow5, dRow5Shft}, [pSrc], srcStep 122 VEXT dRow5Shft, dRow5, dRow5Shft, #1 123 VLD1 {dRow6, dRow6Shft}, [pSrc], srcStep 124 VEXT dRow6Shft, dRow6, dRow6Shft, #1 125 VLD1 {dRow7, dRow7Shft}, [pSrc], srcStep 126 VEXT dRow7Shft, dRow7, dRow7Shft, #1 127 $M_VHADDR dRow0, dRow0, dRow0Shft 128 $M_VHADDR dRow1, dRow1, dRow1Shft 129 VST1 dRow0, [pDst@64], dstStep 130 $M_VHADDR dRow2, dRow2, dRow2Shft 131 VST1 dRow1, [pDst@64], dstStep 132 $M_VHADDR dRow3, dRow3, dRow3Shft 133 VST1 dRow2, [pDst@64], dstStep 134 $M_VHADDR dRow4, dRow4, dRow4Shft 135 VST1 dRow3, [pDst@64], dstStep 136 $M_VHADDR dRow5, dRow5, dRow5Shft 137 VST1 dRow4, [pDst@64], dstStep 138 $M_VHADDR dRow6, dRow6, dRow6Shft 139 VST1 dRow5, [pDst@64], dstStep 140 $M_VHADDR dRow7, dRow7, dRow7Shft 141 VST1 dRow6, [pDst@64], dstStep 142 VST1 dRow7, [pDst@64], dstStep 143 144 B SwitchPredictTypeEnd 145 MEND 146 ;// *************************************************************************** 147 ;// Description: 148 ;// Does interpolation for the case of "HalfPixelY" predictType. The two 149 ;// rounding cases are handled by the parameter "$rndVal". Averages between 150 ;// a pixel and pixel below it, rounding it based on $rndVal. The 151 ;// rounding is implemented by using opCode switching between "VRHADD" and 152 ;// "VHADD" instructions. 153 ;// 154 ;// Syntax: 155 ;// M_MCRECONBLOCK_HalfPixelY $rndVal 156 ;// 157 ;// Inputs: 158 ;// $rndVal: 0 for rounding and 1 for no rounding 159 ;// Outputs: None 160 161 MACRO 162 M_MCRECONBLOCK_HalfPixelY $rndVal 163 164 LCLS M_VHADDR 165 IF $rndVal = 0 166 M_VHADDR SETS "VRHADD" 167 ELSE 168 M_VHADDR SETS "VHADD" 169 ENDIF 170 171 CaseHalfPixelY_Rnd$rndVal 172 VLD1 dRow0, [pSrc], srcStep 173 VLD1 dRow1, [pSrc], srcStep 174 VLD1 dRow2, [pSrc], srcStep 175 VLD1 dRow3, [pSrc], srcStep 176 VLD1 dRow4, [pSrc], srcStep 177 VLD1 dRow5, [pSrc], srcStep 178 VLD1 dRow6, [pSrc], srcStep 179 VLD1 dRow7, [pSrc], srcStep 180 $M_VHADDR dRow0, dRow0, dRow1 181 VLD1 dRow8, [pSrc], srcStep 182 $M_VHADDR dRow1, dRow1, dRow2 183 VST1 dRow0, [pDst@64], dstStep 184 $M_VHADDR dRow2, dRow2, dRow3 185 VST1 dRow1, [pDst@64], dstStep 186 $M_VHADDR dRow3, dRow3, dRow4 187 VST1 dRow2, [pDst@64], dstStep 188 $M_VHADDR dRow4, dRow4, dRow5 189 VST1 dRow3, [pDst@64], dstStep 190 $M_VHADDR dRow5, dRow5, dRow6 191 VST1 dRow4, [pDst@64], dstStep 192 $M_VHADDR dRow6, dRow6, dRow7 193 VST1 dRow5, [pDst@64], dstStep 194 $M_VHADDR dRow7, dRow7, dRow8 195 VST1 dRow6, [pDst@64], dstStep 196 VST1 dRow7, [pDst@64], dstStep 197 198 B SwitchPredictTypeEnd 199 MEND 200 ;// *************************************************************************** 201 ;// Description: 202 ;// Does interpolation for the case of "IntegerPixel" predictType. Both 203 ;// rounding cases are handled. 204 ;// Typical computation for a row goes like this 205 ;// 1. VLD1 {dRow0, dRow0Shft}, [pSrc], srcStep ;// Load the row and next 8 bytes 206 ;// 2. VEXT dRow0Shft, dRow0, dRow0Shft, #1 ;// Generate the shifted row 207 ;// 3. VADDL qSum0, dRow0, dRow0Shft ;// Generate the sum of row and shifted row 208 ;// 5. VADD qSum0, qSum0, qSum1 ;// Add to the sum of next row (odd row sum has rounding value added to it) 209 ;// 6. VSHRN dRow0, qSum0, #2 ;// Divide by 4 210 ;// 7. VST1 dRow0, [pDst@64], dstStep ;// Store 211 ;// Odd rows undergo following computation after step 3 212 ;// 4. VADD qSum1, qSum1, qRound 213 ;// This saves for adding rounding value to each final sum (overall saves 4 214 ;// instructions). 215 ;// There is reuse of registers for qSum6, qSum7 & qSum8. Overall scheduling takes 216 ;// care of this and also minimizes stalls. Rounding value was modified in 217 ;// ARM register rndVal (originally used for rounding flag) before the switch. 218 ;// It is then populated into all lanes in this macro. No branching out to 219 ;// label "SwitchPredictTypeEnd" is required in the end of the macro as these 220 ;// are the last of switch cases. 221 ;// 222 ;// Syntax: 223 ;// M_MCRECONBLOCK_HalfPixelXY 224 ;// 225 ;// Inputs: None 226 ;// Outputs: None 227 228 MACRO 229 M_MCRECONBLOCK_HalfPixelXY 230 231 CaseHalfPixelXY_Rnd0 232 CaseHalfPixelXY_Rnd1 233 VLD1 {dRow0, dRow0Shft}, [pSrc], srcStep 234 VDUP qRound, rndVal 235 VLD1 {dRow1, dRow1Shft}, [pSrc], srcStep 236 VEXT dRow0Shft, dRow0, dRow0Shft, #1 237 VLD1 {dRow2, dRow2Shft}, [pSrc], srcStep 238 VEXT dRow1Shft, dRow1, dRow1Shft, #1 239 VLD1 {dRow3, dRow3Shft}, [pSrc], srcStep 240 VEXT dRow2Shft, dRow2, dRow2Shft, #1 241 VLD1 {dRow4, dRow4Shft}, [pSrc], srcStep 242 VADDL qSum0, dRow0, dRow0Shft 243 VLD1 {dRow5, dRow5Shft}, [pSrc], srcStep 244 VADDL qSum1, dRow1, dRow1Shft 245 VLD1 {dRow6, dRow6Shft}, [pSrc], srcStep 246 VEXT dRow3Shft, dRow3, dRow3Shft, #1 247 VLD1 {dRow7, dRow7Shft}, [pSrc], srcStep 248 VEXT dRow4Shft, dRow4, dRow4Shft, #1 249 VLD1 {dRow8, dRow8Shft}, [pSrc], srcStep 250 VADD qSum1, qSum1, qRound 251 VADDL qSum2, dRow2, dRow2Shft 252 VEXT dRow5Shft, dRow5, dRow5Shft, #1 253 VADD qSum0, qSum0, qSum1 254 VADDL qSum3, dRow3, dRow3Shft 255 VEXT dRow6Shft, dRow6, dRow6Shft, #1 256 VADD qSum1, qSum1, qSum2 257 VSHRN dRow0, qSum0, #2 258 VADDL qSum4, dRow4, dRow4Shft 259 VSHRN dRow1, qSum1, #2 260 VADD qSum3, qSum3, qRound 261 VADDL qSum5, dRow5, dRow5Shft 262 VST1 dRow0, [pDst@64], dstStep 263 VEXT dRow7Shft, dRow7, dRow7Shft, #1 264 VST1 dRow1, [pDst@64], dstStep 265 VEXT dRow8Shft, dRow8, dRow8Shft, #1 266 VADD qSum5, qSum5, qRound 267 VADD qSum2, qSum2, qSum3 268 VADD qSum3, qSum3, qSum4 269 VADD qSum4, qSum4, qSum5 270 VSHRN dRow2, qSum2, #2 271 VSHRN dRow3, qSum3, #2 272 VSHRN dRow4, qSum4, #2 273 VADDL qSum6, dRow6, dRow6Shft 274 VADDL qSum7, dRow7, dRow7Shft 275 VST1 dRow2, [pDst@64], dstStep 276 VADDL qSum8, dRow8, dRow8Shft 277 VADD qSum7, qSum7, qRound 278 VST1 dRow3, [pDst@64], dstStep 279 VST1 dRow4, [pDst@64], dstStep 280 VADD qSum5, qSum5, qSum6 281 VADD qSum6, qSum6, qSum7 282 VADD qSum7, qSum7, qSum8 283 VSHRN dRow5, qSum5, #2 284 VSHRN dRow6, qSum6, #2 285 VSHRN dRow7, qSum7, #2 286 VST1 dRow5, [pDst@64], dstStep 287 VST1 dRow6, [pDst@64], dstStep 288 VST1 dRow7, [pDst@64], dstStep 289 290 MEND 291 ;// *************************************************************************** 292 293 ;// Input/Output Registers 294 pSrc RN 0 295 srcStep RN 1 296 pSrcResidue RN 2 297 pDst RN 3 298 dstStep RN 4 299 predictType RN 5 300 rndVal RN 6 301 302 ;// Local Scratch Registers 303 pDstCopy RN 0 304 return RN 0 305 306 ;// Neon Registers 307 dRow0 DN D0.U8 308 dRow0Shft DN D1.U8 309 dRow1 DN D2.U8 310 dRow1Shft DN D3.U8 311 dRow2 DN D4.U8 312 dRow2Shft DN D5.U8 313 dRow3 DN D6.U8 314 dRow3Shft DN D7.U8 315 dRow4 DN D8.U8 316 dRow4Shft DN D9.U8 317 dRow5 DN D10.U8 318 dRow5Shft DN D11.U8 319 dRow6 DN D12.U8 320 dRow6Shft DN D13.U8 321 dRow7 DN D14.U8 322 dRow7Shft DN D15.U8 323 dRow8 DN D16.U8 324 dRow8Shft DN D17.U8 325 326 327 qSum0 QN Q9.U16 328 qSum1 QN Q10.U16 329 qSum2 QN Q11.U16 330 qSum3 QN Q12.U16 331 qSum4 QN Q13.U16 332 qSum5 QN Q14.U16 333 qSum6 QN Q0.U16 334 qSum7 QN Q1.U16 335 qSum8 QN Q2.U16 336 337 qRound QN Q15.U16 338 339 dDst0 DN D0.U8 340 dDst1 DN D1.U8 341 dDst2 DN D2.U8 342 dDst3 DN D3.U8 343 dDst4 DN D4.U8 344 dDst5 DN D5.U8 345 dDst6 DN D6.U8 346 dDst7 DN D7.U8 347 348 qRes0 QN Q4.S16 349 qRes1 QN Q5.S16 350 qRes2 QN Q6.S16 351 qRes3 QN Q7.S16 352 qRes4 QN Q8.S16 353 qRes5 QN Q9.S16 354 qRes6 QN Q10.S16 355 qRes7 QN Q11.S16 356 357 ;// Function header 358 M_START omxVCM4P2_MCReconBlock, r6, d15 359 ;// Define stack arguments 360 M_ARG Arg_dstStep, 4 361 M_ARG Arg_predictType, 4 362 M_ARG Arg_rndVal, 4 363 ;// Load argument from the stack 364 M_LDR dstStep, Arg_dstStep 365 M_LDR predictType, Arg_predictType 366 M_LDR rndVal, Arg_rndVal 367 ADD predictType, rndVal, predictType, LSL #1 368 RSB rndVal, rndVal, #2 ;// preparing rndVal for HalfPixelXY 369 370 ;// The following is implementation of switching to different code segments 371 ;// based on different predictType and rndVal flags. The corresponding 372 ;// labels (e.g. CaseIntegerPixel_Rnd0) are embedded in the macros following 373 ;// M_ENDSWITCH (e.g. M_MCRECONBLOCK_IntegerPixel). While "M_MCRECONBLOCK_IntegerPixel" 374 ;// and "M_MCRECONBLOCK_HalfPixelXY" handle for both rounding cases; 375 ;// "M_MCRECONBLOCK_HalfPixelX" and "M_MCRECONBLOCK_HalfPixelY" macros handle 376 ;// the two rounding cases in separate code bases. 377 ;// All these together implement the interpolation functionality 378 379 M_SWITCH predictType 380 M_CASE CaseIntegerPixel_Rnd0 381 M_CASE CaseIntegerPixel_Rnd1 382 M_CASE CaseHalfPixelX_Rnd0 383 M_CASE CaseHalfPixelX_Rnd1 384 M_CASE CaseHalfPixelY_Rnd0 385 M_CASE CaseHalfPixelY_Rnd1 386 M_CASE CaseHalfPixelXY_Rnd0 387 M_CASE CaseHalfPixelXY_Rnd1 388 M_ENDSWITCH 389 390 M_MCRECONBLOCK_IntegerPixel 391 M_MCRECONBLOCK_HalfPixelX 0 392 M_MCRECONBLOCK_HalfPixelX 1 393 M_MCRECONBLOCK_HalfPixelY 0 394 M_MCRECONBLOCK_HalfPixelY 1 395 M_MCRECONBLOCK_HalfPixelXY 396 SwitchPredictTypeEnd 397 398 ;// After interpolation is done, residue needs to be added. This is done 399 ;// only in case "pSrcResidue" parameter to the function is not NULL. 400 ;// Following is a completely unrolled code to do so. Each row and 401 ;// corresponding residue is loaded and residue is added and value 402 ;// stored 403 404 CMP pSrcResidue, #0 405 SUBNE pDst, pDst, dstStep, LSL #3 ;// Restoring pDst 406 MOVNE pDstCopy, pDst 407 BEQ pSrcResidueConditionEnd 408 pSrcResidueNotNull 409 VLD1 dDst0, [pDst@64], dstStep 410 VLD1 qRes0, [pSrcResidue@128]! 411 VLD1 dDst1, [pDst@64], dstStep 412 VLD1 qRes1, [pSrcResidue@128]! 413 VLD1 dDst2, [pDst@64], dstStep 414 VLD1 qRes2, [pSrcResidue@128]! 415 VADDW qRes0, qRes0, dDst0 416 VLD1 dDst3, [pDst@64], dstStep 417 VADDW qRes1, qRes1, dDst1 418 VLD1 qRes3, [pSrcResidue@128]! 419 VADDW qRes2, qRes2, dDst2 420 VLD1 dDst4, [pDst@64], dstStep 421 VQMOVUN dDst0, qRes0 422 VLD1 qRes4, [pSrcResidue@128]! 423 VADDW qRes3, qRes3, dDst3 424 VLD1 dDst5, [pDst@64], dstStep 425 VQMOVUN dDst1, qRes1 426 VLD1 qRes5, [pSrcResidue@128]! 427 VADDW qRes4, qRes4, dDst4 428 VLD1 dDst6, [pDst@64], dstStep 429 VQMOVUN dDst2, qRes2 430 VLD1 qRes6, [pSrcResidue@128]! 431 VADDW qRes5, qRes5, dDst5 432 VLD1 dDst7, [pDst@64], dstStep 433 VQMOVUN dDst3, qRes3 434 VLD1 qRes7, [pSrcResidue@128]! 435 VADDW qRes6, qRes6, dDst6 436 VST1 dDst0, [pDstCopy@64], dstStep 437 VQMOVUN dDst4, qRes4 438 VST1 dDst1, [pDstCopy@64], dstStep 439 VADDW qRes7, qRes7, dDst7 440 VST1 dDst2, [pDstCopy@64], dstStep 441 VQMOVUN dDst5, qRes5 442 VST1 dDst3, [pDstCopy@64], dstStep 443 VQMOVUN dDst6, qRes6 444 VST1 dDst4, [pDstCopy@64], dstStep 445 VQMOVUN dDst7, qRes7 446 VST1 dDst5, [pDstCopy@64], dstStep 447 VST1 dDst6, [pDstCopy@64], dstStep 448 VST1 dDst7, [pDstCopy@64], dstStep 449 450 pSrcResidueConditionEnd 451 MOV return, #OMX_Sts_NoErr 452 453 M_END 454 ENDIF ;// CortexA8 455 END 456 ;// *************************************************************************** 457 ;// omxVCM4P2_MCReconBlock ends 458 ;// *************************************************************************** 459