Home | History | Annotate | Download | only in src
      1 ;//
      2 ;//
      3 ;// File Name:  omxVCM4P2_MCReconBlock_s.s
      4 ;// OpenMAX DL: v1.0.2
      5 ;// Revision:   12290
      6 ;// Date:       Wednesday, April 9, 2008
      7 ;//
      8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
      9 ;//
     10 ;//
     11 ;//
     12 ;// Description:
     13 ;//
     14 ;//
     15 
     16 ;// Include standard headers
     17     INCLUDE omxtypes_s.h
     18     INCLUDE armCOMM_s.h
     19 
     20 ;// Import symbols required from other files
     21 
     22     M_VARIANTS CortexA8
     23 
     24 ;// ***************************************************************************
     25 ;// ARM1136JS implementation
     26 ;// ***************************************************************************
     27 
     28 ;// ***************************************************************************
     29 ;// CortexA8 implementation
     30 ;// ***************************************************************************
     31     IF  CortexA8
     32 ;// ***************************************************************************
     33 ;// MACRO DEFINITIONS
     34 ;// ***************************************************************************
     35     ;// Description:
     36     ;// Does interpolation for the case of "IntegerPixel" predictType. Both
     37     ;// rounding cases are handled. Just copies a block from pSrc to pDst
     38     ;//
     39     ;// Syntax:
     40     ;// M_MCRECONBLOCK_IntegerPixel
     41     ;//
     42     ;// Inputs: None
     43     ;// Outputs: None
     44 
     45     MACRO
     46     M_MCRECONBLOCK_IntegerPixel
     47 CaseIntegerPixel_Rnd0
     48 CaseIntegerPixel_Rnd1
     49 
     50     VLD1        dRow0, [pSrc], srcStep
     51     VLD1        dRow1, [pSrc], srcStep
     52     VLD1        dRow2, [pSrc], srcStep
     53     VLD1        dRow3, [pSrc], srcStep
     54     VLD1        dRow4, [pSrc], srcStep
     55     VLD1        dRow5, [pSrc], srcStep
     56     VLD1        dRow6, [pSrc], srcStep
     57     VLD1        dRow7, [pSrc], srcStep
     58 
     59     VST1        dRow0, [pDst@64], dstStep
     60     VST1        dRow1, [pDst@64], dstStep
     61     VST1        dRow2, [pDst@64], dstStep
     62     VST1        dRow3, [pDst@64], dstStep
     63     VST1        dRow4, [pDst@64], dstStep
     64     VST1        dRow5, [pDst@64], dstStep
     65     VST1        dRow6, [pDst@64], dstStep
     66     VST1        dRow7, [pDst@64], dstStep
     67 
     68     B           SwitchPredictTypeEnd
     69     MEND
     70 ;// ***************************************************************************
     71     ;// Description:
     72     ;// Does interpolation for the case of "HalfPixelX" predictType. The two
     73     ;// rounding cases are handled by the parameter "$rndVal". Averages between
     74     ;// a pixel and pixel right to it, rounding it based on $rndVal. The
     75     ;// rounding is implemented by using opCode switching between "VRHADD" and
     76     ;// "VHADD" instructions.
     77     ;//
     78     ;// Syntax:
     79     ;// M_MCRECONBLOCK_HalfPixelX $rndVal
     80     ;//
     81     ;// Inputs:
     82     ;//     $rndVal: 0 for rounding and 1 for no rounding
     83     ;// Outputs: None
     84 
     85     MACRO
     86     M_MCRECONBLOCK_HalfPixelX $rndVal
     87 
     88     LCLS M_VHADDR
     89     IF $rndVal = 0
     90 M_VHADDR SETS "VRHADD"
     91     ELSE
     92 M_VHADDR SETS "VHADD"
     93     ENDIF
     94 
     95 CaseHalfPixelX_Rnd$rndVal
     96 
     97     VLD1        {dRow0, dRow0Shft}, [pSrc], srcStep
     98     VEXT        dRow0Shft, dRow0, dRow0Shft, #1
     99     VLD1        {dRow1, dRow1Shft}, [pSrc], srcStep
    100     VEXT        dRow1Shft, dRow1, dRow1Shft, #1
    101     VLD1        {dRow2, dRow2Shft}, [pSrc], srcStep
    102     VEXT        dRow2Shft, dRow2, dRow2Shft, #1
    103     VLD1        {dRow3, dRow3Shft}, [pSrc], srcStep
    104     VEXT        dRow3Shft, dRow3, dRow3Shft, #1
    105     VLD1        {dRow4, dRow4Shft}, [pSrc], srcStep
    106     VEXT        dRow4Shft, dRow4, dRow4Shft, #1
    107     VLD1        {dRow5, dRow5Shft}, [pSrc], srcStep
    108     VEXT        dRow5Shft, dRow5, dRow5Shft, #1
    109     VLD1        {dRow6, dRow6Shft}, [pSrc], srcStep
    110     VEXT        dRow6Shft, dRow6, dRow6Shft, #1
    111     VLD1        {dRow7, dRow7Shft}, [pSrc], srcStep
    112     VEXT        dRow7Shft, dRow7, dRow7Shft, #1
    113     $M_VHADDR   dRow0, dRow0, dRow0Shft
    114     $M_VHADDR   dRow1, dRow1, dRow1Shft
    115     VST1        dRow0, [pDst@64], dstStep
    116     $M_VHADDR   dRow2, dRow2, dRow2Shft
    117     VST1        dRow1, [pDst@64], dstStep
    118     $M_VHADDR   dRow3, dRow3, dRow3Shft
    119     VST1        dRow2, [pDst@64], dstStep
    120     $M_VHADDR   dRow4, dRow4, dRow4Shft
    121     VST1        dRow3, [pDst@64], dstStep
    122     $M_VHADDR   dRow5, dRow5, dRow5Shft
    123     VST1        dRow4, [pDst@64], dstStep
    124     $M_VHADDR   dRow6, dRow6, dRow6Shft
    125     VST1        dRow5, [pDst@64], dstStep
    126     $M_VHADDR   dRow7, dRow7, dRow7Shft
    127     VST1        dRow6, [pDst@64], dstStep
    128     VST1        dRow7, [pDst@64], dstStep
    129 
    130     B           SwitchPredictTypeEnd
    131     MEND
    132 ;// ***************************************************************************
    133     ;// Description:
    134     ;// Does interpolation for the case of "HalfPixelY" predictType. The two
    135     ;// rounding cases are handled by the parameter "$rndVal". Averages between
    136     ;// a pixel and pixel below it, rounding it based on $rndVal. The
    137     ;// rounding is implemented by using opCode switching between "VRHADD" and
    138     ;// "VHADD" instructions.
    139     ;//
    140     ;// Syntax:
    141     ;// M_MCRECONBLOCK_HalfPixelY $rndVal
    142     ;//
    143     ;// Inputs:
    144     ;//     $rndVal: 0 for rounding and 1 for no rounding
    145     ;// Outputs: None
    146 
    147     MACRO
    148     M_MCRECONBLOCK_HalfPixelY $rndVal
    149 
    150     LCLS M_VHADDR
    151     IF $rndVal = 0
    152 M_VHADDR SETS "VRHADD"
    153     ELSE
    154 M_VHADDR SETS "VHADD"
    155     ENDIF
    156 
    157 CaseHalfPixelY_Rnd$rndVal
    158     VLD1        dRow0, [pSrc], srcStep
    159     VLD1        dRow1, [pSrc], srcStep
    160     VLD1        dRow2, [pSrc], srcStep
    161     VLD1        dRow3, [pSrc], srcStep
    162     VLD1        dRow4, [pSrc], srcStep
    163     VLD1        dRow5, [pSrc], srcStep
    164     VLD1        dRow6, [pSrc], srcStep
    165     VLD1        dRow7, [pSrc], srcStep
    166     $M_VHADDR   dRow0, dRow0, dRow1
    167     VLD1        dRow8, [pSrc], srcStep
    168     $M_VHADDR   dRow1, dRow1, dRow2
    169     VST1        dRow0, [pDst@64], dstStep
    170     $M_VHADDR   dRow2, dRow2, dRow3
    171     VST1        dRow1, [pDst@64], dstStep
    172     $M_VHADDR   dRow3, dRow3, dRow4
    173     VST1        dRow2, [pDst@64], dstStep
    174     $M_VHADDR   dRow4, dRow4, dRow5
    175     VST1        dRow3, [pDst@64], dstStep
    176     $M_VHADDR   dRow5, dRow5, dRow6
    177     VST1        dRow4, [pDst@64], dstStep
    178     $M_VHADDR   dRow6, dRow6, dRow7
    179     VST1        dRow5, [pDst@64], dstStep
    180     $M_VHADDR   dRow7, dRow7, dRow8
    181     VST1        dRow6, [pDst@64], dstStep
    182     VST1        dRow7, [pDst@64], dstStep
    183 
    184     B           SwitchPredictTypeEnd
    185     MEND
    186 ;// ***************************************************************************
    187     ;// Description:
    188     ;// Does interpolation for the case of "IntegerPixel" predictType. Both
    189     ;// rounding cases are handled.
    190     ;// Typical computation for a row goes like this
    191     ;//     1. VLD1        {dRow0, dRow0Shft}, [pSrc], srcStep ;// Load the row and next 8 bytes
    192     ;//     2. VEXT        dRow0Shft, dRow0, dRow0Shft, #1     ;// Generate the shifted row
    193     ;//     3. VADDL       qSum0, dRow0, dRow0Shft             ;// Generate the sum of row and shifted row
    194     ;//     5. VADD        qSum0, qSum0, qSum1                 ;// Add to the sum of next row (odd row sum has rounding value added to it)
    195     ;//     6. VSHRN       dRow0, qSum0, #2                    ;// Divide by 4
    196     ;//     7. VST1        dRow0, [pDst@64], dstStep           ;// Store
    197     ;// Odd rows undergo following computation after step 3
    198     ;//     4. VADD        qSum1, qSum1, qRound
    199     ;// This saves for adding rounding value to each final sum (overall saves 4
    200     ;// instructions).
    201     ;// There is reuse of registers for qSum6, qSum7 & qSum8. Overall scheduling takes
    202     ;// care of this and also minimizes stalls. Rounding value was modified in
    203     ;// ARM register rndVal (originally used for rounding flag) before the switch.
    204     ;// It is then populated into all lanes in this macro. No branching out to
    205     ;// label "SwitchPredictTypeEnd" is required in the end of the macro as these
    206     ;// are the last of switch cases.
    207     ;//
    208     ;// Syntax:
    209     ;// M_MCRECONBLOCK_HalfPixelXY
    210     ;//
    211     ;// Inputs: None
    212     ;// Outputs: None
    213 
    214     MACRO
    215     M_MCRECONBLOCK_HalfPixelXY
    216 
    217 CaseHalfPixelXY_Rnd0
    218 CaseHalfPixelXY_Rnd1
    219     VLD1        {dRow0, dRow0Shft}, [pSrc], srcStep
    220     VDUP        qRound, rndVal
    221     VLD1        {dRow1, dRow1Shft}, [pSrc], srcStep
    222     VEXT        dRow0Shft, dRow0, dRow0Shft, #1
    223     VLD1        {dRow2, dRow2Shft}, [pSrc], srcStep
    224     VEXT        dRow1Shft, dRow1, dRow1Shft, #1
    225     VLD1        {dRow3, dRow3Shft}, [pSrc], srcStep
    226     VEXT        dRow2Shft, dRow2, dRow2Shft, #1
    227     VLD1        {dRow4, dRow4Shft}, [pSrc], srcStep
    228     VADDL       qSum0, dRow0, dRow0Shft
    229     VLD1        {dRow5, dRow5Shft}, [pSrc], srcStep
    230     VADDL       qSum1, dRow1, dRow1Shft
    231     VLD1        {dRow6, dRow6Shft}, [pSrc], srcStep
    232     VEXT        dRow3Shft, dRow3, dRow3Shft, #1
    233     VLD1        {dRow7, dRow7Shft}, [pSrc], srcStep
    234     VEXT        dRow4Shft, dRow4, dRow4Shft, #1
    235     VLD1        {dRow8, dRow8Shft}, [pSrc], srcStep
    236     VADD        qSum1, qSum1, qRound
    237     VADDL       qSum2, dRow2, dRow2Shft
    238     VEXT        dRow5Shft, dRow5, dRow5Shft, #1
    239     VADD        qSum0, qSum0, qSum1
    240     VADDL       qSum3, dRow3, dRow3Shft
    241     VEXT        dRow6Shft, dRow6, dRow6Shft, #1
    242     VADD        qSum1, qSum1, qSum2
    243     VSHRN       dRow0, qSum0, #2
    244     VADDL       qSum4, dRow4, dRow4Shft
    245     VSHRN       dRow1, qSum1, #2
    246     VADD        qSum3, qSum3, qRound
    247     VADDL       qSum5, dRow5, dRow5Shft
    248     VST1        dRow0, [pDst@64], dstStep
    249     VEXT        dRow7Shft, dRow7, dRow7Shft, #1
    250     VST1        dRow1, [pDst@64], dstStep
    251     VEXT        dRow8Shft, dRow8, dRow8Shft, #1
    252     VADD        qSum5, qSum5, qRound
    253     VADD        qSum2, qSum2, qSum3
    254     VADD        qSum3, qSum3, qSum4
    255     VADD        qSum4, qSum4, qSum5
    256     VSHRN       dRow2, qSum2, #2
    257     VSHRN       dRow3, qSum3, #2
    258     VSHRN       dRow4, qSum4, #2
    259     VADDL       qSum6, dRow6, dRow6Shft
    260     VADDL       qSum7, dRow7, dRow7Shft
    261     VST1        dRow2, [pDst@64], dstStep
    262     VADDL       qSum8, dRow8, dRow8Shft
    263     VADD        qSum7, qSum7, qRound
    264     VST1        dRow3, [pDst@64], dstStep
    265     VST1        dRow4, [pDst@64], dstStep
    266     VADD        qSum5, qSum5, qSum6
    267     VADD        qSum6, qSum6, qSum7
    268     VADD        qSum7, qSum7, qSum8
    269     VSHRN       dRow5, qSum5, #2
    270     VSHRN       dRow6, qSum6, #2
    271     VSHRN       dRow7, qSum7, #2
    272     VST1        dRow5, [pDst@64], dstStep
    273     VST1        dRow6, [pDst@64], dstStep
    274     VST1        dRow7, [pDst@64], dstStep
    275 
    276     MEND
    277 ;// ***************************************************************************
    278 
    279 ;// Input/Output Registers
    280 pSrc                  RN 0
    281 srcStep               RN 1
    282 pSrcResidue           RN 2
    283 pDst                  RN 3
    284 dstStep               RN 4
    285 predictType           RN 5
    286 rndVal                RN 6
    287 
    288 ;// Local Scratch Registers
    289 pDstCopy              RN 0
    290 return                RN 0
    291 
    292 ;// Neon Registers
    293 dRow0                 DN D0.U8
    294 dRow0Shft             DN D1.U8
    295 dRow1                 DN D2.U8
    296 dRow1Shft             DN D3.U8
    297 dRow2                 DN D4.U8
    298 dRow2Shft             DN D5.U8
    299 dRow3                 DN D6.U8
    300 dRow3Shft             DN D7.U8
    301 dRow4                 DN D8.U8
    302 dRow4Shft             DN D9.U8
    303 dRow5                 DN D10.U8
    304 dRow5Shft             DN D11.U8
    305 dRow6                 DN D12.U8
    306 dRow6Shft             DN D13.U8
    307 dRow7                 DN D14.U8
    308 dRow7Shft             DN D15.U8
    309 dRow8                 DN D16.U8
    310 dRow8Shft             DN D17.U8
    311 
    312 
    313 qSum0                 QN Q9.U16
    314 qSum1                 QN Q10.U16
    315 qSum2                 QN Q11.U16
    316 qSum3                 QN Q12.U16
    317 qSum4                 QN Q13.U16
    318 qSum5                 QN Q14.U16
    319 qSum6                 QN Q0.U16
    320 qSum7                 QN Q1.U16
    321 qSum8                 QN Q2.U16
    322 
    323 qRound                QN Q15.U16
    324 
    325 dDst0                 DN D0.U8
    326 dDst1                 DN D1.U8
    327 dDst2                 DN D2.U8
    328 dDst3                 DN D3.U8
    329 dDst4                 DN D4.U8
    330 dDst5                 DN D5.U8
    331 dDst6                 DN D6.U8
    332 dDst7                 DN D7.U8
    333 
    334 qRes0                 QN Q4.S16
    335 qRes1                 QN Q5.S16
    336 qRes2                 QN Q6.S16
    337 qRes3                 QN Q7.S16
    338 qRes4                 QN Q8.S16
    339 qRes5                 QN Q9.S16
    340 qRes6                 QN Q10.S16
    341 qRes7                 QN Q11.S16
    342 
    343     ;// Function header
    344     M_START     omxVCM4P2_MCReconBlock, r6, d15
    345     ;// Define stack arguments
    346     M_ARG       Arg_dstStep,        4
    347     M_ARG       Arg_predictType,    4
    348     M_ARG       Arg_rndVal,         4
    349     ;// Load argument from the stack
    350     M_LDR       dstStep, Arg_dstStep
    351     M_LDR       predictType, Arg_predictType
    352     M_LDR       rndVal, Arg_rndVal
    353     ADD         predictType, rndVal, predictType, LSL #1
    354     RSB         rndVal, rndVal, #2              ;// preparing rndVal for HalfPixelXY
    355 
    356     ;// The following is implementation of switching to different code segments
    357     ;// based on different predictType and rndVal flags. The corresponding
    358     ;// labels (e.g. CaseIntegerPixel_Rnd0) are embedded in the macros following
    359     ;// M_ENDSWITCH (e.g. M_MCRECONBLOCK_IntegerPixel). While "M_MCRECONBLOCK_IntegerPixel"
    360     ;// and "M_MCRECONBLOCK_HalfPixelXY" handle for both rounding cases;
    361     ;// "M_MCRECONBLOCK_HalfPixelX" and "M_MCRECONBLOCK_HalfPixelY" macros handle
    362     ;// the two rounding cases in separate code bases.
    363     ;// All these together implement the interpolation functionality
    364 
    365     M_SWITCH    predictType
    366         M_CASE      CaseIntegerPixel_Rnd0
    367         M_CASE      CaseIntegerPixel_Rnd1
    368         M_CASE      CaseHalfPixelX_Rnd0
    369         M_CASE      CaseHalfPixelX_Rnd1
    370         M_CASE      CaseHalfPixelY_Rnd0
    371         M_CASE      CaseHalfPixelY_Rnd1
    372         M_CASE      CaseHalfPixelXY_Rnd0
    373         M_CASE      CaseHalfPixelXY_Rnd1
    374     M_ENDSWITCH
    375 
    376     M_MCRECONBLOCK_IntegerPixel
    377     M_MCRECONBLOCK_HalfPixelX 0
    378     M_MCRECONBLOCK_HalfPixelX 1
    379     M_MCRECONBLOCK_HalfPixelY 0
    380     M_MCRECONBLOCK_HalfPixelY 1
    381     M_MCRECONBLOCK_HalfPixelXY
    382 SwitchPredictTypeEnd
    383 
    384     ;// After interpolation is done, residue needs to be added. This is done
    385     ;// only in case "pSrcResidue" parameter to the function is not NULL.
    386     ;// Following is a completely unrolled code to do so. Each row and
    387     ;// corresponding residue is loaded and residue is added and value
    388     ;// stored
    389 
    390     CMP         pSrcResidue, #0
    391     SUBNE       pDst, pDst, dstStep, LSL #3     ;// Restoring pDst
    392     MOVNE       pDstCopy, pDst
    393     BEQ         pSrcResidueConditionEnd
    394 pSrcResidueNotNull
    395     VLD1        dDst0, [pDst@64], dstStep
    396     VLD1        qRes0, [pSrcResidue@128]!
    397     VLD1        dDst1, [pDst@64], dstStep
    398     VLD1        qRes1, [pSrcResidue@128]!
    399     VLD1        dDst2, [pDst@64], dstStep
    400     VLD1        qRes2, [pSrcResidue@128]!
    401     VADDW       qRes0, qRes0, dDst0
    402     VLD1        dDst3, [pDst@64], dstStep
    403     VADDW       qRes1, qRes1, dDst1
    404     VLD1        qRes3, [pSrcResidue@128]!
    405     VADDW       qRes2, qRes2, dDst2
    406     VLD1        dDst4, [pDst@64], dstStep
    407     VQMOVUN     dDst0, qRes0
    408     VLD1        qRes4, [pSrcResidue@128]!
    409     VADDW       qRes3, qRes3, dDst3
    410     VLD1        dDst5, [pDst@64], dstStep
    411     VQMOVUN     dDst1, qRes1
    412     VLD1        qRes5, [pSrcResidue@128]!
    413     VADDW       qRes4, qRes4, dDst4
    414     VLD1        dDst6, [pDst@64], dstStep
    415     VQMOVUN     dDst2, qRes2
    416     VLD1        qRes6, [pSrcResidue@128]!
    417     VADDW       qRes5, qRes5, dDst5
    418     VLD1        dDst7, [pDst@64], dstStep
    419     VQMOVUN     dDst3, qRes3
    420     VLD1        qRes7, [pSrcResidue@128]!
    421     VADDW       qRes6, qRes6, dDst6
    422     VST1        dDst0, [pDstCopy@64], dstStep
    423     VQMOVUN     dDst4, qRes4
    424     VST1        dDst1, [pDstCopy@64], dstStep
    425     VADDW       qRes7, qRes7, dDst7
    426     VST1        dDst2, [pDstCopy@64], dstStep
    427     VQMOVUN     dDst5, qRes5
    428     VST1        dDst3, [pDstCopy@64], dstStep
    429     VQMOVUN     dDst6, qRes6
    430     VST1        dDst4, [pDstCopy@64], dstStep
    431     VQMOVUN     dDst7, qRes7
    432     VST1        dDst5, [pDstCopy@64], dstStep
    433     VST1        dDst6, [pDstCopy@64], dstStep
    434     VST1        dDst7, [pDstCopy@64], dstStep
    435 
    436 pSrcResidueConditionEnd
    437     MOV         return, #OMX_Sts_NoErr
    438 
    439     M_END
    440     ENDIF ;// CortexA8
    441     END
    442 ;// ***************************************************************************
    443 ;// omxVCM4P2_MCReconBlock ends
    444 ;// ***************************************************************************
    445