Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// Copyright (C) 2007-2008 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// File Name:  omxVCM4P2_MCReconBlock_s.s
     19 ;// OpenMAX DL: v1.0.2
     20 ;// Revision:   12290
     21 ;// Date:       Wednesday, April 9, 2008
     22 ;//
     23 ;//
     24 ;//
     25 ;//
     26 ;// Description:
     27 ;//
     28 ;//
     29 
     30 ;// Include standard headers
     31     INCLUDE omxtypes_s.h
     32     INCLUDE armCOMM_s.h
     33 
     34 ;// Import symbols required from other files
     35 
     36     M_VARIANTS CortexA8
     37 
     38 ;// ***************************************************************************
     39 ;// ARM1136JS implementation
     40 ;// ***************************************************************************
     41 
     42 ;// ***************************************************************************
     43 ;// CortexA8 implementation
     44 ;// ***************************************************************************
     45     IF  CortexA8
     46 ;// ***************************************************************************
     47 ;// MACRO DEFINITIONS
     48 ;// ***************************************************************************
     49     ;// Description:
     50     ;// Does interpolation for the case of "IntegerPixel" predictType. Both
     51     ;// rounding cases are handled. Just copies a block from pSrc to pDst
     52     ;//
     53     ;// Syntax:
     54     ;// M_MCRECONBLOCK_IntegerPixel
     55     ;//
     56     ;// Inputs: None
     57     ;// Outputs: None
     58 
     59     MACRO
     60     M_MCRECONBLOCK_IntegerPixel
     61 CaseIntegerPixel_Rnd0
     62 CaseIntegerPixel_Rnd1
     63 
     64     VLD1        dRow0, [pSrc], srcStep
     65     VLD1        dRow1, [pSrc], srcStep
     66     VLD1        dRow2, [pSrc], srcStep
     67     VLD1        dRow3, [pSrc], srcStep
     68     VLD1        dRow4, [pSrc], srcStep
     69     VLD1        dRow5, [pSrc], srcStep
     70     VLD1        dRow6, [pSrc], srcStep
     71     VLD1        dRow7, [pSrc], srcStep
     72 
     73     VST1        dRow0, [pDst@64], dstStep
     74     VST1        dRow1, [pDst@64], dstStep
     75     VST1        dRow2, [pDst@64], dstStep
     76     VST1        dRow3, [pDst@64], dstStep
     77     VST1        dRow4, [pDst@64], dstStep
     78     VST1        dRow5, [pDst@64], dstStep
     79     VST1        dRow6, [pDst@64], dstStep
     80     VST1        dRow7, [pDst@64], dstStep
     81 
     82     B           SwitchPredictTypeEnd
     83     MEND
     84 ;// ***************************************************************************
     85     ;// Description:
     86     ;// Does interpolation for the case of "HalfPixelX" predictType. The two
     87     ;// rounding cases are handled by the parameter "$rndVal". Averages between
     88     ;// a pixel and pixel right to it, rounding it based on $rndVal. The
     89     ;// rounding is implemented by using opCode switching between "VRHADD" and
     90     ;// "VHADD" instructions.
     91     ;//
     92     ;// Syntax:
     93     ;// M_MCRECONBLOCK_HalfPixelX $rndVal
     94     ;//
     95     ;// Inputs:
     96     ;//     $rndVal: 0 for rounding and 1 for no rounding
     97     ;// Outputs: None
     98 
     99     MACRO
    100     M_MCRECONBLOCK_HalfPixelX $rndVal
    101 
    102     LCLS M_VHADDR
    103     IF $rndVal = 0
    104 M_VHADDR SETS "VRHADD"
    105     ELSE
    106 M_VHADDR SETS "VHADD"
    107     ENDIF
    108 
    109 CaseHalfPixelX_Rnd$rndVal
    110 
    111     VLD1        {dRow0, dRow0Shft}, [pSrc], srcStep
    112     VEXT        dRow0Shft, dRow0, dRow0Shft, #1
    113     VLD1        {dRow1, dRow1Shft}, [pSrc], srcStep
    114     VEXT        dRow1Shft, dRow1, dRow1Shft, #1
    115     VLD1        {dRow2, dRow2Shft}, [pSrc], srcStep
    116     VEXT        dRow2Shft, dRow2, dRow2Shft, #1
    117     VLD1        {dRow3, dRow3Shft}, [pSrc], srcStep
    118     VEXT        dRow3Shft, dRow3, dRow3Shft, #1
    119     VLD1        {dRow4, dRow4Shft}, [pSrc], srcStep
    120     VEXT        dRow4Shft, dRow4, dRow4Shft, #1
    121     VLD1        {dRow5, dRow5Shft}, [pSrc], srcStep
    122     VEXT        dRow5Shft, dRow5, dRow5Shft, #1
    123     VLD1        {dRow6, dRow6Shft}, [pSrc], srcStep
    124     VEXT        dRow6Shft, dRow6, dRow6Shft, #1
    125     VLD1        {dRow7, dRow7Shft}, [pSrc], srcStep
    126     VEXT        dRow7Shft, dRow7, dRow7Shft, #1
    127     $M_VHADDR   dRow0, dRow0, dRow0Shft
    128     $M_VHADDR   dRow1, dRow1, dRow1Shft
    129     VST1        dRow0, [pDst@64], dstStep
    130     $M_VHADDR   dRow2, dRow2, dRow2Shft
    131     VST1        dRow1, [pDst@64], dstStep
    132     $M_VHADDR   dRow3, dRow3, dRow3Shft
    133     VST1        dRow2, [pDst@64], dstStep
    134     $M_VHADDR   dRow4, dRow4, dRow4Shft
    135     VST1        dRow3, [pDst@64], dstStep
    136     $M_VHADDR   dRow5, dRow5, dRow5Shft
    137     VST1        dRow4, [pDst@64], dstStep
    138     $M_VHADDR   dRow6, dRow6, dRow6Shft
    139     VST1        dRow5, [pDst@64], dstStep
    140     $M_VHADDR   dRow7, dRow7, dRow7Shft
    141     VST1        dRow6, [pDst@64], dstStep
    142     VST1        dRow7, [pDst@64], dstStep
    143 
    144     B           SwitchPredictTypeEnd
    145     MEND
    146 ;// ***************************************************************************
    147     ;// Description:
    148     ;// Does interpolation for the case of "HalfPixelY" predictType. The two
    149     ;// rounding cases are handled by the parameter "$rndVal". Averages between
    150     ;// a pixel and pixel below it, rounding it based on $rndVal. The
    151     ;// rounding is implemented by using opCode switching between "VRHADD" and
    152     ;// "VHADD" instructions.
    153     ;//
    154     ;// Syntax:
    155     ;// M_MCRECONBLOCK_HalfPixelY $rndVal
    156     ;//
    157     ;// Inputs:
    158     ;//     $rndVal: 0 for rounding and 1 for no rounding
    159     ;// Outputs: None
    160 
    161     MACRO
    162     M_MCRECONBLOCK_HalfPixelY $rndVal
    163 
    164     LCLS M_VHADDR
    165     IF $rndVal = 0
    166 M_VHADDR SETS "VRHADD"
    167     ELSE
    168 M_VHADDR SETS "VHADD"
    169     ENDIF
    170 
    171 CaseHalfPixelY_Rnd$rndVal
    172     VLD1        dRow0, [pSrc], srcStep
    173     VLD1        dRow1, [pSrc], srcStep
    174     VLD1        dRow2, [pSrc], srcStep
    175     VLD1        dRow3, [pSrc], srcStep
    176     VLD1        dRow4, [pSrc], srcStep
    177     VLD1        dRow5, [pSrc], srcStep
    178     VLD1        dRow6, [pSrc], srcStep
    179     VLD1        dRow7, [pSrc], srcStep
    180     $M_VHADDR   dRow0, dRow0, dRow1
    181     VLD1        dRow8, [pSrc], srcStep
    182     $M_VHADDR   dRow1, dRow1, dRow2
    183     VST1        dRow0, [pDst@64], dstStep
    184     $M_VHADDR   dRow2, dRow2, dRow3
    185     VST1        dRow1, [pDst@64], dstStep
    186     $M_VHADDR   dRow3, dRow3, dRow4
    187     VST1        dRow2, [pDst@64], dstStep
    188     $M_VHADDR   dRow4, dRow4, dRow5
    189     VST1        dRow3, [pDst@64], dstStep
    190     $M_VHADDR   dRow5, dRow5, dRow6
    191     VST1        dRow4, [pDst@64], dstStep
    192     $M_VHADDR   dRow6, dRow6, dRow7
    193     VST1        dRow5, [pDst@64], dstStep
    194     $M_VHADDR   dRow7, dRow7, dRow8
    195     VST1        dRow6, [pDst@64], dstStep
    196     VST1        dRow7, [pDst@64], dstStep
    197 
    198     B           SwitchPredictTypeEnd
    199     MEND
    200 ;// ***************************************************************************
    201     ;// Description:
    202     ;// Does interpolation for the case of "IntegerPixel" predictType. Both
    203     ;// rounding cases are handled.
    204     ;// Typical computation for a row goes like this
    205     ;//     1. VLD1        {dRow0, dRow0Shft}, [pSrc], srcStep ;// Load the row and next 8 bytes
    206     ;//     2. VEXT        dRow0Shft, dRow0, dRow0Shft, #1     ;// Generate the shifted row
    207     ;//     3. VADDL       qSum0, dRow0, dRow0Shft             ;// Generate the sum of row and shifted row
    208     ;//     5. VADD        qSum0, qSum0, qSum1                 ;// Add to the sum of next row (odd row sum has rounding value added to it)
    209     ;//     6. VSHRN       dRow0, qSum0, #2                    ;// Divide by 4
    210     ;//     7. VST1        dRow0, [pDst@64], dstStep           ;// Store
    211     ;// Odd rows undergo following computation after step 3
    212     ;//     4. VADD        qSum1, qSum1, qRound
    213     ;// This saves for adding rounding value to each final sum (overall saves 4
    214     ;// instructions).
    215     ;// There is reuse of registers for qSum6, qSum7 & qSum8. Overall scheduling takes
    216     ;// care of this and also minimizes stalls. Rounding value was modified in
    217     ;// ARM register rndVal (originally used for rounding flag) before the switch.
    218     ;// It is then populated into all lanes in this macro. No branching out to
    219     ;// label "SwitchPredictTypeEnd" is required in the end of the macro as these
    220     ;// are the last of switch cases.
    221     ;//
    222     ;// Syntax:
    223     ;// M_MCRECONBLOCK_HalfPixelXY
    224     ;//
    225     ;// Inputs: None
    226     ;// Outputs: None
    227 
    228     MACRO
    229     M_MCRECONBLOCK_HalfPixelXY
    230 
    231 CaseHalfPixelXY_Rnd0
    232 CaseHalfPixelXY_Rnd1
    233     VLD1        {dRow0, dRow0Shft}, [pSrc], srcStep
    234     VDUP        qRound, rndVal
    235     VLD1        {dRow1, dRow1Shft}, [pSrc], srcStep
    236     VEXT        dRow0Shft, dRow0, dRow0Shft, #1
    237     VLD1        {dRow2, dRow2Shft}, [pSrc], srcStep
    238     VEXT        dRow1Shft, dRow1, dRow1Shft, #1
    239     VLD1        {dRow3, dRow3Shft}, [pSrc], srcStep
    240     VEXT        dRow2Shft, dRow2, dRow2Shft, #1
    241     VLD1        {dRow4, dRow4Shft}, [pSrc], srcStep
    242     VADDL       qSum0, dRow0, dRow0Shft
    243     VLD1        {dRow5, dRow5Shft}, [pSrc], srcStep
    244     VADDL       qSum1, dRow1, dRow1Shft
    245     VLD1        {dRow6, dRow6Shft}, [pSrc], srcStep
    246     VEXT        dRow3Shft, dRow3, dRow3Shft, #1
    247     VLD1        {dRow7, dRow7Shft}, [pSrc], srcStep
    248     VEXT        dRow4Shft, dRow4, dRow4Shft, #1
    249     VLD1        {dRow8, dRow8Shft}, [pSrc], srcStep
    250     VADD        qSum1, qSum1, qRound
    251     VADDL       qSum2, dRow2, dRow2Shft
    252     VEXT        dRow5Shft, dRow5, dRow5Shft, #1
    253     VADD        qSum0, qSum0, qSum1
    254     VADDL       qSum3, dRow3, dRow3Shft
    255     VEXT        dRow6Shft, dRow6, dRow6Shft, #1
    256     VADD        qSum1, qSum1, qSum2
    257     VSHRN       dRow0, qSum0, #2
    258     VADDL       qSum4, dRow4, dRow4Shft
    259     VSHRN       dRow1, qSum1, #2
    260     VADD        qSum3, qSum3, qRound
    261     VADDL       qSum5, dRow5, dRow5Shft
    262     VST1        dRow0, [pDst@64], dstStep
    263     VEXT        dRow7Shft, dRow7, dRow7Shft, #1
    264     VST1        dRow1, [pDst@64], dstStep
    265     VEXT        dRow8Shft, dRow8, dRow8Shft, #1
    266     VADD        qSum5, qSum5, qRound
    267     VADD        qSum2, qSum2, qSum3
    268     VADD        qSum3, qSum3, qSum4
    269     VADD        qSum4, qSum4, qSum5
    270     VSHRN       dRow2, qSum2, #2
    271     VSHRN       dRow3, qSum3, #2
    272     VSHRN       dRow4, qSum4, #2
    273     VADDL       qSum6, dRow6, dRow6Shft
    274     VADDL       qSum7, dRow7, dRow7Shft
    275     VST1        dRow2, [pDst@64], dstStep
    276     VADDL       qSum8, dRow8, dRow8Shft
    277     VADD        qSum7, qSum7, qRound
    278     VST1        dRow3, [pDst@64], dstStep
    279     VST1        dRow4, [pDst@64], dstStep
    280     VADD        qSum5, qSum5, qSum6
    281     VADD        qSum6, qSum6, qSum7
    282     VADD        qSum7, qSum7, qSum8
    283     VSHRN       dRow5, qSum5, #2
    284     VSHRN       dRow6, qSum6, #2
    285     VSHRN       dRow7, qSum7, #2
    286     VST1        dRow5, [pDst@64], dstStep
    287     VST1        dRow6, [pDst@64], dstStep
    288     VST1        dRow7, [pDst@64], dstStep
    289 
    290     MEND
    291 ;// ***************************************************************************
    292 
    293 ;// Input/Output Registers
    294 pSrc                  RN 0
    295 srcStep               RN 1
    296 pSrcResidue           RN 2
    297 pDst                  RN 3
    298 dstStep               RN 4
    299 predictType           RN 5
    300 rndVal                RN 6
    301 
    302 ;// Local Scratch Registers
    303 pDstCopy              RN 0
    304 return                RN 0
    305 
    306 ;// Neon Registers
    307 dRow0                 DN D0.U8
    308 dRow0Shft             DN D1.U8
    309 dRow1                 DN D2.U8
    310 dRow1Shft             DN D3.U8
    311 dRow2                 DN D4.U8
    312 dRow2Shft             DN D5.U8
    313 dRow3                 DN D6.U8
    314 dRow3Shft             DN D7.U8
    315 dRow4                 DN D8.U8
    316 dRow4Shft             DN D9.U8
    317 dRow5                 DN D10.U8
    318 dRow5Shft             DN D11.U8
    319 dRow6                 DN D12.U8
    320 dRow6Shft             DN D13.U8
    321 dRow7                 DN D14.U8
    322 dRow7Shft             DN D15.U8
    323 dRow8                 DN D16.U8
    324 dRow8Shft             DN D17.U8
    325 
    326 
    327 qSum0                 QN Q9.U16
    328 qSum1                 QN Q10.U16
    329 qSum2                 QN Q11.U16
    330 qSum3                 QN Q12.U16
    331 qSum4                 QN Q13.U16
    332 qSum5                 QN Q14.U16
    333 qSum6                 QN Q0.U16
    334 qSum7                 QN Q1.U16
    335 qSum8                 QN Q2.U16
    336 
    337 qRound                QN Q15.U16
    338 
    339 dDst0                 DN D0.U8
    340 dDst1                 DN D1.U8
    341 dDst2                 DN D2.U8
    342 dDst3                 DN D3.U8
    343 dDst4                 DN D4.U8
    344 dDst5                 DN D5.U8
    345 dDst6                 DN D6.U8
    346 dDst7                 DN D7.U8
    347 
    348 qRes0                 QN Q4.S16
    349 qRes1                 QN Q5.S16
    350 qRes2                 QN Q6.S16
    351 qRes3                 QN Q7.S16
    352 qRes4                 QN Q8.S16
    353 qRes5                 QN Q9.S16
    354 qRes6                 QN Q10.S16
    355 qRes7                 QN Q11.S16
    356 
    357     ;// Function header
    358     M_START     omxVCM4P2_MCReconBlock, r6, d15
    359     ;// Define stack arguments
    360     M_ARG       Arg_dstStep,        4
    361     M_ARG       Arg_predictType,    4
    362     M_ARG       Arg_rndVal,         4
    363     ;// Load argument from the stack
    364     M_LDR       dstStep, Arg_dstStep
    365     M_LDR       predictType, Arg_predictType
    366     M_LDR       rndVal, Arg_rndVal
    367     ADD         predictType, rndVal, predictType, LSL #1
    368     RSB         rndVal, rndVal, #2              ;// preparing rndVal for HalfPixelXY
    369 
    370     ;// The following is implementation of switching to different code segments
    371     ;// based on different predictType and rndVal flags. The corresponding
    372     ;// labels (e.g. CaseIntegerPixel_Rnd0) are embedded in the macros following
    373     ;// M_ENDSWITCH (e.g. M_MCRECONBLOCK_IntegerPixel). While "M_MCRECONBLOCK_IntegerPixel"
    374     ;// and "M_MCRECONBLOCK_HalfPixelXY" handle for both rounding cases;
    375     ;// "M_MCRECONBLOCK_HalfPixelX" and "M_MCRECONBLOCK_HalfPixelY" macros handle
    376     ;// the two rounding cases in separate code bases.
    377     ;// All these together implement the interpolation functionality
    378 
    379     M_SWITCH    predictType
    380         M_CASE      CaseIntegerPixel_Rnd0
    381         M_CASE      CaseIntegerPixel_Rnd1
    382         M_CASE      CaseHalfPixelX_Rnd0
    383         M_CASE      CaseHalfPixelX_Rnd1
    384         M_CASE      CaseHalfPixelY_Rnd0
    385         M_CASE      CaseHalfPixelY_Rnd1
    386         M_CASE      CaseHalfPixelXY_Rnd0
    387         M_CASE      CaseHalfPixelXY_Rnd1
    388     M_ENDSWITCH
    389 
    390     M_MCRECONBLOCK_IntegerPixel
    391     M_MCRECONBLOCK_HalfPixelX 0
    392     M_MCRECONBLOCK_HalfPixelX 1
    393     M_MCRECONBLOCK_HalfPixelY 0
    394     M_MCRECONBLOCK_HalfPixelY 1
    395     M_MCRECONBLOCK_HalfPixelXY
    396 SwitchPredictTypeEnd
    397 
    398     ;// After interpolation is done, residue needs to be added. This is done
    399     ;// only in case "pSrcResidue" parameter to the function is not NULL.
    400     ;// Following is a completely unrolled code to do so. Each row and
    401     ;// corresponding residue is loaded and residue is added and value
    402     ;// stored
    403 
    404     CMP         pSrcResidue, #0
    405     SUBNE       pDst, pDst, dstStep, LSL #3     ;// Restoring pDst
    406     MOVNE       pDstCopy, pDst
    407     BEQ         pSrcResidueConditionEnd
    408 pSrcResidueNotNull
    409     VLD1        dDst0, [pDst@64], dstStep
    410     VLD1        qRes0, [pSrcResidue@128]!
    411     VLD1        dDst1, [pDst@64], dstStep
    412     VLD1        qRes1, [pSrcResidue@128]!
    413     VLD1        dDst2, [pDst@64], dstStep
    414     VLD1        qRes2, [pSrcResidue@128]!
    415     VADDW       qRes0, qRes0, dDst0
    416     VLD1        dDst3, [pDst@64], dstStep
    417     VADDW       qRes1, qRes1, dDst1
    418     VLD1        qRes3, [pSrcResidue@128]!
    419     VADDW       qRes2, qRes2, dDst2
    420     VLD1        dDst4, [pDst@64], dstStep
    421     VQMOVUN     dDst0, qRes0
    422     VLD1        qRes4, [pSrcResidue@128]!
    423     VADDW       qRes3, qRes3, dDst3
    424     VLD1        dDst5, [pDst@64], dstStep
    425     VQMOVUN     dDst1, qRes1
    426     VLD1        qRes5, [pSrcResidue@128]!
    427     VADDW       qRes4, qRes4, dDst4
    428     VLD1        dDst6, [pDst@64], dstStep
    429     VQMOVUN     dDst2, qRes2
    430     VLD1        qRes6, [pSrcResidue@128]!
    431     VADDW       qRes5, qRes5, dDst5
    432     VLD1        dDst7, [pDst@64], dstStep
    433     VQMOVUN     dDst3, qRes3
    434     VLD1        qRes7, [pSrcResidue@128]!
    435     VADDW       qRes6, qRes6, dDst6
    436     VST1        dDst0, [pDstCopy@64], dstStep
    437     VQMOVUN     dDst4, qRes4
    438     VST1        dDst1, [pDstCopy@64], dstStep
    439     VADDW       qRes7, qRes7, dDst7
    440     VST1        dDst2, [pDstCopy@64], dstStep
    441     VQMOVUN     dDst5, qRes5
    442     VST1        dDst3, [pDstCopy@64], dstStep
    443     VQMOVUN     dDst6, qRes6
    444     VST1        dDst4, [pDstCopy@64], dstStep
    445     VQMOVUN     dDst7, qRes7
    446     VST1        dDst5, [pDstCopy@64], dstStep
    447     VST1        dDst6, [pDstCopy@64], dstStep
    448     VST1        dDst7, [pDstCopy@64], dstStep
    449 
    450 pSrcResidueConditionEnd
    451     MOV         return, #OMX_Sts_NoErr
    452 
    453     M_END
    454     ENDIF ;// CortexA8
    455     END
    456 ;// ***************************************************************************
    457 ;// omxVCM4P2_MCReconBlock ends
    458 ;// ***************************************************************************
    459