Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// Copyright (C) 2007-2008 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// File Name:  omxVCM4P10_PredictIntraChroma_8x8_s.s
     19 ;// OpenMAX DL: v1.0.2
     20 ;// Revision:   12290
     21 ;// Date:       Wednesday, April 9, 2008
     22 ;//
     23 ;//
     24 ;//
     25 ;//
     26 
     27 
     28         INCLUDE omxtypes_s.h
     29         INCLUDE armCOMM_s.h
     30 
     31         EXPORT armVCM4P10_pIndexTable8x8
     32 
     33 ;// Define the processor variants supported by this file
     34 
     35          M_VARIANTS CortexA8
     36 
     37      AREA table, DATA
     38 ;//-------------------------------------------------------
     39 ;// This table for implementing switch case of C in asm by
     40 ;// the mehtod of two levels of indexing.
     41 ;//-------------------------------------------------------
     42 
     43     M_TABLE armVCM4P10_pIndexTable8x8
     44     DCD  OMX_VC_CHROMA_DC,     OMX_VC_CHROMA_HOR
     45     DCD  OMX_VC_CHROMA_VERT,   OMX_VC_CHROMA_PLANE
     46 
     47     M_TABLE armVCM4P10_MultiplierTableChroma8x8,1
     48     DCW   3, 2, 1,4
     49     DCW  -3,-2,-1,0
     50     DCW   1, 2, 3,4
     51 
     52 
     53 
     54     IF CortexA8
     55 
     56 ;//--------------------------------------------
     57 ;// Scratch variable
     58 ;//--------------------------------------------
     59 
     60 pc              RN 15
     61 return          RN 0
     62 pTable          RN 8
     63 
     64 ;//--------------------------------------------
     65 ;// Input Arguments
     66 ;//--------------------------------------------
     67 pSrcLeft        RN 0    ;// input pointer
     68 pSrcAbove       RN 1    ;// input pointer
     69 pSrcAboveLeft   RN 2    ;// input pointer
     70 pDst            RN 3    ;// output pointer
     71 leftStep        RN 4    ;// input variable
     72 dstStep         RN 5    ;// input variable
     73 predMode        RN 6    ;// input variable
     74 availability    RN 7    ;// input variable
     75 pMultiplierTable    RN  2
     76 
     77 pTmp            RN 9
     78 step            RN 10
     79 
     80 ;//---------------------
     81 ;// Neon Registers
     82 ;//---------------------
     83 
     84 ;// OMX_VC_CHROMA_HOR
     85 
     86 dLeftVal0       DN  D0.8
     87 dLeftVal1       DN  D1.8
     88 dLeftVal2       DN  D2.8
     89 dLeftVal3       DN  D3.8
     90 dLeftVal4       DN  D4.8
     91 dLeftVal5       DN  D5.8
     92 dLeftVal6       DN  D6.8
     93 dLeftVal7       DN  D7.8
     94 
     95 ;// OMX_VC_CHROMA_VERT
     96 
     97 dAboveVal       DN  D0.U8
     98 
     99 ;// OMX_VC_CHROMA_DC
    100 
    101 dLeftVal        DN  D1.U8
    102 dSumAboveValU16 DN  D2.U16
    103 dSumAboveValU32 DN  D3.U32
    104 dSumAboveValU8  DN  D3.U8
    105 dSumLeftValU16  DN  D2.U16
    106 dSumLeftValU32  DN  D1.U32
    107 dSumLeftValU8   DN  D1.U8
    108 dSumAboveLeft   DN  D2.U32
    109 dSumAboveLeftU8 DN  D2.U8
    110 dIndexRow0U8    DN  D5.U8
    111 dIndexRow0      DN  D5.U64
    112 dIndexRow4U8    DN  D6.U8
    113 dIndexRow4      DN  D6.U64
    114 dDstRow0        DN  D0.U8
    115 dDstRow4        DN  D4.U8
    116 dConst128U8     DN  D0.U8
    117 
    118 ;// OMX_VC_CHROMA_PLANE
    119 
    120 dRevAboveVal    DN  D3.U8
    121 dRevAboveValU64 DN  D3.U64
    122 dAboveLeftVal   DN  D2.U8
    123 qAbove7minus0   QN  Q3.S16
    124 qAboveDiff      QN  Q2.S16
    125 dIndex          DN  D8.U8
    126 dDiffAboveU8    DN  D9.U8
    127 dDiffAboveS16   DN  D9.S16
    128 dAboveDiff0U8   DN  D4.U8
    129 dAboveDiff0U64  DN  D4.U64
    130 dAbove7minus0U8 DN  D6.U8
    131 dMultiplier     DN  D10.S16
    132 dHorPred        DN  D11.S16
    133 dRevLeftVal     DN  D3.U8
    134 dRevLeftValU64  DN  D3.U64
    135 qLeft7minus0    QN  Q7.S16
    136 qLeftDiff       QN  Q6.S16
    137 dDiffLeftU8     DN  D16.U8
    138 dDiffLeftS16    DN  D16.S16
    139 dLeftDiff0U8    DN  D12.U8
    140 dLeftDiff0U64   DN  D12.U64
    141 dLeft7minus0U8  DN  D14.U8
    142 dVerPred        DN  D3.S16
    143 dHVValS16       DN  D3.S16
    144 dHVValS32       DN  D3.S32
    145 dHVTempS32      DN  D2.S32
    146 qA              QN  Q0.S16
    147 qB              QN  Q2.S16
    148 qC              QN  Q3.S16
    149 qMultiplier     QN  Q5.S16
    150 dMultiplier0    DN  D10.S16
    151 dMultiplier1    DN  D11.S16
    152 qC0             QN  Q0.S16
    153 qC1             QN  Q1.S16
    154 qC2             QN  Q4.S16
    155 qC3             QN  Q5.S16
    156 qC4             QN  Q6.S16
    157 qC5             QN  Q7.S16
    158 qC6             QN  Q8.S16
    159 qC7             QN  Q9.S16
    160 qSum0           QN  Q0.S16
    161 qSum1           QN  Q1.S16
    162 qSum2           QN  Q4.S16
    163 qSum3           QN  Q5.S16
    164 qSum4           QN  Q6.S16
    165 qSum5           QN  Q7.S16
    166 qSum6           QN  Q8.S16
    167 qSum7           QN  Q9.S16
    168 dSum0           DN  D0.U8
    169 dSum1           DN  D1.U8
    170 dSum2           DN  D2.U8
    171 dSum3           DN  D3.U8
    172 dSum4           DN  D4.U8
    173 dSum5           DN  D5.U8
    174 dSum6           DN  D6.U8
    175 dSum7           DN  D7.U8
    176 
    177 ;//-----------------------------------------------------------------------------------------------
    178 ;// omxVCM4P10_PredictIntraChroma_8x8 starts
    179 ;//-----------------------------------------------------------------------------------------------
    180 
    181         ;// Write function header
    182         M_START omxVCM4P10_PredictIntraChroma_8x8, r10, d15
    183 
    184         ;// Define stack arguments
    185         M_ARG    LeftStep,     4
    186         M_ARG    DstStep,      4
    187         M_ARG    PredMode,     4
    188         M_ARG    Availability, 4
    189 
    190         LDR      pTable,=armVCM4P10_pIndexTable8x8   ;// Load index table for switch case
    191 
    192         ;// Load argument from the stack
    193         M_LDR    predMode, PredMode                  ;// Arg predMode loaded from stack to reg
    194         M_LDR    leftStep, LeftStep                  ;// Arg leftStep loaded from stack to reg
    195         M_LDR    dstStep,  DstStep                   ;// Arg dstStep loaded from stack to reg
    196         M_LDR    availability, Availability          ;// Arg availability loaded from stack to reg
    197 
    198 
    199         LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode
    200 
    201 OMX_VC_CHROMA_DC
    202 
    203         TST     availability, #OMX_VC_LEFT
    204         BEQ     DCChroma8x8LeftNotAvailable
    205 
    206         ADD     pTmp, pSrcLeft, leftStep
    207         ADD     step, leftStep, leftStep
    208 
    209         ;// Load Left Edge
    210         VLD1    {dLeftVal[0]},[pSrcLeft],step               ;// pSrcLeft[0*leftStep]
    211         VLD1    {dLeftVal[1]},[pTmp],step                   ;// pSrcLeft[1*leftStep]
    212         VLD1    {dLeftVal[2]},[pSrcLeft],step               ;// pSrcLeft[2*leftStep]
    213         VLD1    {dLeftVal[3]},[pTmp],step                   ;// pSrcLeft[3*leftStep]
    214         VLD1    {dLeftVal[4]},[pSrcLeft],step               ;// pSrcLeft[4*leftStep]
    215         VLD1    {dLeftVal[5]},[pTmp],step                   ;// pSrcLeft[5*leftStep]
    216         VLD1    {dLeftVal[6]},[pSrcLeft],step               ;// pSrcLeft[6*leftStep]
    217         VLD1    {dLeftVal[7]},[pTmp]                        ;// pSrcLeft[7*leftStep]
    218 
    219         TST     availability, #OMX_VC_UPPER
    220         BEQ     DCChroma8x8LeftOnlyAvailable
    221 
    222         ;// Load Upper Edge also
    223         VLD1     dAboveVal,[pSrcAbove]                      ;// pSrcAbove[0 to 7]
    224 
    225         MOV      return, #OMX_Sts_NoErr                     ;// returnNoError
    226 
    227         VPADDL   dSumAboveValU16, dAboveVal                 ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ]
    228         VPADDL   dSumAboveValU32, dSumAboveValU16           ;// pSrcAbove[ 4+5+6+7 |  0+1+2+3 ]
    229 
    230         VPADDL   dSumLeftValU16, dLeftVal                   ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ]
    231         VPADDL   dSumLeftValU32, dSumLeftValU16             ;// pSrcLeft[ 4+5+6+7 |  0+1+2+3 ]
    232 
    233         VADD     dSumAboveLeft,dSumAboveValU32,dSumLeftValU32
    234         VRSHR    dSumAboveLeft,dSumAboveLeft,#3             ;// Sum = (Sum + 4) >> 3
    235         VRSHR    dSumAboveValU32,dSumAboveValU32,#2         ;// Sum = (Sum + 2) >> 2
    236         VRSHR    dSumLeftValU32,dSumLeftValU32,#2           ;// Sum = (Sum + 2) >> 2
    237 
    238         VMOV     dIndexRow0U8,#0x0c
    239         VMOV     dIndexRow4U8,#0x04
    240         VSHL     dIndexRow0,dIndexRow0,#32                  ;// index0 = 0x0c0c0c0c00000000
    241         VSHR     dIndexRow4,dIndexRow4,#32                  ;// index4 = 0x0000000004040404
    242         VADD     dIndexRow4U8,dIndexRow4U8,dIndexRow0U8     ;// index4 = 0x0c0c0c0c04040404
    243         VTBL     dDstRow0,{dSumAboveLeftU8,dSumAboveValU8},dIndexRow0U8
    244         VTBL     dDstRow4,{dSumLeftValU8,dSumAboveLeftU8},dIndexRow4U8
    245 
    246 DCChroma8x8LeftStore
    247         ADD     pTmp, pDst, dstStep
    248         ADD     step, dstStep, dstStep
    249 
    250         VST1     dDstRow0,[pDst],step                    ;// pDst[0*dstStep+x] :0<= x <= 7
    251         VST1     dDstRow0,[pTmp],step                    ;// pDst[1*dstStep+x] :0<= x <= 7
    252         VST1     dDstRow0,[pDst],step                    ;// pDst[2*dstStep+x] :0<= x <= 7
    253         VST1     dDstRow0,[pTmp],step                    ;// pDst[3*dstStep+x] :0<= x <= 7
    254         VST1     dDstRow4,[pDst],step                    ;// pDst[4*dstStep+x] :0<= x <= 7
    255         VST1     dDstRow4,[pTmp],step                    ;// pDst[5*dstStep+x] :0<= x <= 7
    256         VST1     dDstRow4,[pDst],step                    ;// pDst[6*dstStep+x] :0<= x <= 7
    257         VST1     dDstRow4,[pTmp]                         ;// pDst[7*dstStep+x] :0<= x <= 7
    258 
    259         M_EXIT
    260 
    261 
    262 DCChroma8x8LeftOnlyAvailable
    263 
    264         MOV      return, #OMX_Sts_NoErr
    265 
    266         VPADDL   dSumLeftValU16, dLeftVal                   ;// pSrcLeft[ 6+7 | 4+5 | 2+3 | 0+1 ]
    267         VPADDL   dSumLeftValU32, dSumLeftValU16             ;// pSrcLeft[ 4+5+6+7 |  0+1+2+3 ]
    268         VRSHR    dSumLeftValU32,dSumLeftValU32,#2           ;// Sum = (Sum + 2) >> 2
    269 
    270         VDUP     dDstRow0,dSumLeftValU8[0]
    271         VDUP     dDstRow4,dSumLeftValU8[4]
    272 
    273         B        DCChroma8x8LeftStore
    274 
    275 
    276 DCChroma8x8LeftNotAvailable
    277 
    278         TST     availability, #OMX_VC_UPPER
    279         BEQ     DCChroma8x8NoneAvailable
    280 
    281         ;// Load Upper Edge
    282         VLD1     dAboveVal,[pSrcAbove]                      ;// pSrcAbove[0 to 7]
    283         MOV      return, #OMX_Sts_NoErr                     ;// returnNoError
    284 
    285         VPADDL   dSumAboveValU16, dAboveVal                 ;// pSrcAbove[ 6+7 | 4+5 | 2+3 | 0+1 ]
    286         VPADDL   dSumAboveValU32, dSumAboveValU16           ;// pSrcAbove[ 4+5+6+7 |  0+1+2+3 ]
    287         VRSHR    dSumAboveValU32,dSumAboveValU32,#2         ;// Sum = (Sum + 2) >> 2
    288         VMOV     dIndexRow0U8,#0x04
    289         VSHL     dIndexRow0,dIndexRow0,#32                  ;// index = 0x0404040400000000
    290         VTBL     dDstRow0,{dSumAboveValU8},dIndexRow0U8
    291 
    292         B        DCChroma8x8UpperStore
    293 
    294 
    295 DCChroma8x8NoneAvailable
    296 
    297         VMOV     dConst128U8,#0x80                          ;// 0x8080808080808080 if(count == 0)
    298         MOV      return, #OMX_Sts_NoErr                     ;// returnNoError
    299 
    300 DCChroma8x8UpperStore
    301 
    302         ADD     pTmp, pDst, dstStep
    303         ADD     step, dstStep, dstStep
    304 
    305         VST1     dDstRow0,[pDst],step                    ;// pDst[0*dstStep+x] :0<= x <= 7
    306         VST1     dDstRow0,[pTmp],step                    ;// pDst[1*dstStep+x] :0<= x <= 7
    307         VST1     dDstRow0,[pDst],step                    ;// pDst[2*dstStep+x] :0<= x <= 7
    308         VST1     dDstRow0,[pTmp],step                    ;// pDst[3*dstStep+x] :0<= x <= 7
    309         VST1     dDstRow0,[pDst],step                    ;// pDst[4*dstStep+x] :0<= x <= 7
    310         VST1     dDstRow0,[pTmp],step                    ;// pDst[5*dstStep+x] :0<= x <= 7
    311         VST1     dDstRow0,[pDst],step                    ;// pDst[6*dstStep+x] :0<= x <= 7
    312         VST1     dDstRow0,[pTmp]                         ;// pDst[7*dstStep+x] :0<= x <= 7
    313 
    314         M_EXIT
    315 
    316 
    317 OMX_VC_CHROMA_VERT
    318 
    319         VLD1     dAboveVal,[pSrcAbove]                      ;// pSrcAbove[x]      :0<= x <= 7
    320         MOV      return, #OMX_Sts_NoErr
    321 
    322         B        DCChroma8x8UpperStore
    323 
    324 
    325 OMX_VC_CHROMA_HOR
    326 
    327         ADD     pTmp, pSrcLeft, leftStep
    328         ADD     step, leftStep, leftStep
    329 
    330         VLD1    {dLeftVal0[]},[pSrcLeft],step           ;// pSrcLeft[0*leftStep]
    331         VLD1    {dLeftVal1[]},[pTmp],step               ;// pSrcLeft[1*leftStep]
    332         VLD1    {dLeftVal2[]},[pSrcLeft],step           ;// pSrcLeft[2*leftStep]
    333         VLD1    {dLeftVal3[]},[pTmp],step               ;// pSrcLeft[3*leftStep]
    334         VLD1    {dLeftVal4[]},[pSrcLeft],step           ;// pSrcLeft[4*leftStep]
    335         VLD1    {dLeftVal5[]},[pTmp],step               ;// pSrcLeft[5*leftStep]
    336         VLD1    {dLeftVal6[]},[pSrcLeft],step           ;// pSrcLeft[6*leftStep]
    337         VLD1    {dLeftVal7[]},[pTmp]                    ;// pSrcLeft[7*leftStep]
    338 
    339         B        DCChroma8x8PlaneStore
    340 
    341 
    342 OMX_VC_CHROMA_PLANE
    343         ADD     pTmp, pSrcLeft, leftStep
    344         ADD     step, leftStep, leftStep
    345 
    346         VLD1    dAboveVal,[pSrcAbove]                       ;// pSrcAbove[x]      :0<= x <= 7
    347         VLD1    dAboveLeftVal[0],[pSrcAboveLeft]
    348 
    349         VLD1    {dLeftVal[0]},[pSrcLeft],step               ;// pSrcLeft[0*leftStep]
    350         VLD1    {dLeftVal[1]},[pTmp],step                   ;// pSrcLeft[1*leftStep]
    351         VLD1    {dLeftVal[2]},[pSrcLeft],step               ;// pSrcLeft[2*leftStep]
    352         VLD1    {dLeftVal[3]},[pTmp],step                   ;// pSrcLeft[3*leftStep]
    353         VLD1    {dLeftVal[4]},[pSrcLeft],step               ;// pSrcLeft[4*leftStep]
    354         VLD1    {dLeftVal[5]},[pTmp],step                   ;// pSrcLeft[5*leftStep]
    355         VLD1    {dLeftVal[6]},[pSrcLeft],step               ;// pSrcLeft[6*leftStep]
    356         VLD1    {dLeftVal[7]},[pTmp]                        ;// pSrcLeft[7*leftStep]
    357 
    358 
    359         VREV64  dRevAboveVal,dAboveVal                      ;// Reverse order of bytes = pSrcAbove[0:1:2:3:4:5:6:7]
    360         VSUBL   qAbove7minus0,dRevAboveVal,dAboveLeftVal    ;// qAbove7minus0[0] = pSrcAbove[7] - pSrcAboveLeft[0]
    361         VSHR    dRevAboveValU64,dRevAboveValU64,#8          ;// pSrcAbove[X:0:1:2:3:4:5:6]
    362         VSUBL   qAboveDiff,dRevAboveVal,dAboveVal           ;// pSrcAbove[6] - pSrcAbove[0]
    363                                                             ;// pSrcAbove[5] - pSrcAbove[1]
    364                                                             ;// pSrcAbove[4] - pSrcAbove[2]
    365 
    366         VREV64  dRevLeftVal,dLeftVal                        ;// Reverse order of bytes = pSrcLeft[0:1:2:3:4:5:6:7]
    367         VSUBL   qLeft7minus0,dRevLeftVal,dAboveLeftVal      ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0]
    368         VSHR    dRevLeftValU64,dRevLeftValU64,#8            ;// pSrcLeft[X:0:1:2:3:4:5:6]
    369         VSUBL   qLeftDiff,dRevLeftVal,dLeftVal              ;// pSrcLeft[6] - pSrcLeft[0]
    370                                                             ;// pSrcLeft[5] - pSrcLeft[1]
    371                                                             ;// pSrcLeft[4] - pSrcLeft[2]
    372 
    373         LDR     pMultiplierTable,=armVCM4P10_MultiplierTableChroma8x8   ;// Used to calculate Hval & Vval
    374         VSHL    dAboveDiff0U64,dAboveDiff0U64,#16
    375         VEXT    dDiffAboveU8,dAboveDiff0U8,dAbove7minus0U8,#2           ;// pSrcAbove[ 7-0 | 4-2 | 5-1 | 6-0 ]
    376         VLD1    dMultiplier,[pMultiplierTable]!
    377         VSHL    dLeftDiff0U64,dLeftDiff0U64,#16
    378         VEXT    dDiffLeftU8,dLeftDiff0U8,dLeft7minus0U8,#2              ;// pSrcLeft[ 7-0 | 4-2 | 5-1 | 6-0 ]
    379 
    380 
    381         VMUL    dHorPred,dDiffAboveS16,dMultiplier                      ;// pSrcAbove[ 4*(7-0) | 1*(4-2) | 2*(5-1) | 3*(6-0) ]
    382         VMUL    dVerPred,dDiffLeftS16,dMultiplier
    383         VPADD   dHVValS16,dHorPred,dVerPred
    384 
    385 
    386         VPADDL  dHVValS32,dHVValS16                                     ;// [V|H] in 32 bits each
    387         VSHL    dHVTempS32,dHVValS32,#4                                 ;// 17*H = 16*H + H = (H<<4)+H
    388         VADD    dHVValS32,dHVValS32,dHVTempS32                          ;// [ 17*V  | 17*H ]in 32 bits each
    389         VLD1    {dMultiplier0,dMultiplier1},[pMultiplierTable]          ;// qMultiplier = [ 4|3|2|1|0|-1|-2|-3 ]
    390         VRSHR   dHVValS32,dHVValS32,#5                                  ;// [c|b] in 16bits each
    391         VADDL   qA,dAboveVal,dLeftVal
    392         VDUP    qA,qA[7]
    393         VSHL    qA,qA,#4                                                ;// [a|a|a|a|a|a|a|a]
    394         VDUP    qB,dHVValS16[0]                                         ;// [b|b|b|b|b|b|b|b]
    395         VDUP    qC,dHVValS16[2]                                         ;// [c|c|c|c|c|c|c|c]
    396 
    397 
    398         VMUL    qB,qB,qMultiplier
    399         VMUL    qC,qC,qMultiplier
    400         VADD    qB,qB,qA
    401 
    402         VDUP    qC0,qC[0]
    403         VDUP    qC1,qC[1]
    404         VDUP    qC2,qC[2]
    405         VDUP    qC3,qC[3]
    406         VDUP    qC4,qC[4]
    407         VDUP    qC5,qC[5]
    408         VDUP    qC6,qC[6]
    409         VDUP    qC7,qC[7]
    410 
    411         VADD    qSum0,qB,qC0
    412         VADD    qSum1,qB,qC1
    413         VADD    qSum2,qB,qC2
    414         VADD    qSum3,qB,qC3
    415         VADD    qSum4,qB,qC4
    416         VADD    qSum5,qB,qC5
    417         VADD    qSum6,qB,qC6
    418         VADD    qSum7,qB,qC7
    419 
    420         VQRSHRUN dSum0,qSum0,#5                         ;// (OMX_U8)armClip(0,255,(Sum+16)>>5)
    421         VQRSHRUN dSum1,qSum1,#5
    422         VQRSHRUN dSum2,qSum2,#5
    423         VQRSHRUN dSum3,qSum3,#5
    424         VQRSHRUN dSum4,qSum4,#5
    425         VQRSHRUN dSum5,qSum5,#5
    426         VQRSHRUN dSum6,qSum6,#5
    427         VQRSHRUN dSum7,qSum7,#5
    428 
    429 DCChroma8x8PlaneStore
    430         ADD     pTmp, pDst, dstStep
    431         ADD     step, dstStep, dstStep
    432 
    433         VST1    dSum0,[pDst],step                    ;// pDst[0*dstStep+x] :0<= x <= 7
    434         VST1    dSum1,[pTmp],step                    ;// pDst[1*dstStep+x] :0<= x <= 7
    435         VST1    dSum2,[pDst],step                    ;// pDst[2*dstStep+x] :0<= x <= 7
    436         VST1    dSum3,[pTmp],step                    ;// pDst[3*dstStep+x] :0<= x <= 7
    437         VST1    dSum4,[pDst],step                    ;// pDst[4*dstStep+x] :0<= x <= 7
    438         VST1    dSum5,[pTmp],step                    ;// pDst[5*dstStep+x] :0<= x <= 7
    439         VST1    dSum6,[pDst],step                    ;// pDst[6*dstStep+x] :0<= x <= 7
    440         VST1    dSum7,[pTmp]                         ;// pDst[7*dstStep+x] :0<= x <= 7
    441 
    442         MOV     return, #OMX_Sts_NoErr
    443         M_END
    444 
    445         ENDIF ;// CortexA8
    446 
    447         END
    448 ;//-----------------------------------------------------------------------------------------------
    449 ;// omxVCM4P10_PredictIntraChroma_8x8 ends
    450 ;//-----------------------------------------------------------------------------------------------
    451