Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// Copyright (C) 2007-2008 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// File Name:  omxVCM4P10_PredictIntra_4x4_s.s
     19 ;// OpenMAX DL: v1.0.2
     20 ;// Revision:   12290
     21 ;// Date:       Wednesday, April 9, 2008
     22 ;//
     23 ;//
     24 ;//
     25 ;//
     26 
     27 
     28         INCLUDE omxtypes_s.h
     29         INCLUDE armCOMM_s.h
     30 
     31 ;// Define the processor variants supported by this file
     32 
     33          M_VARIANTS CortexA8
     34 
     35 ;//-------------------------------------------------------
     36 ;// This table for implementing switch case of C in asm by
     37 ;// the mehtod of two levels of indexing.
     38 ;//-------------------------------------------------------
     39 
     40     M_TABLE armVCM4P10_pSwitchTable4x4
     41     DCD  OMX_VC_4x4_VERT,     OMX_VC_4x4_HOR
     42     DCD  OMX_VC_4x4_DC,       OMX_VC_4x4_DIAG_DL
     43     DCD  OMX_VC_4x4_DIAG_DR,  OMX_VC_4x4_VR
     44     DCD  OMX_VC_4x4_HD,       OMX_VC_4x4_VL
     45     DCD  OMX_VC_4x4_HU
     46 
     47 
     48         IF CortexA8
     49 
     50 ;//--------------------------------------------
     51 ;// Scratch variable
     52 ;//--------------------------------------------
     53 return          RN 0
     54 pTable          RN 8
     55 pc              RN 15
     56 
     57 ;//--------------------------------------------
     58 ;// Declare input registers
     59 ;//--------------------------------------------
     60 pSrcLeft        RN 0    ;// input pointer
     61 pSrcAbove       RN 1    ;// input pointer
     62 pSrcAboveLeft   RN 2    ;// input pointer
     63 pDst            RN 3    ;// output pointer
     64 leftStep        RN 4    ;// input variable
     65 dstStep         RN 5    ;// input variable
     66 predMode        RN 6    ;// input variable
     67 availability    RN 7    ;// input variable
     68 pDst1           RN 1
     69 pDst2           RN 4
     70 pDst3           RN 6
     71 
     72 pSrcTmp         RN 9
     73 srcStep         RN 10
     74 pDstTmp         RN 11
     75 dstep           RN 12
     76 
     77 ;//-------------------
     78 ;// Neon registers
     79 ;//-------------------
     80 
     81 ;// OMX_VC_CHROMA_VERT
     82 dAboveU32       DN  D0.U32
     83 
     84 ;// OMX_VC_CHROMA_HOR
     85 dLeftVal0       DN  D0.8
     86 dLeftVal1       DN  D1.8
     87 dLeftVal2       DN  D2.8
     88 dLeftVal3       DN  D3.8
     89 dLeftVal0U32    DN  D0.U32
     90 dLeftVal1U32    DN  D1.U32
     91 dLeftVal2U32    DN  D2.U32
     92 dLeftVal3U32    DN  D3.U32
     93 
     94 ;// OMX_VC_4x4_DC
     95 dLeftVal        DN  D0.U8
     96 dLeftValU32     DN  D0.U32
     97 dSumAboveLeftU16  DN  D1.U16
     98 dSumAboveLeftU32  DN  D1.U32
     99 dSumAboveLeftU64  DN  D1.U64
    100 dSumAboveLeftU8 DN  D1.U8
    101 dSum            DN  D0.U8
    102 
    103 dSumLeftValU16  DN  D1.U16
    104 dSumLeftValU32  DN  D1.U32
    105 dSumLeftValU64  DN  D1.U64
    106 dSumLeftValU8   DN  D1.U8
    107 
    108 dAboveVal       DN  D0.U8
    109 dSumAboveValU16  DN  D1.U16
    110 dSumAboveValU32  DN  D1.U32
    111 dSumAboveValU64  DN  D1.U64
    112 dSumAboveValU8   DN  D1.U8
    113 dConst128U8     DN  D0.U8
    114 
    115 
    116 ;//OMX_VC_4x4_DIAG_DL
    117 
    118 dAbove          DN  D0.U8
    119 dU7             DN  D2.U8
    120 dU3             DN  D2.U8
    121 dAbove0         DN  D3.U8
    122 dAbove1         DN  D4.U8
    123 dAbove2         DN  D5.U8
    124 dTmp            DN  D6.U8
    125 dTmp0           DN  D7.U8
    126 dTmp1           DN  D8.U8
    127 dTmp2            DN  D9.U8
    128 dTmp3            DN  D10.U8
    129 dTmpU32         DN  D6.U32
    130 
    131 
    132 ;//OMX_VC_4x4_DIAG_DR
    133 dLeft           DN  D1.U8
    134 dUL             DN  D2.U8
    135 
    136 ;//OMX_VC_4x4_VR
    137 dLeft0          DN  D1.U8
    138 dLeft1          DN  D2.U8
    139 dEven0          DN  D3.U8
    140 dEven1          DN  D4.U8
    141 dEven2          DN  D5.U8
    142 dOdd0           DN  D6.U8
    143 dOdd1           DN  D11.U8
    144 dOdd2           DN  D12.U8
    145 dTmp3U32        DN  D10.U32
    146 dTmp2U32        DN  D9.U32
    147 
    148 
    149 ;//OMX_VC_4x4_HD
    150 dTmp1U64        DN  D8.U64
    151 dTmp0U64        DN  D7.U64
    152 dTmpU64         DN  D6.U64
    153 dTmpU32         DN  D6.U32
    154 dTmp1U32        DN  D8.U32
    155 
    156 ;//OMX_VC_4x4_HU
    157 dL3             DN  D2.U8
    158 dLeftHU0        DN  D3.U8
    159 dLeftHU1        DN  D4.U8
    160 dLeftHU2        DN  D5.U8
    161 dTmp0U32        DN  D7.U32
    162 
    163 
    164 
    165 
    166 ;//-----------------------------------------------------------------------------------------------
    167 ;// omxVCM4P10_PredictIntra_4x4 starts
    168 ;//-----------------------------------------------------------------------------------------------
    169 
    170         ;// Write function header
    171         M_START omxVCM4P10_PredictIntra_4x4, r12,d12
    172 
    173         ;// Define stack arguments
    174         M_ARG    LeftStep,     4
    175         M_ARG    DstStep,      4
    176         M_ARG    PredMode,     4
    177         M_ARG    Availability, 4
    178 
    179 
    180         LDR      pTable,=armVCM4P10_pSwitchTable4x4  ;// Load index table for switch case
    181 
    182         ;// Load argument from the stack
    183         M_LDRD   predMode,availability,PredMode     ;// Arg predMode & availability loaded from stack to reg
    184         M_LDRD   leftStep,dstStep,LeftStep          ;// Arg leftStep & dstStep loaded from stack to reg
    185 
    186 
    187         LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode
    188 
    189 
    190 OMX_VC_4x4_HOR
    191 
    192         ADD     pSrcTmp, pSrcLeft, leftStep
    193         ADD     srcStep, leftStep, leftStep
    194         ;// Load Left Edge
    195         VLD1    {dLeftVal0[]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
    196         VLD1    {dLeftVal1[]},[pSrcTmp],srcStep            ;//    pSrcLeft[1*leftStep]
    197         VLD1    {dLeftVal2[]},[pSrcLeft]                   ;//    pSrcLeft[2*leftStep]
    198         VLD1    {dLeftVal3[]},[pSrcTmp]                    ;//    pSrcLeft[3*leftStep]
    199 
    200         ADD     pDstTmp, pDst, dstStep
    201         ADD     dstep, dstStep, dstStep
    202 
    203         VST1    dLeftVal0U32[0],[pDst],dstep                ;// pDst[0*dstStep+x] :0<= x <= 7
    204         VST1    dLeftVal1U32[0],[pDstTmp],dstep             ;// pDst[1*dstStep+x] :0<= x <= 7
    205         VST1    dLeftVal2U32[0],[pDst]                      ;// pDst[2*dstStep+x] :0<= x <= 7
    206         VST1    dLeftVal3U32[0],[pDstTmp]                   ;// pDst[3*dstStep+x] :0<= x <= 7
    207 
    208         B        ExitPredict4x4                             ;// Branch to exit code
    209 
    210 OMX_VC_4x4_VERT
    211 
    212         ;// Load Upper Edge
    213         VLD1     dAboveU32[0],[pSrcAbove]
    214         ADD     pDstTmp, pDst, dstStep
    215         ADD     dstep, dstStep, dstStep
    216 
    217 DCPredict4x4VertStore
    218 
    219         VST1     dAboveU32[0],[pDst],dstep
    220         VST1     dAboveU32[0],[pDstTmp],dstep
    221         VST1     dAboveU32[0],[pDst]
    222         VST1     dAboveU32[0],[pDstTmp]
    223 
    224         B        ExitPredict4x4                             ;// Branch to exit code
    225 
    226 OMX_VC_4x4_DC
    227 
    228 
    229         TST     availability, #OMX_VC_LEFT
    230         BEQ     DCPredict4x4LeftNotAvailable
    231 
    232         ADD     pSrcTmp, pSrcLeft, leftStep
    233         ADD     srcStep, leftStep, leftStep
    234         ;// Load Left Edge
    235         VLD1    {dLeftVal[0]},[pSrcLeft],srcStep            ;// pSrcLeft[0*leftStep]
    236         VLD1    {dLeftVal[1]},[pSrcTmp],srcStep             ;//    pSrcLeft[1*leftStep]
    237         VLD1    {dLeftVal[2]},[pSrcLeft]                    ;//    pSrcLeft[2*leftStep]
    238         VLD1    {dLeftVal[3]},[pSrcTmp]                     ;//    pSrcLeft[3*leftStep]
    239 
    240         TST     availability, #OMX_VC_UPPER
    241         BEQ     DCPredict4x4LeftOnlyAvailable
    242 
    243         ;// Load Upper Edge also
    244         VLD1     dLeftValU32[1],[pSrcAbove]                 ;// pSrcAbove[0 to 3]
    245         MOV      return, #OMX_Sts_NoErr
    246 
    247         VPADDL   dSumAboveLeftU16, dLeftVal                 ;// [pSrcAbove[2+3 | 0+1] | pSrcLeft[2+3 | 0+1]]
    248         VPADDL   dSumAboveLeftU32, dSumAboveLeftU16         ;// [pSrcAbove[2+3+0+1] | pSrcLeft[2+3+0+1]]
    249         VPADDL   dSumAboveLeftU64, dSumAboveLeftU32         ;// [pSrcAbove[2+3+0+1] + pSrcLeft[2+3+0+1]]
    250         VRSHR    dSumAboveLeftU64,dSumAboveLeftU64,#3       ;// Sum = (Sum + 4) >> 3
    251         ADD     pDstTmp, pDst, dstStep
    252         ADD     dstep, dstStep, dstStep
    253         VDUP     dSum,dSumAboveLeftU8[0]
    254 
    255         B        DCPredict4x4VertStore
    256 
    257 DCPredict4x4LeftOnlyAvailable
    258 
    259         MOV      return, #OMX_Sts_NoErr                     ;// returnNoError
    260 
    261         VPADDL   dSumLeftValU16, dLeftVal                   ;// [ XX | pSrcLeft[2+3 | 0+1]]
    262         VPADDL   dSumLeftValU32, dSumLeftValU16             ;// [ XXXX | pSrcLeft[2+3+0+1]]
    263 
    264         VRSHR    dSumLeftValU32,dSumLeftValU32,#2           ;// Sum = (Sum + 2) >> 2
    265         ADD     pDstTmp, pDst, dstStep
    266         ADD     dstep, dstStep, dstStep
    267         VDUP     dSum,dSumLeftValU8[0]
    268 
    269         B        DCPredict4x4VertStore
    270 
    271 DCPredict4x4LeftNotAvailable
    272 
    273         TST     availability, #OMX_VC_UPPER
    274         BEQ     DCPredict4x4NoneAvailable
    275 
    276         ;// Load Upper Edge
    277         VLD1     dAboveU32[0],[pSrcAbove]                   ;// pSrcAbove[0 to 3]
    278         MOV      return, #OMX_Sts_NoErr
    279 
    280         VPADDL   dSumAboveValU16, dAboveVal                 ;// [ XX | pSrcAbove[2+3 | 0+1]]
    281         VPADDL   dSumAboveValU32, dSumAboveValU16           ;// [ XXXX | pSrcAbove[2+3+0+1]]
    282 
    283         VRSHR    dSumAboveValU32,dSumAboveValU32,#2         ;// Sum = (Sum + 2) >> 2
    284         ADD     pDstTmp, pDst, dstStep
    285         ADD     dstep, dstStep, dstStep
    286         VDUP     dSum,dSumAboveValU8[0]
    287 
    288         B        DCPredict4x4VertStore
    289 
    290 DCPredict4x4NoneAvailable
    291 
    292         VMOV     dConst128U8,#0x80                          ;// 0x8080808080808080 if(count == 0)
    293         MOV      return, #OMX_Sts_NoErr
    294 
    295         ADD     pDstTmp, pDst, dstStep
    296         ADD     dstep, dstStep, dstStep
    297         B        DCPredict4x4VertStore
    298 
    299 
    300 
    301 OMX_VC_4x4_DIAG_DL
    302 
    303         TST     availability, #OMX_VC_UPPER_RIGHT
    304         BEQ     DiagDLUpperRightNotAvailable
    305 
    306         VLD1    dAbove0,[pSrcAbove]                     ;// [U7|U6|U5|U4|U3|U2|U1|U0]
    307         VDUP    dU7, dAbove0[7]                         ;// [U7|U7|U7|U7|U7|U7|U7|U7]
    308         VEXT    dAbove1, dAbove0, dU7, #1               ;// [U7|U7|U6|U5|U4|U3|U2|U1]
    309         VEXT    dAbove2, dAbove0, dU7, #2               ;// [U7|U7|U7|U6|U5|U4|U3|U2]
    310         B       DiagDLPredict4x4Store
    311 
    312 DiagDLUpperRightNotAvailable
    313         VLD1    dAboveU32[1],[pSrcAbove]                ;// [U3|U2|U1|U0|-|-|-|-]
    314         VDUP    dU3, dAbove[7]                          ;// [U3 U3 U3 U3 U3 U3 U3 U3]
    315 
    316         VEXT    dAbove0, dAbove, dU3, #4                ;// [U3 U3 U3 U3 U3 U2 U1 U0]
    317         VEXT    dAbove1, dAbove, dU3, #5                ;// [U3 U3 U3 U3 U3 U3 U2 U1]
    318         VEXT    dAbove2, dAbove, dU3, #6                ;// [U3 U3 U3 U3 U3 U3 U3 U2]
    319 
    320 DiagDLPredict4x4Store
    321 
    322         VHADD   dTmp, dAbove0, dAbove2
    323         VRHADD  dTmp, dTmp, dAbove1                     ;// (a+2*b+c+2)>>2
    324 
    325 
    326         VST1    dTmpU32[0],[pDst],dstStep
    327         VEXT    dTmp,dTmp,dTmp,#1
    328         VST1    dTmpU32[0],[pDst],dstStep
    329         VEXT    dTmp,dTmp,dTmp,#1
    330         VST1    dTmpU32[0],[pDst],dstStep
    331         VEXT    dTmp,dTmp,dTmp,#1
    332         VST1    dTmpU32[0],[pDst]
    333 
    334         B        ExitPredict4x4                         ;// Branch to exit code
    335 
    336 
    337 OMX_VC_4x4_DIAG_DR
    338 
    339 
    340         ;// Load U0,U1,U2,U3
    341 
    342         VLD1    dAboveU32[0],[pSrcAbove]                ;// [X|X|X|X|U3|U2|U1|U0]
    343 
    344         ;// Load UL,L0,L1,L2,L3                         ;// dLeft = [UL|L0|L1|L2|L3|X|X|X]
    345         VLD1    {dLeft[7]},[pSrcAboveLeft]
    346         ADD     pSrcTmp, pSrcLeft, leftStep
    347         ADD     srcStep, leftStep, leftStep
    348         ADD     pDst1,pDst,dstStep
    349 
    350         VLD1    {dLeft[6]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
    351         VLD1    {dLeft[5]},[pSrcTmp],srcStep            ;// pSrcLeft[1*leftStep]
    352         VLD1    {dLeft[4]},[pSrcLeft]                   ;// pSrcLeft[2*leftStep]
    353         VLD1    {dLeft[3]},[pSrcTmp]                    ;// pSrcLeft[3*leftStep]
    354 
    355 
    356         VEXT    dAbove0,dLeft,dAbove,#3                 ;// [U2|U1|U0|UL|L0|L1|L2|L3]
    357         ADD     pDst2,pDst1,dstStep
    358         VEXT    dAbove1,dLeft,dAbove,#4                 ;// [U3|U2|U1|U0|UL|L0|L1|L2]
    359         ADD     pDst3,pDst2,dstStep
    360         VEXT    dAbove2,dLeft,dAbove,#5                 ;// [ X|U3|U2|U1|U0|UL|L0|L1]
    361 
    362         VHADD   dTmp, dAbove0, dAbove2
    363         VRHADD  dTmp, dTmp, dAbove1                     ;// (a+2*b+c+2)>>2
    364 
    365 
    366         VST1    dTmpU32[0],[pDst3]                      ;// Store pTmp[0],[1],[2],[3] @ pDst3
    367         VEXT    dTmp,dTmp,dTmp,#1
    368         VST1    dTmpU32[0],[pDst2]                      ;// Store pTmp[1],[2],[3],[4] @ pDst2
    369         VEXT    dTmp,dTmp,dTmp,#1
    370         VST1    dTmpU32[0],[pDst1]                      ;// Store pTmp[2],[3],[4],[5] @ pDst1
    371         VEXT    dTmp,dTmp,dTmp,#1
    372         VST1    dTmpU32[0],[pDst]                       ;// Store pTmp[3],[4],[5],[6] @ pDst
    373 
    374         B        ExitPredict4x4                         ;// Branch to exit code
    375 
    376 OMX_VC_4x4_VR
    377 
    378 
    379         ;// Load UL,U0,U1,U2,U3
    380         VLD1    dAboveU32[0],[pSrcAbove]
    381         VLD1    dAbove[7],[pSrcAboveLeft]               ;// [UL|X|X|X|U3|U2|U1|U0]
    382 
    383         ;// Load L0,L1,L2                               ;// dLeft0 = [L0|L2|X|X|X|X|X|X]
    384                                                         ;// dLeft1 = [L1| X|X|X|X|X|X|X]
    385         VLD1    {dLeft0[7]},[pSrcLeft],leftStep         ;// pSrcLeft[0*leftStep]
    386         VLD1    {dLeft1[7]},[pSrcLeft],leftStep         ;// pSrcLeft[1*leftStep]
    387         VLD1    {dLeft0[6]},[pSrcLeft]                  ;// pSrcLeft[2*leftStep]
    388 
    389 
    390         VEXT    dOdd2,dAbove,dAbove,#7                  ;// [ x x x U3 U2 U1 U0 UL ]
    391         VEXT    dEven0,dLeft0,dOdd2,#6                  ;// [ x x x U1 U0 UL L0 L2 ]
    392         VEXT    dEven1,dLeft1,dOdd2,#7                  ;// [ x x x U2 U1 U0 UL L1 ]
    393         VEXT    dEven2,dLeft0,dAbove,#7                 ;// [ x x x U3 U2 U1 U0 L0 ]
    394         VEXT    dOdd0,dLeft1,dAbove,#7                  ;// [ x x x U3 U2 U1 U0 L1 ]
    395         VEXT    dOdd1,dLeft0,dOdd2,#7                   ;// [ x x x U2 U1 U0 UL L0 ]
    396 
    397         VHADD   dTmp1, dOdd0, dOdd2
    398         VRHADD  dTmp1, dTmp1, dOdd1                     ;// Tmp[ x x x 9 7 5 3 1 ]
    399 
    400         VHADD   dTmp0, dEven0, dEven2
    401         VRHADD  dTmp0, dTmp0, dEven1                    ;// Tmp[ x x x 8 6 4 2 0 ]
    402 
    403 
    404         VEXT    dTmp3,dTmp1,dTmp1,#1                    ;// Tmp[ x x x x 9 7 5 3 ]
    405         ADD     pDstTmp, pDst, dstStep
    406         ADD     dstep, dstStep, dstStep
    407         VEXT    dTmp2,dTmp0,dTmp0,#1                    ;// Tmp[ x x x x 8 6 4 2 ]
    408 
    409 
    410         VST1    dTmp3U32[0],[pDst],dstep                ;// Tmp[9],[7],[5],[3]
    411         VST1    dTmp2U32[0],[pDstTmp],dstep             ;// Tmp[8],[6],[4],[2]
    412         VST1    dTmp1U32[0],[pDst],dstep                ;// Tmp[7],[5],[3],[1]
    413         VST1    dTmp0U32[0],[pDstTmp]                   ;// Tmp[6],[4],[2],[0]
    414 
    415         B        ExitPredict4x4                         ;// Branch to exit code
    416 
    417 OMX_VC_4x4_HD
    418 
    419 
    420         ;// Load U0,U1,U2,U3
    421         VLD1    dAbove,[pSrcAbove]                      ;//dAboveLeftVal = [U7|U6|U5|U4|U3|U2|U1|U0]
    422 
    423         ;// Load UL,L0,L1,L2,L3                         ;// dLeft = [UL|L0|L1|L2|L3|X|X|X]
    424         VLD1    {dLeft[7]},[pSrcAboveLeft]
    425         ADD     pSrcTmp, pSrcLeft, leftStep
    426         ADD     srcStep, leftStep, leftStep
    427 
    428         VLD1    {dLeft[6]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
    429         VLD1    {dLeft[5]},[pSrcTmp],srcStep            ;// pSrcLeft[1*leftStep]
    430         VLD1    {dLeft[4]},[pSrcLeft]                   ;// pSrcLeft[2*leftStep]
    431         VLD1    {dLeft[3]},[pSrcTmp]                    ;// pSrcLeft[3*leftStep]
    432 
    433         VEXT    dAbove0,dLeft,dAbove,#3                 ;// [ U2|U1|U0|UL|L0|L1|L2|L3 ]
    434         VEXT    dAbove1,dLeft,dAbove,#2                 ;// [ U1|U0|UL|L0|L1|L2|L3|X ]
    435         VEXT    dAbove2,dLeft,dAbove,#1                 ;// [ U0|UL|L0|L1|L2|L3|X|X ]
    436 
    437         VHADD   dTmp0, dAbove0, dAbove2
    438         VRHADD  dTmp0, dTmp0, dAbove1                   ;// Tmp[ 0 | 1 | 2 | 4 | 6 | 8 | X | X ]
    439 
    440 
    441         VRHADD  dTmp1, dAbove1, dAbove0                 ;// (a+b+1)>>1
    442         VSHL    dTmp1U64,dTmp1U64,#24                   ;// Tmp[ 3|5| 7 |9 | X | X | X | X ]
    443 
    444 
    445         VSHL    dTmpU64,dTmp0U64,#16                    ;// Tmp[ 2|4|6|8| X | X | X | X ]
    446         VZIP    dTmp1,dTmp                              ;// dTmp = [ 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 ]
    447         VEXT    dTmp0,dTmp0,dTmp0,#6                    ;// Tmp[  X| X| X| X| X| X| 0 | 1 ]
    448         VEXT    dTmp1,dTmp,dTmp0,#2                     ;// Tmp[ 0 | 1 | 2 | 3 | 4 | 5 | 6 |7 ]
    449 
    450         ADD     pDstTmp, pDst, dstStep
    451         ADD     dstep, dstStep, dstStep
    452 
    453         VST1    dTmp1U32[1],[pDst],dstep                ;// Store pTmp[0|1|2|3]
    454         VST1    dTmpU32[1],[pDstTmp],dstep              ;// Store pTmp[2|3|4|5]
    455         VST1    dTmp1U32[0],[pDst]                      ;// Store pTmp[4|5|6|7]
    456         VST1    dTmpU32[0],[pDstTmp]                    ;// Store pTmp[6|7|8|9]
    457 
    458         B        ExitPredict4x4                         ;// Branch to exit code
    459 
    460 OMX_VC_4x4_VL
    461 
    462 
    463         TST     availability, #OMX_VC_UPPER_RIGHT
    464         BEQ     DiagVLUpperRightNotAvailable
    465 
    466         VLD1    dAbove0,[pSrcAbove]                      ;// [U7|U6|U5|U4|U3|U2|U1|U0]
    467         VEXT    dAbove1,dAbove0,dAbove0,#1               ;// [ X|U7|U6|U5|U4|U3|U2|U1]
    468         VEXT    dAbove2,dAbove1,dAbove1,#1               ;// [ X| X|U7|U6|U5|U4|U3|U2]
    469 
    470         B       DiagVLPredict4x4Store
    471 
    472 DiagVLUpperRightNotAvailable
    473         VLD1    dAboveU32[1],[pSrcAbove]                 ;// [U3|U2|U1|U0|-|-|-|-]
    474         VDUP    dU3, dAbove[7]                           ;// [U3 U3 U3 U3 U3 U3 U3 U3]
    475 
    476         VEXT    dAbove0, dAbove, dU3, #4                 ;// [U3 U3 U3 U3 U3 U2 U1 U0]
    477         VEXT    dAbove1, dAbove, dU3, #5                 ;// [U3 U3 U3 U3 U3 U3 U2 U1]
    478         VEXT    dAbove2, dAbove, dU3, #6                 ;// [U3 U3 U3 U3 U3 U3 U3 U2]
    479 
    480 DiagVLPredict4x4Store
    481 
    482         VRHADD  dTmp0, dAbove1, dAbove0                 ;// (a+b+1)>>1
    483                                                         ;// Tmp[ X| X| X| 8| 6| 4| 2| 0 ]
    484 
    485         VHADD   dTmp3, dAbove0, dAbove2
    486         VRHADD  dTmp3, dTmp3, dAbove1                   ;// (a+2*b+c+2)>>2
    487                                                         ;// Tmp[ X| X| X| 9| 7| 5| 3| 1 ]
    488 
    489         VEXT    dTmp1,dTmp0,dTmp0,#1                    ;// Tmp[ X| X| X| X| 8| 6| 4| 2 ]
    490         ADD     pDstTmp, pDst, dstStep
    491         ADD     dstep, dstStep, dstStep
    492         VEXT    dTmp2,dTmp3,dTmp1,#1                    ;// Tmp[ X| X| X| X| 9| 7| 5| 3 ]
    493 
    494         VST1    dTmp0U32[0],[pDst],dstep                ;// Tmp[6],[4],[2],[0]
    495         VST1    dTmp3U32[0],[pDstTmp],dstep             ;// Tmp[7],[5],[3],[1]
    496         VST1    dTmp1U32[0],[pDst]                      ;// Tmp[8],[6],[4],[2]
    497         VST1    dTmp2U32[0],[pDstTmp]                   ;// Tmp[9],[7],[5],[3]
    498 
    499         B        ExitPredict4x4                         ;// Branch to exit code
    500 
    501 OMX_VC_4x4_HU
    502         ADD     pSrcTmp, pSrcLeft, leftStep
    503         ADD     srcStep, leftStep, leftStep
    504 
    505         ;// Load Left Edge                              ;// [L3|L2|L1|L0|X|X|X|X]
    506         VLD1    {dLeft[4]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
    507         VLD1    {dLeft[5]},[pSrcTmp],srcStep            ;// pSrcLeft[1*leftStep]
    508         VLD1    {dLeft[6]},[pSrcLeft]                   ;// pSrcLeft[2*leftStep]
    509         VLD1    {dLeft[7]},[pSrcTmp]                    ;// pSrcLeft[3*leftStep]
    510 
    511         VDUP    dL3,dLeft[7]                            ;// [L3|L3|L3|L3|L3|L3|L3|L3]
    512 
    513         VEXT    dLeftHU0,dLeft,dL3,#4                   ;// [L3|L3|L3|L3|L3|L2|L1|L0]
    514         VEXT    dLeftHU1,dLeft,dL3,#5                   ;// [L3|L3|L3|L3|L3|L3|L2|L1]
    515         VEXT    dLeftHU2,dLeft,dL3,#6                   ;// [L3|L3|L3|L3|L3|L3|L3|L2]
    516 
    517         VHADD   dTmp0, dLeftHU0, dLeftHU2
    518         VRHADD  dTmp0, dTmp0, dLeftHU1                  ;// Tmp[ L3 | L3 | L3 | L3 | L3 | 5 | 3 | 1 ]
    519 
    520         VRHADD  dTmp1, dLeftHU1, dLeftHU0               ;// (a+b+1)>>1
    521                                                         ;//  Tmp[ L3 | L3 | L3 | L3 | L3 | 4 | 2 | 0 ]
    522 
    523         VZIP    dTmp1,dTmp0                             ;// dTmp1 = Tmp[7| 6| 5| 4| 3| 2| 1| 0]
    524                                                         ;// dTmp0 = [L3|L3|L3|L3|L3|L3|L3|L3]
    525 
    526 
    527         VST1    dTmp1U32[0],[pDst],dstStep              ;// [3|2|1|0]
    528         VEXT    dTmp1,dTmp1,dTmp1,#2
    529         VST1    dTmp1U32[0],[pDst],dstStep              ;// [5|4|3|2]
    530         VEXT    dTmp1,dTmp1,dTmp1,#2
    531         VST1    dTmp1U32[0],[pDst],dstStep              ;// [7|6|5|4]
    532         VST1    dTmp0U32[0],[pDst]                      ;// [9|8|7|6]
    533 
    534 
    535 ExitPredict4x4
    536 
    537         MOV      return,  #OMX_Sts_NoErr
    538         M_END
    539 
    540         ENDIF ;// CortexA8
    541 
    542         END
    543 ;//-----------------------------------------------------------------------------------------------
    544 ;// omxVCM4P10_PredictIntra_4x4 ends
    545 ;//-----------------------------------------------------------------------------------------------
    546