Home | History | Annotate | Download | only in src
      1 ;//
      2 ;//
      3 ;// File Name:  omxVCM4P10_PredictIntra_4x4_s.s
      4 ;// OpenMAX DL: v1.0.2
      5 ;// Revision:   12290
      6 ;// Date:       Wednesday, April 9, 2008
      7 ;//
      8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
      9 ;//
     10 ;//
     11 ;//
     12 
     13 
     14         INCLUDE omxtypes_s.h
     15         INCLUDE armCOMM_s.h
     16 
     17 ;// Define the processor variants supported by this file
     18 
     19          M_VARIANTS CortexA8
     20 
     21 ;//-------------------------------------------------------
     22 ;// This table for implementing switch case of C in asm by
     23 ;// the mehtod of two levels of indexing.
     24 ;//-------------------------------------------------------
     25 
     26     M_TABLE armVCM4P10_pSwitchTable4x4
     27     DCD  OMX_VC_4x4_VERT,     OMX_VC_4x4_HOR
     28     DCD  OMX_VC_4x4_DC,       OMX_VC_4x4_DIAG_DL
     29     DCD  OMX_VC_4x4_DIAG_DR,  OMX_VC_4x4_VR
     30     DCD  OMX_VC_4x4_HD,       OMX_VC_4x4_VL
     31     DCD  OMX_VC_4x4_HU
     32 
     33 
     34         IF CortexA8
     35 
     36 ;//--------------------------------------------
     37 ;// Scratch variable
     38 ;//--------------------------------------------
     39 return          RN 0
     40 pTable          RN 8
     41 pc              RN 15
     42 
     43 ;//--------------------------------------------
     44 ;// Declare input registers
     45 ;//--------------------------------------------
     46 pSrcLeft        RN 0    ;// input pointer
     47 pSrcAbove       RN 1    ;// input pointer
     48 pSrcAboveLeft   RN 2    ;// input pointer
     49 pDst            RN 3    ;// output pointer
     50 leftStep        RN 4    ;// input variable
     51 dstStep         RN 5    ;// input variable
     52 predMode        RN 6    ;// input variable
     53 availability    RN 7    ;// input variable
     54 pDst1           RN 1
     55 pDst2           RN 4
     56 pDst3           RN 6
     57 
     58 pSrcTmp         RN 9
     59 srcStep         RN 10
     60 pDstTmp         RN 11
     61 dstep           RN 12
     62 
     63 ;//-------------------
     64 ;// Neon registers
     65 ;//-------------------
     66 
     67 ;// OMX_VC_CHROMA_VERT
     68 dAboveU32       DN  D0.U32
     69 
     70 ;// OMX_VC_CHROMA_HOR
     71 dLeftVal0       DN  D0.8
     72 dLeftVal1       DN  D1.8
     73 dLeftVal2       DN  D2.8
     74 dLeftVal3       DN  D3.8
     75 dLeftVal0U32    DN  D0.U32
     76 dLeftVal1U32    DN  D1.U32
     77 dLeftVal2U32    DN  D2.U32
     78 dLeftVal3U32    DN  D3.U32
     79 
     80 ;// OMX_VC_4x4_DC
     81 dLeftVal        DN  D0.U8
     82 dLeftValU32     DN  D0.U32
     83 dSumAboveLeftU16  DN  D1.U16
     84 dSumAboveLeftU32  DN  D1.U32
     85 dSumAboveLeftU64  DN  D1.U64
     86 dSumAboveLeftU8 DN  D1.U8
     87 dSum            DN  D0.U8
     88 
     89 dSumLeftValU16  DN  D1.U16
     90 dSumLeftValU32  DN  D1.U32
     91 dSumLeftValU64  DN  D1.U64
     92 dSumLeftValU8   DN  D1.U8
     93 
     94 dAboveVal       DN  D0.U8
     95 dSumAboveValU16  DN  D1.U16
     96 dSumAboveValU32  DN  D1.U32
     97 dSumAboveValU64  DN  D1.U64
     98 dSumAboveValU8   DN  D1.U8
     99 dConst128U8     DN  D0.U8
    100 
    101 
    102 ;//OMX_VC_4x4_DIAG_DL
    103 
    104 dAbove          DN  D0.U8
    105 dU7             DN  D2.U8
    106 dU3             DN  D2.U8
    107 dAbove0         DN  D3.U8
    108 dAbove1         DN  D4.U8
    109 dAbove2         DN  D5.U8
    110 dTmp            DN  D6.U8
    111 dTmp0           DN  D7.U8
    112 dTmp1           DN  D8.U8
    113 dTmp2            DN  D9.U8
    114 dTmp3            DN  D10.U8
    115 dTmpU32         DN  D6.U32
    116 
    117 
    118 ;//OMX_VC_4x4_DIAG_DR
    119 dLeft           DN  D1.U8
    120 dUL             DN  D2.U8
    121 
    122 ;//OMX_VC_4x4_VR
    123 dLeft0          DN  D1.U8
    124 dLeft1          DN  D2.U8
    125 dEven0          DN  D3.U8
    126 dEven1          DN  D4.U8
    127 dEven2          DN  D5.U8
    128 dOdd0           DN  D6.U8
    129 dOdd1           DN  D11.U8
    130 dOdd2           DN  D12.U8
    131 dTmp3U32        DN  D10.U32
    132 dTmp2U32        DN  D9.U32
    133 
    134 
    135 ;//OMX_VC_4x4_HD
    136 dTmp1U64        DN  D8.U64
    137 dTmp0U64        DN  D7.U64
    138 dTmpU64         DN  D6.U64
    139 dTmpU32         DN  D6.U32
    140 dTmp1U32        DN  D8.U32
    141 
    142 ;//OMX_VC_4x4_HU
    143 dL3             DN  D2.U8
    144 dLeftHU0        DN  D3.U8
    145 dLeftHU1        DN  D4.U8
    146 dLeftHU2        DN  D5.U8
    147 dTmp0U32        DN  D7.U32
    148 
    149 
    150 
    151 
    152 ;//-----------------------------------------------------------------------------------------------
    153 ;// omxVCM4P10_PredictIntra_4x4 starts
    154 ;//-----------------------------------------------------------------------------------------------
    155 
    156         ;// Write function header
    157         M_START omxVCM4P10_PredictIntra_4x4, r12,d12
    158 
    159         ;// Define stack arguments
    160         M_ARG    LeftStep,     4
    161         M_ARG    DstStep,      4
    162         M_ARG    PredMode,     4
    163         M_ARG    Availability, 4
    164 
    165 
    166         LDR      pTable,=armVCM4P10_pSwitchTable4x4  ;// Load index table for switch case
    167 
    168         ;// Load argument from the stack
    169         M_LDRD   predMode,availability,PredMode     ;// Arg predMode & availability loaded from stack to reg
    170         M_LDRD   leftStep,dstStep,LeftStep          ;// Arg leftStep & dstStep loaded from stack to reg
    171 
    172 
    173         LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode
    174 
    175 
    176 OMX_VC_4x4_HOR
    177 
    178         ADD     pSrcTmp, pSrcLeft, leftStep
    179         ADD     srcStep, leftStep, leftStep
    180         ;// Load Left Edge
    181         VLD1    {dLeftVal0[]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
    182         VLD1    {dLeftVal1[]},[pSrcTmp],srcStep            ;//    pSrcLeft[1*leftStep]
    183         VLD1    {dLeftVal2[]},[pSrcLeft]                   ;//    pSrcLeft[2*leftStep]
    184         VLD1    {dLeftVal3[]},[pSrcTmp]                    ;//    pSrcLeft[3*leftStep]
    185 
    186         ADD     pDstTmp, pDst, dstStep
    187         ADD     dstep, dstStep, dstStep
    188 
    189         VST1    dLeftVal0U32[0],[pDst],dstep                ;// pDst[0*dstStep+x] :0<= x <= 7
    190         VST1    dLeftVal1U32[0],[pDstTmp],dstep             ;// pDst[1*dstStep+x] :0<= x <= 7
    191         VST1    dLeftVal2U32[0],[pDst]                      ;// pDst[2*dstStep+x] :0<= x <= 7
    192         VST1    dLeftVal3U32[0],[pDstTmp]                   ;// pDst[3*dstStep+x] :0<= x <= 7
    193 
    194         B        ExitPredict4x4                             ;// Branch to exit code
    195 
    196 OMX_VC_4x4_VERT
    197 
    198         ;// Load Upper Edge
    199         VLD1     dAboveU32[0],[pSrcAbove]
    200         ADD     pDstTmp, pDst, dstStep
    201         ADD     dstep, dstStep, dstStep
    202 
    203 DCPredict4x4VertStore
    204 
    205         VST1     dAboveU32[0],[pDst],dstep
    206         VST1     dAboveU32[0],[pDstTmp],dstep
    207         VST1     dAboveU32[0],[pDst]
    208         VST1     dAboveU32[0],[pDstTmp]
    209 
    210         B        ExitPredict4x4                             ;// Branch to exit code
    211 
    212 OMX_VC_4x4_DC
    213 
    214 
    215         TST     availability, #OMX_VC_LEFT
    216         BEQ     DCPredict4x4LeftNotAvailable
    217 
    218         ADD     pSrcTmp, pSrcLeft, leftStep
    219         ADD     srcStep, leftStep, leftStep
    220         ;// Load Left Edge
    221         VLD1    {dLeftVal[0]},[pSrcLeft],srcStep            ;// pSrcLeft[0*leftStep]
    222         VLD1    {dLeftVal[1]},[pSrcTmp],srcStep             ;//    pSrcLeft[1*leftStep]
    223         VLD1    {dLeftVal[2]},[pSrcLeft]                    ;//    pSrcLeft[2*leftStep]
    224         VLD1    {dLeftVal[3]},[pSrcTmp]                     ;//    pSrcLeft[3*leftStep]
    225 
    226         TST     availability, #OMX_VC_UPPER
    227         BEQ     DCPredict4x4LeftOnlyAvailable
    228 
    229         ;// Load Upper Edge also
    230         VLD1     dLeftValU32[1],[pSrcAbove]                 ;// pSrcAbove[0 to 3]
    231         MOV      return, #OMX_Sts_NoErr
    232 
    233         VPADDL   dSumAboveLeftU16, dLeftVal                 ;// [pSrcAbove[2+3 | 0+1] | pSrcLeft[2+3 | 0+1]]
    234         VPADDL   dSumAboveLeftU32, dSumAboveLeftU16         ;// [pSrcAbove[2+3+0+1] | pSrcLeft[2+3+0+1]]
    235         VPADDL   dSumAboveLeftU64, dSumAboveLeftU32         ;// [pSrcAbove[2+3+0+1] + pSrcLeft[2+3+0+1]]
    236         VRSHR    dSumAboveLeftU64,dSumAboveLeftU64,#3       ;// Sum = (Sum + 4) >> 3
    237         ADD     pDstTmp, pDst, dstStep
    238         ADD     dstep, dstStep, dstStep
    239         VDUP     dSum,dSumAboveLeftU8[0]
    240 
    241         B        DCPredict4x4VertStore
    242 
    243 DCPredict4x4LeftOnlyAvailable
    244 
    245         MOV      return, #OMX_Sts_NoErr                     ;// returnNoError
    246 
    247         VPADDL   dSumLeftValU16, dLeftVal                   ;// [ XX | pSrcLeft[2+3 | 0+1]]
    248         VPADDL   dSumLeftValU32, dSumLeftValU16             ;// [ XXXX | pSrcLeft[2+3+0+1]]
    249 
    250         VRSHR    dSumLeftValU32,dSumLeftValU32,#2           ;// Sum = (Sum + 2) >> 2
    251         ADD     pDstTmp, pDst, dstStep
    252         ADD     dstep, dstStep, dstStep
    253         VDUP     dSum,dSumLeftValU8[0]
    254 
    255         B        DCPredict4x4VertStore
    256 
    257 DCPredict4x4LeftNotAvailable
    258 
    259         TST     availability, #OMX_VC_UPPER
    260         BEQ     DCPredict4x4NoneAvailable
    261 
    262         ;// Load Upper Edge
    263         VLD1     dAboveU32[0],[pSrcAbove]                   ;// pSrcAbove[0 to 3]
    264         MOV      return, #OMX_Sts_NoErr
    265 
    266         VPADDL   dSumAboveValU16, dAboveVal                 ;// [ XX | pSrcAbove[2+3 | 0+1]]
    267         VPADDL   dSumAboveValU32, dSumAboveValU16           ;// [ XXXX | pSrcAbove[2+3+0+1]]
    268 
    269         VRSHR    dSumAboveValU32,dSumAboveValU32,#2         ;// Sum = (Sum + 2) >> 2
    270         ADD     pDstTmp, pDst, dstStep
    271         ADD     dstep, dstStep, dstStep
    272         VDUP     dSum,dSumAboveValU8[0]
    273 
    274         B        DCPredict4x4VertStore
    275 
    276 DCPredict4x4NoneAvailable
    277 
    278         VMOV     dConst128U8,#0x80                          ;// 0x8080808080808080 if(count == 0)
    279         MOV      return, #OMX_Sts_NoErr
    280 
    281         ADD     pDstTmp, pDst, dstStep
    282         ADD     dstep, dstStep, dstStep
    283         B        DCPredict4x4VertStore
    284 
    285 
    286 
    287 OMX_VC_4x4_DIAG_DL
    288 
    289         TST     availability, #OMX_VC_UPPER_RIGHT
    290         BEQ     DiagDLUpperRightNotAvailable
    291 
    292         VLD1    dAbove0,[pSrcAbove]                     ;// [U7|U6|U5|U4|U3|U2|U1|U0]
    293         VDUP    dU7, dAbove0[7]                         ;// [U7|U7|U7|U7|U7|U7|U7|U7]
    294         VEXT    dAbove1, dAbove0, dU7, #1               ;// [U7|U7|U6|U5|U4|U3|U2|U1]
    295         VEXT    dAbove2, dAbove0, dU7, #2               ;// [U7|U7|U7|U6|U5|U4|U3|U2]
    296         B       DiagDLPredict4x4Store
    297 
    298 DiagDLUpperRightNotAvailable
    299         VLD1    dAboveU32[1],[pSrcAbove]                ;// [U3|U2|U1|U0|-|-|-|-]
    300         VDUP    dU3, dAbove[7]                          ;// [U3 U3 U3 U3 U3 U3 U3 U3]
    301 
    302         VEXT    dAbove0, dAbove, dU3, #4                ;// [U3 U3 U3 U3 U3 U2 U1 U0]
    303         VEXT    dAbove1, dAbove, dU3, #5                ;// [U3 U3 U3 U3 U3 U3 U2 U1]
    304         VEXT    dAbove2, dAbove, dU3, #6                ;// [U3 U3 U3 U3 U3 U3 U3 U2]
    305 
    306 DiagDLPredict4x4Store
    307 
    308         VHADD   dTmp, dAbove0, dAbove2
    309         VRHADD  dTmp, dTmp, dAbove1                     ;// (a+2*b+c+2)>>2
    310 
    311 
    312         VST1    dTmpU32[0],[pDst],dstStep
    313         VEXT    dTmp,dTmp,dTmp,#1
    314         VST1    dTmpU32[0],[pDst],dstStep
    315         VEXT    dTmp,dTmp,dTmp,#1
    316         VST1    dTmpU32[0],[pDst],dstStep
    317         VEXT    dTmp,dTmp,dTmp,#1
    318         VST1    dTmpU32[0],[pDst]
    319 
    320         B        ExitPredict4x4                         ;// Branch to exit code
    321 
    322 
    323 OMX_VC_4x4_DIAG_DR
    324 
    325 
    326         ;// Load U0,U1,U2,U3
    327 
    328         VLD1    dAboveU32[0],[pSrcAbove]                ;// [X|X|X|X|U3|U2|U1|U0]
    329 
    330         ;// Load UL,L0,L1,L2,L3                         ;// dLeft = [UL|L0|L1|L2|L3|X|X|X]
    331         VLD1    {dLeft[7]},[pSrcAboveLeft]
    332         ADD     pSrcTmp, pSrcLeft, leftStep
    333         ADD     srcStep, leftStep, leftStep
    334         ADD     pDst1,pDst,dstStep
    335 
    336         VLD1    {dLeft[6]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
    337         VLD1    {dLeft[5]},[pSrcTmp],srcStep            ;// pSrcLeft[1*leftStep]
    338         VLD1    {dLeft[4]},[pSrcLeft]                   ;// pSrcLeft[2*leftStep]
    339         VLD1    {dLeft[3]},[pSrcTmp]                    ;// pSrcLeft[3*leftStep]
    340 
    341 
    342         VEXT    dAbove0,dLeft,dAbove,#3                 ;// [U2|U1|U0|UL|L0|L1|L2|L3]
    343         ADD     pDst2,pDst1,dstStep
    344         VEXT    dAbove1,dLeft,dAbove,#4                 ;// [U3|U2|U1|U0|UL|L0|L1|L2]
    345         ADD     pDst3,pDst2,dstStep
    346         VEXT    dAbove2,dLeft,dAbove,#5                 ;// [ X|U3|U2|U1|U0|UL|L0|L1]
    347 
    348         VHADD   dTmp, dAbove0, dAbove2
    349         VRHADD  dTmp, dTmp, dAbove1                     ;// (a+2*b+c+2)>>2
    350 
    351 
    352         VST1    dTmpU32[0],[pDst3]                      ;// Store pTmp[0],[1],[2],[3] @ pDst3
    353         VEXT    dTmp,dTmp,dTmp,#1
    354         VST1    dTmpU32[0],[pDst2]                      ;// Store pTmp[1],[2],[3],[4] @ pDst2
    355         VEXT    dTmp,dTmp,dTmp,#1
    356         VST1    dTmpU32[0],[pDst1]                      ;// Store pTmp[2],[3],[4],[5] @ pDst1
    357         VEXT    dTmp,dTmp,dTmp,#1
    358         VST1    dTmpU32[0],[pDst]                       ;// Store pTmp[3],[4],[5],[6] @ pDst
    359 
    360         B        ExitPredict4x4                         ;// Branch to exit code
    361 
    362 OMX_VC_4x4_VR
    363 
    364 
    365         ;// Load UL,U0,U1,U2,U3
    366         VLD1    dAboveU32[0],[pSrcAbove]
    367         VLD1    dAbove[7],[pSrcAboveLeft]               ;// [UL|X|X|X|U3|U2|U1|U0]
    368 
    369         ;// Load L0,L1,L2                               ;// dLeft0 = [L0|L2|X|X|X|X|X|X]
    370                                                         ;// dLeft1 = [L1| X|X|X|X|X|X|X]
    371         VLD1    {dLeft0[7]},[pSrcLeft],leftStep         ;// pSrcLeft[0*leftStep]
    372         VLD1    {dLeft1[7]},[pSrcLeft],leftStep         ;// pSrcLeft[1*leftStep]
    373         VLD1    {dLeft0[6]},[pSrcLeft]                  ;// pSrcLeft[2*leftStep]
    374 
    375 
    376         VEXT    dOdd2,dAbove,dAbove,#7                  ;// [ x x x U3 U2 U1 U0 UL ]
    377         VEXT    dEven0,dLeft0,dOdd2,#6                  ;// [ x x x U1 U0 UL L0 L2 ]
    378         VEXT    dEven1,dLeft1,dOdd2,#7                  ;// [ x x x U2 U1 U0 UL L1 ]
    379         VEXT    dEven2,dLeft0,dAbove,#7                 ;// [ x x x U3 U2 U1 U0 L0 ]
    380         VEXT    dOdd0,dLeft1,dAbove,#7                  ;// [ x x x U3 U2 U1 U0 L1 ]
    381         VEXT    dOdd1,dLeft0,dOdd2,#7                   ;// [ x x x U2 U1 U0 UL L0 ]
    382 
    383         VHADD   dTmp1, dOdd0, dOdd2
    384         VRHADD  dTmp1, dTmp1, dOdd1                     ;// Tmp[ x x x 9 7 5 3 1 ]
    385 
    386         VHADD   dTmp0, dEven0, dEven2
    387         VRHADD  dTmp0, dTmp0, dEven1                    ;// Tmp[ x x x 8 6 4 2 0 ]
    388 
    389 
    390         VEXT    dTmp3,dTmp1,dTmp1,#1                    ;// Tmp[ x x x x 9 7 5 3 ]
    391         ADD     pDstTmp, pDst, dstStep
    392         ADD     dstep, dstStep, dstStep
    393         VEXT    dTmp2,dTmp0,dTmp0,#1                    ;// Tmp[ x x x x 8 6 4 2 ]
    394 
    395 
    396         VST1    dTmp3U32[0],[pDst],dstep                ;// Tmp[9],[7],[5],[3]
    397         VST1    dTmp2U32[0],[pDstTmp],dstep             ;// Tmp[8],[6],[4],[2]
    398         VST1    dTmp1U32[0],[pDst],dstep                ;// Tmp[7],[5],[3],[1]
    399         VST1    dTmp0U32[0],[pDstTmp]                   ;// Tmp[6],[4],[2],[0]
    400 
    401         B        ExitPredict4x4                         ;// Branch to exit code
    402 
    403 OMX_VC_4x4_HD
    404 
    405 
    406         ;// Load U0,U1,U2,U3
    407         VLD1    dAbove,[pSrcAbove]                      ;//dAboveLeftVal = [U7|U6|U5|U4|U3|U2|U1|U0]
    408 
    409         ;// Load UL,L0,L1,L2,L3                         ;// dLeft = [UL|L0|L1|L2|L3|X|X|X]
    410         VLD1    {dLeft[7]},[pSrcAboveLeft]
    411         ADD     pSrcTmp, pSrcLeft, leftStep
    412         ADD     srcStep, leftStep, leftStep
    413 
    414         VLD1    {dLeft[6]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
    415         VLD1    {dLeft[5]},[pSrcTmp],srcStep            ;// pSrcLeft[1*leftStep]
    416         VLD1    {dLeft[4]},[pSrcLeft]                   ;// pSrcLeft[2*leftStep]
    417         VLD1    {dLeft[3]},[pSrcTmp]                    ;// pSrcLeft[3*leftStep]
    418 
    419         VEXT    dAbove0,dLeft,dAbove,#3                 ;// [ U2|U1|U0|UL|L0|L1|L2|L3 ]
    420         VEXT    dAbove1,dLeft,dAbove,#2                 ;// [ U1|U0|UL|L0|L1|L2|L3|X ]
    421         VEXT    dAbove2,dLeft,dAbove,#1                 ;// [ U0|UL|L0|L1|L2|L3|X|X ]
    422 
    423         VHADD   dTmp0, dAbove0, dAbove2
    424         VRHADD  dTmp0, dTmp0, dAbove1                   ;// Tmp[ 0 | 1 | 2 | 4 | 6 | 8 | X | X ]
    425 
    426 
    427         VRHADD  dTmp1, dAbove1, dAbove0                 ;// (a+b+1)>>1
    428         VSHL    dTmp1U64,dTmp1U64,#24                   ;// Tmp[ 3|5| 7 |9 | X | X | X | X ]
    429 
    430 
    431         VSHL    dTmpU64,dTmp0U64,#16                    ;// Tmp[ 2|4|6|8| X | X | X | X ]
    432         VZIP    dTmp1,dTmp                              ;// dTmp = [ 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 ]
    433         VEXT    dTmp0,dTmp0,dTmp0,#6                    ;// Tmp[  X| X| X| X| X| X| 0 | 1 ]
    434         VEXT    dTmp1,dTmp,dTmp0,#2                     ;// Tmp[ 0 | 1 | 2 | 3 | 4 | 5 | 6 |7 ]
    435 
    436         ADD     pDstTmp, pDst, dstStep
    437         ADD     dstep, dstStep, dstStep
    438 
    439         VST1    dTmp1U32[1],[pDst],dstep                ;// Store pTmp[0|1|2|3]
    440         VST1    dTmpU32[1],[pDstTmp],dstep              ;// Store pTmp[2|3|4|5]
    441         VST1    dTmp1U32[0],[pDst]                      ;// Store pTmp[4|5|6|7]
    442         VST1    dTmpU32[0],[pDstTmp]                    ;// Store pTmp[6|7|8|9]
    443 
    444         B        ExitPredict4x4                         ;// Branch to exit code
    445 
    446 OMX_VC_4x4_VL
    447 
    448 
    449         TST     availability, #OMX_VC_UPPER_RIGHT
    450         BEQ     DiagVLUpperRightNotAvailable
    451 
    452         VLD1    dAbove0,[pSrcAbove]                      ;// [U7|U6|U5|U4|U3|U2|U1|U0]
    453         VEXT    dAbove1,dAbove0,dAbove0,#1               ;// [ X|U7|U6|U5|U4|U3|U2|U1]
    454         VEXT    dAbove2,dAbove1,dAbove1,#1               ;// [ X| X|U7|U6|U5|U4|U3|U2]
    455 
    456         B       DiagVLPredict4x4Store
    457 
    458 DiagVLUpperRightNotAvailable
    459         VLD1    dAboveU32[1],[pSrcAbove]                 ;// [U3|U2|U1|U0|-|-|-|-]
    460         VDUP    dU3, dAbove[7]                           ;// [U3 U3 U3 U3 U3 U3 U3 U3]
    461 
    462         VEXT    dAbove0, dAbove, dU3, #4                 ;// [U3 U3 U3 U3 U3 U2 U1 U0]
    463         VEXT    dAbove1, dAbove, dU3, #5                 ;// [U3 U3 U3 U3 U3 U3 U2 U1]
    464         VEXT    dAbove2, dAbove, dU3, #6                 ;// [U3 U3 U3 U3 U3 U3 U3 U2]
    465 
    466 DiagVLPredict4x4Store
    467 
    468         VRHADD  dTmp0, dAbove1, dAbove0                 ;// (a+b+1)>>1
    469                                                         ;// Tmp[ X| X| X| 8| 6| 4| 2| 0 ]
    470 
    471         VHADD   dTmp3, dAbove0, dAbove2
    472         VRHADD  dTmp3, dTmp3, dAbove1                   ;// (a+2*b+c+2)>>2
    473                                                         ;// Tmp[ X| X| X| 9| 7| 5| 3| 1 ]
    474 
    475         VEXT    dTmp1,dTmp0,dTmp0,#1                    ;// Tmp[ X| X| X| X| 8| 6| 4| 2 ]
    476         ADD     pDstTmp, pDst, dstStep
    477         ADD     dstep, dstStep, dstStep
    478         VEXT    dTmp2,dTmp3,dTmp1,#1                    ;// Tmp[ X| X| X| X| 9| 7| 5| 3 ]
    479 
    480         VST1    dTmp0U32[0],[pDst],dstep                ;// Tmp[6],[4],[2],[0]
    481         VST1    dTmp3U32[0],[pDstTmp],dstep             ;// Tmp[7],[5],[3],[1]
    482         VST1    dTmp1U32[0],[pDst]                      ;// Tmp[8],[6],[4],[2]
    483         VST1    dTmp2U32[0],[pDstTmp]                   ;// Tmp[9],[7],[5],[3]
    484 
    485         B        ExitPredict4x4                         ;// Branch to exit code
    486 
    487 OMX_VC_4x4_HU
    488         ADD     pSrcTmp, pSrcLeft, leftStep
    489         ADD     srcStep, leftStep, leftStep
    490 
    491         ;// Load Left Edge                              ;// [L3|L2|L1|L0|X|X|X|X]
    492         VLD1    {dLeft[4]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
    493         VLD1    {dLeft[5]},[pSrcTmp],srcStep            ;// pSrcLeft[1*leftStep]
    494         VLD1    {dLeft[6]},[pSrcLeft]                   ;// pSrcLeft[2*leftStep]
    495         VLD1    {dLeft[7]},[pSrcTmp]                    ;// pSrcLeft[3*leftStep]
    496 
    497         VDUP    dL3,dLeft[7]                            ;// [L3|L3|L3|L3|L3|L3|L3|L3]
    498 
    499         VEXT    dLeftHU0,dLeft,dL3,#4                   ;// [L3|L3|L3|L3|L3|L2|L1|L0]
    500         VEXT    dLeftHU1,dLeft,dL3,#5                   ;// [L3|L3|L3|L3|L3|L3|L2|L1]
    501         VEXT    dLeftHU2,dLeft,dL3,#6                   ;// [L3|L3|L3|L3|L3|L3|L3|L2]
    502 
    503         VHADD   dTmp0, dLeftHU0, dLeftHU2
    504         VRHADD  dTmp0, dTmp0, dLeftHU1                  ;// Tmp[ L3 | L3 | L3 | L3 | L3 | 5 | 3 | 1 ]
    505 
    506         VRHADD  dTmp1, dLeftHU1, dLeftHU0               ;// (a+b+1)>>1
    507                                                         ;//  Tmp[ L3 | L3 | L3 | L3 | L3 | 4 | 2 | 0 ]
    508 
    509         VZIP    dTmp1,dTmp0                             ;// dTmp1 = Tmp[7| 6| 5| 4| 3| 2| 1| 0]
    510                                                         ;// dTmp0 = [L3|L3|L3|L3|L3|L3|L3|L3]
    511 
    512 
    513         VST1    dTmp1U32[0],[pDst],dstStep              ;// [3|2|1|0]
    514         VEXT    dTmp1,dTmp1,dTmp1,#2
    515         VST1    dTmp1U32[0],[pDst],dstStep              ;// [5|4|3|2]
    516         VEXT    dTmp1,dTmp1,dTmp1,#2
    517         VST1    dTmp1U32[0],[pDst],dstStep              ;// [7|6|5|4]
    518         VST1    dTmp0U32[0],[pDst]                      ;// [9|8|7|6]
    519 
    520 
    521 ExitPredict4x4
    522 
    523         MOV      return,  #OMX_Sts_NoErr
    524         M_END
    525 
    526         ENDIF ;// CortexA8
    527 
    528         END
    529 ;//-----------------------------------------------------------------------------------------------
    530 ;// omxVCM4P10_PredictIntra_4x4 ends
    531 ;//-----------------------------------------------------------------------------------------------
    532