Home | History | Annotate | Download | only in src
      1 ;//
      2 ;//
      3 ;// File Name:  omxVCM4P10_PredictIntra_16x16_s.s
      4 ;// OpenMAX DL: v1.0.2
      5 ;// Revision:   9641
      6 ;// Date:       Thursday, February 7, 2008
      7 ;//
      8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
      9 ;//
     10 ;//
     11 ;//
     12 
     13         INCLUDE omxtypes_s.h
     14         INCLUDE armCOMM_s.h
     15 
     16         M_VARIANTS ARM1136JS
     17 
     18 ;//-------------------------------------------------------
     19 ;// This table for implementing switch case of C in asm by
     20 ;// the mehtod of two levels of indexing.
     21 ;//-------------------------------------------------------
     22 
     23     M_TABLE armVCM4P10_pIndexTable16x16
     24     DCD  OMX_VC_16X16_VERT, OMX_VC_16X16_HOR
     25     DCD  OMX_VC_16X16_DC,   OMX_VC_16X16_PLANE
     26 
     27     IF ARM1136JS
     28 
     29 ;//--------------------------------------------
     30 ;// Constants
     31 ;//--------------------------------------------
     32 BLK_SIZE        EQU 0x10
     33 MUL_CONST0      EQU 0x01010101
     34 MUL_CONST1      EQU 0x00060004
     35 MUL_CONST2      EQU 0x00070005
     36 MUL_CONST3      EQU 0x00030001
     37 MASK_CONST      EQU 0x00FF00FF
     38 
     39 ;//--------------------------------------------
     40 ;// Scratch variable
     41 ;//--------------------------------------------
     42 y               RN 12
     43 pc              RN 15
     44 
     45 return          RN 0
     46 innerCount      RN 0
     47 outerCount      RN 1
     48 pSrcLeft2       RN 1
     49 pDst2           RN 2
     50 sum             RN 6
     51 pTable          RN 9
     52 temp1           RN 10
     53 temp2           RN 12
     54 cMul1           RN 11
     55 cMul2           RN 12
     56 count           RN 12
     57 dstStepx2       RN 11
     58 leftStepx2      RN 14
     59 r0x01010101     RN 10
     60 r0x00FF00FF     RN 11
     61 
     62 tVal0           RN 0
     63 tVal1           RN 1
     64 tVal2           RN 2
     65 tVal3           RN 3
     66 tVal4           RN 4
     67 tVal5           RN 5
     68 tVal6           RN 6
     69 tVal7           RN 7
     70 tVal8           RN 8
     71 tVal9           RN 9
     72 tVal10          RN 10
     73 tVal11          RN 11
     74 tVal12          RN 12
     75 tVal14          RN 14
     76 
     77 b               RN 12
     78 c               RN 14
     79 
     80 p2p0            RN 0
     81 p3p1            RN 1
     82 p6p4            RN 2
     83 p7p5            RN 4
     84 p10p8           RN 6
     85 p11p9           RN 7
     86 p14p12          RN 8
     87 p15p13          RN 9
     88 
     89 p3210           RN 10
     90 p7654           RN 10
     91 p111098         RN 10
     92 p15141312       RN 10
     93 
     94 ;//--------------------------------------------
     95 ;// Declare input registers
     96 ;//--------------------------------------------
     97 pSrcLeft        RN 0    ;// input pointer
     98 pSrcAbove       RN 1    ;// input pointer
     99 pSrcAboveLeft   RN 2    ;// input pointer
    100 pDst            RN 3    ;// output pointer
    101 leftStep        RN 4    ;// input variable
    102 dstStep         RN 5    ;// input variable
    103 predMode        RN 6    ;// input variable
    104 availability    RN 7    ;// input variable
    105 
    106 ;//-----------------------------------------------------------------------------------------------
    107 ;// omxVCM4P10_PredictIntra_16x16 starts
    108 ;//-----------------------------------------------------------------------------------------------
    109 
    110         ;// Write function header
    111         M_START omxVCM4P10_PredictIntra_16x16, r11
    112 
    113         ;// Define stack arguments
    114         M_ARG    LeftStep,     4
    115         M_ARG    DstStep,      4
    116         M_ARG    PredMode,     4
    117         M_ARG    Availability, 4
    118 
    119         ;// M_STALL ARM1136JS=4
    120 
    121         LDR      pTable,=armVCM4P10_pIndexTable16x16 ;// Load index table for switch case
    122 
    123         ;// Load argument from the stack
    124         M_LDR    predMode, PredMode                  ;// Arg predMode loaded from stack to reg
    125         M_LDR    leftStep, LeftStep                  ;// Arg leftStep loaded from stack to reg
    126         M_LDR    dstStep,  DstStep                   ;// Arg dstStep loaded from stack to reg
    127         M_LDR    availability, Availability          ;// Arg availability loaded from stack to reg
    128 
    129         MOV      y, #BLK_SIZE                        ;// Outer Loop Count
    130         LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode
    131 
    132 OMX_VC_16X16_VERT
    133         LDM      pSrcAbove, {tVal6,tVal7,tVal8,tVal9};// tVal 6 to 9 = pSrcAbove[0 to 15]
    134         ADD      dstStepx2, dstStep, dstStep         ;// double dstStep
    135         ADD      pDst2, pDst, dstStep                ;// pDst2- pDst advanced by dstStep
    136 
    137         ;// M_STALL ARM1136JS=2                       ;// Stall outside the loop
    138 
    139 LOOP_VERT
    140         STM      pDst, {tVal6,tVal7,tVal8,tVal9}     ;// pDst[0 to 15] = tVal 6 to 9
    141         SUBS     y, y, #2                            ;// y--
    142         ADD      pDst, pDst, dstStepx2               ;// pDst advanced by dstStep
    143         STM      pDst2, {tVal6,tVal7,tVal8,tVal9}    ;// pDst2[16 to 31] = tVal 6 to 9
    144         ADD      pDst2, pDst2, dstStepx2             ;// pDst advanced by dstStep
    145         BNE      LOOP_VERT                           ;// Loop for 8 times
    146         MOV      return, #OMX_Sts_NoErr
    147         M_EXIT
    148 
    149 
    150 OMX_VC_16X16_HOR
    151 
    152         ;// M_STALL ARM1136JS=6
    153 
    154         LDR      r0x01010101, =MUL_CONST0            ;// Const to repeat the byte in reg 4 times
    155         MOV      y, #4                               ;// Outer Loop Count
    156         M_LDRB   tVal6, [pSrcLeft], +leftStep        ;// tVal6 = pSrcLeft[0 to 3]
    157         ADD      pDst2, pDst, dstStep                ;// pDst2- pDst advanced by dstStep
    158         M_LDRB   tVal7, [pSrcLeft], +leftStep        ;// tVal1 = pSrcLeft[4 to 7]
    159         ADD      dstStepx2, dstStep, dstStep         ;// double dstStep
    160         SUB      dstStepx2, dstStepx2, #12           ;// double dstStep  minus 12
    161 
    162 LOOP_HOR
    163         M_LDRB   tVal8, [pSrcLeft], +leftStep        ;// tVal8 = pSrcLeft[0 to 3]
    164         MUL      tVal6, tVal6, r0x01010101           ;// replicate the val in all the bytes
    165         M_LDRB   tVal9, [pSrcLeft], +leftStep        ;// tVal9 = pSrcLeft[4 to 7]
    166         MUL      tVal7, tVal7, r0x01010101           ;// replicate the val in all the bytes
    167         SUBS     y, y, #1                            ;// y--
    168         STR      tVal6, [pDst],  #+4                 ;// store {tVal6} at pDst[0 to 3]
    169         STR      tVal7, [pDst2], #+4                 ;// store {tVal7} at pDst2[0 to 3]
    170         STR      tVal6, [pDst],  #+4                 ;// store {tVal6} at pDst[4 to 7]
    171         STR      tVal7, [pDst2], #+4                 ;// store {tVal7} at pDst2[4 to 7]
    172         MUL      tVal8, tVal8, r0x01010101           ;// replicate the val in all the bytes
    173         STR      tVal6, [pDst],  #+4                 ;// store {tVal6} at pDst[8 to 11]
    174         STR      tVal7, [pDst2], #+4                 ;// store {tVal7} at pDst2[8 to 11]
    175         MUL      tVal9, tVal9, r0x01010101           ;// replicate the val in all the bytes
    176         M_STR    tVal6, [pDst], dstStepx2            ;// store {tVal6} at pDst[12 to 15]
    177         M_STR    tVal7, [pDst2], dstStepx2           ;// store {tVal7} at pDst2[12 to 15]
    178         STR      tVal8, [pDst],  #+4                 ;// store {tVal6} at pDst[0 to 3]
    179         STR      tVal9, [pDst2], #+4                 ;// store {tVal7} at pDst2[0 to 3]
    180         STR      tVal8, [pDst],  #+4                 ;// store {tVal6} at pDst[4 to 7]
    181         STR      tVal9, [pDst2], #+4                 ;// store {tVal7} at pDst2[4 to 7]
    182         STR      tVal8, [pDst],  #+4                 ;// store {tVal6} at pDst[8 to 11]
    183         STR      tVal9, [pDst2], #+4                 ;// store {tVal7} at pDst2[8 to 11]
    184         M_STR    tVal8, [pDst], dstStepx2            ;// store {tVal6} at pDst[12 to 15]
    185         M_LDRB   tVal6, [pSrcLeft], +leftStep        ;// tVal6 = pSrcLeft[0 to 3]
    186         M_STR    tVal9, [pDst2], dstStepx2           ;// store {tVal7} at pDst2[12 to 15]
    187         M_LDRB   tVal7, [pSrcLeft], +leftStep        ;// tVal7 = pSrcLeft[4 to 7]
    188         BNE      LOOP_HOR                            ;// Loop for 3 times
    189         MOV      return, #OMX_Sts_NoErr
    190         M_EXIT
    191 
    192 OMX_VC_16X16_DC
    193 
    194         ;// M_STALL ARM1136JS=2
    195 
    196         MOV      count, #0                           ;// count = 0
    197         TST      availability, #OMX_VC_UPPER         ;// if(availability & #OMX_VC_UPPER)
    198         BEQ      TST_LEFT                            ;// Jump to Left if not upper
    199         LDM      pSrcAbove,{tVal8,tVal9,tVal10,tVal11};// tVal 8 to 11 = pSrcAbove[0 to 15]
    200         ADD      count, count, #1                    ;// if upper inc count by 1
    201 
    202         ;// M_STALL ARM1136JS=2
    203 
    204         UXTB16   tVal2, tVal8                        ;// pSrcAbove[0, 2]
    205         UXTB16   tVal6, tVal9                        ;// pSrcAbove[4, 6]
    206         UADD16   tVal2, tVal2, tVal6                 ;// pSrcAbove[0, 2] + pSrcAbove[4, 6]
    207         UXTB16   tVal8, tVal8, ROR #8                ;// pSrcAbove[1, 3]
    208         UXTB16   tVal9, tVal9, ROR #8                ;// pSrcAbove[5, 7]
    209         UADD16   tVal8, tVal8, tVal9                 ;// pSrcAbove[1, 3] + pSrcAbove[5, 7]
    210         UADD16   tVal2, tVal2, tVal8                 ;// sum(pSrcAbove[0] to pSrcAbove[7])
    211 
    212         UXTB16   tVal8, tVal10                       ;// pSrcAbove[8, 10]
    213         UXTB16   tVal9, tVal11                       ;// pSrcAbove[12, 14]
    214         UADD16   tVal8, tVal8, tVal9                 ;// pSrcAbove[8, 10] + pSrcAbove[12, 14]
    215         UXTB16   tVal10, tVal10, ROR #8              ;// pSrcAbove[9, 11]
    216         UXTB16   tVal11, tVal11, ROR #8              ;// pSrcAbove[13, 15]
    217         UADD16   tVal10, tVal10, tVal11              ;// pSrcAbove[9, 11] + pSrcAbove[13, 15]
    218         UADD16   tVal8, tVal8, tVal10                ;// sum(pSrcAbove[8] to pSrcAbove[15])
    219 
    220         UADD16   tVal2, tVal2, tVal8                 ;// sum(pSrcAbove[0] to pSrcAbove[15])
    221 
    222         ;// M_STALL ARM1136JS=1
    223 
    224         ADD      tVal2, tVal2, tVal2, LSR #16        ;// sum(pSrcAbove[0] to pSrcAbove[15])
    225 
    226         ;// M_STALL ARM1136JS=1
    227 
    228         UXTH     sum, tVal2                          ;// Extract the lower half for result
    229 
    230 TST_LEFT
    231         TST      availability, #OMX_VC_LEFT
    232         BEQ      TST_COUNT
    233         ADD      leftStepx2, leftStep,leftStep       ;// leftStepx2 = 2 * leftStep
    234         ADD      pSrcLeft2, pSrcLeft, leftStep       ;// pSrcLeft2 = pSrcLeft + leftStep
    235 
    236         M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
    237         M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
    238         M_LDRB   tVal10, [pSrcLeft], +leftStepx2     ;// tVal10= pSrcLeft[2]
    239         M_LDRB   tVal11, [pSrcLeft2],+leftStepx2     ;// tVal11= pSrcLeft[3]
    240         ADD      tVal7, tVal8, tVal9                 ;// tVal7 = tVal8 + tVal9
    241         ADD      count, count, #1                    ;// Inc Counter if Left is available
    242         ADD      tVal6, tVal10, tVal11               ;// tVal6 = tVal10 + tVal11
    243 
    244         M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
    245         M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
    246         M_LDRB   tVal10, [pSrcLeft], +leftStepx2     ;// tVal10= pSrcLeft[2]
    247         M_LDRB   tVal11, [pSrcLeft2],+leftStepx2     ;// tVal11= pSrcLeft[3]
    248         ADD      sum, tVal7, tVal6                   ;// sum = tVal8 + tVal10
    249         ADD      tVal8, tVal8, tVal9                 ;// tVal8 = tVal8 + tVal9
    250         ADD      tVal10, tVal10, tVal11              ;// tVal10= tVal10 + tVal11
    251         ADD      tVal7, tVal8, tVal10                ;// tVal7 = tVal8 + tVal10
    252 
    253 
    254         M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
    255         M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
    256         M_LDRB   tVal10, [pSrcLeft], +leftStepx2     ;// tVal10= pSrcLeft[2]
    257         M_LDRB   tVal11, [pSrcLeft2],+leftStepx2     ;// tVal11= pSrcLeft[3]
    258         ADD      sum, sum, tVal7                     ;// sum = sum + tVal7
    259         ADD      tVal8, tVal8, tVal9                 ;// tVal8 = tVal8 + tVal9
    260         ADD      tVal10, tVal10, tVal11              ;// tVal10= tVal10 + tVal11
    261         ADD      tVal7, tVal8, tVal10                ;// tVal7 = tVal8 + tVal10
    262 
    263 
    264         M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
    265         M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
    266         M_LDRB   tVal10, [pSrcLeft], +leftStepx2     ;// tVal10= pSrcLeft[2]
    267         M_LDRB   tVal11, [pSrcLeft2],+leftStepx2     ;// tVal11= pSrcLeft[3]
    268         ADD      sum, sum, tVal7                     ;// sum = sum + tVal7
    269         ADD      tVal8, tVal8, tVal9                 ;// tVal8 = tVal8 + tVal9
    270         ADD      tVal10, tVal10, tVal11              ;// tVal10= tVal10 + tVal11
    271         ADD      tVal7, tVal8, tVal10                ;// tVal7 = tVal8 + tVal10
    272         ADD      sum, sum, tVal7                     ;// sum = sum + tVal7
    273 
    274 TST_COUNT
    275         CMP      count, #0                           ;// if(count == 0)
    276         MOVEQ    sum, #128                           ;// sum = 128 if(count == 0)
    277         BEQ      TST_COUNT0                          ;// if(count == 0)
    278         CMP      count, #1                           ;// if(count == 1)
    279         ADDEQ    sum, sum, #8                        ;// sum += 8 if(count == 1)
    280         ADDNE    sum, sum, tVal2                     ;// sum = sumleft + sumupper
    281         ADDNE    sum, sum, #16                       ;// sum += 16 if(count == 2)
    282 
    283         ;// M_STALL ARM1136JS=1
    284 
    285         UXTH     sum, sum                            ;// sum only byte rest cleared
    286 
    287         ;// M_STALL ARM1136JS=1
    288 
    289         LSREQ    sum, sum, #4                        ;// sum >> 4 if(count == 1)
    290 
    291         ;// M_STALL ARM1136JS=1
    292 
    293         LSRNE    sum, sum, #5                        ;// sum >> 5 if(count == 2)
    294 
    295 TST_COUNT0
    296 
    297         ;// M_STALL ARM1136JS=1
    298 
    299         ORR      sum, sum, sum, LSL #8               ;// sum replicated in two halfword
    300 
    301         ;// M_STALL ARM1136JS=1
    302 
    303         ORR      tVal6, sum, sum, LSL #16            ;// sum  replicated in all bytes
    304         CPY      tVal7, tVal6                        ;// tVal1 = tVal0
    305         CPY      tVal8, tVal6                        ;// tVal2 = tVal0
    306         CPY      tVal9, tVal6                        ;// tVal3 = tVal0
    307         ADD      dstStepx2, dstStep, dstStep         ;// double dstStep
    308         ADD      pDst2, pDst, dstStep                ;// pDst2- pDst advanced by dstStep
    309         MOV      y, #BLK_SIZE                        ;// Outer Loop Count
    310 
    311 LOOP_DC
    312         STM      pDst, {tVal6,tVal7,tVal8,tVal9}     ;// pDst[0 to 15] = tVal 6 to 9
    313         SUBS     y, y, #2                            ;// y--
    314         ADD      pDst, pDst, dstStepx2               ;// pDst advanced by dstStep
    315         STM      pDst2, {tVal6,tVal7,tVal8,tVal9}    ;// pDst2[16 to 31] = tVal 6 to 9
    316         ADD      pDst2, pDst2, dstStepx2             ;// pDst advanced by dstStep
    317         BNE      LOOP_DC                             ;// Loop for 8 times
    318 
    319         MOV      return, #OMX_Sts_NoErr
    320         M_EXIT
    321 
    322 OMX_VC_16X16_PLANE
    323 
    324         ;// M_STALL ARM1136JS=3
    325         RSB      tVal14, leftStep, leftStep, LSL #4  ;// tVal14 = 15*leftStep
    326 
    327         ;// M_STALL ARM1136JS=2
    328         LDRB     tVal10, [pSrcLeft,  tVal14]         ;// tVal10 = pSrcLeft[15*leftStep]
    329         LDRB     tVal11, [pSrcAboveLeft]             ;// tVal11 = pSrcAboveLeft[0]
    330         LDRB     tVal12, [pSrcAbove, #15]
    331 
    332         ADD      tVal2,  tVal12,  tVal10             ;// tVal2  = pSrcAbove[15] + pSrcLeft[15*leftStep]
    333         SUB      tVal10, tVal10,  tVal11             ;// tVal10 = V0 = pSrcLeft[15*leftStep] - pSrcAboveLeft[0]
    334         SUB      tVal11, tVal12,  tVal11             ;// tVal11 = H0 = pSrcAbove[15] - pSrcAboveLeft[0]
    335         MOV      tVal2,  tVal2,   LSL #4             ;// tVal2  = a = 16 * (pSrcAbove[15] + pSrcLeft[15*leftStep])
    336 
    337         MOV     tVal11, tVal11, LSL #3              ;// 8*[15]-[-1]
    338         LDRB    tVal6, [pSrcAbove, #0]
    339         LDRB    tVal7, [pSrcAbove, #14]
    340         SUB     tVal8, tVal7, tVal6
    341         RSB     tVal8, tVal8, tVal8, LSL #3         ;// 7*[14]-[0]
    342         ADD     tVal11, tVal11, tVal8
    343         LDRB    tVal6, [pSrcAbove, #1]
    344         LDRB    tVal7, [pSrcAbove, #13]
    345         SUB     tVal8, tVal7, tVal6
    346         ADD     tVal8, tVal8, tVal8
    347         ADD     tVal8, tVal8, tVal8, LSL #1         ;// 6*[13]-[1]
    348         ADD     tVal11, tVal11, tVal8
    349         LDRB    tVal6, [pSrcAbove, #2]
    350         LDRB    tVal7, [pSrcAbove, #12]
    351         SUB     tVal8, tVal7, tVal6
    352         ADD     tVal8, tVal8, tVal8, LSL #2         ;// 5*[12]-[2]
    353         ADD     tVal11, tVal11, tVal8
    354         LDRB    tVal6, [pSrcAbove, #3]
    355         LDRB    tVal7, [pSrcAbove, #11]
    356         SUB     tVal8, tVal7, tVal6
    357         ADD     tVal11, tVal11, tVal8, LSL #2       ;// + 4*[11]-[3]
    358         LDRB    tVal6, [pSrcAbove, #4]
    359         LDRB    tVal7, [pSrcAbove, #10]
    360         SUB     tVal8, tVal7, tVal6
    361         ADD     tVal8, tVal8, tVal8, LSL #1         ;// 3*[10]-[4]
    362         ADD     tVal11, tVal11, tVal8
    363         LDRB    tVal6, [pSrcAbove, #5]
    364         LDRB    tVal7, [pSrcAbove, #9]
    365         SUB     tVal8, tVal7, tVal6
    366         ADD     tVal11, tVal11, tVal8, LSL #1       ;// + 2*[9]-[5]
    367         LDRB    tVal6, [pSrcAbove, #6]
    368         LDRB    tVal7, [pSrcAbove, #8]
    369         SUB     tVal8, tVal7, tVal6                 ;// 1*[8]-[6]
    370         ADD     tVal7, tVal11, tVal8
    371 
    372         ADD      tVal2,  tVal2,   #16                ;// tVal2  = a + 16
    373         MOV      tVal1,  pSrcLeft                    ;// tVal4  = pSrcLeft
    374         SUB      tVal9,  tVal14,   leftStep          ;// tVal9  = 14*leftStep
    375         ADD      tVal9,  pSrcLeft, tVal9             ;// tVal9  = pSrcLeft + 14*leftStep
    376 
    377         M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[14*leftStep]
    378         M_LDRB   tVal11, [tVal1], +leftStep          ;// tVal11 = pSrcLeft[0]
    379         ADD      tVal7,  tVal7,  tVal7,  LSL #2      ;// tVal7  = 5 * H
    380         ADD      tVal7,  tVal7,  #32                 ;// tVal7  = 5 * H + 32
    381         SUB      tVal8,  tVal8,  tVal11              ;// tVal8  = pSrcLeft[14*leftStep] - pSrcLeft[0]
    382         ASR      tVal12, tVal7,  #6                  ;// tVal12 = b = (5 * H + 32) >> 6
    383 
    384         RSB      tVal8,  tVal8,  tVal8,  LSL #3      ;// tVal8  = V1 = 7* (pSrcLeft[14*leftStep]-pSrcLeft[0])
    385         ADD      tVal6,  tVal8,  tVal10, LSL #3      ;// tVal6  = V = V0 +V1
    386         M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[13*leftStep]
    387         M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[leftStep]
    388         RSB      tVal7,  tVal12,  tVal12,  LSL #3    ;// tVal7  = 7*b
    389         SUB      tVal2,  tVal2,   tVal7              ;// tVal2  = a + 16 - 7*b
    390         SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[13*leftStep] - pSrcLeft[leftStep]
    391         M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[12*lS]
    392         ADD      tVal7,  tVal7,   tVal7              ;// tVal7  = 2 * (pSrcLeft[13*leftStep] - pSrcLeft[leftStep])
    393         M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[2*leftStep]
    394         ADD      tVal7,  tVal7,   tVal7,  LSL #1     ;// tVal7  = 6 * (pSrcLeft[13*leftStep] - pSrcLeft[leftStep])
    395         ADD      tVal6,  tVal6,   tVal7              ;// tVal6  = V = V + V2
    396         SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[12*leftStep] - pSrcLeft[2*leftStep]
    397         M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[11*leftStep]
    398         M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[3*leftStep]
    399         ADD      tVal7,  tVal7,   tVal7,  LSL #2     ;// tVal7  = 5 * (pSrcLeft[12*leftStep] - pSrcLeft[2*leftStep])
    400         ADD      tVal6,  tVal6,   tVal7              ;// tVal6  = V = V + V3
    401         SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[11*leftStep] - pSrcLeft[3*leftStep]
    402         M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[10*leftStep]
    403         M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[4*leftStep]
    404         ADD      tVal6,  tVal6,   tVal7,  LSL #2     ;// tVal6  = V = V + V4
    405         SUB      dstStep, dstStep, #16               ;// tVal5  = dstStep - 16
    406         SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[10*leftStep] - pSrcLeft[4*leftStep]
    407         M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[9*leftStep]
    408         M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[5*leftStep]
    409         ADD      tVal7,  tVal7,   tVal7,  LSL #1     ;// tVal7  = 3 * (pSrcLeft[10*leftStep] - pSrcLeft[4*leftStep])
    410         ADD      tVal6,  tVal6,   tVal7              ;// tVal6  = V = V + V5
    411         SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[9*leftStep] - pSrcLeft[5*leftStep]
    412         M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[8*leftStep]
    413         M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[6*leftStep]
    414         ADD      tVal6,  tVal6,   tVal7,  LSL #1     ;// tVal6  = V = V + V6
    415 
    416         ;// M_STALL ARM1136JS=1
    417         SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[8*leftStep] - pSrcLeft[6*leftStep]
    418         ADD      tVal6,  tVal6,   tVal7              ;// tVal6  = V = V + V7
    419 
    420         ;// M_STALL ARM1136JS=1
    421         ADD      tVal6,  tVal6,   tVal6,  LSL #2     ;// tVal6  = 5*V
    422         ADD      tVal6,  tVal6,   #32                ;// tVal6  = 5*V + 32
    423 
    424         ;// M_STALL ARM1136JS=1
    425         ASR      tVal14, tVal6,   #6                 ;// tVal14 = c = (5*V + 32)>>6
    426 
    427         ;// M_STALL ARM1136JS=1
    428         RSB      tVal6,  tVal14,  tVal14, LSL #3     ;// tVal6  = 7*c
    429         UXTH     tVal14, tVal14                      ;// tVal14 = Cleared the upper half word
    430         ADD      tVal10, tVal12,  tVal12             ;// tVal10 = 2*b
    431         ORR      tVal14, tVal14,  tVal14, LSL #16    ;// tVal14 = {c  ,  c}
    432         SUB      tVal6,  tVal2,   tVal6              ;// tVal6  = d = a - 7*b - 7*c + 16
    433         ADD      tVal1,  tVal6,   tVal10             ;// tVal1  = pp2 = d + 2*b
    434         ADD      tVal10, tVal10,  tVal12             ;// tVal10 =3*b
    435         ORR      tVal0,  tVal6,   tVal1,  LSL #16    ;// tval0  = p2p0   = pack {p2, p0}
    436         UXTH     tVal12, tVal12                      ;// tVal12 = Cleared the upper half word
    437         UXTH     tVal10, tVal10                      ;// tVal12 = Cleared the upper half word
    438         ORR      tVal12, tVal12,  tVal12, LSL #16    ;// tVal12 = {b  ,  b}
    439         ORR      tVal10, tVal10,  tVal10, LSL #16    ;// tVal10 = {3b , 3b}
    440         SADD16   tVal1,  tVal0,   tVal12             ;// tVal1  = p3p1   = p2p0   + {b,b}
    441         SADD16   tVal2,  tVal1,   tVal10             ;// tVal2  = p6p4   = p3p1   + {3b,3b}
    442         SADD16   tVal4,  tVal2,   tVal12             ;// tVal4  = p7p5   = p6p4   + {b,b}
    443         SADD16   tVal6,  tVal4,   tVal10             ;// tVal6  = p10p8  = p7p5   + {3b,3b}
    444         SADD16   tVal7,  tVal6,   tVal12             ;// tVal7  = p11p9  = p10p8  + {b,b}
    445         SADD16   tVal8,  tVal7,   tVal10             ;// tVal8  = p14p12 = p11p9  + {3b,3b}
    446         SADD16   tVal9,  tVal8,   tVal12             ;// tVal9  = p15p13 = p14p12 + {b,b}
    447         LDR      r0x00FF00FF,     =MASK_CONST        ;// r0x00FF00FF = 0x00FF00FF
    448 
    449 LOOP_PLANE
    450 
    451         USAT16   temp2, #13, p3p1
    452         USAT16   temp1, #13, p2p0
    453         SADD16   p3p1,   p3p1,   c
    454         SADD16   p2p0,   p2p0,   c
    455         AND      temp2, r0x00FF00FF, temp2, ASR #5
    456         AND      temp1, r0x00FF00FF, temp1, ASR #5
    457         ORR      temp1, temp1, temp2, LSL #8
    458         STR      temp1, [pDst], #4
    459 
    460         USAT16   temp2, #13, p7p5
    461         USAT16   temp1, #13, p6p4
    462         SADD16   p7p5,   p7p5,   c
    463         SADD16   p6p4,   p6p4,   c
    464         AND      temp2, r0x00FF00FF, temp2, ASR #5
    465         AND      temp1, r0x00FF00FF, temp1, ASR #5
    466         ORR      temp1, temp1, temp2, LSL #8
    467         STR      temp1, [pDst], #4
    468 
    469         USAT16   temp2, #13, p11p9
    470         USAT16   temp1, #13, p10p8
    471         SADD16   p11p9,  p11p9,  c
    472         SADD16   p10p8,  p10p8,  c
    473         AND      temp2, r0x00FF00FF, temp2, ASR #5
    474         AND      temp1, r0x00FF00FF, temp1, ASR #5
    475         ORR      temp1, temp1, temp2, LSL #8
    476         STR      temp1, [pDst], #4
    477 
    478         USAT16   temp2, #13, p15p13
    479         USAT16   temp1, #13, p14p12
    480         SADD16   p15p13, p15p13, c
    481         SADD16   p14p12, p14p12, c
    482         AND      temp2, r0x00FF00FF, temp2, ASR #5
    483         AND      temp1, r0x00FF00FF, temp1, ASR #5
    484         ORR      temp1, temp1, temp2, LSL #8
    485         STR      temp1, [pDst], #4
    486 
    487         ADDS     r0x00FF00FF, r0x00FF00FF, #1<<28     ;// Loop counter value in top 4 bits
    488 
    489         ADD      pDst, pDst, dstStep
    490 
    491         BCC      LOOP_PLANE                           ;// Loop for 16 times
    492         MOV      return, #OMX_Sts_NoErr
    493         M_END
    494 
    495         ENDIF ;// ARM1136JS
    496 
    497 
    498         END
    499 ;-----------------------------------------------------------------------------------------------
    500 ; omxVCM4P10_PredictIntra_16x16 ends
    501 ;-----------------------------------------------------------------------------------------------
    502