Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// Copyright (C) 2007-2008 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// File Name:  omxVCM4P10_PredictIntra_16x16_s.s
     19 ;// OpenMAX DL: v1.0.2
     20 ;// Revision:   9641
     21 ;// Date:       Thursday, February 7, 2008
     22 ;//
     23 ;//
     24 ;//
     25 ;//
     26 
     27         INCLUDE omxtypes_s.h
     28         INCLUDE armCOMM_s.h
     29 
     30         M_VARIANTS ARM1136JS
     31 
     32 ;//-------------------------------------------------------
     33 ;// This table for implementing switch case of C in asm by
     34 ;// the mehtod of two levels of indexing.
     35 ;//-------------------------------------------------------
     36 
     37     M_TABLE armVCM4P10_pIndexTable16x16
     38     DCD  OMX_VC_16X16_VERT, OMX_VC_16X16_HOR
     39     DCD  OMX_VC_16X16_DC,   OMX_VC_16X16_PLANE
     40 
     41     IF ARM1136JS
     42 
     43 ;//--------------------------------------------
     44 ;// Constants
     45 ;//--------------------------------------------
     46 BLK_SIZE        EQU 0x10
     47 MUL_CONST0      EQU 0x01010101
     48 MUL_CONST1      EQU 0x00060004
     49 MUL_CONST2      EQU 0x00070005
     50 MUL_CONST3      EQU 0x00030001
     51 MASK_CONST      EQU 0x00FF00FF
     52 
     53 ;//--------------------------------------------
     54 ;// Scratch variable
     55 ;//--------------------------------------------
     56 y               RN 12
     57 pc              RN 15
     58 
     59 return          RN 0
     60 innerCount      RN 0
     61 outerCount      RN 1
     62 pSrcLeft2       RN 1
     63 pDst2           RN 2
     64 sum             RN 6
     65 pTable          RN 9
     66 temp1           RN 10
     67 temp2           RN 12
     68 cMul1           RN 11
     69 cMul2           RN 12
     70 count           RN 12
     71 dstStepx2       RN 11
     72 leftStepx2      RN 14
     73 r0x01010101     RN 10
     74 r0x00FF00FF     RN 11
     75 
     76 tVal0           RN 0
     77 tVal1           RN 1
     78 tVal2           RN 2
     79 tVal3           RN 3
     80 tVal4           RN 4
     81 tVal5           RN 5
     82 tVal6           RN 6
     83 tVal7           RN 7
     84 tVal8           RN 8
     85 tVal9           RN 9
     86 tVal10          RN 10
     87 tVal11          RN 11
     88 tVal12          RN 12
     89 tVal14          RN 14
     90 
     91 b               RN 12
     92 c               RN 14
     93 
     94 p2p0            RN 0
     95 p3p1            RN 1
     96 p6p4            RN 2
     97 p7p5            RN 4
     98 p10p8           RN 6
     99 p11p9           RN 7
    100 p14p12          RN 8
    101 p15p13          RN 9
    102 
    103 p3210           RN 10
    104 p7654           RN 10
    105 p111098         RN 10
    106 p15141312       RN 10
    107 
    108 ;//--------------------------------------------
    109 ;// Declare input registers
    110 ;//--------------------------------------------
    111 pSrcLeft        RN 0    ;// input pointer
    112 pSrcAbove       RN 1    ;// input pointer
    113 pSrcAboveLeft   RN 2    ;// input pointer
    114 pDst            RN 3    ;// output pointer
    115 leftStep        RN 4    ;// input variable
    116 dstStep         RN 5    ;// input variable
    117 predMode        RN 6    ;// input variable
    118 availability    RN 7    ;// input variable
    119 
    120 ;//-----------------------------------------------------------------------------------------------
    121 ;// omxVCM4P10_PredictIntra_16x16 starts
    122 ;//-----------------------------------------------------------------------------------------------
    123 
    124         ;// Write function header
    125         M_START omxVCM4P10_PredictIntra_16x16, r11
    126 
    127         ;// Define stack arguments
    128         M_ARG    LeftStep,     4
    129         M_ARG    DstStep,      4
    130         M_ARG    PredMode,     4
    131         M_ARG    Availability, 4
    132 
    133         ;// M_STALL ARM1136JS=4
    134 
    135         LDR      pTable,=armVCM4P10_pIndexTable16x16 ;// Load index table for switch case
    136 
    137         ;// Load argument from the stack
    138         M_LDR    predMode, PredMode                  ;// Arg predMode loaded from stack to reg
    139         M_LDR    leftStep, LeftStep                  ;// Arg leftStep loaded from stack to reg
    140         M_LDR    dstStep,  DstStep                   ;// Arg dstStep loaded from stack to reg
    141         M_LDR    availability, Availability          ;// Arg availability loaded from stack to reg
    142 
    143         MOV      y, #BLK_SIZE                        ;// Outer Loop Count
    144         LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode
    145 
    146 OMX_VC_16X16_VERT
    147         LDM      pSrcAbove, {tVal6,tVal7,tVal8,tVal9};// tVal 6 to 9 = pSrcAbove[0 to 15]
    148         ADD      dstStepx2, dstStep, dstStep         ;// double dstStep
    149         ADD      pDst2, pDst, dstStep                ;// pDst2- pDst advanced by dstStep
    150 
    151         ;// M_STALL ARM1136JS=2                       ;// Stall outside the loop
    152 
    153 LOOP_VERT
    154         STM      pDst, {tVal6,tVal7,tVal8,tVal9}     ;// pDst[0 to 15] = tVal 6 to 9
    155         SUBS     y, y, #2                            ;// y--
    156         ADD      pDst, pDst, dstStepx2               ;// pDst advanced by dstStep
    157         STM      pDst2, {tVal6,tVal7,tVal8,tVal9}    ;// pDst2[16 to 31] = tVal 6 to 9
    158         ADD      pDst2, pDst2, dstStepx2             ;// pDst advanced by dstStep
    159         BNE      LOOP_VERT                           ;// Loop for 8 times
    160         MOV      return, #OMX_Sts_NoErr
    161         M_EXIT
    162 
    163 
    164 OMX_VC_16X16_HOR
    165 
    166         ;// M_STALL ARM1136JS=6
    167 
    168         LDR      r0x01010101, =MUL_CONST0            ;// Const to repeat the byte in reg 4 times
    169         MOV      y, #4                               ;// Outer Loop Count
    170         M_LDRB   tVal6, [pSrcLeft], +leftStep        ;// tVal6 = pSrcLeft[0 to 3]
    171         ADD      pDst2, pDst, dstStep                ;// pDst2- pDst advanced by dstStep
    172         M_LDRB   tVal7, [pSrcLeft], +leftStep        ;// tVal1 = pSrcLeft[4 to 7]
    173         ADD      dstStepx2, dstStep, dstStep         ;// double dstStep
    174         SUB      dstStepx2, dstStepx2, #12           ;// double dstStep  minus 12
    175 
    176 LOOP_HOR
    177         M_LDRB   tVal8, [pSrcLeft], +leftStep        ;// tVal8 = pSrcLeft[0 to 3]
    178         MUL      tVal6, tVal6, r0x01010101           ;// replicate the val in all the bytes
    179         M_LDRB   tVal9, [pSrcLeft], +leftStep        ;// tVal9 = pSrcLeft[4 to 7]
    180         MUL      tVal7, tVal7, r0x01010101           ;// replicate the val in all the bytes
    181         SUBS     y, y, #1                            ;// y--
    182         STR      tVal6, [pDst],  #+4                 ;// store {tVal6} at pDst[0 to 3]
    183         STR      tVal7, [pDst2], #+4                 ;// store {tVal7} at pDst2[0 to 3]
    184         STR      tVal6, [pDst],  #+4                 ;// store {tVal6} at pDst[4 to 7]
    185         STR      tVal7, [pDst2], #+4                 ;// store {tVal7} at pDst2[4 to 7]
    186         MUL      tVal8, tVal8, r0x01010101           ;// replicate the val in all the bytes
    187         STR      tVal6, [pDst],  #+4                 ;// store {tVal6} at pDst[8 to 11]
    188         STR      tVal7, [pDst2], #+4                 ;// store {tVal7} at pDst2[8 to 11]
    189         MUL      tVal9, tVal9, r0x01010101           ;// replicate the val in all the bytes
    190         M_STR    tVal6, [pDst], dstStepx2            ;// store {tVal6} at pDst[12 to 15]
    191         M_STR    tVal7, [pDst2], dstStepx2           ;// store {tVal7} at pDst2[12 to 15]
    192         STR      tVal8, [pDst],  #+4                 ;// store {tVal6} at pDst[0 to 3]
    193         STR      tVal9, [pDst2], #+4                 ;// store {tVal7} at pDst2[0 to 3]
    194         STR      tVal8, [pDst],  #+4                 ;// store {tVal6} at pDst[4 to 7]
    195         STR      tVal9, [pDst2], #+4                 ;// store {tVal7} at pDst2[4 to 7]
    196         STR      tVal8, [pDst],  #+4                 ;// store {tVal6} at pDst[8 to 11]
    197         STR      tVal9, [pDst2], #+4                 ;// store {tVal7} at pDst2[8 to 11]
    198         M_STR    tVal8, [pDst], dstStepx2            ;// store {tVal6} at pDst[12 to 15]
    199         M_LDRB   tVal6, [pSrcLeft], +leftStep        ;// tVal6 = pSrcLeft[0 to 3]
    200         M_STR    tVal9, [pDst2], dstStepx2           ;// store {tVal7} at pDst2[12 to 15]
    201         M_LDRB   tVal7, [pSrcLeft], +leftStep        ;// tVal7 = pSrcLeft[4 to 7]
    202         BNE      LOOP_HOR                            ;// Loop for 3 times
    203         MOV      return, #OMX_Sts_NoErr
    204         M_EXIT
    205 
    206 OMX_VC_16X16_DC
    207 
    208         ;// M_STALL ARM1136JS=2
    209 
    210         MOV      count, #0                           ;// count = 0
    211         TST      availability, #OMX_VC_UPPER         ;// if(availability & #OMX_VC_UPPER)
    212         BEQ      TST_LEFT                            ;// Jump to Left if not upper
    213         LDM      pSrcAbove,{tVal8,tVal9,tVal10,tVal11};// tVal 8 to 11 = pSrcAbove[0 to 15]
    214         ADD      count, count, #1                    ;// if upper inc count by 1
    215 
    216         ;// M_STALL ARM1136JS=2
    217 
    218         UXTB16   tVal2, tVal8                        ;// pSrcAbove[0, 2]
    219         UXTB16   tVal6, tVal9                        ;// pSrcAbove[4, 6]
    220         UADD16   tVal2, tVal2, tVal6                 ;// pSrcAbove[0, 2] + pSrcAbove[4, 6]
    221         UXTB16   tVal8, tVal8, ROR #8                ;// pSrcAbove[1, 3]
    222         UXTB16   tVal9, tVal9, ROR #8                ;// pSrcAbove[5, 7]
    223         UADD16   tVal8, tVal8, tVal9                 ;// pSrcAbove[1, 3] + pSrcAbove[5, 7]
    224         UADD16   tVal2, tVal2, tVal8                 ;// sum(pSrcAbove[0] to pSrcAbove[7])
    225 
    226         UXTB16   tVal8, tVal10                       ;// pSrcAbove[8, 10]
    227         UXTB16   tVal9, tVal11                       ;// pSrcAbove[12, 14]
    228         UADD16   tVal8, tVal8, tVal9                 ;// pSrcAbove[8, 10] + pSrcAbove[12, 14]
    229         UXTB16   tVal10, tVal10, ROR #8              ;// pSrcAbove[9, 11]
    230         UXTB16   tVal11, tVal11, ROR #8              ;// pSrcAbove[13, 15]
    231         UADD16   tVal10, tVal10, tVal11              ;// pSrcAbove[9, 11] + pSrcAbove[13, 15]
    232         UADD16   tVal8, tVal8, tVal10                ;// sum(pSrcAbove[8] to pSrcAbove[15])
    233 
    234         UADD16   tVal2, tVal2, tVal8                 ;// sum(pSrcAbove[0] to pSrcAbove[15])
    235 
    236         ;// M_STALL ARM1136JS=1
    237 
    238         ADD      tVal2, tVal2, tVal2, LSR #16        ;// sum(pSrcAbove[0] to pSrcAbove[15])
    239 
    240         ;// M_STALL ARM1136JS=1
    241 
    242         UXTH     sum, tVal2                          ;// Extract the lower half for result
    243 
    244 TST_LEFT
    245         TST      availability, #OMX_VC_LEFT
    246         BEQ      TST_COUNT
    247         ADD      leftStepx2, leftStep,leftStep       ;// leftStepx2 = 2 * leftStep
    248         ADD      pSrcLeft2, pSrcLeft, leftStep       ;// pSrcLeft2 = pSrcLeft + leftStep
    249 
    250         M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
    251         M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
    252         M_LDRB   tVal10, [pSrcLeft], +leftStepx2     ;// tVal10= pSrcLeft[2]
    253         M_LDRB   tVal11, [pSrcLeft2],+leftStepx2     ;// tVal11= pSrcLeft[3]
    254         ADD      tVal7, tVal8, tVal9                 ;// tVal7 = tVal8 + tVal9
    255         ADD      count, count, #1                    ;// Inc Counter if Left is available
    256         ADD      tVal6, tVal10, tVal11               ;// tVal6 = tVal10 + tVal11
    257 
    258         M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
    259         M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
    260         M_LDRB   tVal10, [pSrcLeft], +leftStepx2     ;// tVal10= pSrcLeft[2]
    261         M_LDRB   tVal11, [pSrcLeft2],+leftStepx2     ;// tVal11= pSrcLeft[3]
    262         ADD      sum, tVal7, tVal6                   ;// sum = tVal8 + tVal10
    263         ADD      tVal8, tVal8, tVal9                 ;// tVal8 = tVal8 + tVal9
    264         ADD      tVal10, tVal10, tVal11              ;// tVal10= tVal10 + tVal11
    265         ADD      tVal7, tVal8, tVal10                ;// tVal7 = tVal8 + tVal10
    266 
    267 
    268         M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
    269         M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
    270         M_LDRB   tVal10, [pSrcLeft], +leftStepx2     ;// tVal10= pSrcLeft[2]
    271         M_LDRB   tVal11, [pSrcLeft2],+leftStepx2     ;// tVal11= pSrcLeft[3]
    272         ADD      sum, sum, tVal7                     ;// sum = sum + tVal7
    273         ADD      tVal8, tVal8, tVal9                 ;// tVal8 = tVal8 + tVal9
    274         ADD      tVal10, tVal10, tVal11              ;// tVal10= tVal10 + tVal11
    275         ADD      tVal7, tVal8, tVal10                ;// tVal7 = tVal8 + tVal10
    276 
    277 
    278         M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
    279         M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
    280         M_LDRB   tVal10, [pSrcLeft], +leftStepx2     ;// tVal10= pSrcLeft[2]
    281         M_LDRB   tVal11, [pSrcLeft2],+leftStepx2     ;// tVal11= pSrcLeft[3]
    282         ADD      sum, sum, tVal7                     ;// sum = sum + tVal7
    283         ADD      tVal8, tVal8, tVal9                 ;// tVal8 = tVal8 + tVal9
    284         ADD      tVal10, tVal10, tVal11              ;// tVal10= tVal10 + tVal11
    285         ADD      tVal7, tVal8, tVal10                ;// tVal7 = tVal8 + tVal10
    286         ADD      sum, sum, tVal7                     ;// sum = sum + tVal7
    287 
    288 TST_COUNT
    289         CMP      count, #0                           ;// if(count == 0)
    290         MOVEQ    sum, #128                           ;// sum = 128 if(count == 0)
    291         BEQ      TST_COUNT0                          ;// if(count == 0)
    292         CMP      count, #1                           ;// if(count == 1)
    293         ADDEQ    sum, sum, #8                        ;// sum += 8 if(count == 1)
    294         ADDNE    sum, sum, tVal2                     ;// sum = sumleft + sumupper
    295         ADDNE    sum, sum, #16                       ;// sum += 16 if(count == 2)
    296 
    297         ;// M_STALL ARM1136JS=1
    298 
    299         UXTH     sum, sum                            ;// sum only byte rest cleared
    300 
    301         ;// M_STALL ARM1136JS=1
    302 
    303         LSREQ    sum, sum, #4                        ;// sum >> 4 if(count == 1)
    304 
    305         ;// M_STALL ARM1136JS=1
    306 
    307         LSRNE    sum, sum, #5                        ;// sum >> 5 if(count == 2)
    308 
    309 TST_COUNT0
    310 
    311         ;// M_STALL ARM1136JS=1
    312 
    313         ORR      sum, sum, sum, LSL #8               ;// sum replicated in two halfword
    314 
    315         ;// M_STALL ARM1136JS=1
    316 
    317         ORR      tVal6, sum, sum, LSL #16            ;// sum  replicated in all bytes
    318         CPY      tVal7, tVal6                        ;// tVal1 = tVal0
    319         CPY      tVal8, tVal6                        ;// tVal2 = tVal0
    320         CPY      tVal9, tVal6                        ;// tVal3 = tVal0
    321         ADD      dstStepx2, dstStep, dstStep         ;// double dstStep
    322         ADD      pDst2, pDst, dstStep                ;// pDst2- pDst advanced by dstStep
    323         MOV      y, #BLK_SIZE                        ;// Outer Loop Count
    324 
    325 LOOP_DC
    326         STM      pDst, {tVal6,tVal7,tVal8,tVal9}     ;// pDst[0 to 15] = tVal 6 to 9
    327         SUBS     y, y, #2                            ;// y--
    328         ADD      pDst, pDst, dstStepx2               ;// pDst advanced by dstStep
    329         STM      pDst2, {tVal6,tVal7,tVal8,tVal9}    ;// pDst2[16 to 31] = tVal 6 to 9
    330         ADD      pDst2, pDst2, dstStepx2             ;// pDst advanced by dstStep
    331         BNE      LOOP_DC                             ;// Loop for 8 times
    332 
    333         MOV      return, #OMX_Sts_NoErr
    334         M_EXIT
    335 
    336 OMX_VC_16X16_PLANE
    337 
    338         ;// M_STALL ARM1136JS=3
    339         RSB      tVal14, leftStep, leftStep, LSL #4  ;// tVal14 = 15*leftStep
    340 
    341         ;// M_STALL ARM1136JS=2
    342         LDRB     tVal10, [pSrcLeft,  tVal14]         ;// tVal10 = pSrcLeft[15*leftStep]
    343         LDRB     tVal11, [pSrcAboveLeft]             ;// tVal11 = pSrcAboveLeft[0]
    344         LDRB     tVal12, [pSrcAbove, #15]
    345 
    346         ADD      tVal2,  tVal12,  tVal10             ;// tVal2  = pSrcAbove[15] + pSrcLeft[15*leftStep]
    347         SUB      tVal10, tVal10,  tVal11             ;// tVal10 = V0 = pSrcLeft[15*leftStep] - pSrcAboveLeft[0]
    348         SUB      tVal11, tVal12,  tVal11             ;// tVal11 = H0 = pSrcAbove[15] - pSrcAboveLeft[0]
    349         MOV      tVal2,  tVal2,   LSL #4             ;// tVal2  = a = 16 * (pSrcAbove[15] + pSrcLeft[15*leftStep])
    350 
    351         MOV     tVal11, tVal11, LSL #3              ;// 8*[15]-[-1]
    352         LDRB    tVal6, [pSrcAbove, #0]
    353         LDRB    tVal7, [pSrcAbove, #14]
    354         SUB     tVal8, tVal7, tVal6
    355         RSB     tVal8, tVal8, tVal8, LSL #3         ;// 7*[14]-[0]
    356         ADD     tVal11, tVal11, tVal8
    357         LDRB    tVal6, [pSrcAbove, #1]
    358         LDRB    tVal7, [pSrcAbove, #13]
    359         SUB     tVal8, tVal7, tVal6
    360         ADD     tVal8, tVal8, tVal8
    361         ADD     tVal8, tVal8, tVal8, LSL #1         ;// 6*[13]-[1]
    362         ADD     tVal11, tVal11, tVal8
    363         LDRB    tVal6, [pSrcAbove, #2]
    364         LDRB    tVal7, [pSrcAbove, #12]
    365         SUB     tVal8, tVal7, tVal6
    366         ADD     tVal8, tVal8, tVal8, LSL #2         ;// 5*[12]-[2]
    367         ADD     tVal11, tVal11, tVal8
    368         LDRB    tVal6, [pSrcAbove, #3]
    369         LDRB    tVal7, [pSrcAbove, #11]
    370         SUB     tVal8, tVal7, tVal6
    371         ADD     tVal11, tVal11, tVal8, LSL #2       ;// + 4*[11]-[3]
    372         LDRB    tVal6, [pSrcAbove, #4]
    373         LDRB    tVal7, [pSrcAbove, #10]
    374         SUB     tVal8, tVal7, tVal6
    375         ADD     tVal8, tVal8, tVal8, LSL #1         ;// 3*[10]-[4]
    376         ADD     tVal11, tVal11, tVal8
    377         LDRB    tVal6, [pSrcAbove, #5]
    378         LDRB    tVal7, [pSrcAbove, #9]
    379         SUB     tVal8, tVal7, tVal6
    380         ADD     tVal11, tVal11, tVal8, LSL #1       ;// + 2*[9]-[5]
    381         LDRB    tVal6, [pSrcAbove, #6]
    382         LDRB    tVal7, [pSrcAbove, #8]
    383         SUB     tVal8, tVal7, tVal6                 ;// 1*[8]-[6]
    384         ADD     tVal7, tVal11, tVal8
    385 
    386         ADD      tVal2,  tVal2,   #16                ;// tVal2  = a + 16
    387         MOV      tVal1,  pSrcLeft                    ;// tVal4  = pSrcLeft
    388         SUB      tVal9,  tVal14,   leftStep          ;// tVal9  = 14*leftStep
    389         ADD      tVal9,  pSrcLeft, tVal9             ;// tVal9  = pSrcLeft + 14*leftStep
    390 
    391         M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[14*leftStep]
    392         M_LDRB   tVal11, [tVal1], +leftStep          ;// tVal11 = pSrcLeft[0]
    393         ADD      tVal7,  tVal7,  tVal7,  LSL #2      ;// tVal7  = 5 * H
    394         ADD      tVal7,  tVal7,  #32                 ;// tVal7  = 5 * H + 32
    395         SUB      tVal8,  tVal8,  tVal11              ;// tVal8  = pSrcLeft[14*leftStep] - pSrcLeft[0]
    396         ASR      tVal12, tVal7,  #6                  ;// tVal12 = b = (5 * H + 32) >> 6
    397 
    398         RSB      tVal8,  tVal8,  tVal8,  LSL #3      ;// tVal8  = V1 = 7* (pSrcLeft[14*leftStep]-pSrcLeft[0])
    399         ADD      tVal6,  tVal8,  tVal10, LSL #3      ;// tVal6  = V = V0 +V1
    400         M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[13*leftStep]
    401         M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[leftStep]
    402         RSB      tVal7,  tVal12,  tVal12,  LSL #3    ;// tVal7  = 7*b
    403         SUB      tVal2,  tVal2,   tVal7              ;// tVal2  = a + 16 - 7*b
    404         SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[13*leftStep] - pSrcLeft[leftStep]
    405         M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[12*lS]
    406         ADD      tVal7,  tVal7,   tVal7              ;// tVal7  = 2 * (pSrcLeft[13*leftStep] - pSrcLeft[leftStep])
    407         M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[2*leftStep]
    408         ADD      tVal7,  tVal7,   tVal7,  LSL #1     ;// tVal7  = 6 * (pSrcLeft[13*leftStep] - pSrcLeft[leftStep])
    409         ADD      tVal6,  tVal6,   tVal7              ;// tVal6  = V = V + V2
    410         SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[12*leftStep] - pSrcLeft[2*leftStep]
    411         M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[11*leftStep]
    412         M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[3*leftStep]
    413         ADD      tVal7,  tVal7,   tVal7,  LSL #2     ;// tVal7  = 5 * (pSrcLeft[12*leftStep] - pSrcLeft[2*leftStep])
    414         ADD      tVal6,  tVal6,   tVal7              ;// tVal6  = V = V + V3
    415         SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[11*leftStep] - pSrcLeft[3*leftStep]
    416         M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[10*leftStep]
    417         M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[4*leftStep]
    418         ADD      tVal6,  tVal6,   tVal7,  LSL #2     ;// tVal6  = V = V + V4
    419         SUB      dstStep, dstStep, #16               ;// tVal5  = dstStep - 16
    420         SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[10*leftStep] - pSrcLeft[4*leftStep]
    421         M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[9*leftStep]
    422         M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[5*leftStep]
    423         ADD      tVal7,  tVal7,   tVal7,  LSL #1     ;// tVal7  = 3 * (pSrcLeft[10*leftStep] - pSrcLeft[4*leftStep])
    424         ADD      tVal6,  tVal6,   tVal7              ;// tVal6  = V = V + V5
    425         SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[9*leftStep] - pSrcLeft[5*leftStep]
    426         M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[8*leftStep]
    427         M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[6*leftStep]
    428         ADD      tVal6,  tVal6,   tVal7,  LSL #1     ;// tVal6  = V = V + V6
    429 
    430         ;// M_STALL ARM1136JS=1
    431         SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[8*leftStep] - pSrcLeft[6*leftStep]
    432         ADD      tVal6,  tVal6,   tVal7              ;// tVal6  = V = V + V7
    433 
    434         ;// M_STALL ARM1136JS=1
    435         ADD      tVal6,  tVal6,   tVal6,  LSL #2     ;// tVal6  = 5*V
    436         ADD      tVal6,  tVal6,   #32                ;// tVal6  = 5*V + 32
    437 
    438         ;// M_STALL ARM1136JS=1
    439         ASR      tVal14, tVal6,   #6                 ;// tVal14 = c = (5*V + 32)>>6
    440 
    441         ;// M_STALL ARM1136JS=1
    442         RSB      tVal6,  tVal14,  tVal14, LSL #3     ;// tVal6  = 7*c
    443         UXTH     tVal14, tVal14                      ;// tVal14 = Cleared the upper half word
    444         ADD      tVal10, tVal12,  tVal12             ;// tVal10 = 2*b
    445         ORR      tVal14, tVal14,  tVal14, LSL #16    ;// tVal14 = {c  ,  c}
    446         SUB      tVal6,  tVal2,   tVal6              ;// tVal6  = d = a - 7*b - 7*c + 16
    447         ADD      tVal1,  tVal6,   tVal10             ;// tVal1  = pp2 = d + 2*b
    448         ADD      tVal10, tVal10,  tVal12             ;// tVal10 =3*b
    449         ORR      tVal0,  tVal6,   tVal1,  LSL #16    ;// tval0  = p2p0   = pack {p2, p0}
    450         UXTH     tVal12, tVal12                      ;// tVal12 = Cleared the upper half word
    451         UXTH     tVal10, tVal10                      ;// tVal12 = Cleared the upper half word
    452         ORR      tVal12, tVal12,  tVal12, LSL #16    ;// tVal12 = {b  ,  b}
    453         ORR      tVal10, tVal10,  tVal10, LSL #16    ;// tVal10 = {3b , 3b}
    454         SADD16   tVal1,  tVal0,   tVal12             ;// tVal1  = p3p1   = p2p0   + {b,b}
    455         SADD16   tVal2,  tVal1,   tVal10             ;// tVal2  = p6p4   = p3p1   + {3b,3b}
    456         SADD16   tVal4,  tVal2,   tVal12             ;// tVal4  = p7p5   = p6p4   + {b,b}
    457         SADD16   tVal6,  tVal4,   tVal10             ;// tVal6  = p10p8  = p7p5   + {3b,3b}
    458         SADD16   tVal7,  tVal6,   tVal12             ;// tVal7  = p11p9  = p10p8  + {b,b}
    459         SADD16   tVal8,  tVal7,   tVal10             ;// tVal8  = p14p12 = p11p9  + {3b,3b}
    460         SADD16   tVal9,  tVal8,   tVal12             ;// tVal9  = p15p13 = p14p12 + {b,b}
    461         LDR      r0x00FF00FF,     =MASK_CONST        ;// r0x00FF00FF = 0x00FF00FF
    462 
    463 LOOP_PLANE
    464 
    465         USAT16   temp2, #13, p3p1
    466         USAT16   temp1, #13, p2p0
    467         SADD16   p3p1,   p3p1,   c
    468         SADD16   p2p0,   p2p0,   c
    469         AND      temp2, r0x00FF00FF, temp2, ASR #5
    470         AND      temp1, r0x00FF00FF, temp1, ASR #5
    471         ORR      temp1, temp1, temp2, LSL #8
    472         STR      temp1, [pDst], #4
    473 
    474         USAT16   temp2, #13, p7p5
    475         USAT16   temp1, #13, p6p4
    476         SADD16   p7p5,   p7p5,   c
    477         SADD16   p6p4,   p6p4,   c
    478         AND      temp2, r0x00FF00FF, temp2, ASR #5
    479         AND      temp1, r0x00FF00FF, temp1, ASR #5
    480         ORR      temp1, temp1, temp2, LSL #8
    481         STR      temp1, [pDst], #4
    482 
    483         USAT16   temp2, #13, p11p9
    484         USAT16   temp1, #13, p10p8
    485         SADD16   p11p9,  p11p9,  c
    486         SADD16   p10p8,  p10p8,  c
    487         AND      temp2, r0x00FF00FF, temp2, ASR #5
    488         AND      temp1, r0x00FF00FF, temp1, ASR #5
    489         ORR      temp1, temp1, temp2, LSL #8
    490         STR      temp1, [pDst], #4
    491 
    492         USAT16   temp2, #13, p15p13
    493         USAT16   temp1, #13, p14p12
    494         SADD16   p15p13, p15p13, c
    495         SADD16   p14p12, p14p12, c
    496         AND      temp2, r0x00FF00FF, temp2, ASR #5
    497         AND      temp1, r0x00FF00FF, temp1, ASR #5
    498         ORR      temp1, temp1, temp2, LSL #8
    499         STR      temp1, [pDst], #4
    500 
    501         ADDS     r0x00FF00FF, r0x00FF00FF, #1<<28     ;// Loop counter value in top 4 bits
    502 
    503         ADD      pDst, pDst, dstStep
    504 
    505         BCC      LOOP_PLANE                           ;// Loop for 16 times
    506         MOV      return, #OMX_Sts_NoErr
    507         M_END
    508 
    509         ENDIF ;// ARM1136JS
    510 
    511 
    512         END
    513 ;-----------------------------------------------------------------------------------------------
    514 ; omxVCM4P10_PredictIntra_16x16 ends
    515 ;-----------------------------------------------------------------------------------------------
    516