Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// Copyright (C) 2007-2008 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// File Name:  omxVCM4P10_PredictIntraChroma_8x8_s.s
     19 ;// OpenMAX DL: v1.0.2
     20 ;// Revision:   9641
     21 ;// Date:       Thursday, February 7, 2008
     22 ;//
     23 ;//
     24 ;//
     25 ;//
     26 
     27 
     28         INCLUDE omxtypes_s.h
     29         INCLUDE armCOMM_s.h
     30 
     31         EXPORT armVCM4P10_pIndexTable8x8
     32 
     33 ;// Define the processor variants supported by this file
     34 
     35          M_VARIANTS ARM1136JS
     36 
     37      AREA table, DATA
     38 ;//-------------------------------------------------------
     39 ;// This table for implementing switch case of C in asm by
     40 ;// the mehtod of two levels of indexing.
     41 ;//-------------------------------------------------------
     42 
     43     M_TABLE armVCM4P10_pIndexTable8x8
     44     DCD  OMX_VC_CHROMA_DC,     OMX_VC_CHROMA_HOR
     45     DCD  OMX_VC_CHROMA_VERT,   OMX_VC_CHROMA_PLANE
     46 
     47     M_TABLE armVCM4P10_MultiplierTableChroma8x8,1
     48     DCW   3, 2, 1,4
     49     DCW  -3,-2,-1,0
     50     DCW   1, 2, 3,4
     51 
     52     IF ARM1136JS
     53 
     54 ;//--------------------------------------------
     55 ;// Constants
     56 ;//--------------------------------------------
     57 
     58 BLK_SIZE        EQU 0x8
     59 MUL_CONST0      EQU 0x01010101
     60 MASK_CONST      EQU 0x00FF00FF
     61 MUL_CONST1      EQU 0x80808080
     62 
     63 ;//--------------------------------------------
     64 ;// Scratch variable
     65 ;//--------------------------------------------
     66 y               RN 12
     67 pc              RN 15
     68 return          RN 0
     69 pSrcLeft2       RN 1
     70 pDst2           RN 2
     71 sum1            RN 6
     72 sum2            RN 7
     73 pTable          RN 9
     74 dstStepx2       RN 11
     75 leftStepx2      RN 14
     76 outerCount      RN 14
     77 r0x01010101     RN 10
     78 r0x00FF00FF     RN 11
     79 
     80 tVal0           RN 0
     81 tVal1           RN 1
     82 tVal2           RN 2
     83 tVal3           RN 3
     84 tVal4           RN 4
     85 tVal5           RN 5
     86 tVal6           RN 6
     87 tVal7           RN 7
     88 tVal8           RN 8
     89 tVal9           RN 9
     90 tVal10          RN 10
     91 tVal11          RN 11
     92 tVal12          RN 12
     93 tVal14          RN 14
     94 
     95 b               RN 14
     96 c               RN 12
     97 
     98 p2p0            RN 0
     99 p3p1            RN 1
    100 p6p4            RN 2
    101 p7p5            RN 4
    102 
    103 pp2pp0          RN 6
    104 pp3pp1          RN 7
    105 pp6pp4          RN 8
    106 pp7pp5          RN 9
    107 
    108 p3210           RN 10
    109 p7654           RN 10
    110 
    111 ;//--------------------------------------------
    112 ;// Input Arguments
    113 ;//--------------------------------------------
    114 pSrcLeft        RN 0    ;// input pointer
    115 pSrcAbove       RN 1    ;// input pointer
    116 pSrcAboveLeft   RN 2    ;// input pointer
    117 pDst            RN 3    ;// output pointer
    118 leftStep        RN 4    ;// input variable
    119 dstStep         RN 5    ;// input variable
    120 predMode        RN 6    ;// input variable
    121 availability    RN 7    ;// input variable
    122 
    123 ;//-----------------------------------------------------------------------------------------------
    124 ;// omxVCM4P10_PredictIntraChroma_8x8 starts
    125 ;//-----------------------------------------------------------------------------------------------
    126 
    127         ;// Write function header
    128         M_START omxVCM4P10_PredictIntraChroma_8x8, r11
    129 
    130         ;// Define stack arguments
    131         M_ARG    LeftStep,     4
    132         M_ARG    DstStep,      4
    133         M_ARG    PredMode,     4
    134         M_ARG    Availability, 4
    135 
    136         ;// M_STALL ARM1136JS=4
    137 
    138         LDR      pTable,=armVCM4P10_pIndexTable8x8   ;// Load index table for switch case
    139 
    140 
    141         ;// Load argument from the stack
    142         M_LDR    predMode, PredMode                  ;// Arg predMode loaded from stack to reg
    143         M_LDR    leftStep, LeftStep                  ;// Arg leftStep loaded from stack to reg
    144         M_LDR    dstStep,  DstStep                   ;// Arg dstStep loaded from stack to reg
    145         M_LDR    availability, Availability          ;// Arg availability loaded from stack to reg
    146 
    147         MOV      y, #BLK_SIZE                        ;// Outer Loop Count
    148         LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode
    149 
    150 OMX_VC_CHROMA_DC
    151         AND      availability, availability,#(OMX_VC_UPPER + OMX_VC_LEFT)
    152         CMP      availability, #(OMX_VC_UPPER + OMX_VC_LEFT) ;// if(availability & (#OMX_VC_UPPER | #OMX_VC_LEFT))
    153         LDR      r0x01010101, =MUL_CONST0
    154         BNE      TST_UPPER                           ;// Jump to Upper if not both
    155         LDM      pSrcAbove,{tVal8,tVal9}             ;// tVal 8 to 9 = pSrcAbove[0 to 7]
    156 
    157         ADD      leftStepx2, leftStep,leftStep       ;// leftStepx2 = 2 * leftStep
    158         ADD      pSrcLeft2, pSrcLeft, leftStep       ;// pSrcLeft2 = pSrcLeft + leftStep
    159 
    160         ;// M_STALL ARM1136JS=1
    161 
    162         UXTB16   tVal7, tVal8                        ;// pSrcAbove[0, 2]
    163         UXTB16   tVal8, tVal8, ROR #8                ;// pSrcAbove[1, 3]
    164         UADD16   sum1, tVal7, tVal8                  ;// pSrcAbove[0, 2] + pSrcAbove[1, 3]
    165 
    166         UXTB16   tVal7, tVal9                        ;// pSrcAbove[4, 6]
    167         UXTB16   tVal9, tVal9, ROR #8                ;// pSrcAbove[5, 7]
    168         UADD16   sum2, tVal7, tVal9                  ;// pSrcAbove[0, 2] + pSrcAbove[4, 6]
    169         ADD      sum1, sum1, sum1, LSR #16           ;// sum(pSrcAbove[0] to pSrcAbove[3])
    170         ADD      sum2, sum2, sum2, LSR #16           ;// sum(pSrcAbove[4] to pSrcAbove[7])
    171         UXTH     sum1, sum1                          ;// upsum1 (Clear the top junk bits)
    172         UXTH     sum2, sum2                          ;// upsum2 (Clear the top junk bits)
    173 
    174         M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
    175         M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
    176         M_LDRB   tVal4, [pSrcLeft],  +leftStepx2     ;// tVal4 = pSrcLeft[2]
    177         M_LDRB   tVal12,[pSrcLeft2], +leftStepx2     ;// tVal12= pSrcLeft[3]
    178         ADD      tVal2, tVal8, tVal9                 ;// tVal14 = tVal8 + tVal9
    179 
    180         M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[4]
    181         M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[5]
    182         ADD      tVal14, tVal4, tVal12               ;// tVal14 = tVal4 + tVal12
    183 
    184         LDRB     tVal4, [pSrcLeft]                   ;// tVal4 = pSrcLeft[6]
    185         LDRB     tVal12,[pSrcLeft2]                  ;// tVal12= pSrcLeft[7]
    186         ADD      tVal8, tVal8, tVal9                 ;// tVal8 = tVal8 + tVal9
    187         ADD      tVal2, tVal2, tVal14                ;// leftsum1  = sum(pSrcLeft[0] to pSrcLeft[3])
    188         ADD      tVal4, tVal4, tVal12                ;// tVal4 = tVal4 + tVal12
    189         ADD      tVal14, tVal8, tVal4                ;// leftsum2  = sum(pSrcLeft[4] to pSrcLeft[7])
    190         ADD      tVal8, tVal14, #2                   ;// tVal8 = leftsum2 + 2
    191         ADD      tVal9, sum2,   #2                   ;// tVal8 = upsum2 + 2
    192         ADD      sum1,  sum1, tVal2                  ;// sum1 = upsum1 + leftsum1
    193         ADD      sum2,  sum2, tVal14                 ;// sum2 = upsum2 + leftsum2
    194         ADD      sum1, sum1, #4                      ;// (sum1 + 4)
    195         ADD      sum2, sum2, #4                      ;// (sum2 + 4)
    196         MOV      sum1,  sum1,  LSR #3                ;// (sum1 + 4)>>3
    197         MOV      tVal9, tVal9, LSR #2                ;// (tVal9 + 2)>>2
    198         MOV      tVal8, tVal8, LSR #2                ;// (tVal8 + 2)>>2
    199         MOV      sum2,  sum2,  LSR #3                ;// (sum2 + 4)>>3
    200 
    201         MUL      tVal0, sum1, r0x01010101            ;// replicate the val in all the bytes
    202         MUL      tVal1, tVal9,r0x01010101            ;// replicate the val in all the bytes
    203         MUL      tVal8, tVal8,r0x01010101            ;// replicate the val in all the bytes
    204         MUL      tVal9, sum2, r0x01010101            ;// replicate the val in all the bytes
    205 
    206         M_STRD   tVal0, tVal1, [pDst], dstStep       ;// pDst[0 to 7]   = tVal 0 to 1
    207         M_STRD   tVal0, tVal1, [pDst], dstStep       ;// pDst[8 to 15]  = tVal 0 to 1
    208         M_STRD   tVal0, tVal1, [pDst], dstStep       ;// pDst[16 to 23] = tVal 0 to 1
    209         M_STRD   tVal0, tVal1, [pDst], dstStep       ;// pDst[24 to 31] = tVal 0 to 1
    210 
    211         M_STRD   tVal8, tVal9, [pDst], dstStep       ;// pDst[32 to 39] = tVal 8 to 9
    212         M_STRD   tVal8, tVal9, [pDst], dstStep       ;// pDst[40 to 47] = tVal 8 to 9
    213         M_STRD   tVal8, tVal9, [pDst], dstStep       ;// pDst[48 to 55] = tVal 8 to 9
    214         M_STRD   tVal8, tVal9, [pDst], dstStep       ;// pDst[56 to 63] = tVal 8 to 9
    215         MOV      return, #OMX_Sts_NoErr
    216         M_EXIT
    217 
    218 TST_UPPER
    219 
    220         ;// M_STALL ARM1136JS=3
    221 
    222         CMP      availability, #OMX_VC_UPPER         ;// if(availability & #OMX_VC_UPPER)
    223 
    224         BNE      TST_LEFT                            ;// Jump to Left if not upper
    225         LDM      pSrcAbove,{tVal8,tVal9}             ;// tVal 8 to 9 = pSrcAbove[0 to 7]
    226 
    227         ;// M_STALL ARM1136JS=3
    228 
    229         UXTB16   tVal7, tVal8                        ;// pSrcAbove[0, 2]
    230         UXTB16   tVal8, tVal8, ROR #8                ;// pSrcAbove[1, 3]
    231         UADD16   sum1,  tVal7, tVal8                 ;// pSrcAbove[0, 2] + pSrcAbove[1, 3]
    232 
    233         UXTB16   tVal7, tVal9                        ;// pSrcAbove[4, 6]
    234         UXTB16   tVal9, tVal9, ROR #8                ;// pSrcAbove[5, 7]
    235         UADD16   sum2,  tVal7, tVal9                 ;// pSrcAbove[0, 2] + pSrcAbove[4, 6]
    236 
    237         ADD      sum1, sum1, sum1, LSR #16           ;// sum(pSrcAbove[0] to pSrcAbove[3])
    238         ADD      sum2, sum2, sum2, LSR #16           ;// sum(pSrcAbove[4] to pSrcAbove[7])
    239 
    240         UXTH     sum1, sum1                          ;// upsum1 (Clear the top junk bits)
    241         UXTH     sum2, sum2                          ;// upsum2 (Clear the top junk bits)
    242 
    243         ADD      sum1, sum1, #2                      ;// sum1 + 2
    244         ADD      sum2, sum2, #2                      ;// sum2 + 2
    245 
    246         MOV      sum1, sum1, LSR #2                  ;// (sum1 + 2)>>2
    247         MOV      sum2, sum2, LSR #2                  ;// (sum2 + 2)>>2
    248 
    249         MUL      sum1, sum1,r0x01010101              ;// replicate the val in all the bytes
    250         MUL      sum2, sum2,r0x01010101              ;// replicate the val in all the bytes
    251 
    252         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[0 to 7]   = tVal 6 to 7
    253         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[8 to 15]  = tVal 6 to 7
    254         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[16 to 23] = tVal 6 to 7
    255         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[24 to 31] = tVal 6 to 7
    256         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[32 to 39] = tVal 6 to 7
    257         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[40 to 47] = tVal 6 to 7
    258         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[48 to 55] = tVal 6 to 7
    259         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[56 to 63] = tVal 6 to 7
    260         MOV      return, #OMX_Sts_NoErr
    261         M_EXIT
    262 
    263 TST_LEFT
    264         ;// M_STALL ARM1136JS=3
    265 
    266         CMP      availability, #OMX_VC_LEFT
    267         BNE      TST_COUNT0
    268         ADD      leftStepx2, leftStep,leftStep       ;// leftStepx2 = 2 * leftStep
    269         ADD      pSrcLeft2, pSrcLeft, leftStep       ;// pSrcLeft2 = pSrcLeft + leftStep
    270 
    271         M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
    272         M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
    273         M_LDRB   tVal4, [pSrcLeft],  +leftStepx2     ;// tVal4 = pSrcLeft[2]
    274         M_LDRB   tVal12,[pSrcLeft2], +leftStepx2     ;// tVal12= pSrcLeft[3]
    275 
    276         ADD      tVal6, tVal8, tVal9                 ;// tVal6 = tVal8 + tVal9
    277 
    278         M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[4]
    279         ADD      tVal7, tVal4, tVal12                ;// tVal7 = tVal4 + tVal12
    280         M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[5]
    281         M_LDRB   tVal4, [pSrcLeft],  +leftStepx2     ;// tVal4 = pSrcLeft[6]
    282         M_LDRB   tVal12,[pSrcLeft2], +leftStepx2     ;// tVal12= pSrcLeft[7]
    283 
    284         ADD      tVal8, tVal8, tVal9                 ;// tVal8 = tVal8 + tVal9
    285         ADD      sum1,  tVal6, tVal7                 ;// sum1  = sum(pSrcLeft[0] to pSrcLeft[3])
    286         ADD      tVal4, tVal4, tVal12                ;// tVal4 = tVal4 + tVal12
    287         ADD      sum2,  tVal8, tVal4                 ;// sum2  = sum(pSrcLeft[4] to pSrcLeft[7])
    288 
    289         ADD      sum1, sum1, #2                      ;// sum1 + 2
    290         ADD      sum2, sum2, #2                      ;// sum2 + 2
    291 
    292         MOV      sum1, sum1, LSR #2                  ;// (sum1 + 2)>>2
    293         MOV      sum2, sum2, LSR #2                  ;// (sum2 + 2)>>2
    294 
    295         MUL      tVal6, sum1,r0x01010101             ;// replicate the val in all the bytes
    296         MUL      tVal8, sum2,r0x01010101             ;// replicate the val in all the bytes
    297 
    298         ;// M_STALL ARM1136JS=1
    299         MOV      tVal7,tVal6                         ;// tVal7 = sum1
    300         MOV      tVal9,tVal8                         ;// tVal9 = sum2
    301 
    302         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[0 to 7]   = tVal 6 to 7
    303         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[8 to 15]  = tVal 6 to 7
    304         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[16 to 23] = tVal 6 to 7
    305         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[24 to 31] = tVal 6 to 7
    306 
    307         M_STRD   tVal8, tVal9, [pDst], dstStep       ;// pDst[32 to 39] = tVal 8 to 9
    308         M_STRD   tVal8, tVal9, [pDst], dstStep       ;// pDst[40 to 47] = tVal 8 to 9
    309         M_STRD   tVal8, tVal9, [pDst], dstStep       ;// pDst[48 to 55] = tVal 8 to 9
    310         M_STRD   tVal8, tVal9, [pDst], dstStep       ;// pDst[56 to 63] = tVal 8 to 9
    311 
    312         MOV      return, #OMX_Sts_NoErr
    313         M_EXIT                                       ;// Macro to exit midway-break frm case
    314 
    315 TST_COUNT0
    316         LDR      sum1, =MUL_CONST1                  ;// sum1 = 0x80808080 if(count == 0)
    317 
    318         ;// M_STALL ARM1136JS=2
    319 
    320         MOV      tVal7, sum1                         ;// tVal7 = sum1
    321 
    322         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[0 to 7]   = tVal 6 to 7
    323         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[8 to 15]  = tVal 6 to 7
    324         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[16 to 23] = tVal 6 to 7
    325         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[24 to 31] = tVal 6 to 7
    326         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[32 to 39] = tVal 6 to 7
    327         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[40 to 47] = tVal 6 to 7
    328         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[48 to 55] = tVal 6 to 7
    329         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[56 to 63] = tVal 6 to 7
    330 
    331         MOV      return, #OMX_Sts_NoErr
    332         M_EXIT                                       ;// Macro to exit midway-break frm case
    333 
    334 OMX_VC_CHROMA_HOR
    335 
    336         ;// M_STALL ARM1136JS=2
    337 
    338         ADD      pSrcLeft2, pSrcLeft, leftStep       ;// pSrcLeft2 = pSrcLeft + leftStep
    339         ADD      leftStepx2, leftStep, leftStep      ;// leftStepx2 = leftStep * 2
    340         ADD      pDst2, pDst, dstStep                ;// pDst2 = pDst + dstStep
    341         ADD      dstStepx2, dstStep, dstStep         ;// double dstStep
    342         SUB      dstStepx2, dstStepx2, #4            ;// double dstStep  minus 4
    343         LDR      r0x01010101, =MUL_CONST0            ;// Const to repeat the byte in reg 4 times
    344         M_LDRB   tVal6, [pSrcLeft], +leftStepx2      ;// tVal6 = pSrcLeft[0]
    345         M_LDRB   tVal7, [pSrcLeft2],+leftStepx2      ;// tVal7 = pSrcLeft[1]
    346         M_LDRB   tVal8, [pSrcLeft], +leftStepx2      ;// tVal8 = pSrcLeft[2]
    347         M_LDRB   tVal9, [pSrcLeft2],+leftStepx2      ;// tVal9 = pSrcLeft[3]
    348         MUL      tVal6, tVal6, r0x01010101           ;// replicate the val in all the bytes
    349         MUL      tVal7, tVal7, r0x01010101           ;// replicate the val in all the bytes
    350         MUL      tVal8, tVal8, r0x01010101           ;// replicate the val in all the bytes
    351         MUL      tVal9, tVal9, r0x01010101           ;// replicate the val in all the bytes
    352         STR      tVal6, [pDst],  #+4                 ;// store {tVal6} at pDst [0 to 3]
    353         STR      tVal7, [pDst2], #+4                 ;// store {tVal7} at pDst2[0 to 3]
    354         M_STR    tVal6, [pDst],  dstStepx2           ;// store {tVal6} at pDst [4 to 7]
    355         M_STR    tVal7, [pDst2], dstStepx2           ;// store {tVal7} at pDst2[4 to 7]
    356         STR      tVal8, [pDst],  #+4                 ;// store {tVal6} at pDst [0 to 3]
    357         STR      tVal9, [pDst2], #+4                 ;// store {tVal7} at pDst2[0 to 3]
    358         M_STR    tVal8, [pDst],  dstStepx2           ;// store {tVal6} at pDst [4 to 7]
    359         M_STR    tVal9, [pDst2], dstStepx2           ;// store {tVal7} at pDst2[4 to 7]
    360         M_LDRB   tVal6, [pSrcLeft], +leftStepx2      ;// tVal6 = pSrcLeft[4]
    361         M_LDRB   tVal7, [pSrcLeft2],+leftStepx2      ;// tVal7 = pSrcLeft[5]
    362         M_LDRB   tVal8, [pSrcLeft], +leftStepx2      ;// tVal8 = pSrcLeft[6]
    363         M_LDRB   tVal9, [pSrcLeft2],+leftStepx2      ;// tVal9 = pSrcLeft[7]
    364         MUL      tVal6, tVal6, r0x01010101           ;// replicate the val in all the bytes
    365         MUL      tVal7, tVal7, r0x01010101           ;// replicate the val in all the bytes
    366         MUL      tVal8, tVal8, r0x01010101           ;// replicate the val in all the bytes
    367         MUL      tVal9, tVal9, r0x01010101           ;// replicate the val in all the bytes
    368         STR      tVal6, [pDst],  #+4                 ;// store {tVal6} at pDst [0 to 3]
    369         STR      tVal7, [pDst2], #+4                 ;// store {tVal7} at pDst2[0 to 3]
    370         M_STR    tVal6, [pDst],  dstStepx2           ;// store {tVal6} at pDst [4 to 7]
    371         M_STR    tVal7, [pDst2], dstStepx2           ;// store {tVal7} at pDst2[4 to 7]
    372         STR      tVal8, [pDst],  #+4                 ;// store {tVal6} at pDst [0 to 3]
    373         STR      tVal9, [pDst2], #+4                 ;// store {tVal7} at pDst2[0 to 3]
    374         M_STR    tVal8, [pDst],  dstStepx2           ;// store {tVal6} at pDst [4 to 7]
    375         M_STR    tVal9, [pDst2], dstStepx2           ;// store {tVal7} at pDst2[4 to 7]
    376         MOV      return, #OMX_Sts_NoErr
    377         M_EXIT
    378 
    379 OMX_VC_CHROMA_VERT
    380 
    381         ;// M_STALL ARM1136JS=4
    382 
    383         LDMIA    pSrcAbove, {tVal6,tVal7}            ;// tVal 6 to 7 = pSrcAbove[0 to 7]
    384         MOV      return, #OMX_Sts_NoErr
    385 
    386         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[0 to 7]   = tVal 6 to 7
    387         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[8 to 15]  = tVal 6 to 7
    388         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[16 to 23] = tVal 6 to 7
    389         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[24 to 31] = tVal 6 to 7
    390         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[32 to 39] = tVal 6 to 7
    391         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[40 to 47] = tVal 6 to 7
    392         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[48 to 55] = tVal 6 to 7
    393         M_STRD   tVal6, tVal7, [pDst], dstStep       ;// pDst[56 to 63] = tVal 6 to 7
    394 
    395         M_EXIT                                       ;// Macro to exit midway-break frm case
    396 
    397 OMX_VC_CHROMA_PLANE
    398 
    399         ;// M_STALL ARM1136JS=3
    400 
    401         RSB      tVal14, leftStep, leftStep, LSL #3  ;// 7*leftStep
    402         LDRB     tVal7, [pSrcAbove, #+7]             ;// pSrcAbove[7]
    403         LDRB     tVal6, [pSrcLeft, +tVal14]          ;// pSrcLeft[7*leftStep]
    404         LDRB     tVal8, [pSrcAboveLeft]              ;// pSrcAboveLeft[0]
    405         LDRB     tVal9, [pSrcAbove, #+6 ]            ;// pSrcAbove[6]
    406         LDRB     tVal10,[pSrcAbove]                  ;// pSrcAbove[0]
    407         ADD      tVal2, tVal7, tVal6                 ;// pSrcAbove[7] + pSrcLeft[7*leftStep]
    408         SUB      tVal6, tVal6, tVal8                 ;// V0 = pSrcLeft[7*leftStep] - pSrcAboveLeft[0]
    409         SUB      tVal7, tVal7, tVal8                 ;// H0 = pSrcAbove[7] - pSrcAboveLeft[0]
    410         LSL      tVal2, tVal2, #4                    ;// a = 16 * (pSrcAbove[15] + pSrcLeft[15*lS])
    411         ADD      tVal2, tVal2, #16                   ;// a + 16
    412         SUB      tVal9, tVal9,tVal10                 ;// pSrcAbove[6] - pSrcAbove[0]
    413         LDRB     tVal8, [pSrcAbove,#+5]              ;// pSrcAbove[5]
    414         LDRB     tVal10,[pSrcAbove,#+1]              ;// pSrcAbove[1]
    415         ADD      tVal9, tVal9, tVal9, LSL #1         ;// H1 = 3 * (pSrcAbove[6] - pSrcAbove[0])
    416         ADD      tVal7, tVal9, tVal7, LSL #2         ;// H = H1 + H0
    417         SUB      tVal8, tVal8, tVal10                ;// pSrcAbove[5] - pSrcAbove[1]
    418         LDRB     tVal9, [pSrcAbove,#+4]              ;// pSrcAbove[4]
    419         LDRB     tVal10,[pSrcAbove,#+2]              ;// pSrcAbove[2]
    420         ADD      tVal7, tVal7, tVal8, LSL #1         ;// H = H + H2
    421         SUB      tVal11, tVal14,leftStep             ;// 6*leftStep
    422         ADD      tVal11, pSrcLeft, tVal11            ;// pSrcLeft + 6*leftStep
    423         MOV      tVal12, pSrcLeft                    ;// pSrcLeft
    424         SUB      tVal9, tVal9, tVal10                ;// pSrcAbove[4] - pSrcAbove[2]
    425         ADD      tVal7, tVal7, tVal9                 ;// H = H + H3
    426         M_LDRB   tVal8, [tVal11],-leftStep           ;// pSrcLeft[6*leftStep]
    427         M_LDRB   tVal10,[tVal12],+leftStep           ;// pSrcLeft[0]
    428         ADD      tVal7, tVal7, tVal7, LSL #4         ;// 17 * H
    429         ADD      tVal7, tVal7, #16                   ;// 17 * H + 16
    430         SUB      tVal8, tVal8, tVal10                ;// pSrcLeft[6*leftStep] - pSrcLeft[0]
    431         ASR      b, tVal7, #5                        ;// b = (17 * H + 16) >> 5
    432         ADD      tVal8, tVal8, tVal8, LSL #1         ;// V1 = 3 * (pSrcLeft[6*leftStep] - pSrcLeft[0])
    433         ADD      tVal6, tVal8, tVal6, LSL #2         ;// V = V0 +V1
    434         M_LDRB   tVal8, [tVal11],-leftStep           ;// pSrcLeft[5*leftStep]
    435         M_LDRB   tVal10,[tVal12],+leftStep           ;// pSrcLeft[leftStep]
    436         ADD      tVal7, b, b, LSL #1                 ;// 3*b
    437         SUB      tVal2, tVal2, tVal7                 ;// a + 16 - 3*b
    438         SUB      tVal7, tVal8, tVal10                ;// pSrcLeft[5*leftStep] - pSrcLeft[leftStep]
    439         M_LDRB   tVal8, [tVal11],-leftStep           ;// pSrcLeft[4*leftStep]
    440         M_LDRB   tVal10,[tVal12],+leftStep           ;// pSrcLeft[2*leftStep]
    441         ADD      tVal6, tVal6, tVal7, LSL #1         ;// V = V + V2
    442         LDR      r0x00FF00FF, =MASK_CONST            ;// r0x00FF00FF = 0x00FF00FF
    443         SUB      tVal7, tVal8, tVal10                ;// pSrcLeft[4*leftStep] - pSrcLeft[2*leftStep]
    444         ADD      tVal6, tVal6, tVal7                 ;// V = V + V7
    445         SUB      dstStep, dstStep, #4                ;// dstStep - 4
    446         ADD      tVal6, tVal6, tVal6, LSL #4         ;// 17*V
    447         ADD      tVal6, tVal6, #16                   ;// 17*V + 16
    448 
    449         ;// M_STALL ARM1136JS=1
    450 
    451         ASR      c, tVal6, #5                        ;// c = (17*V + 16)>>5
    452 
    453         ;// M_STALL ARM1136JS=1
    454 
    455         ADD      tVal6, c, c, LSL #1                 ;// 3*c
    456         UXTH     c, c                                ;// only in half word
    457         SUB      tVal6, tVal2, tVal6                 ;// a - 3*b - 3*c + 16
    458         ORR      c, c, c, LSL #16                    ;// c c
    459         ADD      tVal7, b, b                         ;// 2b
    460         ADD      tVal2, tVal6, tVal7                 ;// pp2 = d + 2*b
    461         ADD      tVal7, tVal7, b                     ;// 3b
    462         ORR      p2p0,   tVal6,  tVal2,  LSL #16     ;// p2p0   = pack {p2, p0}
    463         UXTH     b, b
    464         UXTH     tVal7, tVal7
    465         ORR      b, b, b, LSL #16                    ;// {b,b}
    466         ORR      tVal7, tVal7, tVal7, LSL #16        ;// {3b,3b}
    467         SADD16   p3p1,   p2p0, b                     ;// p3p1   = p2p0 + {b,b}
    468         SADD16   p6p4,   p3p1, tVal7                 ;// p6p4   = p3p1 + {3b,3b}
    469         SADD16   p7p5,   p6p4, b                     ;// p7p5   = p6p4 + {b,b}
    470         MOV      outerCount, #BLK_SIZE               ;// Outer Loop Count
    471 
    472 LOOP_PLANE
    473 
    474         USAT16   p7p5,   #13, p7p5                    ;// clip13(p7) clip13(p5)
    475         USAT16   p6p4,   #13, p6p4                    ;// clip13(p6) clip13(p4)
    476         USAT16   p3p1,   #13, p3p1                    ;// clip13(p3) clip13(p1)
    477         USAT16   p2p0,   #13, p2p0                    ;// clip13(p2) clip13(p0)
    478 
    479         AND      pp7pp5, r0x00FF00FF, p7p5, ASR #5    ;// clip8(p7) clip8(p5)
    480         AND      pp6pp4, r0x00FF00FF, p6p4, ASR #5    ;// clip8(p6) clip8(p4)
    481         AND      pp3pp1, r0x00FF00FF, p3p1, ASR #5    ;// clip8(p3) clip8(p1)
    482         AND      pp2pp0, r0x00FF00FF, p2p0, ASR #5    ;// clip8(p2) clip8(p0)
    483 
    484         SUBS     outerCount, outerCount, #1           ;// outerCount--
    485 
    486         ORR      p3210, pp2pp0, pp3pp1, LSL #8        ;// pack {p3,p2, p1, p0}
    487         STR      p3210, [pDst], #4                    ;// store {pDst[0] to pDst[3]}
    488 
    489         ORR      p7654, pp6pp4, pp7pp5, LSL #8        ;// pack {p7,p6, p5, p4}
    490         M_STR    p7654, [pDst], dstStep               ;// store {pDst[4] to pDst[7]}
    491 
    492         SADD16   p7p5,   p7p5,   c                    ;// {p7 + c}, {p5 + c}
    493         SADD16   p6p4,   p6p4,   c                    ;// {p6 + c}, {p4 + c}
    494         SADD16   p3p1,   p3p1,   c                    ;// {p3 + c}, {p1 + c}
    495         SADD16   p2p0,   p2p0,   c                    ;// {p2 + c}, {p0 + c}
    496 
    497         BNE      LOOP_PLANE                           ;// Loop for 8 times
    498         MOV      return, #OMX_Sts_NoErr
    499         M_END
    500 
    501         ENDIF ;// ARM1136JS
    502 
    503 
    504 
    505         END
    506 ;//-----------------------------------------------------------------------------------------------
    507 ;// omxVCM4P10_PredictIntraChroma_8x8 ends
    508 ;//-----------------------------------------------------------------------------------------------
    509