Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// Copyright (C) 2007-2008 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// File Name:  omxVCM4P10_PredictIntra_16x16_s.s
     19 ;// OpenMAX DL: v1.0.2
     20 ;// Revision:   12290
     21 ;// Date:       Wednesday, April 9, 2008
     22 ;//
     23 ;//
     24 ;//
     25 ;//
     26 
     27         INCLUDE omxtypes_s.h
     28         INCLUDE armCOMM_s.h
     29 
     30         M_VARIANTS CortexA8
     31 
     32 
     33 ;//-------------------------------------------------------
     34 ;// This table for implementing switch case of C in asm by
     35 ;// the mehtod of two levels of indexing.
     36 ;//-------------------------------------------------------
     37 
     38     M_TABLE armVCM4P10_pIndexTable16x16
     39     DCD  OMX_VC_16X16_VERT, OMX_VC_16X16_HOR
     40     DCD  OMX_VC_16X16_DC,   OMX_VC_16X16_PLANE
     41 
     42 
     43     IF CortexA8
     44 
     45     M_TABLE armVCM4P10_MultiplierTable16x16,1
     46     DCW   7,  6,  5,  4,  3,  2,  1,  8
     47     DCW   0,  1,  2,  3,  4,  5,  6,  7
     48     DCW   8,  9, 10, 11, 12, 13, 14, 15
     49 
     50 ;//--------------------------------------------
     51 ;// Constants
     52 ;//--------------------------------------------
     53 BLK_SIZE        EQU 0x10
     54 MUL_CONST0      EQU 0x01010101
     55 MUL_CONST1      EQU 0x00060004
     56 MUL_CONST2      EQU 0x00070005
     57 MUL_CONST3      EQU 0x00030001
     58 MASK_CONST      EQU 0x00FF00FF
     59 
     60 ;//--------------------------------------------
     61 ;// Scratch variable
     62 ;//--------------------------------------------
     63 y               RN 12
     64 pc              RN 15
     65 
     66 return          RN 0
     67 pTable          RN 9
     68 count           RN 11
     69 pMultTable      RN 9
     70 ; ----------------------------------------------
     71 ; Neon registers
     72 ; ----------------------------------------------
     73 qAbove          QN Q0.U8
     74 qLeft           QN Q1.U8
     75 qSum8           QN Q0.U16
     76 dSum80          DN D0.U16
     77 dSum81          DN D1.U16
     78 dSum4           DN D0.U16
     79 dSum2           DN D0.U32
     80 dSum1           DN D0.U64
     81 qOut            QN Q3.U8
     82 dSumLeft        DN D6.U64
     83 dSumAbove       DN D7.U64
     84 dSum            DN D8.U64
     85 dSum0           DN D8.U8[0]
     86 
     87 qH              QN Q11.S32
     88 qV              QN Q12.S32
     89 qA              QN Q11.S16
     90 qB              QN Q6.S16
     91 qC              QN Q7.S16
     92 
     93 qB0             QN Q5.S16
     94 qB1             QN Q6.S16
     95 dA1             DN D23.S16
     96 
     97 dH0             DN D22.S32
     98 dH1             DN D23.S32
     99 dV0             DN D24.S32
    100 dV1             DN D25.S32
    101 
    102 qHV             QN Q11.S64
    103 qHV0            QN Q11.S32
    104 qHV1            QN Q12.S64
    105 
    106 dHV00           DN D22.S32
    107 dHV01           DN D23.S32
    108 
    109 dHV0            DN D22.S16[0]
    110 dHV1            DN D23.S16[0]
    111 dHV10           DN D24.S64
    112 dHV11           DN D25.S64
    113 
    114 qSum0           QN Q0.S16
    115 qSum1           QN Q1.S16
    116 
    117 dOut0           DN D6.U8
    118 dOut1           DN D7.U8
    119 
    120 dLeft0          DN D2.U8
    121 dLeft1          DN D3.U8
    122 qConst          QN Q13.S16
    123 
    124 dAbove0         DN D0.U8
    125 dAbove1         DN D1.U8
    126 
    127 dRevLeft64      DN D12.U64
    128 dRevLeft        DN D12.U8
    129 dRevAbove64     DN D5.U64
    130 dRevAbove       DN D5.U8
    131 qLeftDiff       QN Q8.S16
    132 dLeftDiff1      DN D17.S16
    133 dLeftDiff64     DN D17.S64
    134 qDiffLeft       QN Q8.S16
    135 qDiffAbove      QN Q4.S16
    136 dAboveDiff1     DN D9.S16
    137 dAboveDiff64    DN D9.S64
    138 qAboveDiff      QN Q4.S16
    139 
    140 dAboveLeft      DN D4.U8
    141 
    142 dDiffLeft0      DN D16.S16
    143 dDiffLeft1      DN D17.S16
    144 dDiffAbove0     DN D8.S16
    145 dDiffAbove1     DN D9.S16
    146 
    147 qLeft15minus0   QN Q7.S16
    148 dLeft15minus0   DN D14.S16
    149 qAbove15minus0  QN Q3.S16
    150 dAbove15minus0  DN D6.S16
    151 
    152 qMultiplier     QN Q10.S16
    153 qMultiplier0    QN Q10.S16
    154 qMultiplier1    QN Q12.S16
    155 dMultiplier0    DN D20.S16
    156 dMultiplier1    DN D21.S16
    157 
    158 dBPlusCMult7    DN D1.S64
    159 dBPlusCMult7S16 DN D1.S16
    160 
    161 qTmp            QN Q0.U8
    162 
    163 ;//--------------------------------------------
    164 ;// Declare input registers
    165 ;//--------------------------------------------
    166 pSrcLeft        RN 0    ;// input pointer
    167 pSrcAbove       RN 1    ;// input pointer
    168 pSrcAboveLeft   RN 2    ;// input pointer
    169 pDst            RN 3    ;// output pointer
    170 leftStep        RN 4    ;// input variable
    171 dstStep         RN 5    ;// input variable
    172 predMode        RN 6    ;// input variable
    173 availability    RN 7    ;// input variable
    174 
    175 pTmp            RN 8
    176 step            RN 10
    177 pTmp2           RN 11
    178 
    179 ;//-----------------------------------------------------------------------------------------------
    180 ;// omxVCM4P10_PredictIntra_16x16 starts
    181 ;//-----------------------------------------------------------------------------------------------
    182 
    183         ;// Write function header
    184         M_START omxVCM4P10_PredictIntra_16x16, r11, d15
    185 
    186         ;// Define stack arguments
    187         M_ARG    LeftStep,     4
    188         M_ARG    DstStep,      4
    189         M_ARG    PredMode,     4
    190         M_ARG    Availability, 4
    191 
    192         ;// M_STALL ARM1136JS=4
    193 
    194         LDR      pTable,=armVCM4P10_pIndexTable16x16 ;// Load index table for switch case
    195 
    196         ;// Load argument from the stack
    197         M_LDR    predMode, PredMode                  ;// Arg predMode loaded from stack to reg
    198         M_LDR    leftStep, LeftStep                  ;// Arg leftStep loaded from stack to reg
    199         M_LDR    dstStep,  DstStep                   ;// Arg dstStep loaded from stack to reg
    200         M_LDR    availability, Availability          ;// Arg availability loaded from stack to reg
    201 
    202         MOV      y, #BLK_SIZE                        ;// Outer Loop Count
    203         LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode
    204 
    205 OMX_VC_16X16_VERT
    206         VLD1    qAbove,  [pSrcAbove]
    207         ADD     pTmp, pDst, dstStep
    208         ADD     step, dstStep, dstStep
    209         VST1    qAbove, [pDst], step
    210         VST1    qAbove, [pTmp], step
    211         VST1    qAbove, [pDst], step
    212         VST1    qAbove, [pTmp], step
    213         VST1    qAbove, [pDst], step
    214         VST1    qAbove, [pTmp], step
    215         VST1    qAbove, [pDst], step
    216         VST1    qAbove, [pTmp], step
    217         VST1    qAbove, [pDst], step
    218         VST1    qAbove, [pTmp], step
    219         VST1    qAbove, [pDst], step
    220         VST1    qAbove, [pTmp], step
    221         VST1    qAbove, [pDst], step
    222         VST1    qAbove, [pTmp], step
    223         VST1    qAbove, [pDst]
    224         VST1    qAbove, [pTmp]
    225         MOV     return, #OMX_Sts_NoErr               ;// returnNoError
    226         M_EXIT
    227 
    228 OMX_VC_16X16_HOR
    229         ADD     pTmp, pSrcLeft, leftStep
    230         ADD     leftStep, leftStep, leftStep
    231         ADD     pTmp2, pDst, dstStep
    232         ADD     dstStep, dstStep, dstStep
    233 LoopHor
    234         VLD1     {qLeft[]}, [pSrcLeft], leftStep
    235         VLD1     {qTmp[]}, [pTmp], leftStep
    236         SUBS     y, y, #8
    237         VST1     qLeft, [pDst], dstStep
    238         VST1     qTmp, [pTmp2], dstStep
    239         VLD1     {qLeft[]}, [pSrcLeft], leftStep
    240         VLD1     {qTmp[]}, [pTmp], leftStep
    241         VST1     qLeft, [pDst], dstStep
    242         VST1     qTmp, [pTmp2], dstStep
    243         VLD1     {qLeft[]}, [pSrcLeft], leftStep
    244         VLD1     {qTmp[]}, [pTmp], leftStep
    245         VST1     qLeft, [pDst], dstStep
    246         VST1     qTmp, [pTmp2], dstStep
    247         VLD1     {qLeft[]}, [pSrcLeft], leftStep
    248         VLD1     {qTmp[]}, [pTmp], leftStep
    249         VST1     qLeft, [pDst], dstStep
    250         VST1     qTmp, [pTmp2], dstStep
    251 
    252         BNE      LoopHor                                  ;// Loop for 16 times
    253         MOV      return, #OMX_Sts_NoErr
    254         M_EXIT
    255 
    256 OMX_VC_16X16_DC
    257         MOV      count, #0                                 ;// count = 0
    258         TST      availability, #OMX_VC_LEFT
    259         BEQ      UpperOrNoneAvailable                      ;// Jump to Upper if not left
    260 
    261         ADD     pTmp, pSrcLeft, leftStep
    262         ADD     step, leftStep, leftStep
    263 
    264         VLD1    {qLeft[0]}, [pSrcLeft],step
    265         VLD1    {qLeft[1]}, [pTmp],step
    266         VLD1    {qLeft[2]}, [pSrcLeft],step
    267         VLD1    {qLeft[3]}, [pTmp],step
    268         VLD1    {qLeft[4]}, [pSrcLeft],step
    269         VLD1    {qLeft[5]}, [pTmp],step
    270         VLD1    {qLeft[6]}, [pSrcLeft],step
    271         VLD1    {qLeft[7]}, [pTmp],step
    272         VLD1    {qLeft[8]}, [pSrcLeft],step
    273         VLD1    {qLeft[9]}, [pTmp],step
    274         VLD1    {qLeft[10]},[pSrcLeft],step
    275         VLD1    {qLeft[11]},[pTmp],step
    276         VLD1    {qLeft[12]},[pSrcLeft],step
    277         VLD1    {qLeft[13]},[pTmp],step
    278         VLD1    {qLeft[14]},[pSrcLeft],step
    279         VLD1    {qLeft[15]},[pTmp]
    280 
    281         VPADDL   qSum8, qLeft
    282         ADD     count, count, #1
    283         VPADD    dSum4, dSum80, dSum81
    284         VPADDL   dSum2, dSum4
    285         VPADDL   dSumLeft, dSum2
    286         VRSHR    dSum, dSumLeft, #4
    287 
    288 UpperOrNoneAvailable
    289         TST      availability,  #OMX_VC_UPPER              ;// if(availability & #OMX_VC_UPPER)
    290         BEQ      BothOrNoneAvailable                       ;// Jump to Left if not upper
    291         VLD1     qAbove, [pSrcAbove]
    292         ADD      count, count, #1                          ;// if upper inc count by 1
    293         VPADDL   qSum8, qAbove
    294         VPADD    dSum4, dSum80, dSum81
    295         VPADDL   dSum2, dSum4
    296         VPADDL   dSumAbove, dSum2
    297         VRSHR    dSum, dSumAbove, #4
    298 
    299 BothOrNoneAvailable
    300         CMP      count, #2                                  ;// check if both available
    301         BNE      NoneAvailable
    302         VADD     dSum, dSumAbove, dSumLeft
    303         VRSHR    dSum, dSum, #5
    304 
    305 
    306 NoneAvailable
    307         VDUP     qOut, dSum0
    308         CMP      count, #0                                  ;// check if none available
    309         ADD      pTmp, pDst, dstStep
    310         ADD      step, dstStep, dstStep
    311         BNE      LoopDC
    312         VMOV     qOut, #128
    313 LoopDC
    314         VST1    qOut, [pDst], step
    315         VST1    qOut, [pTmp], step
    316         VST1    qOut, [pDst], step
    317         VST1    qOut, [pTmp], step
    318         VST1    qOut, [pDst], step
    319         VST1    qOut, [pTmp], step
    320         VST1    qOut, [pDst], step
    321         VST1    qOut, [pTmp], step
    322         VST1    qOut, [pDst], step
    323         VST1    qOut, [pTmp], step
    324         VST1    qOut, [pDst], step
    325         VST1    qOut, [pTmp], step
    326         VST1    qOut, [pDst], step
    327         VST1    qOut, [pTmp], step
    328         VST1    qOut, [pDst], step
    329         VST1    qOut, [pTmp], step
    330         MOV     return, #OMX_Sts_NoErr
    331         M_EXIT
    332 
    333 OMX_VC_16X16_PLANE
    334         LDR     pMultTable, =armVCM4P10_MultiplierTable16x16
    335         VLD1    qAbove, [pSrcAbove]                         ;// pSrcAbove[x]      :0<= x <= 7
    336         VLD1    dAboveLeft[0],[pSrcAboveLeft]
    337         ADD     pTmp, pSrcLeft, leftStep
    338         ADD     step, leftStep, leftStep
    339         VLD1    {qLeft[0]},  [pSrcLeft],step
    340         VLD1    {qLeft[1]},  [pTmp],step
    341         VLD1    {qLeft[2]},  [pSrcLeft],step
    342         VLD1    {qLeft[3]},  [pTmp],step
    343         VLD1    {qLeft[4]},  [pSrcLeft],step
    344         VLD1    {qLeft[5]},  [pTmp],step
    345         VLD1    {qLeft[6]},  [pSrcLeft],step
    346         VLD1    {qLeft[7]},  [pTmp],step
    347         VLD1    {qLeft[8]},  [pSrcLeft],step
    348         VLD1    {qLeft[9]},  [pTmp],step
    349         VLD1    {qLeft[10]}, [pSrcLeft],step
    350         VLD1    {qLeft[11]}, [pTmp],step
    351         VLD1    {qLeft[12]}, [pSrcLeft],step
    352         VLD1    {qLeft[13]}, [pTmp],step
    353         VLD1    {qLeft[14]}, [pSrcLeft],step
    354         VLD1    {qLeft[15]}, [pTmp]
    355 
    356         VREV64  dRevAbove, dAbove1                          ;// pSrcAbove[15:14:13:12:11:10:9:8]
    357         VSUBL   qAbove15minus0, dRevAbove, dAboveLeft       ;// qAbove7minus0[0] = pSrcAbove[15] - pSrcAboveLeft[0]
    358         VSHR    dRevAbove64, dRevAbove64, #8                ;// pSrcAbove[14:13:12:11:10:9:8:X]
    359         VSUBL   qAboveDiff, dRevAbove, dAbove0
    360 
    361         VSHL    dAboveDiff64, dAboveDiff64, #16
    362         VEXT    dDiffAbove1, dAboveDiff1, dAbove15minus0, #1
    363 
    364         VREV64  dRevLeft,dLeft1                             ;// pSrcLeft[15:14:13:12:11:10:9:8]
    365         VSUBL   qLeft15minus0,dRevLeft, dAboveLeft          ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0]
    366         VSHR    dRevLeft64, dRevLeft64, #8                  ;// pSrcLeft[14:13:12:11:10:9:8:X]
    367         VSUBL   qLeftDiff,dRevLeft, dLeft0
    368 
    369         ;// Multiplier = [8|1|2|...|6|7]
    370         VLD1    qMultiplier, [pMultTable]!
    371 
    372         VSHL    dLeftDiff64, dLeftDiff64, #16
    373         VEXT    dDiffLeft1, dLeftDiff1, dLeft15minus0, #1
    374 
    375         VMULL   qH,dDiffAbove0, dMultiplier0
    376         VMULL   qV,dDiffLeft0,  dMultiplier0
    377         VMLAL   qH,dDiffAbove1, dMultiplier1
    378         VMLAL   qV,dDiffLeft1,  dMultiplier1
    379 
    380         VPADD   dHV00,dH1,dH0
    381         VPADD   dHV01,dV1,dV0
    382         VPADDL  qHV, qHV0
    383         VSHL    qHV1,qHV,#2
    384         VADD    qHV,qHV,qHV1
    385 
    386         ;// HV = [c = ((5*V+32)>>6) | b = ((5*H+32)>>6)]
    387         VRSHR   qHV,qHV,#6
    388 
    389         ;// HV1 = [c*7|b*7]
    390         VSHL    qHV1,qHV,#3
    391         VSUB    qHV1,qHV1,qHV
    392 
    393         ;// Multiplier1 = [0|1|2|...|7]
    394         VLD1    qMultiplier0, [pMultTable]!
    395         VDUP    qB, dHV0
    396         VDUP    qC, dHV1
    397 
    398         VADDL   qA,dAbove1,dLeft1
    399         VSHL    qA,qA, #4
    400         VDUP    qA,dA1[3]
    401         VADD    dBPlusCMult7, dHV10, dHV11
    402 
    403         ;// Multiplier1 = [8|9|10|...|15]
    404         VLD1    qMultiplier1, [pMultTable]
    405         ;// Const = a - 7*(b+c)
    406         VDUP    qConst, dBPlusCMult7S16[0]
    407         VSUB    qConst, qA, qConst
    408 
    409         ;// B0 = [0*b|1*b|2*b|3*b|......|7*b]
    410         VMUL    qB0,qB,qMultiplier0
    411 
    412         ;// B0 = [8*b|9*b|10*b|11*b|....|15*b]
    413         VMUL    qB1,qB,qMultiplier1
    414 
    415         VADD    qSum0, qB0, qConst
    416         VADD    qSum1, qB1, qConst
    417 
    418         ;// Loops for 16 times
    419 LoopPlane
    420         ;// (b*x + c*y + C)>>5
    421         VQRSHRUN dOut0, qSum0,#5
    422         VQRSHRUN dOut1, qSum1,#5
    423         SUBS     y, y, #1
    424         VST1     qOut,[pDst],dstStep
    425         VADD     qSum0,qSum0,qC
    426         VADD     qSum1,qSum1,qC
    427         BNE      LoopPlane
    428 
    429         MOV      return, #OMX_Sts_NoErr
    430 
    431         M_END
    432 
    433         ENDIF ;// CortexA8
    434 
    435         END
    436 ;-----------------------------------------------------------------------------------------------
    437 ; omxVCM4P10_PredictIntra_16x16 ends
    438 ;-----------------------------------------------------------------------------------------------
    439