Home | History | Annotate | Download | only in src
      1 ;//
      2 ;//
      3 ;// File Name:  armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s
      4 ;// OpenMAX DL: v1.0.2
      5 ;// Revision:   12290
      6 ;// Date:       Wednesday, April 9, 2008
      7 ;//
      8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
      9 ;//
     10 ;//
     11 ;//
     12 
     13         INCLUDE omxtypes_s.h
     14         INCLUDE armCOMM_s.h
     15 
     16         M_VARIANTS CortexA8
     17 
     18         EXPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
     19 
     20     IF CortexA8
     21 
     22         M_START armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe, r11
     23 
     24 ;// Declare input registers
     25 pSrc            RN 0
     26 srcStep         RN 1
     27 pDst            RN 2
     28 dstStep         RN 3
     29 
     30 Temp            RN 12
     31 
     32 ;// Declare Neon registers
     33 dCoeff5         DN 30.S16
     34 dCoeff20        DN 31.S16
     35 
     36 dSrc0           DN 7.U8
     37 dSrc1           DN 8.U8
     38 dSrc2           DN 9.U8
     39 dSrc3           DN 10.U8
     40 dSrc4           DN 11.U8
     41 dSrc5           DN 12.U8
     42 dSrc6           DN 13.U8
     43 dSrc7           DN 14.U8
     44 dSrc8           DN 15.U8
     45 
     46 qSumBE01        QN 8.S16
     47 qSumCD01        QN 9.S16
     48 dSumBE0         DN 16.S16
     49 dSumCD0         DN 18.S16
     50 
     51 qAcc01          QN 0.S16
     52 qAcc23          QN 1.S16
     53 qAcc45          QN 2.S16
     54 qAcc67          QN 3.S16
     55 
     56 dRes0           DN 0.S16
     57 dRes1           DN 2.S16
     58 dRes2           DN 4.S16
     59 dRes3           DN 6.S16
     60 
     61 dAcc0           DN 0.U8
     62 dAcc1           DN 2.U8
     63 dAcc2           DN 4.U8
     64 dAcc3           DN 6.U8
     65 
     66 
     67 dTmp0           DN 20.S16
     68 dTmp1           DN 21.S16
     69 dTmp2           DN 22.S16
     70 dTmp3           DN 23.S16
     71 
     72 
     73         VLD1        dSrc0, [pSrc], srcStep     ;// [a0 a1 a2 a3 .. ]
     74         ADD         Temp, pSrc, srcStep, LSL #2
     75         VLD1        dSrc1, [pSrc], srcStep     ;// [b0 b1 b2 b3 .. ]
     76         ;// One cycle stall
     77         VLD1        dSrc5, [Temp], srcStep
     78         ;// One cycle stall
     79         VLD1        dSrc2, [pSrc], srcStep     ;// [c0 c1 c2 c3 .. ]
     80         VADDL       qAcc01, dSrc0, dSrc5       ;// Acc = a+f
     81         VLD1        dSrc3, [pSrc], srcStep
     82         ;// One cycle stall
     83         VLD1        dSrc6, [Temp], srcStep ;// TeRi
     84 
     85         VLD1        dSrc4, [pSrc], srcStep
     86         VLD1        dSrc7, [Temp], srcStep ;// TeRi
     87         VADDL       qSumBE01, dSrc1, dSrc4     ;// b+e
     88         VADDL       qSumCD01, dSrc2, dSrc3     ;// c+d
     89         VLD1        dSrc8, [Temp], srcStep ;// TeRi
     90         VMLS        dRes0, dSumBE0, dCoeff5    ;// Acc -= 20*(b+e)
     91 ;        VMLA        dRes0, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
     92         VMUL        dTmp0, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
     93 
     94 ;        VLD1        dSrc6, [Temp], srcStep
     95         VADDL       qSumBE01, dSrc2, dSrc5     ;// b+e
     96         VADDL       qSumCD01, dSrc3, dSrc4     ;// c+d
     97         VADDL       qAcc23, dSrc1, dSrc6       ;// Acc = a+f
     98         VMLS        dRes1, dSumBE0, dCoeff5    ;// Acc -= 20*(b+e)
     99 ;        VMLA        dRes1, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
    100         VMUL        dTmp1, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
    101 
    102 ;        VLD1        dSrc7, [Temp], srcStep
    103         VADDL       qSumBE01, dSrc3, dSrc6     ;// b+e
    104         VADDL       qSumCD01, dSrc4, dSrc5     ;// c+d
    105         VADDL       qAcc45, dSrc2, dSrc7       ;// Acc = a+f
    106         VMLS        dRes2, dSumBE0, dCoeff5    ;// Acc -= 20*(b+e)
    107 ;        VMLA        dRes2, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
    108         VMUL        dTmp2, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
    109 
    110 ;        VLD1        dSrc8, [Temp], srcStep     ;// [i0 i1 i2 i3 .. ]
    111         VADDL       qSumBE01, dSrc4, dSrc7     ;// b+e
    112         VADDL       qAcc67, dSrc3, dSrc8       ;// Acc = a+f
    113         VADDL       qSumCD01, dSrc5, dSrc6     ;// c+d
    114         VMLS        dRes3, dSumBE0, dCoeff5    ;// Acc -= 20*(b+e)
    115         VADD        dRes0, dRes0, dTmp0
    116         VADD        dRes1, dRes1, dTmp1
    117         VADD        dRes2, dRes2, dTmp2
    118         VMLA        dRes3, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
    119 ;        VMUL        dTmp3, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
    120 ;        VADD        dRes3, dRes3, dTmp3
    121 
    122         VQRSHRUN    dAcc0, qAcc01, #5
    123         VQRSHRUN    dAcc1, qAcc23, #5
    124         VQRSHRUN    dAcc2, qAcc45, #5
    125         VQRSHRUN    dAcc3, qAcc67, #5
    126 
    127         M_END
    128 
    129     ENDIF
    130 
    131 
    132 
    133     END
    134 
    135