Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// Copyright (C) 2007-2008 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// File Name:  armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s
     19 ;// OpenMAX DL: v1.0.2
     20 ;// Revision:   12290
     21 ;// Date:       Wednesday, April 9, 2008
     22 ;//
     23 ;//
     24 ;//
     25 ;//
     26 
     27         INCLUDE omxtypes_s.h
     28         INCLUDE armCOMM_s.h
     29 
     30         M_VARIANTS CortexA8
     31 
     32         EXPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
     33 
     34     IF CortexA8
     35 
     36         M_START armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe, r11
     37 
     38 ;// Declare input registers
     39 pSrc            RN 0
     40 srcStep         RN 1
     41 pDst            RN 2
     42 dstStep         RN 3
     43 
     44 Temp            RN 12
     45 
     46 ;// Declare Neon registers
     47 dCoeff5         DN 30.S16
     48 dCoeff20        DN 31.S16
     49 
     50 dSrc0           DN 7.U8
     51 dSrc1           DN 8.U8
     52 dSrc2           DN 9.U8
     53 dSrc3           DN 10.U8
     54 dSrc4           DN 11.U8
     55 dSrc5           DN 12.U8
     56 dSrc6           DN 13.U8
     57 dSrc7           DN 14.U8
     58 dSrc8           DN 15.U8
     59 
     60 qSumBE01        QN 8.S16
     61 qSumCD01        QN 9.S16
     62 dSumBE0         DN 16.S16
     63 dSumCD0         DN 18.S16
     64 
     65 qAcc01          QN 0.S16
     66 qAcc23          QN 1.S16
     67 qAcc45          QN 2.S16
     68 qAcc67          QN 3.S16
     69 
     70 dRes0           DN 0.S16
     71 dRes1           DN 2.S16
     72 dRes2           DN 4.S16
     73 dRes3           DN 6.S16
     74 
     75 dAcc0           DN 0.U8
     76 dAcc1           DN 2.U8
     77 dAcc2           DN 4.U8
     78 dAcc3           DN 6.U8
     79 
     80 
     81 dTmp0           DN 20.S16
     82 dTmp1           DN 21.S16
     83 dTmp2           DN 22.S16
     84 dTmp3           DN 23.S16
     85 
     86 
     87         VLD1        dSrc0, [pSrc], srcStep     ;// [a0 a1 a2 a3 .. ]
     88         ADD         Temp, pSrc, srcStep, LSL #2
     89         VLD1        dSrc1, [pSrc], srcStep     ;// [b0 b1 b2 b3 .. ]
     90         ;// One cycle stall
     91         VLD1        dSrc5, [Temp], srcStep
     92         ;// One cycle stall
     93         VLD1        dSrc2, [pSrc], srcStep     ;// [c0 c1 c2 c3 .. ]
     94         VADDL       qAcc01, dSrc0, dSrc5       ;// Acc = a+f
     95         VLD1        dSrc3, [pSrc], srcStep
     96         ;// One cycle stall
     97         VLD1        dSrc6, [Temp], srcStep ;// TeRi
     98 
     99         VLD1        dSrc4, [pSrc], srcStep
    100         VLD1        dSrc7, [Temp], srcStep ;// TeRi
    101         VADDL       qSumBE01, dSrc1, dSrc4     ;// b+e
    102         VADDL       qSumCD01, dSrc2, dSrc3     ;// c+d
    103         VLD1        dSrc8, [Temp], srcStep ;// TeRi
    104         VMLS        dRes0, dSumBE0, dCoeff5    ;// Acc -= 20*(b+e)
    105 ;        VMLA        dRes0, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
    106         VMUL        dTmp0, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
    107 
    108 ;        VLD1        dSrc6, [Temp], srcStep
    109         VADDL       qSumBE01, dSrc2, dSrc5     ;// b+e
    110         VADDL       qSumCD01, dSrc3, dSrc4     ;// c+d
    111         VADDL       qAcc23, dSrc1, dSrc6       ;// Acc = a+f
    112         VMLS        dRes1, dSumBE0, dCoeff5    ;// Acc -= 20*(b+e)
    113 ;        VMLA        dRes1, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
    114         VMUL        dTmp1, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
    115 
    116 ;        VLD1        dSrc7, [Temp], srcStep
    117         VADDL       qSumBE01, dSrc3, dSrc6     ;// b+e
    118         VADDL       qSumCD01, dSrc4, dSrc5     ;// c+d
    119         VADDL       qAcc45, dSrc2, dSrc7       ;// Acc = a+f
    120         VMLS        dRes2, dSumBE0, dCoeff5    ;// Acc -= 20*(b+e)
    121 ;        VMLA        dRes2, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
    122         VMUL        dTmp2, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
    123 
    124 ;        VLD1        dSrc8, [Temp], srcStep     ;// [i0 i1 i2 i3 .. ]
    125         VADDL       qSumBE01, dSrc4, dSrc7     ;// b+e
    126         VADDL       qAcc67, dSrc3, dSrc8       ;// Acc = a+f
    127         VADDL       qSumCD01, dSrc5, dSrc6     ;// c+d
    128         VMLS        dRes3, dSumBE0, dCoeff5    ;// Acc -= 20*(b+e)
    129         VADD        dRes0, dRes0, dTmp0
    130         VADD        dRes1, dRes1, dTmp1
    131         VADD        dRes2, dRes2, dTmp2
    132         VMLA        dRes3, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
    133 ;        VMUL        dTmp3, dSumCD0, dCoeff20   ;// Acc += 20*(c+d)
    134 ;        VADD        dRes3, dRes3, dTmp3
    135 
    136         VQRSHRUN    dAcc0, qAcc01, #5
    137         VQRSHRUN    dAcc1, qAcc23, #5
    138         VQRSHRUN    dAcc2, qAcc45, #5
    139         VQRSHRUN    dAcc3, qAcc67, #5
    140 
    141         M_END
    142 
    143     ENDIF
    144 
    145 
    146 
    147     END
    148 
    149