Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// Copyright (C) 2007-2008 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// File Name:  armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe_s.s
     19 ;// OpenMAX DL: v1.0.2
     20 ;// Revision:   12290
     21 ;// Date:       Wednesday, April 9, 2008
     22 ;//
     23 ;//
     24 ;//
     25 ;//
     26 
     27         INCLUDE omxtypes_s.h
     28         INCLUDE armCOMM_s.h
     29 
     30         M_VARIANTS CortexA8
     31 
     32         EXPORT armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe
     33 
     34 DEBUG_ON    SETL {FALSE}
     35 
     36     IF CortexA8
     37 
     38         M_START armVCM4P10_InterpolateLuma_HalfHor4x4_unsafe, r11
     39 
     40 ;// Declare input registers
     41 pSrc            RN 0
     42 srcStep         RN 1
     43 pDst            RN 2
     44 dstStep         RN 3
     45 
     46 ;// Declare Neon registers
     47 dCoeff5         DN 30.S16
     48 dCoeff20        DN 31.S16
     49 
     50 qSrcA01         QN 11.U8
     51 qSrcB01         QN 12.U8
     52 qSrcC01         QN 13.U8
     53 qSrcD01         QN 14.U8
     54 
     55 dSrcA0          DN 22.U8
     56 dSrcA1          DN 23.U8
     57 dSrcB0          DN 24.U8
     58 dSrcB1          DN 25.U8
     59 dSrcC0          DN 26.U8
     60 dSrcC1          DN 27.U8
     61 dSrcD0          DN 28.U8
     62 dSrcD1          DN 29.U8
     63 
     64 dSrcb           DN 12.U8
     65 dSrce           DN 13.U8
     66 dSrcf           DN 10.U8
     67 
     68 dSrc0c          DN 14.U8
     69 dSrc1c          DN 16.U8
     70 dSrc2c          DN 18.U8
     71 dSrc3c          DN 20.U8
     72 
     73 dSrc0d          DN 15.U8
     74 dSrc1d          DN 17.U8
     75 dSrc2d          DN 19.U8
     76 dSrc3d          DN 21.U8
     77 
     78 qTemp01         QN 4.S16
     79 qTemp23         QN 6.S16
     80 dTemp0          DN 8.S16
     81 dTemp2          DN 12.S16
     82 
     83 qRes01          QN 11.S16
     84 qRes23          QN 12.S16
     85 qRes45          QN 13.S16
     86 qRes67          QN 14.S16
     87 
     88 dRes0           DN 22.S16
     89 dRes2           DN 24.S16
     90 dRes4           DN 26.S16
     91 dRes6           DN 28.S16
     92 
     93 dAcc0           DN 22.U8
     94 dAcc2           DN 24.U8
     95 dAcc4           DN 26.U8
     96 dAcc6           DN 28.U8
     97 
     98 dResult0        DN 22.U32
     99 dResult2        DN 24.U32
    100 dResult4        DN 26.U32
    101 dResult6        DN 28.U32
    102 
    103         VLD1        qSrcA01, [pSrc], srcStep    ;// Load A register [a0 a1 a2 a3 ..]
    104         ;// One cycle stall
    105         VEXT        dSrcf, dSrcA0, dSrcA1, #5   ;// [f0 f1 f2 f3 ..]
    106         VEXT        dSrcb, dSrcA0, dSrcA1, #1   ;// [b0 b1 b2 b3 ..]
    107 ;        VLD1        qSrcB01, [pSrc], srcStep    ;// Load B register [a0 a1 a2 a3 ..]
    108         VEXT        dSrc0c, dSrcA0, dSrcA1, #2
    109         VEXT        dSrc0d, dSrcA0, dSrcA1, #3
    110         VEXT        dSrce, dSrcA0, dSrcA1, #4
    111         VADDL       qRes01, dSrcA0, dSrcf       ;// Acc=a+f
    112         VADDL       qTemp01, dSrc0c, dSrc0d     ;// c+d
    113         VADDL       qTemp23, dSrcb, dSrce       ;// b+e
    114 
    115         VLD1        qSrcB01, [pSrc], srcStep    ;// Load B register [a0 a1 a2 a3 ..]
    116 ;        VLD1        qSrcC01, [pSrc], srcStep    ;// Load C register [a0 a1 a2 a3 ..]
    117         VMLA        dRes0, dTemp0, dCoeff20     ;// Acc += 20*(c+d)
    118 ;        VMLS        dRes0, dTemp2, dCoeff5      ;// Acc -= 5*(b+e)
    119         VMUL        dTemp0, dTemp2, dCoeff5 ;// TeRi
    120 
    121         VEXT        dSrcf, dSrcB0, dSrcB1, #5   ;// [f0 f1 f2 f3 ..]
    122         VEXT        dSrcb, dSrcB0, dSrcB1, #1   ;// [b0 b1 b2 b3 ..]
    123         VEXT        dSrc1c, dSrcB0, dSrcB1, #2
    124         VEXT        dSrc1d, dSrcB0, dSrcB1, #3
    125         VEXT        dSrce, dSrcB0, dSrcB1, #4
    126         VADDL       qRes23, dSrcB0, dSrcf       ;// Acc=a+f
    127 
    128         VSUB        dRes0, dRes0, dTemp0    ;// TeRi
    129 
    130         VADDL       qTemp01, dSrc1c, dSrc1d     ;// c+d
    131         VADDL       qTemp23, dSrcb, dSrce       ;// b+e
    132 
    133         VLD1        qSrcC01, [pSrc], srcStep    ;// Load C register [a0 a1 a2 a3 ..]
    134 ;        VLD1        qSrcD01, [pSrc], srcStep    ;// Load D register [a0 a1 a2 a3 ..]
    135 
    136         VMLA        dRes2, dTemp0, dCoeff20     ;// Acc += 20*(c+d)
    137 ;        VMLS        dRes2, dTemp2, dCoeff5      ;// Acc -= 5*(b+e)
    138         VMUL        dTemp0, dTemp2, dCoeff5 ;// TeRi
    139 
    140         VEXT        dSrcf, dSrcC0, dSrcC1, #5   ;// [f0 f1 f2 f3 ..]
    141         VEXT        dSrcb, dSrcC0, dSrcC1, #1   ;// [b0 b1 b2 b3 ..]
    142         VEXT        dSrc2c, dSrcC0, dSrcC1, #2
    143         VEXT        dSrc2d, dSrcC0, dSrcC1, #3
    144         VEXT        dSrce, dSrcC0, dSrcC1, #4
    145         VADDL       qRes45, dSrcC0, dSrcf       ;// Acc=a+f
    146 
    147         VSUB        dRes2, dRes2, dTemp0  ;// TeRi
    148 
    149         VADDL       qTemp01, dSrc2c, dSrc2d     ;// c+d
    150         VADDL       qTemp23, dSrcb, dSrce       ;// b+e
    151 
    152         VLD1        qSrcD01, [pSrc], srcStep    ;// Load D register [a0 a1 a2 a3 ..]
    153 
    154         VMLA        dRes4, dTemp0, dCoeff20     ;// Acc += 20*(c+d)
    155 ;        VMLS        dRes4, dTemp2, dCoeff5      ;// Acc -= 5*(b+e)
    156         VMUL        dTemp0, dTemp2, dCoeff5      ;// Acc -= 5*(b+e) TeRi
    157 
    158 
    159         VEXT        dSrcf, dSrcD0, dSrcD1, #5   ;// [f0 f1 f2 f3 ..]
    160         VEXT        dSrcb, dSrcD0, dSrcD1, #1   ;// [b0 b1 b2 b3 ..]
    161         VEXT        dSrc3c, dSrcD0, dSrcD1, #2
    162         VEXT        dSrc3d, dSrcD0, dSrcD1, #3
    163         VEXT        dSrce, dSrcD0, dSrcD1, #4
    164         VADDL       qRes67, dSrcD0, dSrcf       ;// Acc=a+f
    165 
    166         VSUB        dRes4, dRes4, dTemp0 ;// TeRi
    167 
    168         VADDL       qTemp01, dSrc3c, dSrc3d     ;// c+d
    169         VADDL       qTemp23, dSrcb, dSrce       ;// b+e
    170         VMLA        dRes6, dTemp0, dCoeff20     ;// Acc += 20*(c+d)
    171         VMLS        dRes6, dTemp2, dCoeff5      ;// Acc -= 5*(b+e)
    172 
    173         VQRSHRUN    dAcc0, qRes01, #5           ;// Acc = Sat ((Acc + 16) / 32)
    174         VQRSHRUN    dAcc2, qRes23, #5           ;// Acc = Sat ((Acc + 16) / 32)
    175         VQRSHRUN    dAcc4, qRes45, #5           ;// Acc = Sat ((Acc + 16) / 32)
    176         VQRSHRUN    dAcc6, qRes67, #5           ;// Acc = Sat ((Acc + 16) / 32)
    177 
    178         M_END
    179 
    180     ENDIF
    181 
    182 
    183     END
    184 
    185 
    186 
    187 
    188 
    189 
    190 
    191 
    192 
    193 
    194 
    195 
    196 
    197 
    198 
    199 
    200 
    201 
    202 
    203 
    204 
    205 
    206 
    207 
    208 
    209 
    210 
    211 
    212 
    213 
    214 
    215 
    216 
    217 
    218 
    219 
    220 
    221 
    222 
    223 
    224 
    225 
    226 
    227 
    228 
    229 
    230 
    231 
    232 
    233 
    234 
    235 
    236 
    237 
    238 
    239 
    240 
    241 
    242 
    243