Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// Copyright (C) 2007-2008 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// File Name:  armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe_s.s
     19 ;// OpenMAX DL: v1.0.2
     20 ;// Revision:   9641
     21 ;// Date:       Thursday, February 7, 2008
     22 ;//
     23 ;//
     24 ;//
     25 ;//
     26 
     27         INCLUDE omxtypes_s.h
     28         INCLUDE armCOMM_s.h
     29 
     30         M_VARIANTS ARM1136JS
     31 
     32         EXPORT armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
     33 
     34 
     35 
     36     IF ARM1136JS
     37 
     38         ;// Function header
     39 
     40         ;// Function:
     41         ;//     armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe
     42         ;//
     43         ;// Implements vertical interpolation for a block of size 4x4. Input and output should
     44         ;// be aligned.
     45         ;//
     46         ;// Registers used as input for this function
     47         ;// r0,r1,r2,r3 where r0,r2  input pointer and r1,r3 corresponding step size
     48         ;//
     49         ;// Registers preserved for top level function
     50         ;// r0,r1,r2,r3,r4,r5,r6,r14
     51         ;//
     52         ;// Registers modified by the function
     53         ;// r7,r8,r9,r10,r11,r12
     54         ;//
     55         ;// Output registers
     56         ;// None. Function will preserve r0-r3
     57         M_START armVCM4P10_InterpolateLuma_HalfVer4x4_unsafe, r6
     58 
     59 ;// Declare input registers
     60 pSrc            RN 0
     61 srcStep         RN 1
     62 pDst            RN 2
     63 dstStep         RN 3
     64 
     65 ;// Declare inner loop registers
     66 ValA            RN 5
     67 ValA0           RN 4
     68 ValA1           RN 5
     69 ValAF0          RN 4
     70 ValAF1          RN 5
     71 
     72 ValB            RN 11
     73 
     74 ValC            RN 5
     75 ValC0           RN 4
     76 ValC1           RN 5
     77 ValCD0          RN 12
     78 ValCD1          RN 14
     79 ValCF0          RN 4
     80 ValCF1          RN 5
     81 
     82 ValD            RN 10
     83 
     84 ValE            RN 7
     85 ValE0           RN 6
     86 ValE1           RN 7
     87 ValEB0          RN 10
     88 ValEB1          RN 11
     89 ValED0          RN 6
     90 ValED1          RN 7
     91 
     92 ValF            RN 10
     93 
     94 ValG            RN 14
     95 ValG0           RN 12
     96 ValG1           RN 14
     97 ValGB0          RN 12
     98 ValGB1          RN 14
     99 
    100 Acc0            RN 4
    101 Acc1            RN 5
    102 Acc2            RN 6
    103 Acc3            RN 7
    104 
    105 Temp            RN 7
    106 Height          RN 3
    107 Step            RN 6
    108 
    109 Counter         RN 8
    110 r0x00ff00ff     RN 9                                        ;// [0 255 0 255] where 255 is offset
    111 r0x0fe00fe0     RN 10                                       ;// [0 (16*255 - 16) 0 (16*255 - 16)]
    112 
    113 
    114         LDR         r0x00ff00ff, =0x00ff00ff                ;// [0 255 0 255] 255 is offset to avoid negative results
    115         MOV         Counter, #2
    116 
    117 TwoRowsLoop
    118         M_LDR       ValC, [pSrc], srcStep                   ;// Load  [c3 c2 c1 c0]
    119         M_LDR       ValD, [pSrc], srcStep                   ;// Load  [d3 d2 d1 d0]
    120         M_LDR       ValE, [pSrc], srcStep                   ;// Load  [e3 e2 e1 e0]
    121         SUB         pSrc, pSrc, srcStep, LSL #2
    122         LDR         ValB, [pSrc]                            ;// Load  [b3 b2 b1 b0]
    123         UXTAB16     ValC0, r0x00ff00ff, ValC                ;// [0 c2 0 c0] + [0 255 0 255]
    124         UXTAB16     ValC1, r0x00ff00ff, ValC, ROR #8        ;// [0 c3 0 c1] + [0 255 0 255]
    125 
    126         UXTAB16     ValE0, r0x00ff00ff, ValE                ;// [0 e2 0 e0] + [0 255 0 255]
    127         UXTAB16     ValE1, r0x00ff00ff, ValE, ROR #8        ;// [0 e3 0 e1] + [0 255 0 255]
    128         UXTAB16     ValCD0, ValC0, ValD                     ;// [0 c2 0 c0] + [0 255 0 255] + [0 d2 0 d0]
    129         UXTAB16     ValCD1, ValC1, ValD, ROR #8             ;// [0 c3 0 c1] + [0 255 0 255] + [0 d3 0 d1]
    130         UXTAB16     ValEB0, ValE0, ValB                     ;// [0 e2 0 e0] + [0 255 0 255] + [0 b2 0 b0]
    131         RSB         ValCD0, ValEB0, ValCD0, LSL #2          ;// 4*(Off+C+D) - (Off+B+E)
    132 
    133         LDR         ValD, [pSrc, srcStep, LSL #1]                       ;// Load  [d3 d2 d1 d0]
    134         UXTAB16     ValEB1, ValE1, ValB, ROR #8             ;// [0 e3 0 e1] + [0 255 0 255] + [0 b3 0 b1]
    135         RSB         ValCD1, ValEB1, ValCD1, LSL #2
    136         ;// One cycle stall
    137         UXTAB16     ValED0, ValE0, ValD                     ;// [0 e2 0 e0] + [0 255 0 255] + [0 d2 0 d0]
    138         UXTAB16     ValED1, ValE1, ValD, ROR #8             ;// [0 e3 0 e1] + [0 255 0 255] + [0 d3 0 d1]
    139 
    140         LDR         ValF, [pSrc, srcStep, LSL #2]           ;// Load  [f3 f2 f1 f0]
    141         M_LDR       ValB, [pSrc], srcStep                   ;// Load  [b3 b2 b1 b0]
    142         ADD         ValCD0, ValCD0, ValCD0, LSL #2          ;// 5 * [4*(Off+C+D) - (Off+B+E)]
    143         ADD         ValCD1, ValCD1, ValCD1, LSL #2
    144         UXTAB16     ValCF1, ValC1, ValF, ROR #8             ;// [0 c3 0 c1] + [0 255 0 255] + [0 f3 0 f1]
    145         UXTAB16     ValCF0, ValC0, ValF                     ;// [0 c2 0 c0] + [0 255 0 255] + [0 f2 0 f0]
    146         RSB         ValED1, ValCF1, ValED1, LSL #2
    147 
    148         SUB         ValA, pSrc, srcStep, LSL #1
    149         LDR         ValA, [ValA]                            ;// Load  [a3 a2 a1 a0]
    150         RSB         ValED0, ValCF0, ValED0, LSL #2          ;// 4*(Off+E+D) - (Off+C+F)
    151         ADD         ValED1, ValED1, ValED1, LSL #2
    152         ADD         ValED0, ValED0, ValED0, LSL #2          ;// 5 * [4*(Off+E+D) - (Off+C+F)]
    153         UXTAB16     ValA0, r0x00ff00ff, ValA                ;// [0 a2 0 a0] + [0 255 0 255]
    154         UXTAB16     ValA1, r0x00ff00ff, ValA, ROR #8        ;// [0 a3 0 a1] + [0 255 0 255]
    155         UXTAB16     ValAF0, ValA0, ValF                     ;// [0 a2 0 a0] + [0 255 0 255] + [0 f2 0 f0]
    156         UXTAB16     ValAF1, ValA1, ValF, ROR #8             ;// [0 a3 0 a1] + [0 255 0 255] + [0 f3 0 f1]
    157 
    158         LDR         r0x0fe00fe0, =0x0fe00fe0                ;// [0 255 0 255] 255 is offset to avoid negative results
    159         ADD         Acc1, ValCD1, ValAF1
    160 
    161         LDR         ValG, [pSrc, srcStep, LSL #2]           ;// Load  [g3 g2 g1 g0]
    162         ADD         Acc0, ValCD0, ValAF0                    ;// Acc0 = 16*Off + (A+F) + 20*(C+D) - 5*(B+E)
    163         UQSUB16     Acc1, Acc1, r0x0fe00fe0                 ;// Acc1 -= (16*Off - 16)
    164         UQSUB16     Acc0, Acc0, r0x0fe00fe0
    165         UXTAB16     ValG0, r0x00ff00ff, ValG                ;// [0 g2 0 g0] + [0 255 0 255]
    166         UXTAB16     ValG1, r0x00ff00ff, ValG, ROR #8        ;// [0 g3 0 g1] + [0 255 0 255]
    167         UXTAB16     ValGB0, ValG0, ValB                     ;// [0 g2 0 g0] + [0 255 0 255] + [0 b2 0 b0]
    168         UXTAB16     ValGB1, ValG1, ValB, ROR #8             ;// [0 g3 0 g1] + [0 255 0 255] + [0 b3 0 b1]
    169         ADD         Acc2, ValED0, ValGB0                    ;// Acc2 = 16*Off + (B+G) + 20*(D+E) - 5*(C+F)
    170         ADD         Acc3, ValED1, ValGB1
    171         UQSUB16     Acc3, Acc3, r0x0fe00fe0                 ;// Acc3 -= (16*Off - 16)
    172         UQSUB16     Acc2, Acc2, r0x0fe00fe0
    173         USAT16      Acc1, #13, Acc1                         ;// Saturate to 8+5 = 13 bits
    174         USAT16      Acc0, #13, Acc0
    175         USAT16      Acc3, #13, Acc3
    176         USAT16      Acc2, #13, Acc2
    177         AND         Acc1, r0x00ff00ff, Acc1, LSR #5         ;// [0 a3 0 a1]
    178         AND         Acc0, r0x00ff00ff, Acc0, LSR #5         ;// [0 a2 0 a0]
    179         ORR         Acc0, Acc0, Acc1, LSL #8                ;// [a3 a2 a1 a0]
    180         AND         Acc3, r0x00ff00ff, Acc3, LSR #5         ;// [0 b3 0 b1]
    181         AND         Acc2, r0x00ff00ff, Acc2, LSR #5         ;// [0 b2 0 b0]
    182 
    183         M_STR       Acc0, [pDst], dstStep                   ;// Store result & adjust pointer
    184         ORR         Acc2, Acc2, Acc3, LSL #8                ;// [b3 b2 b1 b0]
    185         M_STR       Acc2, [pDst], dstStep                   ;// Store result & adjust pointer
    186         ADD         pSrc, pSrc, srcStep, LSL #1
    187 
    188         SUBS        Counter, Counter, #1
    189         BGT         TwoRowsLoop
    190 End
    191         SUB     pDst, pDst, dstStep, LSL #2
    192         SUB     pSrc, pSrc, srcStep, LSL #2
    193 
    194         M_END
    195 
    196     ENDIF
    197 
    198     END
    199 
    200