Home | History | Annotate | Download | only in src
      1 ;//
      2 ;// Copyright (C) 2007-2008 ARM Limited
      3 ;//
      4 ;// Licensed under the Apache License, Version 2.0 (the "License");
      5 ;// you may not use this file except in compliance with the License.
      6 ;// You may obtain a copy of the License at
      7 ;//
      8 ;//      http://www.apache.org/licenses/LICENSE-2.0
      9 ;//
     10 ;// Unless required by applicable law or agreed to in writing, software
     11 ;// distributed under the License is distributed on an "AS IS" BASIS,
     12 ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 ;// See the License for the specific language governing permissions and
     14 ;// limitations under the License.
     15 ;//
     16 ;//
     17 ;//
     18 ;// File Name:  armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe_s.s
     19 ;// OpenMAX DL: v1.0.2
     20 ;// Revision:   9641
     21 ;// Date:       Thursday, February 7, 2008
     22 ;//
     23 ;//
     24 ;//
     25 ;//
     26 
     27         INCLUDE omxtypes_s.h
     28         INCLUDE armCOMM_s.h
     29 
     30         EXPORT armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
     31 
     32         M_VARIANTS ARM1136JS
     33 
     34 
     35 
     36 
     37 
     38     IF ARM1136JS
     39 
     40         M_ALLOC8 ppDstArgs, 8
     41         M_ALLOC4 ppSrc, 4
     42         M_ALLOC4 ppDst, 4
     43         M_ALLOC4 pCounter, 4
     44 
     45         ;// Function header
     46         ;// Function:
     47         ;//     armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe
     48         ;//
     49         ;// Implements diagonal interpolation for a block of size 4x4. Input and output should
     50         ;// be aligned.
     51         ;//
     52         ;// Registers used as input for this function
     53         ;// r0,r1,r2,r3, r8 where r0,r2  input pointer and r1,r3 step size, r8 intermediate-buf pointer
     54         ;//
     55         ;// Registers preserved for top level function
     56         ;// r0,r1,r2,r3,r4,r5,r6,r14
     57         ;//
     58         ;// Registers modified by the function
     59         ;// r7,r8,r9,r10,r11,r12
     60         ;//
     61         ;// Output registers
     62         ;// None. Function will preserve r0-r3
     63 
     64         M_START armVCM4P10_InterpolateLuma_HalfDiagVerHor4x4_unsafe, r6
     65 
     66 ;// Declare input registers
     67 pSrc            RN 0
     68 srcStep         RN 1
     69 pDst            RN 2
     70 dstStep         RN 3
     71 
     72 ;// Declare inner loop registers
     73 ValA            RN 5
     74 ValA0           RN 4
     75 ValA1           RN 5
     76 ValAF0          RN 4
     77 ValAF1          RN 5
     78 
     79 ValB            RN 11
     80 
     81 ValC            RN 5
     82 ValC0           RN 4
     83 ValC1           RN 5
     84 ValCD0          RN 12
     85 ValCD1          RN 14
     86 ValCF0          RN 4
     87 ValCF1          RN 5
     88 
     89 ValD            RN 10
     90 
     91 ValE            RN 7
     92 ValE0           RN 6
     93 ValE1           RN 7
     94 ValEB0          RN 10
     95 ValEB1          RN 11
     96 ValED0          RN 6
     97 ValED1          RN 7
     98 
     99 ValF            RN 10
    100 
    101 ValG            RN 14
    102 ValG0           RN 12
    103 ValG1           RN 14
    104 ValGB0          RN 12
    105 ValGB1          RN 14
    106 
    107 Acc0            RN 4
    108 Acc1            RN 5
    109 Acc2            RN 6
    110 Acc3            RN 7
    111 
    112 Temp            RN 7
    113 Step            RN 6
    114 
    115 pInterBuf       RN 8
    116 Counter         RN 8
    117 r0x00ff00ff     RN 9                                        ;// [0 255 0 255] where 255 is offset
    118 r0x0001fc00     RN 10                                       ;// [0 (16*255 - 16) 0 (16*255 - 16)]
    119 
    120 
    121 ;// Declare inner loop registers
    122 ValCA           RN 8
    123 ValDB           RN 9
    124 ValGE           RN 10
    125 ValHF           RN 11
    126 r0x00140001     RN 12
    127 r0x0014fffb     RN 14
    128 
    129 r0x00000200     RN 12
    130 r0x000000ff     RN 12
    131 
    132         M_STRD      pDst, dstStep, ppDstArgs
    133         MOV         pDst, pInterBuf
    134         MOV         dstStep, #24
    135 
    136         ;// Set up counter of format, [0]  [0]  [1 (height)]  [8 (width)]
    137         MOV         Counter, #1
    138         MOV         Temp, #8
    139         ADD         Counter, Temp, Counter, LSL #8        ;// [0 0 H W]
    140 
    141         LDR         r0x00ff00ff, =0x00ff00ff                ;// [0 255 0 255] 255 is offset to avoid negative results
    142 WidthLoop
    143         M_STR       pSrc, ppSrc
    144         M_STR       pDst, ppDst
    145 HeightLoop
    146 TwoRowsLoop
    147         M_LDR       ValC, [pSrc], srcStep                   ;// Load  [c3 c2 c1 c0]
    148         M_LDR       ValD, [pSrc], srcStep                   ;// Load  [d3 d2 d1 d0]
    149         M_LDR       ValE, [pSrc], srcStep                   ;// Load  [e3 e2 e1 e0]
    150         SUB         pSrc, pSrc, srcStep, LSL #2
    151         UXTAB16     ValC0, r0x00ff00ff, ValC                ;// [0 c2 0 c0] + [0 255 0 255]
    152         UXTAB16     ValC1, r0x00ff00ff, ValC, ROR #8        ;// [0 c3 0 c1] + [0 255 0 255]
    153         LDR         ValB, [pSrc]                            ;// Load  [b3 b2 b1 b0]
    154         UXTAB16     ValE0, r0x00ff00ff, ValE                ;// [0 e2 0 e0] + [0 255 0 255]
    155         UXTAB16     ValE1, r0x00ff00ff, ValE, ROR #8        ;// [0 e3 0 e1] + [0 255 0 255]
    156         UXTAB16     ValCD0, ValC0, ValD                     ;// [0 c2 0 c0] + [0 255 0 255] + [0 d2 0 d0]
    157         UXTAB16     ValCD1, ValC1, ValD, ROR #8             ;// [0 c3 0 c1] + [0 255 0 255] + [0 d3 0 d1]
    158         UXTAB16     ValEB0, ValE0, ValB                     ;// [0 e2 0 e0] + [0 255 0 255] + [0 b2 0 b0]
    159         RSB         ValCD0, ValEB0, ValCD0, LSL #2          ;// 4*(Off+C+D) - (Off+B+E)
    160 
    161         LDR         ValD, [pSrc, srcStep, LSL #1]                       ;// Load  [d3 d2 d1 d0]
    162         UXTAB16     ValEB1, ValE1, ValB, ROR #8             ;// [0 e3 0 e1] + [0 255 0 255] + [0 b3 0 b1]
    163         RSB         ValCD1, ValEB1, ValCD1, LSL #2
    164 
    165         UXTAB16     ValED0, ValE0, ValD                     ;// [0 e2 0 e0] + [0 255 0 255] + [0 d2 0 d0]
    166         UXTAB16     ValED1, ValE1, ValD, ROR #8             ;// [0 e3 0 e1] + [0 255 0 255] + [0 d3 0 d1]
    167         LDR         ValF, [pSrc, srcStep, LSL #2]           ;// Load  [f3 f2 f1 f0]
    168         M_LDR       ValB, [pSrc], srcStep                   ;// Load  [b3 b2 b1 b0]
    169         ADD         ValCD0, ValCD0, ValCD0, LSL #2          ;// 5 * [4*(Off+C+D) - (Off+B+E)]
    170         ADD         ValCD1, ValCD1, ValCD1, LSL #2
    171         UXTAB16     ValCF1, ValC1, ValF, ROR #8             ;// [0 c3 0 c1] + [0 255 0 255] + [0 f3 0 f1]
    172         UXTAB16     ValCF0, ValC0, ValF                     ;// [0 c2 0 c0] + [0 255 0 255] + [0 f2 0 f0]
    173         RSB         ValED1, ValCF1, ValED1, LSL #2
    174 
    175         SUB         ValA, pSrc, srcStep, LSL #1
    176         LDR         ValA, [ValA]                            ;// Load  [a3 a2 a1 a0]
    177         RSB         ValED0, ValCF0, ValED0, LSL #2          ;// 4*(Off+E+D) - (Off+C+F)
    178         ADD         ValED1, ValED1, ValED1, LSL #2
    179         ADD         ValED0, ValED0, ValED0, LSL #2          ;// 5 * [4*(Off+E+D) - (Off+C+F)]
    180         UXTAB16     ValA0, r0x00ff00ff, ValA                ;// [0 a2 0 a0] + [0 255 0 255]
    181         UXTAB16     ValA1, r0x00ff00ff, ValA, ROR #8        ;// [0 a3 0 a1] + [0 255 0 255]
    182         UXTAB16     ValAF0, ValA0, ValF                     ;// [0 a2 0 a0] + [0 255 0 255] + [0 f2 0 f0]
    183         UXTAB16     ValAF1, ValA1, ValF, ROR #8             ;// [0 a3 0 a1] + [0 255 0 255] + [0 f3 0 f1]
    184         ADD         Acc1, ValCD1, ValAF1
    185 
    186         LDR         ValG, [pSrc, srcStep, LSL #2]           ;// Load  [g3 g2 g1 g0]
    187         ADD         Acc0, ValCD0, ValAF0                    ;// Acc0 = 16*Off + (A+F) + 20*(C+D) - 5*(B+E)
    188         STR         Acc1, [pDst, #4]                        ;// Store result & adjust pointer
    189         M_STR       Acc0, [pDst], dstStep                   ;// Store result & adjust pointer
    190         UXTAB16     ValG0, r0x00ff00ff, ValG                ;// [0 g2 0 g0] + [0 255 0 255]
    191         UXTAB16     ValG1, r0x00ff00ff, ValG, ROR #8        ;// [0 g3 0 g1] + [0 255 0 255]
    192         UXTAB16     ValGB0, ValG0, ValB                     ;// [0 g2 0 g0] + [0 255 0 255] + [0 b2 0 b0]
    193         UXTAB16     ValGB1, ValG1, ValB, ROR #8             ;// [0 g3 0 g1] + [0 255 0 255] + [0 b3 0 b1]
    194         ADD         Acc2, ValED0, ValGB0                    ;// Acc2 = 16*Off + (B+G) + 20*(D+E) - 5*(C+F)
    195         ADD         Acc3, ValED1, ValGB1
    196 
    197         STR         Acc3, [pDst, #4]                        ;// Store result & adjust pointer
    198         M_STR       Acc2, [pDst], dstStep                   ;// Store result & adjust pointer
    199 
    200         SUBS        Counter, Counter, #1 << 8               ;// Loop till height is 10
    201         ADD         pSrc, pSrc, srcStep, LSL #1
    202         BPL         HeightLoop
    203 
    204         M_LDR       pSrc, ppSrc
    205         M_LDR       pDst, ppDst
    206         ADDS        Counter, Counter, #(1 << 8)-4           ;// Loop till width is 12
    207         ADD         pSrc, pSrc, #4
    208         ADD         pDst, pDst, #8
    209         ADD         Counter, Counter, #1<<8
    210         BPL         WidthLoop
    211 
    212         ;//
    213         ;// Horizontal interpolation using multiplication
    214         ;//
    215 
    216         SUB         pSrc, pDst, #24
    217         MOV         srcStep, #24
    218         M_LDRD      pDst, dstStep, ppDstArgs
    219 
    220         MOV         Counter, #4
    221         LDR         r0x0014fffb, =0x0014fffb
    222         LDR         r0x00140001, =0x00140001
    223 
    224 HeightLoop1
    225         M_STR       Counter, pCounter
    226 
    227 
    228         LDR         ValCA, [pSrc], #4                   ;// Load  [0 c 0 a]
    229         LDR         ValDB, [pSrc], #4                   ;// Load  [0 d 0 b]
    230         LDR         ValGE, [pSrc], #4                   ;// Load  [0 g 0 e]
    231         LDR         ValHF, [pSrc], #4                   ;// Load  [0 h 0 f]
    232 
    233         ;// Acc0 = smuad ([0 20 0 1], add([0 c 0 a] + [0 d 0 f])) - (5 * (b + e))
    234         ;// Acc1 = smuad ([0 20 0 1], add([0 e 0 g] + [0 d 0 b])) - (5 * (c + f))
    235         ;// Acc2 = smuad ([0 1 0 20], add([0 c 0 e] + [0 h 0 f])) - (5 * (d + g))
    236         ;// Acc3 = smuad ([0 20 0 1], add([0 d 0 f] + [0 i 0 g])) - (5 * (e + h))
    237         SMUAD       Acc0, ValCA, r0x00140001            ;// Acc0  = [0 c 0 a] * [0 20 0 1]
    238         SMUAD       Acc1, ValDB, r0x00140001            ;// Acc1  = [0 c 0 a] * [0 20 0 1]
    239         SMUADX      Acc2, ValGE, r0x0014fffb            ;// Acc2  = [0 g 0 e] * [0 20 0 -5]
    240         SMUAD       Acc3, ValGE, r0x0014fffb            ;// Acc3  = [0 g 0 e] * [0 20 0 -5]
    241 
    242         SMLAD       Acc0, ValDB, r0x0014fffb, Acc0      ;// Acc0 += [0 d 0 b] * [0 20 0 -5]
    243         SMLADX      Acc1, ValGE, r0x00140001, Acc1      ;// Acc1 += [0 g 0 e] * [0 20 0 1]
    244         SMLADX      Acc2, ValHF, r0x00140001, Acc2      ;// Acc2 += [0 h 0 f] * [0 20 0 1]
    245         SMLADX      Acc3, ValHF, r0x0014fffb, Acc3      ;// Acc3 += [0 h 0 f] * [0 20 0 -5]
    246 
    247         SMLABB      Acc0, ValGE, r0x0014fffb, Acc0      ;// Acc0 += [0 g 0 e] * [0 0 0 -5]
    248         SMLATB      Acc1, ValCA, r0x0014fffb, Acc1      ;// Acc1 += [0 d 0 b] * [0 0 0 -5]
    249         SMLATB      Acc2, ValCA, r0x00140001, Acc2      ;// Acc2 += [0 c 0 a] * [0 0 0 1]
    250         SMLATB      Acc3, ValDB, r0x00140001, Acc3      ;// Acc3 += [0 c 0 a] * [0 0 0 1]
    251 
    252         LDRH        ValCA, [pSrc], #8                   ;// 8 = srcStep - 16
    253         SMLABB      Acc0, ValHF, r0x00140001, Acc0      ;// Acc0 += [0 h 0 f] * [0 0 0 1]
    254         SMLABB      Acc1, ValHF, r0x0014fffb, Acc1      ;// Acc1 += [0 h 0 f] * [0 0 0 -5]
    255         SMLATB      Acc2, ValDB, r0x0014fffb, Acc2      ;// Acc2 += [0 d 0 b] * [0 0 0 -5]
    256         SMLABB      Acc3, ValCA, r0x00140001, Acc3      ;// Acc3 += [0 d 0 b] * [0 0 0 1]
    257 
    258         LDR         r0x0001fc00, =0x0001fc00            ;// (0xff * 16 * 32) - 512
    259         SUB         Acc0, Acc0, r0x0001fc00
    260         SUB         Acc1, Acc1, r0x0001fc00
    261         SUB         Acc2, Acc2, r0x0001fc00
    262         SUB         Acc3, Acc3, r0x0001fc00
    263 
    264         USAT        Acc0, #18, Acc0
    265         USAT        Acc1, #18, Acc1
    266         USAT        Acc2, #18, Acc2
    267         USAT        Acc3, #18, Acc3
    268 
    269         MOV         Acc0, Acc0, LSR #10
    270         MOV         Acc1, Acc1, LSR #10
    271         MOV         Acc2, Acc2, LSR #10
    272         MOV         Acc3, Acc3, LSR #10
    273 
    274         M_LDR       Counter, pCounter
    275         ORR         Acc0, Acc0, Acc1, LSL #8
    276         ORR         Acc2, Acc2, Acc3, LSL #8
    277         SUBS        Counter, Counter, #1
    278         ORR         Acc0, Acc0, Acc2, LSL #16
    279         M_STR       Acc0, [pDst], dstStep
    280         BGT         HeightLoop1
    281 End
    282         SUB         pDst, pDst, dstStep, LSL #2
    283         SUB         pSrc, pSrc, srcStep, LSL #2
    284 
    285         M_END
    286 
    287     ENDIF
    288 
    289     END
    290 
    291