Home | History | Annotate | Download | only in src
      1 ;//
      2 ;//
      3 ;// File Name:  armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe_s.s
      4 ;// OpenMAX DL: v1.0.2
      5 ;// Revision:   9641
      6 ;// Date:       Thursday, February 7, 2008
      7 ;//
      8 ;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
      9 ;//
     10 ;//
     11 ;//
     12 
     13         INCLUDE omxtypes_s.h
     14         INCLUDE armCOMM_s.h
     15 
     16         EXPORT armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
     17 
     18         M_VARIANTS ARM1136JS
     19 
     20 
     21 
     22     IF ARM1136JS
     23 
     24 
     25         M_ALLOC8 ppDstArgs, 8
     26         M_ALLOC8 pTempResult1, 8
     27         M_ALLOC8 pTempResult2, 8
     28         M_ALLOC4 ppSrc, 4
     29         M_ALLOC4 ppDst, 4
     30         M_ALLOC4 pDstStep, 4
     31         M_ALLOC4 pSrcStep, 4
     32         M_ALLOC4 pCounter, 4
     33 
     34         ;// Function header
     35         ;// Function:
     36         ;//     armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe
     37         ;//
     38         ;// Implements diagonal interpolation for a block of size 4x4. Input and output should
     39         ;// be aligned.
     40         ;//
     41         ;// Registers used as input for this function
     42         ;// r0,r1,r2,r3, r8 where r0,r2  input pointer and r1,r3 step size, r8 intermediate-buf pointer
     43         ;//
     44         ;// Registers preserved for top level function
     45         ;// r0,r1,r2,r3,r4,r5,r6,r14
     46         ;//
     47         ;// Registers modified by the function
     48         ;// r7,r8,r9,r10,r11,r12
     49         ;//
     50         ;// Output registers
     51         ;// None. Function will preserve r0-r3
     52 
     53         M_START armVCM4P10_InterpolateLuma_HalfDiagHorVer4x4_unsafe, r6
     54 
     55 ;// Declare input registers
     56 pSrc            RN 0
     57 srcStep         RN 1
     58 pDst            RN 2
     59 dstStep         RN 3
     60 
     61 ;// Declare inner loop registers
     62 Acc0            RN 4
     63 Acc1            RN 5
     64 Acc2            RN 6
     65 Acc3            RN 7
     66 
     67 ValA            RN 4
     68 ValB            RN 5
     69 ValC            RN 6
     70 ValD            RN 7
     71 ValE            RN 8
     72 ValF            RN 9
     73 ValG            RN 12
     74 ValH            RN 14
     75 ValI            RN 1
     76 
     77 Temp1           RN 3
     78 Temp2           RN 1
     79 Temp3           RN 12
     80 Temp4           RN 7
     81 Temp5           RN 5
     82 r0x0fe00fe0     RN 3                                    ;// [0 (16*255 - 16) 0 (16*255 - 16)]
     83 r0x00ff00ff     RN 10                                   ;// [0 255 0 255] where 255 is offset
     84 Counter         RN 11
     85 pInterBuf       RN 8
     86 
     87 ValCA           RN 8
     88 ValDB           RN 9
     89 ValGE           RN 10
     90 ValHF           RN 11
     91 r0x00140001     RN 12
     92 r0x0014fffb     RN 14
     93 
     94 r0x0001fc00     RN 11
     95 
     96 Accx            RN 8
     97 Accy            RN 9
     98 Temp6           RN 14
     99 
    100         M_STRD      pDst, dstStep, ppDstArgs
    101 
    102         MOV         pDst, pInterBuf
    103         MOV         dstStep, #16
    104 
    105         ;// Set up counter of format, [0]  [0]  [1 (height)]  [8 (width)]
    106         MOV         Counter, #4
    107         M_STR       dstStep, pDstStep
    108         M_STR       srcStep, pSrcStep
    109         LDR         r0x00ff00ff, =0x00ff00ff               ;// [0 255 0 255] 255 is offset to avoid negative results
    110 
    111 HeightLoop
    112 NextTwoRowsLoop
    113         LDR     ValD, [pSrc, srcStep]                   ;// Load row 1 [d1 c1 b1 a1]
    114         LDR     ValA, [pSrc], #4                        ;// Load row 0 [d0 c0 b0 a0]
    115         LDR     ValH, [pSrc, srcStep]                   ;// Load  [h1 g1 f1 e1]
    116         LDR     ValE, [pSrc], #4                        ;// Load  [h0 g0 f0 e0]
    117         LDRB    Temp2, [pSrc, srcStep]                  ;// Load row 1 [l1 k1 j1 i1]
    118         LDRB    Temp1, [pSrc], #-8                      ;// Load row 0 [l0 k0 j0 i0]
    119 
    120         PKHBT   ValB, ValA, ValD, LSL #16               ;// [b1 a1 b0 a0]
    121         PKHTB   ValD, ValD, ValA, ASR #16               ;// [d1 c1 d0 c0]
    122         UXTAB16 ValA, r0x00ff00ff, ValB                 ;// [00 a1 00 a0] + [0 255 0 255]
    123         UXTAB16 ValC, r0x00ff00ff, ValD                 ;// [00 c1 00 c0] + [0 255 0 255]
    124         PKHBT   ValI, Temp1, Temp2, LSL #16             ;// [00 i1 00 i0]
    125         PKHBT   ValF, ValE, ValH, LSL #16               ;// [f1 e1 f0 e0]
    126         PKHTB   ValH, ValH, ValE, ASR #16               ;// [h1 g1 h0 g0]
    127         UXTAB16 ValE, r0x00ff00ff, ValF                 ;// [00 e1 00 e0] + [0 255 0 255]
    128 
    129         ;// Calculate Acc0
    130         ;// Acc0 = a - 5*b + 20*c + 20*d - 5*e + f
    131         UXTAB16 Temp1, ValC, ValD, ROR #8
    132         UXTAB16 Temp3, ValE, ValB, ROR #8
    133         RSB     Temp1, Temp3, Temp1, LSL #2
    134         UXTAB16 Acc0, ValA, ValF, ROR #8
    135         ADD     Temp1, Temp1, Temp1, LSL #2
    136         ADD     Acc0, Acc0, Temp1
    137 
    138         ;// Calculate Acc1
    139         ;// Acc1 = b - 5*c + 20*d + 20*e - 5*f + g
    140         UXTAB16 Temp1, ValE, ValD, ROR #8
    141         UXTAB16 Temp3, ValC, ValF, ROR #8
    142         RSB     Temp1, Temp3, Temp1, LSL #2
    143         UXTAB16 ValG, r0x00ff00ff, ValH                 ;// [00 g1 00 g0] + [0 255 0 255]
    144         ADD     Temp1, Temp1, Temp1, LSL #2
    145         UXTAB16 Acc1, ValG, ValB, ROR #8
    146         ADD     Acc1, Acc1, Temp1
    147 
    148         UXTAB16 Acc2, ValC, ValH, ROR #8
    149         ADD     ValI, r0x00ff00ff, ValI                 ;// [00 i1 00 i0] + [0 255 0 255]
    150 
    151         ;// Calculate Acc2
    152         ;// Acc2 = c - 5*d + 20*e + 20*f - 5*g + h
    153         UXTAB16 Temp1, ValG, ValD, ROR #8
    154         UXTAB16 Acc3, ValI, ValD, ROR #8
    155         UXTAB16 Temp2, ValE, ValF, ROR #8
    156 
    157         RSB     Temp1, Temp1, Temp2, LSL #2
    158         UXTAB16 Temp2, ValG, ValF, ROR #8
    159         ADD     Temp1, Temp1, Temp1, LSL #2
    160         ADD     Acc2, Acc2, Temp1
    161 
    162         ;// Calculate Acc3
    163         ;// Acc3 = d - 5*e + 20*f + 20*g - 5*h + i
    164         UXTAB16 Temp1, ValE, ValH, ROR #8
    165         RSB     Temp1, Temp1, Temp2, LSL #2
    166         ADD     Temp1, Temp1, Temp1, LSL #2
    167         ADD     Acc3, Acc3, Temp1
    168 
    169         M_LDR   dstStep, pDstStep
    170         M_LDR   srcStep, pSrcStep
    171 
    172         ;// If Counter is even store Acc0-Acc3 in a temporary buffer
    173         ;// If Counter is off store Acc0-Acc3 and previous Acc0-Acc3 in a intermediate buf
    174         ANDS        Temp3, Counter, #1
    175         BEQ         NoProcessing
    176 
    177         ;// Packing previous and current Acc0-Acc3 values
    178         M_LDRD      Accx, Accy, pTempResult1
    179         PKHBT       Temp6, Accx, Acc0, LSL #16          ;//[0 a2 0 a0] = [0 a3 0 a2] [0 a1 0 a0]
    180         PKHTB       Acc0, Acc0, Accx, ASR #16           ;//[0 a3 0 a1] = [0 a1 0 a0] [0 a3 0 a2]
    181         STR         Acc0, [pDst, dstStep]
    182         STR         Temp6, [pDst], #4
    183         PKHBT       Temp6, Accy, Acc1, LSL #16          ;//[0 b2 0 b0] = [0 b3 0 b2] [0 b1 0 b0]
    184         PKHTB       Acc1, Acc1, Accy, ASR #16            ;//[0 b3 0 b1] = [0 b1 0 b0] [0 b3 0 b2]
    185         M_LDRD      Accx, Accy, pTempResult2
    186         STR         Acc1, [pDst, dstStep]
    187         STR         Temp6, [pDst], #4
    188 
    189         PKHBT       Temp6, Accx, Acc2, LSL #16          ;//[0 c2 0 c0] = [0 c3 0 c2] [0 c1 0 c0]
    190         PKHTB       Acc2, Acc2, Accx, ASR #16            ;//[0 c3 0 c1] = [0 c1 0 c0] [0 c3 0 c2]
    191         STR         Acc2, [pDst, dstStep]
    192         STR         Temp6, [pDst], #4
    193         PKHBT       Temp6, Accy, Acc3, LSL #16          ;//[0 d2 0 d0] = [0 d3 0 d2] [0 d1 0 d0]
    194         PKHTB       Acc3, Acc3, Accy, ASR #16            ;//[0 d3 0 d1] = [0 d1 0 d0] [0 d3 0 d2]
    195         STR         Acc3, [pDst, dstStep]
    196         STR         Temp6, [pDst], #-12
    197         ADD         pDst, pDst, dstStep, LSL #1
    198         B           AfterStore
    199 
    200 NoProcessing
    201         M_STRD      Acc0, Acc1, pTempResult1
    202         M_STRD      Acc2, Acc3, pTempResult2
    203 AfterStore
    204         SUBS        Counter, Counter, #1                ;// Loop till height is 10
    205         ADD         pSrc, pSrc, srcStep, LSL #1
    206         BPL         HeightLoop
    207 
    208         STR         Acc0, [pDst], #4                    ;//[0 a1 0 a0]
    209         STR         Acc1, [pDst], #4
    210         STR         Acc2, [pDst], #4
    211         STR         Acc3, [pDst], #-12
    212 
    213         ;//
    214         ;// Horizontal interpolation using multiplication
    215         ;//
    216 
    217         SUB         pSrc, pDst, dstStep, LSL #2
    218         MOV         srcStep, #16
    219         M_LDRD      pDst, dstStep, ppDstArgs
    220 
    221         MOV         Counter, #4
    222         LDR         r0x0014fffb, =0x0014fffb
    223         LDR         r0x00140001, =0x00140001
    224 
    225 HeightLoop1
    226         M_STR       Counter, pCounter
    227 
    228         M_LDR       ValCA, [pSrc], srcStep               ;// Load  [0 c 0 a]
    229         M_LDR       ValDB, [pSrc], srcStep               ;// Load  [0 d 0 b]
    230         M_LDR       ValGE, [pSrc], srcStep               ;// Load  [0 g 0 e]
    231         M_LDR       ValHF, [pSrc], srcStep               ;// Load  [0 h 0 f]
    232 
    233 
    234         ;// Acc0 = smuad ([0 20 0 1], add([0 c 0 a] + [0 d 0 f])) - (5 * (b + e))
    235         ;// Acc1 = smuad ([0 20 0 1], add([0 e 0 g] + [0 d 0 b])) - (5 * (c + f))
    236         ;// Acc2 = smuad ([0 1 0 20], add([0 c 0 e] + [0 h 0 f])) - (5 * (d + g))
    237         ;// Acc3 = smuad ([0 20 0 1], add([0 d 0 f] + [0 i 0 g])) - (5 * (e + h))
    238 
    239         SMUAD       Acc0, ValCA, r0x00140001            ;// Acc0  = [0 c 0 a] * [0 20 0 1]
    240         SMUAD       Acc1, ValDB, r0x00140001            ;// Acc1  = [0 c 0 a] * [0 20 0 1]
    241         SMUADX      Acc2, ValGE, r0x0014fffb            ;// Acc2  = [0 g 0 e] * [0 20 0 -5]
    242         SMUAD       Acc3, ValGE, r0x0014fffb            ;// Acc3  = [0 g 0 e] * [0 20 0 -5]
    243 
    244         SMLAD       Acc0, ValDB, r0x0014fffb, Acc0      ;// Acc0 += [0 d 0 b] * [0 20 0 -5]
    245         SMLADX      Acc1, ValGE, r0x00140001, Acc1      ;// Acc1 += [0 g 0 e] * [0 20 0 1]
    246         SMLADX      Acc2, ValHF, r0x00140001, Acc2      ;// Acc2 += [0 h 0 f] * [0 20 0 1]
    247         SMLADX      Acc3, ValHF, r0x0014fffb, Acc3      ;// Acc3 += [0 h 0 f] * [0 20 0 -5]
    248 
    249         SMLABB      Acc0, ValGE, r0x0014fffb, Acc0      ;// Acc0 += [0 g 0 e] * [0 0 0 -5]
    250         SMLATB      Acc1, ValCA, r0x0014fffb, Acc1      ;// Acc1 += [0 d 0 b] * [0 0 0 -5]
    251         SMLATB      Acc2, ValCA, r0x00140001, Acc2      ;// Acc2 += [0 c 0 a] * [0 0 0 1]
    252         SMLATB      Acc3, ValDB, r0x00140001, Acc3      ;// Acc3 += [0 c 0 a] * [0 0 0 1]
    253 
    254         LDRH        ValCA, [pSrc], #4                   ;// 8 = srcStep - 16
    255         SMLABB      Acc0, ValHF, r0x00140001, Acc0      ;// Acc0 += [0 h 0 f] * [0 0 0 1]
    256         SMLABB      Acc1, ValHF, r0x0014fffb, Acc1      ;// Acc1 += [0 h 0 f] * [0 0 0 -5]
    257         SMLATB      Acc2, ValDB, r0x0014fffb, Acc2      ;// Acc2 += [0 d 0 b] * [0 0 0 -5]
    258         SMLABB      Acc3, ValCA, r0x00140001, Acc3      ;// Acc3 += [0 d 0 b] * [0 0 0 1]
    259 
    260         LDR         r0x0001fc00, =0x0001fc00            ;// (0xff * 16 * 32) - 512
    261         SUB         Acc0, Acc0, r0x0001fc00
    262         SUB         Acc1, Acc1, r0x0001fc00
    263         SUB         Acc2, Acc2, r0x0001fc00
    264         SUB         Acc3, Acc3, r0x0001fc00
    265 
    266         USAT        Acc0, #18, Acc0
    267         USAT        Acc1, #18, Acc1
    268         USAT        Acc2, #18, Acc2
    269         USAT        Acc3, #18, Acc3
    270 
    271         MOV         Acc0, Acc0, LSR #10
    272         M_STRB      Acc0, [pDst], dstStep
    273         MOV         Acc1, Acc1, LSR #10
    274         M_STRB      Acc1, [pDst], dstStep
    275         MOV         Acc2, Acc2, LSR #10
    276         M_STRB      Acc2, [pDst], dstStep
    277         MOV         Acc3, Acc3, LSR #10
    278         M_STRB      Acc3, [pDst], dstStep
    279 
    280 
    281         M_LDR       Counter, pCounter
    282         SUB         pDst, pDst, dstStep, LSL #2
    283         SUB         pSrc, pSrc, srcStep, LSL #2
    284         ADD         pDst, pDst, #1
    285         SUBS        Counter, Counter, #1
    286         BGT         HeightLoop1
    287 End
    288         SUB         pDst, pDst, #4
    289         SUB         pSrc, pSrc, #16
    290 
    291         M_END
    292 
    293     ENDIF
    294 
    295     END
    296 
    297