Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2014 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
     18 #define END(f) .fnend; .size f, .-f;
     19 
     20 .eabi_attribute 25,1 @Tag_ABI_align8_preserved
     21 .arm
     22 
     23 .macro lanepair dst, src, xr0, xr1, yr0, yr1, zr0, zr1
     24 
     25             vmov.s32    r6, r7, \src
     26 
     27             add         r6, r6, r3
     28             add         r7, r7, r3
     29 
     30             vld1.u8     d16, [r6], r4
     31             vld1.u8     d17, [r7], r4
     32 
     33             vld1.u8     d18, [r6], r5
     34             vld1.u8     d19, [r7], r5
     35 
     36             vdup.u8     d6, \yr0
     37             vdup.u8     d7, \yr1
     38             /* Y interpolate, front, lanes 0 and 1 -> q12 and q13 */
     39             vshll.u8    q12, d16, #8
     40             vshll.u8    q13, d17, #8
     41             vmlsl.u8    q12, d16, d6
     42             vmlsl.u8    q13, d17, d7
     43             vmlal.u8    q12, d18, d6
     44             vmlal.u8    q13, d19, d7
     45 
     46             vld1.u8     d18, [r6]
     47             vld1.u8     d19, [r7]
     48 
     49             sub         r6, r6, r4
     50             sub         r7, r7, r4
     51 
     52             vld1.u8     d16, [r6]
     53             vld1.u8     d17, [r7]
     54 
     55             /* Y interpolate, rear, lanes 0 and 1 -> q14 and q15 */
     56             vshll.u8    q14, d16, #8
     57             vshll.u8    q15, d17, #8
     58             vmlsl.u8    q14, d16, d6
     59             vmlsl.u8    q15, d17, d7
     60             vmlal.u8    q14, d18, d6
     61             vmlal.u8    q15, d19, d7
     62 
     63             /* Z interpolate, lane 0 q12/q14 -> q10 */
     64             vshll.u16   q8, d24, #8
     65             vshll.u16   q9, d25, #8
     66             vmlsl.u16   q8, d24, \zr0
     67             vmlsl.u16   q9, d25, \zr0
     68             vmlal.u16   q8, d28, \zr0
     69             vmlal.u16   q9, d29, \zr0
     70             vrshrn.u32  d20, q8, #8
     71             vrshrn.u32  d21, q9, #8
     72 
     73             /* Z interpolate, lane 1 q13/q15 -> q11 */
     74             vshll.u16   q8, d26, #8
     75             vshll.u16   q9, d27, #8
     76             vmlsl.u16   q8, d26, \zr1
     77             vmlsl.u16   q9, d27, \zr1
     78             vmlal.u16   q8, d30, \zr1
     79             vmlal.u16   q9, d31, \zr1
     80             vrshrn.u32  d22, q8, #8
     81             vrshrn.u32  d23, q9, #8
     82 
     83             /* X interpolate, lanes 0 and 1 q10,q11 -> q14 */
     84             vshll.u16   q8, d20, #8
     85             vshll.u16   q9, d22, #8
     86             vmlsl.u16   q8, d20, \xr0
     87             vmlsl.u16   q9, d22, \xr1
     88             vmlal.u16   q8, d21, \xr0
     89             vmlal.u16   q9, d23, \xr1
     90             vshrn.u32   d28, q8, #8
     91             vshrn.u32   d29, q9, #8
     92 
     93             /* pack lanes 0-1 -> d12 */
     94             vqrshrn.u16  \dst, q14, #8
     95 .endm
     96 
     97 /* void rsdIntrinsic3DLUT_K(
     98  *          void *dst,          // r0
     99  *          void const *in,     // r1
    100  *          size_t count,       // r2
    101  *          void const *lut,    // r3
    102  *          int32_t pitchy,     // [sp]
    103  *          int32_t pitchz,     // [sp+#4]
    104  *          int dimx,           // [sp+#8]
    105  *          int dimy,           // [sp+#12]
    106  *          int dimz);          // [sp+#16]
    107  */
    108 ENTRY(rsdIntrinsic3DLUT_K)
    109             push        {r4,r5,r6,r7}
    110             ldr         r4, [sp, #16]
    111             ldr         r5, [sp, #20]
    112             ldr         r6, [sp, #24]
    113             ldr         r7, [sp, #28]
    114             ldr         r12, [sp, #32]
    115             vpush       {d8-d15}
    116 
    117             vmov.u8     d8, #1
    118             vmov.u16    d8[0], r6
    119             vmov.u16    d8[1], r7
    120             vmov.u16    d8[2], r12
    121             vmov.s32    d9, r4, r5
    122 
    123             subs        r2, #8
    124             bge         2f
    125             cmp         r2, #-8
    126             ble         9f
    127             b           4f
    128 
    129             .align 6
    130 1:          vst4.u8     {d12,d13,d14,d15}, [r0]!
    131 /* r0  = dst
    132  * r1  = src
    133  * r2  = count
    134  * r3  = lut
    135  * r4  = pitchy
    136  * r5  = pitchz
    137  * r6 = offset0
    138  * r7 = offset1
    139  */
    140 2:          vld4.u8     {d0,d2,d4,d6}, [r1]!
    141 3:          vmov        d10, d6
    142 /* q0,q1,q2,q5 source data
    143  * q4 dimensions and pitches
    144  * q3, scratch register for scalar access
    145  */
    146             vmov        q3, q4
    147             vmovl.u8    q0, d0
    148             vmovl.u8    q1, d2
    149             vmovl.u8    q2, d4
    150             vmul.u16    q0, q0, d6[0]
    151             vmul.u16    q1, q1, d6[1]
    152             vmul.u16    q2, q2, d6[2]
    153 
    154 /* vrsra.u16 below would be more accurate, but this can result in a dim.0 case
    155  * where we try to read from the limit of the array and the limit +1 to
    156  * interpolate, even though the fractional component is zero.  Strictly this is
    157  * correct, except for the llegal access problem.
    158  */
    159             vsra.u16    q0, q0, #8
    160             vsra.u16    q1, q1, #8
    161             vsra.u16    q2, q2, #8
    162 
    163             vshr.u16    q12, q0, #8
    164             vshr.u16    q13, q1, #8
    165             vshr.u16    q14, q2, #8
    166 
    167             vbic.u16    q0, #0xff00
    168             vmovn.u16   d2, q1
    169             vbic.u16    q2, #0xff00
    170 
    171 /* q0,d2,q2 fractional offset
    172  * q12,q13,q14 integer offset
    173  */
    174 
    175             vshll.u16   q6, d24, #2
    176             vshll.u16   q7, d25, #2
    177             vmovl.u16   q8, d26
    178             vmovl.u16   q9, d27
    179             vmovl.u16   q10, d28
    180             vmovl.u16   q11, d29
    181             vmla.s32    q6, q8,  d9[0]
    182             vmla.s32    q7, q9,  d9[0]
    183             vmla.s32    q6, q10, d9[1]
    184             vmla.s32    q7, q11, d9[1]
    185 
    186 /* q6,q7 list of table offsets */
    187 
    188         /* lanes 0 and 1 */
    189             lanepair dst=d12, src=d12, xr0=d0[0], xr1=d0[1], yr0=d2[0], yr1=d2[1], zr0=d4[0], zr1=d4[1]
    190 
    191         /* lanes 2 and 3 */
    192             lanepair dst=d13, src=d13, xr0=d0[2], xr1=d0[3], yr0=d2[2], yr1=d2[3], zr0=d4[2], zr1=d4[3]
    193 
    194         /* lanes 4 and 5 */
    195             lanepair dst=d14, src=d14, xr0=d1[0], xr1=d1[1], yr0=d2[4], yr1=d2[5], zr0=d5[0], zr1=d5[1]
    196 
    197         /* lanes 6 and 7 */
    198             lanepair dst=d15, src=d15, xr0=d1[2], xr1=d1[3], yr0=d2[6], yr1=d2[7], zr0=d5[2], zr1=d5[3]
    199 
    200             vuzp.u8     d12, d13
    201             vuzp.u8     d14, d15
    202             vuzp.u8     d12, d14
    203             vuzp.u8     d13, d15
    204 
    205             subs        r2, r2, #8
    206             vmov.u8     d15, d10
    207 
    208             bge         1b
    209 
    210             cmp         r2, #-8
    211             blt         1f
    212 
    213             vst4.u8     {d12,d13,d14,d15}, [r0]!
    214 
    215             beq         9f
    216 
    217             /* fill the vector with a safe value */
    218 4:          vld1.u32    {d0[]}, [r1]
    219             vmov        d2, d0
    220             vmov        d4, d0
    221             vmov        d6, d0
    222             tst         r2, #4
    223             beq         2f
    224             vld1.u32    {d0}, [r1]!
    225             vld1.u32    {d2}, [r1]!
    226 2:          tst         r2, #2
    227             beq         2f
    228             vld1.u32    {d4}, [r1]!
    229 2:          tst         r2, #1
    230             beq         2f
    231             vld1.u32    {d6[0]}, [r1]!
    232 2:          vuzp.8      d0, d2
    233             vuzp.8      d4, d6
    234             vuzp.8      d0, d4
    235             vuzp.8      d2, d6
    236             b           3b
    237 
    238 1:          vzip.8      d12, d14
    239             vzip.8      d13, d15
    240             vzip.8      d12, d13
    241             vzip.8      d14, d15
    242             tst         r2, #4
    243             beq         2f
    244             vst1.u32    {d12,d13}, [r0]!
    245 2:          tst         r2, #2
    246             beq         2f
    247             vst1.u32    {d14}, [r0]!
    248 2:          tst         r2, #1
    249             beq         9f
    250             vst1.u32    {d15[0]}, [r0]!
    251 
    252 9:          mov         r0, #0
    253             vpop        {d8-d15}
    254             pop         {r4,r5,r6,r7}
    255             bx lr
    256 END(rsdIntrinsic3DLUT_K)
    257