Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2014 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
     18 #define END(f) .size f, .-f;
     19 
     20 
     21 .macro lanepair dst, src0, src1, xr0, xr1, yr0, yr1, zr0, zr1
     22 
     23             smov        x6, \src0
     24             smov        x7, \src1
     25 
     26             add         x6, x6, x3
     27             add         x7, x7, x3
     28 
     29             ld1         {v16.2s}, [x6], x4
     30             ld1         {v17.2s}, [x7], x4
     31 
     32             ld1         {v18.2s}, [x6], x5
     33             ld1         {v19.2s}, [x7], x5
     34 
     35             dup         v8.8b, \yr0
     36             dup         v9.8b, \yr1
     37             /* Y interpolate, front, lanes 0 and 1 -> v12 and v13 */
     38             zip1        v12.16b, v5.16b, v16.16b
     39             zip1        v13.16b, v5.16b, v17.16b
     40             umlsl       v12.8h, v16.8b, v8.8b
     41             umlsl       v13.8h, v17.8b, v9.8b
     42             umlal       v12.8h, v18.8b, v8.8b
     43             umlal       v13.8h, v19.8b, v9.8b
     44 
     45             ld1         {v18.2s}, [x6]
     46             ld1         {v19.2s}, [x7]
     47 
     48             sub         x6, x6, x4
     49             sub         x7, x7, x4
     50 
     51             ld1         {v16.2s}, [x6]
     52             ld1         {v17.2s}, [x7]
     53 
     54             /* Y interpolate, rear, lanes 0 and 1 -> v14 and v15 */
     55             zip1        v14.16b, v5.16b, v16.16b
     56             zip1        v15.16b, v5.16b, v17.16b
     57             umlsl       v14.8h, v16.8b, v8.8b
     58             umlsl       v15.8h, v17.8b, v9.8b
     59             umlal       v14.8h, v18.8b, v8.8b
     60             umlal       v15.8h, v19.8b, v9.8b
     61 
     62             /* Z interpolate, lane 0 v12/v14 -> v10 */
     63             ushll       v8.4s, v12.4h, #8
     64             ushll2      v9.4s, v12.8h, #8
     65             umlsl       v8.4s, v12.4h, \zr0
     66             umlsl2      v9.4s, v12.8h, \zr0
     67             umlal       v8.4s, v14.4h, \zr0
     68             umlal2      v9.4s, v14.8h, \zr0
     69             rshrn       v10.4h, v8.4s, #8
     70             rshrn2      v10.8h, v9.4s, #8
     71 
     72             /* Z interpolate, lane 1 v13/v15 -> v11 */
     73             ushll       v8.4s, v13.4h, #8
     74             ushll2      v9.4s, v13.8h, #8
     75             umlsl       v8.4s, v13.4h, \zr1
     76             umlsl2      v9.4s, v13.8h, \zr1
     77             umlal       v8.4s, v15.4h, \zr1
     78             umlal2      v9.4s, v15.8h, \zr1
     79             rshrn       v11.4h, v8.4s, #8
     80             rshrn2      v11.8h, v9.4s, #8
     81 
     82             /* X interpolate, lanes 0 and 1 v10,v11 -> v14 */
     83             ushll       v8.4s, v10.4h, #8
     84             ushll       v9.4s, v11.4h, #8
     85             umlsl       v8.4s, v10.4h, \xr0
     86             umlsl       v9.4s, v11.4h, \xr1
     87             umlal2      v8.4s, v10.8h, \xr0
     88             umlal2      v9.4s, v11.8h, \xr1
     89             shrn        v14.4h, v8.4s, #8
     90             shrn2       v14.8h, v9.4s, #8
     91 
     92             /* pack lanes 0-1 -> v6 */
     93 .ifc \dst, v20.16b
     94             uqrshrn2    \dst, v14.8h, #8
     95 .else ; .ifc \dst, v21.16b
     96             uqrshrn2    \dst, v14.8h, #8
     97 .else
     98             uqrshrn     \dst, v14.8h, #8
     99 .endif ; .endif
    100 .endm
    101 
    102 /* void rsdIntrinsic3DLUT_K(
    103  *          void *dst,          // x0
    104  *          void const *in,     // x1
    105  *          size_t count,       // x2
    106  *          void const *lut,    // x3
    107  *          int32_t pitchy,     // w4
    108  *          int32_t pitchz,     // w5
    109  *          int dimx,           // w6
    110  *          int dimy,           // w7
    111  *          int dimz);          // [sp]
    112  */
    113 ENTRY(rsdIntrinsic3DLUT_K)
    114             ldr         w8, [sp]
    115             stp         d8, d9, [sp, #-64]!
    116             stp         d10, d11, [sp, #16]
    117             stp         d12, d13, [sp, #32]
    118             stp         d14, d15, [sp, #48]
    119             movi        v4.8b, #1
    120             ins         v4.h[0], w6
    121             ins         v4.h[1], w7
    122             ins         v4.h[2], w8
    123             ins         v4.s[2], w4
    124             ins         v4.s[3], w5
    125             movi        v5.16b, #0
    126 
    127             subs        x2, x2, #8
    128             bge         2f
    129             cmp         x2, #-8
    130             ble         9f
    131             b           4f
    132 
    133             .align 6
    134 1:          st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
    135 /* x0  = dst
    136  * x1  = src
    137  * x2  = count
    138  * x3  = lut
    139  * x4  = pitchy
    140  * x5  = pitchz
    141  * x6 = offset0
    142  * x7 = offset1
    143  */
    144 2:          ld4         {v0.8b-v3.8b}, [x1], #32
    145 /* v0,v1,v2,v3 source data
    146  * v4 dimensions and pitches
    147  */
    148 3:          uxtl        v0.8h, v0.8b
    149             uxtl        v1.8h, v1.8b
    150             uxtl        v2.8h, v2.8b
    151             mul         v0.8h, v0.8h, v4.h[0]
    152             mul         v1.8h, v1.8h, v4.h[1]
    153             mul         v2.8h, v2.8h, v4.h[2]
    154 
    155 /* ursra below would be more accurate, but this can result in a dim.0 case
    156  * where we try to read from the limit of the array and the limit +1 to
    157  * interpolate, even though the fractional component is zero.  Strictly this is
    158  * correct, except for the llegal access problem.
    159  */
    160             usra        v0.8h, v0.8h, #8
    161             usra        v1.8h, v1.8h, #8
    162             usra        v2.8h, v2.8h, #8
    163 
    164             ushr        v12.8h, v0.8h, #8
    165             ushr        v13.8h, v1.8h, #8
    166             ushr        v14.8h, v2.8h, #8
    167             bic         v0.8h, #0xff, LSL #8
    168             xtn         v1.8b, v1.8h
    169             bic         v2.8h, #0xff, LSL #8
    170 
    171 /* v0.8h,v1.8b,v2.hb fractional offset
    172  * v12.8h,v13.8h,v14.8h integer offset
    173  */
    174 
    175             ushll       v6.4s, v12.4h, #2
    176             ushll2      v7.4s, v12.8h, #2
    177             uxtl        v8.4s, v13.4h
    178             uxtl2       v9.4s, v13.8h
    179             uxtl        v10.4s, v14.4h
    180             uxtl2       v11.4s, v14.8h
    181             mla         v6.4s, v8.4s,  v4.s[2]
    182             mla         v7.4s, v9.4s,  v4.s[2]
    183             mla         v6.4s, v10.4s, v4.s[3]
    184             mla         v7.4s, v11.4s, v4.s[3]
    185 
    186 /* v6,v7 list of table offsets */
    187 
    188         /* lanes 0 and 1 */
    189             lanepair    dst=v20.8b,  src0=v6.s[0], src1=v6.s[1], xr0=v0.h[0], xr1=v0.h[1], yr0=v1.b[0], yr1=v1.b[1], zr0=v2.h[0], zr1=v2.h[1]
    190 
    191         /* lanes 2 and 3 */
    192             lanepair    dst=v20.16b, src0=v6.s[2], src1=v6.s[3], xr0=v0.h[2], xr1=v0.h[3], yr0=v1.b[2], yr1=v1.b[3], zr0=v2.h[2], zr1=v2.h[3]
    193 
    194         /* lanes 4 and 5 */
    195             lanepair    dst=v21.8b,  src0=v7.s[0], src1=v7.s[1], xr0=v0.h[4], xr1=v0.h[5], yr0=v1.b[4], yr1=v1.b[5], zr0=v2.h[4], zr1=v2.h[5]
    196 
    197         /* lanes 6 and 7 */
    198             lanepair    dst=v21.16b, src0=v7.s[2], src1=v7.s[3], xr0=v0.h[6], xr1=v0.h[7], yr0=v1.b[6], yr1=v1.b[7], zr0=v2.h[6], zr1=v2.h[7]
    199 
    200             uzp1        v6.16b, v20.16b, v21.16b
    201             uzp2        v7.16b, v20.16b, v21.16b
    202             uzp1        v20.16b, v6.16b, v7.16b
    203             uzp2        v22.16b, v6.16b, v7.16b
    204             mov         v21.d[0], v20.d[1]
    205 
    206             subs        x2, x2, #8
    207             mov         v23.8b, v3.8b
    208 
    209             bge         1b
    210 
    211             cmp         x2, #-8
    212             blt         1f
    213 
    214             st4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x0], #32
    215             beq         9f
    216 
    217             /* fill the vector  with a safe value */
    218 4:          ld4r        {v0.8b-v3.8b}, [x1]
    219             tbz         x2, #2, 2f
    220             ld4         {v0.b-v3.b}[0], [x1], #4
    221             ld4         {v0.b-v3.b}[1], [x1], #4
    222             ld4         {v0.b-v3.b}[2], [x1], #4
    223             ld4         {v0.b-v3.b}[3], [x1], #4
    224 2:          tbz         x2, #1, 2f
    225             ld4         {v0.b-v3.b}[4], [x1], #4
    226             ld4         {v0.b-v3.b}[5], [x1], #4
    227 2:          tbz         x2, #0, 2f
    228             ld4         {v0.b-v3.b}[6], [x1], #4
    229 2:          b           3b
    230 
    231 1:          tst         x2, #4
    232             beq         2f
    233             st4         {v20.b-v23.b}[0], [x0], #4
    234             st4         {v20.b-v23.b}[1], [x0], #4
    235             st4         {v20.b-v23.b}[2], [x0], #4
    236             st4         {v20.b-v23.b}[3], [x0], #4
    237 2:          tst         x2, #2
    238             beq         2f
    239             st4         {v20.b-v23.b}[4], [x0], #4
    240             st4         {v20.b-v23.b}[5], [x0], #4
    241 2:          tst         x2, #1
    242             beq         9f
    243             st4         {v20.b-v23.b}[6], [x0], #4
    244 
    245 9:          ldp         d14, d15, [sp, #48]
    246             ldp         d12, d13, [sp, #32]
    247             ldp         d10, d11, [sp, #16]
    248             ldp         d8, d9, [sp], #64
    249             ret
    250 END(rsdIntrinsic3DLUT_K)
    251