Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2014 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
     18 #define END(f) .size f, .-f;
     19 
     20 /* Perform the actual YuvToRGB conversion in a macro, from register to
     21  * register.  This macro will be called from within several different wrapper
     22  * variants for different data layouts.  Y data starts with the even and odd
     23  * bytes split into the low parts of v8 and v9 respectively.  U and V are in
     24  * v10 and v11.  Working constants are pre-loaded into v24-v31, and v3 and v7
     25  * are pre-loaded with a constant 0xff alpha channel.
     26  *
     27  * The complicated arithmetic is the result of refactoring the original
     28  * equations to avoid 16-bit overflow without losing any precision.
     29  */
     30 .macro yuvkern, regu=v10, regv=v11
     31         /* v0   out R_lo / even R_lo accumulator
     32          * v1   out G_lo / even G_lo accumulator
     33          * v2   out B_lo / even B_lo accumulator
     34          * v3   out A_lo / const 0xff*ff
     35          * v4   out R_hi / even R_hi accumulator
     36          * v5   out G_hi / even G_hi accumulator
     37          * v6   out B_hi / even B_hi accumulator
     38          * v7   out A_hi / const 0xff*ff
     39          * v8   even Y   / G_lo luma tmp
     40          * v9   odd Y    / G_lo luma tmp
     41          * \regu in U
     42          * \regv in V
     43          * v12  R_lo luma tmp
     44          * v13  B_lo luma tmp
     45          * v14  R_hi luma tmp
     46          * v15  B_hi luma tmp
     47          * v16  odd R_lo accumulator
     48          * v17  odd G_lo accumulator
     49          * v18  odd B_lo accumulator
     50          * v19  multiplier extra bits low
     51          * v20  odd R_hi accumulator
     52          * v21  odd G_hi accumulator
     53          * v22  odd B_hi accumulator
     54          * v23  multiplier extra bits high
     55          * v24  constant 149
     56          * v25  constant 50
     57          * v26  constant 104
     58          * v27  constant 204
     59          * v28  constant 254
     60          * v29  constant ((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
     61          * v30  constant ((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
     62          * v31  constant ((16 * 149 + (128 << 2) + 128 * 254) >> 1)
     63          */
     64 
     65         umull       v1.8h,  v8.8b,  v24.8b      // g0 = y0 * 149
     66         umull       v17.8h, v9.8b,  v24.8b      // g1 = y1 * 149
     67         umull2      v5.8h,  v8.16b, v24.16b     // g0_hi = y0_hi * 149
     68         umull2      v21.8h, v9.16b, v24.16b     // g1_hi = y1_hi * 149
     69 
     70         umull       v8.8h, \regu\().8b, v25.8b     // g2 = u * 50 + v * 104
     71         umlal       v8.8h, \regv\().8b, v26.8b
     72         umull2      v9.8h, \regu\().16b, v25.16b   // g2_hi = u_hi * 50 + v_hi * 104
     73         umlal2      v9.8h, \regv\().16b, v26.16b
     74 
     75         ushr        v19.16b, \regv\().16b, #1
     76         uaddw       v0.8h,  v1.8h,  v19.8b      // r0 = g0 + (v >> 1)
     77         uaddw       v16.8h, v17.8h, v19.8b      // r1 = g1 + (v >> 1)
     78 
     79         uaddw2      v4.8h,  v5.8h,  v19.16b     // r0_hi = g0_hi + (v_hi >> 1)
     80         uaddw2      v20.8h, v21.8h, v19.16b     // r1_hi = g1_hi + (v_hi >> 1)
     81 
     82         ushll       v19.8h, \regu\().8b,  #2
     83         ushll2      v23.8h, \regu\().16b, #2
     84         add         v2.8h,  v1.8h,  v19.8h      // b0 = g0 + (u << 2)
     85         add         v18.8h, v17.8h, v19.8h      // b1 = g1 + (u << 2)
     86 
     87         add         v6.8h,  v5.8h,  v23.8h      // b0_hi = g0_hi + (u_hi << 2)
     88         add         v22.8h, v21.8h, v23.8h      // b1_hi = g1_hi + (u_hi << 2)
     89 
     90         umull       v12.8h, \regv\().8b, v27.8b    // r2 = v * 204
     91         umull       v13.8h, \regu\().8b, v28.8b    // b2 = u * 254
     92 
     93         umull2      v14.8h, \regv\().16b, v27.16b  // r2_hi = v_hi * 204
     94         umull2      v15.8h, \regu\().16b, v28.16b  // b2_hi = u_hi * 254
     95 
     96         uhadd       v0.8h,  v0.8h,  v12.8h      // r0 = (r0 + r2) >> 1
     97         uhadd       v16.8h, v16.8h, v12.8h      // r1 = (r1 + r2) >> 1
     98         uqadd       v1.8h,  v1.8h,  v30.8h      // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
     99         uqadd       v17.8h, v17.8h, v30.8h      // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
    100         uhadd       v2.8h,  v2.8h,  v13.8h      // b0 = (b0 + b2) >> 1
    101         uhadd       v18.8h, v18.8h, v13.8h      // b1 = (b1 + b2) >> 1
    102 
    103         uhadd       v4.8h,  v4.8h,  v14.8h      // r0_hi = (r0_hi + r2_hi) >> 1
    104         uhadd       v20.8h, v20.8h, v14.8h      // r1_hi = (r1_hi + r2_hi) >> 1
    105         uqadd       v5.8h,  v5.8h,  v30.8h      // g0_hi = satu16(g0_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
    106         uqadd       v21.8h, v21.8h, v30.8h      // g1_hi = satu16(g1_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
    107         uhadd       v6.8h,  v6.8h,  v15.8h      // b0_hi = (b0_hi + b2_hi) >> 1
    108         uhadd       v22.8h, v22.8h, v15.8h      // b1_hi = (b1_hi + b2_hi) >> 1
    109 
    110         uqsub       v0.8h,  v0.8h,  v29.8h      // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
    111         uqsub       v16.8h, v16.8h, v29.8h      // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
    112         uqsub       v1.8h,  v1.8h,  v8.8h       // g0 = satu16(g0 - g2)
    113         uqsub       v17.8h, v17.8h, v8.8h       // g1 = satu16(g1 - g2)
    114         uqsub       v2.8h,  v2.8h,  v31.8h      // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
    115         uqsub       v18.8h, v18.8h, v31.8h      // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
    116 
    117         uqsub       v4.8h,  v4.8h,  v29.8h      // r0_hi = satu16(r0_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
    118         uqsub       v20.8h, v20.8h, v29.8h      // r1_hi = satu16(r1_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
    119         uqsub       v5.8h,  v5.8h,  v9.8h       // g0_hi = satu16(g0_hi - g2_hi)
    120         uqsub       v21.8h, v21.8h, v9.8h       // g1_hi = satu16(g1_hi - g2_hi)
    121         uqsub       v6.8h,  v6.8h,  v31.8h      // b0_hi = satu16(b0_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
    122         uqsub       v22.8h, v22.8h, v31.8h      // b1_hi = satu16(b1_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
    123 
    124         uqrshrn     v0.8b,  v0.8h,  #6
    125         uqrshrn     v16.8b, v16.8h, #6
    126         uqrshrn     v1.8b,  v1.8h,  #7
    127         uqrshrn     v17.8b, v17.8h, #7
    128         uqrshrn     v2.8b,  v2.8h,  #6
    129         uqrshrn     v18.8b, v18.8h, #6
    130 
    131         uqrshrn     v4.8b,  v4.8h,  #6
    132         uqrshrn     v20.8b, v20.8h, #6
    133         uqrshrn     v5.8b,  v5.8h,  #7
    134         uqrshrn     v21.8b, v21.8h, #7
    135         uqrshrn     v6.8b,  v6.8h,  #6
    136         uqrshrn     v22.8b, v22.8h, #6
    137 
    138         zip1        v0.16b, v0.16b, v16.16b
    139         zip1        v1.16b, v1.16b, v17.16b
    140         zip1        v2.16b, v2.16b, v18.16b
    141 
    142         zip1        v4.16b, v4.16b, v20.16b
    143         zip1        v5.16b, v5.16b, v21.16b
    144         zip1        v6.16b, v6.16b, v22.16b
    145 .endm
    146 
    147 /* Define the wrapper code which will load and store the data, iterate the
    148  * correct number of times, and safely handle the remainder at the end of the
    149  * loop.  Some sections of code are switched out depending on the data packing
    150  * being handled.
    151  */
    152 .macro wrap_line kernel, interleaved=0, swapuv=0
    153         movi        v24.16b, #149
    154         movi        v25.16b, #50
    155         movi        v26.16b, #104
    156         movi        v27.16b, #204
    157         movi        v28.16b, #254
    158         mov         w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
    159         dup         v29.8h, w5
    160         mov         w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
    161         dup         v30.8h, w5
    162         mov         w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
    163         dup         v31.8h, w5
    164 
    165         movi        v3.16b, #0xff
    166         movi        v7.16b, #0xff
    167 
    168         subs        x2, x2, #32
    169         bhs         1f
    170         b           2f
    171 
    172         .align 4
    173 1:      ld2         {v8.16b,v9.16b}, [x1], #32
    174   .if \interleaved
    175         ld2         {v10.16b,v11.16b}, [x3], #32
    176   .else
    177         ld1         {v10.16b}, [x3], #16
    178         ld1         {v11.16b}, [x4], #16
    179   .endif
    180 
    181   .if \swapuv
    182         \kernel regu=v11, regv=v10
    183   .else
    184         \kernel
    185   .endif
    186 
    187         subs        x2, x2, #32
    188 
    189         st4         {v0.16b - v3.16b}, [x0], #64
    190         st4         {v4.16b - v7.16b}, [x0], #64
    191 
    192         bhs         1b
    193 
    194 2:      adds        x2, x2, #32
    195         beq         2f
    196 
    197         /* To handle the tail portion of the data (something less than 32
    198          * bytes) load small power-of-two chunks into working registers.  It
    199          * doesn't matter where they end up in the register; the same process
    200          * will store them back out using the same positions and the
    201          * interaction between neighbouring pixels is constrained to odd
    202          * boundaries where the load operations don't interfere.
    203          */
    204         movi        v8.8b, #0
    205         movi        v9.8b, #0
    206         movi        v10.8b, #0
    207         movi        v11.8b, #0
    208 
    209         tbz         x2, #4, 1f
    210         ld1         {v9.16b}, [x1], #16
    211   .if \interleaved
    212         ld1         {v11.16b}, [x3], #16
    213   .else
    214         ld1         {v10.d}[1], [x3], #8
    215         ld1         {v11.d}[1], [x4], #8
    216   .endif
    217 1:      tbz         x2, #3, 1f
    218         ld1         {v8.d}[1], [x1], #8
    219   .if \interleaved
    220         ld1         {v10.d}[1], [x3], #8
    221   .else
    222         ld1         {v10.s}[1], [x3], #4
    223         ld1         {v11.s}[1], [x4], #4
    224   .endif
    225 1:      tbz         x2, #2, 1f
    226         ld1         {v8.s}[1], [x1], #4
    227   .if \interleaved
    228         ld1         {v10.s}[1], [x3], #4
    229   .else
    230         ld1         {v10.h}[1], [x3], #2
    231         ld1         {v11.h}[1], [x4], #2
    232   .endif
    233 1:      tbz         x2, #1, 1f
    234         ld1         {v8.h}[1], [x1], #2
    235   .if \interleaved
    236         ld1         {v10.h}[1], [x3], #2
    237   .else
    238         ld1         {v10.b}[1], [x3], #1
    239         ld1         {v11.b}[1], [x4], #1
    240   .endif
    241 1:      tbz         x2, #0, 1f
    242         ld1         {v8.b}[1], [x1], #1
    243   .if \interleaved
    244         ld1         {v10.h}[0], [x3], #2
    245   .else
    246         ld1         {v10.b}[0], [x3], #1
    247         ld1         {v11.b}[0], [x4], #1
    248   .endif
    249 
    250         /* One small impediment in the process above is that some of the load
    251          * operations can't perform byte-wise structure deinterleaving at the
    252          * same time as loading only part of a register.  So the data is loaded
    253          * linearly and unpacked manually at this point if necessary.
    254          */
    255 1:      mov         v12.16b, v8.16b
    256         uzp1        v8.16b, v12.16b, v9.16b
    257         uzp2        v9.16b, v12.16b, v9.16b
    258   .if \interleaved
    259         mov         v12.16b, v10.16b
    260         uzp1        v10.16b, v12.16b, v11.16b
    261         uzp2        v11.16b, v12.16b, v11.16b
    262   .endif
    263 
    264   .if \swapuv
    265         \kernel regu=v11, regv=v10
    266   .else
    267         \kernel
    268   .endif
    269 
    270         /* As above but with the output; structured stores for partial vectors
    271          * aren't available, so the data is re-packed first and stored linearly.
    272          */
    273         zip1        v16.16b, v0.16b, v2.16b
    274         zip2        v18.16b, v0.16b, v2.16b
    275         zip1        v17.16b, v1.16b, v3.16b
    276         zip2        v19.16b, v1.16b, v3.16b
    277         zip1        v0.16b, v16.16b, v17.16b
    278         zip2        v1.16b, v16.16b, v17.16b
    279         zip1        v2.16b, v18.16b, v19.16b
    280         zip2        v3.16b, v18.16b, v19.16b
    281 
    282         /* Luckily v4-v7 don't need to be unzipped because the complete set of
    283          * four and can be stored using st4. */
    284 
    285         tbz         x2, #4, 1f
    286         st4         {v4.16b - v7.16b}, [x0], #64
    287 1:      tbz         x2, #3, 1f
    288         st1         {v2.16b,v3.16b}, [x0], #32
    289 1:      tbz         x2, #2, 1f
    290         st1         {v1.16b}, [x0], #16
    291 1:      tbz         x2, #1, 1f
    292         st1         {v0.d}[1], [x0], #8
    293 1:      tbz         x2, #0, 2f
    294         st1         {v0.s}[1], [x0], #4
    295 2:
    296 .endm
    297 
    298 
    299 /*  void rsdIntrinsicYuv2_K(
    300  *          void *out,          // x0
    301  *          void const *yin,    // x1
    302  *          void const *uin,    // x2
    303  *          void const *vin,    // x3
    304  *          size_t xstart,      // x4
    305  *          size_t xend);       // x5
    306  */
    307 ENTRY(rsdIntrinsicYuv2_K)
    308         lsr         x6, x4, #1
    309         add         x0, x0, x4, LSL #2
    310         add         x1, x1, x4
    311         add         x4, x3, x6
    312         add         x3, x2, x6
    313         sub         x2, x5, x6, LSL #1
    314 
    315         sub         x6, sp, #32
    316         sub         sp, sp, #64
    317         st1         {v8.1d - v11.1d}, [sp]
    318         st1         {v12.1d - v15.1d}, [x6]
    319 
    320         wrap_line yuvkern, 0
    321 
    322         ld1         {v8.1d - v11.1d}, [sp], #32
    323         ld1         {v12.1d - v15.1d}, [sp], #32
    324         ret
    325 END(rsdIntrinsicYuv2_K)
    326 
    327 /*  void rsdIntrinsicYuv_K(
    328  *          void *out,          // x0
    329  *          void const *yin,    // x1
    330  *          void const *uvin,   // x2
    331  *          size_t xstart,      // x3
    332  *          size_t xend);       // x4
    333  */
    334 ENTRY(rsdIntrinsicYuv_K)
    335         bic         x5, x3, #1
    336         add         x0, x0, x5, LSL #2
    337         add         x1, x1, x5
    338         add         x3, x2, x5
    339         sub         x2, x4, x5
    340 
    341         sub         x5, sp, #32
    342         sub         sp, sp, #64
    343         st1         {v8.1d - v11.1d}, [sp]
    344         st1         {v12.1d - v15.1d}, [x5]
    345 
    346         wrap_line yuvkern, 1, 1
    347 
    348         ld1         {v8.1d - v11.1d}, [sp], #32
    349         ld1         {v12.1d - v15.1d}, [sp], #32
    350         ret
    351 END(rsdIntrinsicYuv_K)
    352 
    353 /*  void rsdIntrinsicYuvR_K(
    354  *          void *out,          // x0
    355  *          void const *yin,    // x1
    356  *          void const *uvin,   // x2
    357  *          size_t xstart,      // x3
    358  *          size_t xend);       // x4
    359  */
    360 ENTRY(rsdIntrinsicYuvR_K)
    361         bic         x5, x3, #1
    362         add         x0, x0, x5, LSL #2
    363         add         x1, x1, x5
    364         add         x3, x2, x5
    365         sub         x2, x4, x5
    366 
    367         sub         x5, sp, #32
    368         sub         sp, sp, #64
    369         st1         {v8.1d - v11.1d}, [sp]
    370         st1         {v12.1d - v15.1d}, [x5]
    371 
    372         wrap_line yuvkern, 1
    373 
    374         ld1         {v8.1d - v11.1d}, [sp], #32
    375         ld1         {v12.1d - v15.1d}, [sp], #32
    376         ret
    377 END(rsdIntrinsicYuvR_K)
    378