Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2014 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
     18 #define END(f) .size f, .-f;
     19 
     20 /* Perform the actual YuvToRGB conversion in a macro, from register to
     21  * register.  This macro will be called from within several different wrapper
     22  * variants for different data layouts.  Y data starts with the even and odd
     23  * bytes split into the low parts of v8 and v9 respectively.  U and V are in
     24  * v16 and v17.  Working constants are pre-loaded into v13-v15, and v3 is
     25  * pre-loaded with a constant 0xff alpha channel.
     26  *
     27  * The complicated arithmetic is the result of refactoring the original
     28  * equations to avoid 16-bit overflow without losing any precision.
     29  */
     30 .macro yuvkern
     31         movi        v7.8b, #149
     32 
     33         umull       v1.8h, v8.8b, v7.8b        // g0 = y0 * 149
     34         umull       v5.8h, v9.8b, v7.8b        // g1 = y1 * 149
     35 
     36         movi        v7.8b, #50
     37         movi        v10.8b, #104
     38         umull       v8.8h, v16.8b, v7.8b       // g2 = u * 50 + v * 104
     39         umlal       v8.8h, v17.8b, v10.8b
     40 
     41         ushr        v7.8b, v17.8b, #1
     42         uaddw       v0.8h, v1.8h, v7.8b        // r0 = y0 * 149 + (v >> 1)
     43         uaddw       v4.8h, v5.8h, v7.8b        // r1 = y1 * 149 + (v >> 1)
     44 
     45         ushll       v7.8h, v16.8b, #2
     46         add         v2.8h, v1.8h, v7.8h        // b0 = y0 * 149 + (u << 2)
     47         add         v6.8h, v5.8h, v7.8h        // b1 = y1 * 149 + (u << 2)
     48 
     49         movi        v7.16b, #204
     50         movi        v10.8b, #254
     51         umull       v11.8h, v17.8b, v7.8b     // r2 = v * 204
     52         umull       v12.8h, v16.8b, v10.8b      // b2 = u * 254
     53 
     54         uhadd       v0.8h, v0.8h, v11.8h       // r0 = (r0 + r2) >> 1
     55         uhadd       v4.8h, v4.8h, v11.8h       // r1 = (r1 + r2) >> 1
     56         uqadd       v1.8h, v1.8h, v14.8h       // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
     57         uqadd       v5.8h, v5.8h, v14.8h       // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
     58         uhadd       v2.8h, v2.8h, v12.8h       // b0 = (b0 + b2) >> 1
     59         uhadd       v6.8h, v6.8h, v12.8h       // b1 = (b1 + b2) >> 1
     60 
     61         uqsub       v0.8h, v0.8h, v13.8h       // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
     62         uqsub       v4.8h, v4.8h, v13.8h       // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
     63         uqsub       v1.8h, v1.8h, v8.8h        // g0 = satu16(g0 - g2)
     64         uqsub       v5.8h, v5.8h, v8.8h        // g1 = satu16(g1 - g2)
     65         uqsub       v2.8h, v2.8h, v15.8h       // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
     66         uqsub       v6.8h, v6.8h, v15.8h       // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
     67 
     68         uqrshrn     v0.8b, v0.8h, #6
     69         uqrshrn     v4.8b, v4.8h, #6
     70         uqrshrn     v1.8b, v1.8h, #7
     71         uqrshrn     v5.8b, v5.8h, #7
     72         uqrshrn     v2.8b, v2.8h, #6
     73         uqrshrn     v6.8b, v6.8h, #6
     74 
     75         zip1        v0.16b, v0.16b, v4.16b
     76         zip1        v1.16b, v1.16b, v5.16b
     77         zip1        v2.16b, v2.16b, v6.16b
     78 .endm
     79 
     80 /* Define the wrapper code which will load and store the data, iterate the
     81  * correct number of times, and safely handle the remainder at the end of the
     82  * loop.  Some sections of code are switched out depending on the data packing
     83  * being handled.
     84  */
     85 .macro wrap_line kernel, interleaved=0, swapuv=0
     86 
     87         mov         w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
     88         dup         v13.8h, w5
     89         mov         w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
     90         dup         v14.8h, w5
     91         mov         w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
     92         dup         v15.8h, w5
     93 
     94         movi        v3.16b, #0xff
     95 
     96         subs        x2, x2, #16
     97         bhs         1f
     98         b           2f
     99 
    100         .align 4
    101 1:      ld2         {v8.8b,v9.8b}, [x1], #16
    102 //      prfm PLDL1STRM, [x1, #256]
    103   .if \interleaved
    104     .if \swapuv
    105         ld2         {v17.8b,v18.8b}, [x3], #16
    106         mov         v16.8b, v18.8b
    107     .else
    108         ld2         {v16.8b,v17.8b}, [x3], #16
    109     .endif
    110 //      prfm PLD1STRM,  [x3, #256]
    111   .else
    112         ld1         {v16.8b}, [x3], #8
    113         ld1         {v17.8b}, [x4], #8
    114 //      prfm PLD1STRM,  [x3, #128]
    115 //      prfm PLD1STRM,  [x4, #128]
    116   .endif
    117 
    118         \kernel
    119 
    120         subs        x2, x2, #16
    121 
    122         st4         {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
    123 
    124         bhs         1b
    125 
    126 2:      adds        x2, x2, #16
    127         beq         2f
    128 
    129         /* To handle the tail portion of the data (something less than 16
    130          * bytes) load small power-of-two chunks into working registers.  It
    131          * doesn't matter where they end up in the register; the same process
    132          * will store them back out using the same positions and the
    133          * interaction between neighbouring pixels is constrained to odd
    134          * boundaries where the load operations don't interfere.
    135          */
    136         movi        v8.8b, #0
    137         movi        v9.8b, #0
    138         movi        v16.8b, #0
    139         movi        v17.8b, #0
    140 
    141         tbz         x2, #3, 1f
    142         ld1         {v9.8b}, [x1], #8
    143   .if \interleaved
    144         ld1         {v17.8b}, [x3], #8
    145   .else
    146         ld1         {v16.s}[1], [x3], #4
    147         ld1         {v17.s}[1], [x4], #4
    148   .endif
    149 1:      tbz         x2, #2, 1f
    150         ld1         {v8.s}[1], [x1], #4
    151   .if \interleaved
    152         ld1         {v16.s}[1], [x3], #4
    153   .else
    154         ld1         {v16.h}[1], [x3], #2
    155         ld1         {v17.h}[1], [x4], #2
    156   .endif
    157 1:      tbz         x2, #1, 1f
    158         ld1         {v8.h}[1], [x1], #2
    159   .if \interleaved
    160         ld1         {v16.h}[1], [x3], #2
    161   .else
    162         ld1         {v16.b}[1], [x3], #1
    163         ld1         {v17.b}[1], [x4], #1
    164   .endif
    165 1:      tbz         x2, #0, 1f
    166         ld1         {v8.b}[1], [x1], #1
    167   .if \interleaved
    168         ld1         {v16.h}[0], [x3], #2
    169   .else
    170         ld1         {v16.b}[0], [x3], #1
    171         ld1         {v17.b}[0], [x4], #1
    172   .endif
    173 
    174         /* One small impediment in the process above is that some of the load
    175          * operations can't perform byte-wise structure deinterleaving at the
    176          * same time as loading only part of a register.  So the data is loaded
    177          * linearly and unpacked manually at this point if necessary.
    178          */
    179 1:      uzp1        v8.16b, v8.16b, v9.16b
    180   .if \interleaved
    181     .if \swapuv
    182         uzp1        v16.16b, v17.16b, v16.16b
    183     .else
    184         uzp1        v16.16b, v16.16b, v17.16b
    185     .endif
    186   .endif
    187 
    188         \kernel
    189 
    190         /* As above but with the output; structured stores for partial vectors
    191          * aren't available, so the data is re-packed first and stored linearly.
    192          */
    193         zip1        v4.16b, v0.16b, v2.16b
    194         zip2        v6.16b, v0.16b, v2.16b
    195         zip1        v5.16b, v1.16b, v3.16b
    196         zip2        v7.16b, v1.16b, v3.16b
    197         zip1        v0.16b, v4.16b, v5.16b
    198         zip2        v1.16b, v4.16b, v5.16b
    199         zip1        v2.16b, v6.16b, v7.16b
    200         zip2        v3.16b, v6.16b, v7.16b
    201 
    202 1:      tbz         x2, #3, 1f
    203         st1         {v2.16b,v3.16b}, [x0], #32
    204 1:      tbz         x2, #2, 1f
    205         st1         {v1.16b}, [x0], #16
    206 1:      tbz         x2, #1, 1f
    207         st1         {v0.d}[1], [x0], #8
    208 1:      tbz         x2, #0, 2f
    209         st1         {v0.s}[1], [x0], #4
    210 2:
    211 .endm
    212 
    213 
    214 /*  void rsdIntrinsicYuv2_K(
    215  *          void *out,          // x0
    216  *          void const *yin,    // x1
    217  *          void const *uin,    // x2
    218  *          void const *vin,    // x3
    219  *          size_t xstart,      // x4
    220  *          size_t xend);       // x5
    221  */
    222 ENTRY(rsdIntrinsicYuv2_K)
    223         lsr         x6, x4, #1
    224         add         x0, x0, x4, LSL #2
    225         add         x1, x1, x4
    226         add         x4, x3, x6
    227         add         x3, x2, x6
    228         sub         x2, x5, x6, LSL #2
    229 
    230         sub         x6, sp, #32
    231         sub         sp, sp, #64
    232         st1         {v8.1d - v11.1d}, [sp]
    233         st1         {v12.1d - v15.1d}, [x6]
    234 
    235         wrap_line yuvkern, 0
    236 
    237         ld1         {v8.1d - v11.1d}, [sp], #32
    238         ld1         {v12.1d - v15.1d}, [sp], #32
    239         ret
    240 END(rsdIntrinsicYuv2_K)
    241 
    242 /*  void rsdIntrinsicYuv_K(
    243  *          void *out,          // x0
    244  *          void const *yin,    // x1
    245  *          void const *uvin,   // x2
    246  *          size_t xstart,      // x3
    247  *          size_t xend);       // x4
    248  */
    249 ENTRY(rsdIntrinsicYuv_K)
    250         bic         x5, x3, #1
    251         add         x0, x0, x5, LSL #2
    252         add         x1, x1, x5
    253         add         x3, x2, x5
    254         sub         x2, x4, x5
    255 
    256         sub         x5, sp, #32
    257         sub         sp, sp, #64
    258         st1         {v8.1d - v11.1d}, [sp]
    259         st1         {v12.1d - v15.1d}, [x5]
    260 
    261         wrap_line yuvkern, 1, 1
    262 
    263         ld1         {v8.1d - v11.1d}, [sp], #32
    264         ld1         {v12.1d - v15.1d}, [sp], #32
    265         ret
    266 END(rsdIntrinsicYuv_K)
    267 
    268 /*  void rsdIntrinsicYuvR_K(
    269  *          void *out,          // x0
    270  *          void const *yin,    // x1
    271  *          void const *uvin,   // x2
    272  *          size_t xstart,      // x3
    273  *          size_t xend);       // x4
    274  */
    275 ENTRY(rsdIntrinsicYuvR_K)
    276         bic         x5, x3, #1
    277         add         x0, x0, x5, LSL #2
    278         add         x1, x1, x5
    279         add         x3, x2, x5
    280         sub         x2, x4, x5
    281 
    282         sub         x5, sp, #32
    283         sub         sp, sp, #64
    284         st1         {v8.1d - v11.1d}, [sp]
    285         st1         {v12.1d - v15.1d}, [x5]
    286 
    287         wrap_line yuvkern, 1
    288 
    289         ld1         {v8.1d - v11.1d}, [sp], #32
    290         ld1         {v12.1d - v15.1d}, [sp], #32
    291         ret
    292 END(rsdIntrinsicYuvR_K)
    293