Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2014 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
     18 #define END(f) .fnend; .size f, .-f;
     19 
     20 .eabi_attribute 25,1 @Tag_ABI_align8_preserved
     21 .arm
     22 
     23 /* Perform the actual YuvToRGB conversion in a macro, from register to
     24  * register.  This macro will be called from within several different wrapper
     25  * variants for different data layouts.  Y data starts in q8, but with the even
     26  * and odd bytes split into d16 and d17 respectively.  U and V are in d20
     27  * and d21.  Working constants are pre-loaded into q13-q15, and q3 is
     28  * pre-loaded with a constant 0xff alpha channel.
     29  *
     30  * The complicated arithmetic is the result of refactoring the original
     31  * equations to avoid 16-bit overflow without losing any precision.
     32  */
     33 .macro yuvkern
     34         vmov.i8     d15, #149
     35 
     36         vmull.u8    q1, d16, d15        // g0 = y0 * 149
     37         vmull.u8    q5, d17, d15        // g1 = y1 * 149
     38 
     39         vmov.i8     d14, #50
     40         vmov.i8     d15, #104
     41         vmull.u8    q8, d20, d14        // g2 = u * 50 + v * 104
     42         vmlal.u8    q8, d21, d15
     43 
     44         vshr.u8     d14, d21, #1
     45         vaddw.u8    q0, q1, d14         // r0 = y0 * 149 + (v >> 1)
     46         vaddw.u8    q4, q5, d14         // r1 = y1 * 149 + (v >> 1)
     47 
     48         vshll.u8    q7, d20, #2
     49         vadd.u16    q2, q1, q7          // b0 = y0 * 149 + (u << 2)
     50         vadd.u16    q6, q5, q7          // b1 = y1 * 149 + (u << 2)
     51 
     52         vmov.i8     d14, #204
     53         vmov.i8     d15, #254
     54         vmull.u8    q11, d21, d14       // r2 = v * 204
     55         vmull.u8    q12, d20, d15       // b2 = u * 254
     56 
     57         vhadd.u16   q0, q11             // r0 = (r0 + r2) >> 1
     58         vhadd.u16   q4, q11             // r1 = (r1 + r2) >> 1
     59         vqadd.u16   q1, q14             // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
     60         vqadd.u16   q5, q14             // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
     61         vhadd.u16   q2, q12             // b0 = (b0 + b2) >> 1
     62         vhadd.u16   q6, q12             // b1 = (b1 + b2) >> 1
     63 
     64         vqsub.u16   q0, q13             // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
     65         vqsub.u16   q4, q13             // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
     66         vqsub.u16   q1, q8              // g0 = satu16(g0 - g2)
     67         vqsub.u16   q5, q8              // g1 = satu16(g1 - g2)
     68         vqsub.u16   q2, q15             // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
     69         vqsub.u16   q6, q15             // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
     70 
     71         vqrshrn.u16 d0, q0, #6
     72         vqrshrn.u16 d1, q1, #7
     73         vqrshrn.u16 d2, q4, #6
     74         vqrshrn.u16 d3, q5, #7
     75         vqrshrn.u16 d4, q2, #6
     76         vqrshrn.u16 d5, q6, #6
     77 
     78         vzip.u8     q0, q1
     79         vzip.u8     d4, d5
     80 .endm
     81 
     82 /* Define the wrapper code which will load and store the data, iterate the
     83  * correct number of times, and safely handle the remainder at the end of the
     84  * loop.  Some sections of code are switched out depending on the data packing
     85  * being handled.
     86  */
     87 .macro wrap_line kernel, interleaved=0, swapuv=0
     88 
     89         movw        r5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
     90         vdup.i16    q13, r5
     91         movw        r5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
     92         vdup.i16    q14, r5
     93         movw        r5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
     94         vdup.i16    q15, r5
     95 
     96         vmov.i8     q3, #0xff
     97 
     98         subs        r2, #16
     99         bhs         1f
    100         b           2f
    101 
    102         .align 4
    103 1:      vld2.u8     {d16,d17}, [r1]!
    104         pld         [r1, #256]
    105   .if \interleaved
    106         vld2.u8     {d20,d21}, [r3]!
    107     .if \swapuv
    108         vswp        d20, d21
    109     .endif
    110         pld         [r3, #256]
    111   .else
    112         vld1.u8     d20, [r3]!
    113         vld1.u8     d21, [r4]!
    114         pld         [r3, #128]
    115         pld         [r4, #128]
    116   .endif
    117 
    118         \kernel
    119 
    120         subs        r2, #16
    121 
    122         vst4.u8     {d0,d2,d4,d6}, [r0]!
    123         vst4.u8     {d1,d3,d5,d7}, [r0]!
    124 
    125         bhs         1b
    126 
    127 2:      adds        r2, #16
    128         beq         2f
    129 
    130         /* To handle the tail portion of the data (something less than 16
    131          * bytes) load small power-of-two chunks into working registers.  It
    132          * doesn't matter where they end up in the register; the same process
    133          * will store them back out using the same positions and the
    134          * interaction between neighbouring pixels is constrained to odd
    135          * boundaries where the load operations don't interfere.
    136          */
    137         vmov.i8     q8, #0
    138         vmov.i8     q10, #0
    139 
    140         tst         r2, #8
    141         beq         1f
    142         vld1.u8     d17, [r1]!
    143   .if \interleaved
    144         vld1.u8     d21, [r3]!
    145   .else
    146         vld1.u32    d20[1], [r3]!
    147         vld1.u32    d21[1], [r4]!
    148   .endif
    149 
    150 1:      tst         r2, #4
    151         beq         1f
    152         vld1.u32    d16[1], [r1]!
    153   .if \interleaved
    154         vld1.u32    d20[1], [r3]!
    155   .else
    156         vld1.u16    d20[1], [r3]!
    157         vld1.u16    d21[1], [r4]!
    158   .endif
    159 1:      tst         r2, #2
    160         beq         1f
    161         vld1.u16    d16[1], [r1]!
    162   .if \interleaved
    163         vld1.u16    d20[1], [r3]!
    164   .else
    165         vld1.u8     d20[1], [r3]!
    166         vld1.u8     d21[1], [r4]!
    167   .endif
    168 1:      tst         r2, #1
    169         beq         1f
    170         vld1.u8     d16[1], [r1]!
    171   .if \interleaved
    172         vld1.u16    d20[0], [r3]!
    173   .else
    174         vld1.u8     d20[0], [r3]!
    175         vld1.u8     d21[0], [r4]!
    176   .endif
    177 
    178         /* One small impediment in the process above is that some of the load
    179          * operations can't perform byte-wise structure deinterleaving at the
    180          * same time as loading only part of a register.  So the data is loaded
    181          * linearly and unpacked manually at this point if necessary.
    182          */
    183 1:      vuzp.8      d16, d17
    184   .if \interleaved
    185         vuzp.8      d20, d21
    186     .if \swapuv
    187         vswp        d20, d21
    188     .endif
    189   .endif
    190 
    191         \kernel
    192 
    193         /* As above but with the output; structured stores for partial vectors
    194          * aren't available, so the data is re-packed first and stored linearly.
    195          */
    196         vzip.8  q0, q2
    197         vzip.8  q1, q3
    198         vzip.8  q0, q1
    199         vzip.8  q2, q3
    200 
    201 1:      tst         r2, #8
    202         beq         1f
    203         vst1.u8     {d4,d5,d6,d7}, [r0]!
    204 
    205 1:      tst         r2, #4
    206         beq         1f
    207         vst1.u8     {d2,d3}, [r0]!
    208 1:      tst         r2, #2
    209         beq         1f
    210         vst1.u8     d1, [r0]!
    211 1:      tst         r2, #1
    212         beq         2f
    213         vst1.u32    d0[1], [r0]!
    214 2:
    215 .endm
    216 
    217 
    218 /*  void rsdIntrinsicYuv2_K(
    219  *          void *out,          // r0
    220  *          void const *yin,    // r1
    221  *          void const *uin,    // r2
    222  *          void const *vin,    // r3
    223  *          size_t xstart,      // [sp]
    224  *          size_t xend);       // [sp+#4]
    225  */
    226 ENTRY(rsdIntrinsicYuv2_K)
    227         push        {r4,r5}
    228         ldr         r5, [sp, #8]
    229         mov         r4, r3
    230         mov         r3, r2
    231         ldr         r2, [sp, #12]
    232 
    233         add         r0, r5, LSL #2
    234         add         r1, r5
    235         add         r3, r5, LSR #1
    236         add         r4, r5, LSR #1
    237         sub         r2, r5
    238 
    239         vpush       {d8-d15}
    240 
    241         wrap_line yuvkern, 0
    242 
    243         vpop        {d8-d15}
    244         pop         {r4,r5}
    245         bx lr
    246 END(rsdIntrinsicYuv2_K)
    247 
    248 /*  void rsdIntrinsicYuv_K(
    249  *          void *out,          // r0
    250  *          void const *yin,    // r1
    251  *          void const *uvin,   // r2
    252  *          size_t xstart,      // r3
    253  *          size_t xend);       // [sp]
    254  */
    255 ENTRY(rsdIntrinsicYuv_K)
    256         push        {r4,r5}
    257         bic         r4, r3, #1
    258         add         r3, r2, r4
    259         ldr         r2, [sp, #8]
    260 
    261         add         r0, r4, LSL #2
    262         add         r1, r4
    263         sub         r2, r4
    264 
    265         vpush       {d8-d15}
    266 
    267         wrap_line yuvkern, 1, 1
    268 
    269         vpop        {d8-d15}
    270         pop         {r4,r5}
    271         bx lr
    272 END(rsdIntrinsicYuv_K)
    273 
    274 /*  void rsdIntrinsicYuvR_K(
    275  *          void *out,          // r0
    276  *          void const *yin,    // r1
    277  *          void const *uvin,   // r2
    278  *          size_t xstart,      // r3
    279  *          size_t xend);       // [sp]
    280  */
    281 ENTRY(rsdIntrinsicYuvR_K)
    282         push        {r4,r5}
    283         bic         r4, r3, #1
    284         add         r3, r2, r4
    285         ldr         r2, [sp, #8]
    286 
    287         add         r0, r4, LSL #2
    288         add         r1, r4
    289         sub         r2, r4
    290 
    291         vpush       {d8-d15}
    292 
    293         wrap_line yuvkern, 1
    294 
    295         vpop        {d8-d15}
    296         pop         {r4,r5}
    297         bx lr
    298 END(rsdIntrinsicYuvR_K)
    299