Home | History | Annotate | Download | only in pixman
      1 /*
      2  * Copyright  2009 Nokia Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     21  * DEALINGS IN THE SOFTWARE.
     22  *
     23  * Author:  Siarhei Siamashka (siarhei.siamashka (at) nokia.com)
     24  */
     25 
     26 /*
     27  * This file contains implementations of NEON optimized pixel processing
     28  * functions. There is no full and detailed tutorial, but some functions
     29  * (those which are exposing some new or interesting features) are
     30  * extensively commented and can be used as examples.
     31  *
     32  * You may want to have a look at the comments for following functions:
     33  *  - pixman_composite_over_8888_0565_asm_neon
     34  *  - pixman_composite_over_n_8_0565_asm_neon
     35  */
     36 
     37 /* Prevent the stack from becoming executable for no reason... */
     38 #if defined(__linux__) && defined(__ELF__)
     39 .section .note.GNU-stack,"",%progbits
     40 #endif
     41 
     42     .text
     43     .fpu neon
     44     .arch armv7a
     45     .object_arch armv4
     46     .eabi_attribute 10, 0 /* suppress Tag_FP_arch */
     47     .eabi_attribute 12, 0 /* suppress Tag_Advanced_SIMD_arch */
     48     .arm
     49     .altmacro
     50     .p2align 2
     51 
     52 #include "pixman-private.h"
     53 #include "pixman-arm-neon-asm.h"
     54 
     55 /* Global configuration options and preferences */
     56 
     57 /*
     58  * The code can optionally make use of unaligned memory accesses to improve
     59  * performance of handling leading/trailing pixels for each scanline.
     60  * Configuration variable RESPECT_STRICT_ALIGNMENT can be set to 0 for
     61  * example in linux if unaligned memory accesses are not configured to
     62  * generate.exceptions.
     63  */
     64 .set RESPECT_STRICT_ALIGNMENT, 1
     65 
     66 /*
     67  * Set default prefetch type. There is a choice between the following options:
     68  *
     69  * PREFETCH_TYPE_NONE (may be useful for the ARM cores where PLD is set to work
     70  * as NOP to workaround some HW bugs or for whatever other reason)
     71  *
     72  * PREFETCH_TYPE_SIMPLE (may be useful for simple single-issue ARM cores where
     73  * advanced prefetch intruduces heavy overhead)
     74  *
     75  * PREFETCH_TYPE_ADVANCED (useful for superscalar cores such as ARM Cortex-A8
     76  * which can run ARM and NEON instructions simultaneously so that extra ARM
     77  * instructions do not add (many) extra cycles, but improve prefetch efficiency)
     78  *
     79  * Note: some types of function can't support advanced prefetch and fallback
     80  *       to simple one (those which handle 24bpp pixels)
     81  */
     82 .set PREFETCH_TYPE_DEFAULT, PREFETCH_TYPE_ADVANCED
     83 
     84 /* Prefetch distance in pixels for simple prefetch */
     85 .set PREFETCH_DISTANCE_SIMPLE, 64
     86 
     87 /*
     88  * Implementation of pixman_composite_over_8888_0565_asm_neon
     89  *
     90  * This function takes a8r8g8b8 source buffer, r5g6b5 destination buffer and
     91  * performs OVER compositing operation. Function fast_composite_over_8888_0565
     92  * from pixman-fast-path.c does the same in C and can be used as a reference.
     93  *
     94  * First we need to have some NEON assembly code which can do the actual
     95  * operation on the pixels and provide it to the template macro.
     96  *
     97  * Template macro quite conveniently takes care of emitting all the necessary
     98  * code for memory reading and writing (including quite tricky cases of
     99  * handling unaligned leading/trailing pixels), so we only need to deal with
    100  * the data in NEON registers.
    101  *
    102  * NEON registers allocation in general is recommented to be the following:
    103  * d0,  d1,  d2,  d3  - contain loaded source pixel data
    104  * d4,  d5,  d6,  d7  - contain loaded destination pixels (if they are needed)
    105  * d24, d25, d26, d27 - contain loading mask pixel data (if mask is used)
    106  * d28, d29, d30, d31 - place for storing the result (destination pixels)
    107  *
    108  * As can be seen above, four 64-bit NEON registers are used for keeping
    109  * intermediate pixel data and up to 8 pixels can be processed in one step
    110  * for 32bpp formats (16 pixels for 16bpp, 32 pixels for 8bpp).
    111  *
    112  * This particular function uses the following registers allocation:
    113  * d0,  d1,  d2,  d3  - contain loaded source pixel data
    114  * d4,  d5            - contain loaded destination pixels (they are needed)
    115  * d28, d29           - place for storing the result (destination pixels)
    116  */
    117 
    118 /*
    119  * Step one. We need to have some code to do some arithmetics on pixel data.
    120  * This is implemented as a pair of macros: '*_head' and '*_tail'. When used
    121  * back-to-back, they take pixel data from {d0, d1, d2, d3} and {d4, d5},
    122  * perform all the needed calculations and write the result to {d28, d29}.
    123  * The rationale for having two macros and not just one will be explained
    124  * later. In practice, any single monolitic function which does the work can
    125  * be split into two parts in any arbitrary way without affecting correctness.
    126  *
    127  * There is one special trick here too. Common template macro can optionally
    128  * make our life a bit easier by doing R, G, B, A color components
    129  * deinterleaving for 32bpp pixel formats (and this feature is used in
    130  * 'pixman_composite_over_8888_0565_asm_neon' function). So it means that
    131  * instead of having 8 packed pixels in {d0, d1, d2, d3} registers, we
    132  * actually use d0 register for blue channel (a vector of eight 8-bit
    133  * values), d1 register for green, d2 for red and d3 for alpha. This
    134  * simple conversion can be also done with a few NEON instructions:
    135  *
    136  * Packed to planar conversion:
    137  *  vuzp.8 d0, d1
    138  *  vuzp.8 d2, d3
    139  *  vuzp.8 d1, d3
    140  *  vuzp.8 d0, d2
    141  *
    142  * Planar to packed conversion:
    143  *  vzip.8 d0, d2
    144  *  vzip.8 d1, d3
    145  *  vzip.8 d2, d3
    146  *  vzip.8 d0, d1
    147  *
    148  * But pixel can be loaded directly in planar format using VLD4.8 NEON
    149  * instruction. It is 1 cycle slower than VLD1.32, so this is not always
    150  * desirable, that's why deinterleaving is optional.
    151  *
    152  * But anyway, here is the code:
    153  */
    154 .macro pixman_composite_over_8888_0565_process_pixblock_head
    155     /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
    156        and put data into d6 - red, d7 - green, d30 - blue */
    157     vshrn.u16   d6, q2, #8
    158     vshrn.u16   d7, q2, #3
    159     vsli.u16    q2, q2, #5
    160     vsri.u8     d6, d6, #5
    161     vmvn.8      d3, d3      /* invert source alpha */
    162     vsri.u8     d7, d7, #6
    163     vshrn.u16   d30, q2, #2
    164     /* now do alpha blending, storing results in 8-bit planar format
    165        into d16 - red, d19 - green, d18 - blue */
    166     vmull.u8    q10, d3, d6
    167     vmull.u8    q11, d3, d7
    168     vmull.u8    q12, d3, d30
    169     vrshr.u16   q13, q10, #8
    170     vrshr.u16   q3, q11, #8
    171     vrshr.u16   q15, q12, #8
    172     vraddhn.u16 d20, q10, q13
    173     vraddhn.u16 d23, q11, q3
    174     vraddhn.u16 d22, q12, q15
    175 .endm
    176 
    177 .macro pixman_composite_over_8888_0565_process_pixblock_tail
    178     /* ... continue alpha blending */
    179     vqadd.u8    d16, d2, d20
    180     vqadd.u8    q9, q0, q11
    181     /* convert the result to r5g6b5 and store it into {d28, d29} */
    182     vshll.u8    q14, d16, #8
    183     vshll.u8    q8, d19, #8
    184     vshll.u8    q9, d18, #8
    185     vsri.u16    q14, q8, #5
    186     vsri.u16    q14, q9, #11
    187 .endm
    188 
    189 /*
    190  * OK, now we got almost everything that we need. Using the above two
    191  * macros, the work can be done right. But now we want to optimize
    192  * it a bit. ARM Cortex-A8 is an in-order core, and benefits really
    193  * a lot from good code scheduling and software pipelining.
    194  *
    195  * Let's construct some code, which will run in the core main loop.
    196  * Some pseudo-code of the main loop will look like this:
    197  *   head
    198  *   while (...) {
    199  *     tail
    200  *     head
    201  *   }
    202  *   tail
    203  *
    204  * It may look a bit weird, but this setup allows to hide instruction
    205  * latencies better and also utilize dual-issue capability more
    206  * efficiently (make pairs of load-store and ALU instructions).
    207  *
    208  * So what we need now is a '*_tail_head' macro, which will be used
    209  * in the core main loop. A trivial straightforward implementation
    210  * of this macro would look like this:
    211  *
    212  *   pixman_composite_over_8888_0565_process_pixblock_tail
    213  *   vst1.16     {d28, d29}, [DST_W, :128]!
    214  *   vld1.16     {d4, d5}, [DST_R, :128]!
    215  *   vld4.32     {d0, d1, d2, d3}, [SRC]!
    216  *   pixman_composite_over_8888_0565_process_pixblock_head
    217  *   cache_preload 8, 8
    218  *
    219  * Now it also got some VLD/VST instructions. We simply can't move from
    220  * processing one block of pixels to the other one with just arithmetics.
    221  * The previously processed data needs to be written to memory and new
    222  * data needs to be fetched. Fortunately, this main loop does not deal
    223  * with partial leading/trailing pixels and can load/store a full block
    224  * of pixels in a bulk. Additionally, destination buffer is already
    225  * 16 bytes aligned here (which is good for performance).
    226  *
    227  * New things here are DST_R, DST_W, SRC and MASK identifiers. These
    228  * are the aliases for ARM registers which are used as pointers for
    229  * accessing data. We maintain separate pointers for reading and writing
    230  * destination buffer (DST_R and DST_W).
    231  *
    232  * Another new thing is 'cache_preload' macro. It is used for prefetching
    233  * data into CPU L2 cache and improve performance when dealing with large
    234  * images which are far larger than cache size. It uses one argument
    235  * (actually two, but they need to be the same here) - number of pixels
    236  * in a block. Looking into 'pixman-arm-neon-asm.h' can provide some
    237  * details about this macro. Moreover, if good performance is needed
    238  * the code from this macro needs to be copied into '*_tail_head' macro
    239  * and mixed with the rest of code for optimal instructions scheduling.
    240  * We are actually doing it below.
    241  *
    242  * Now after all the explanations, here is the optimized code.
    243  * Different instruction streams (originaling from '*_head', '*_tail'
    244  * and 'cache_preload' macro) use different indentation levels for
    245  * better readability. Actually taking the code from one of these
    246  * indentation levels and ignoring a few VLD/VST instructions would
    247  * result in exactly the code from '*_head', '*_tail' or 'cache_preload'
    248  * macro!
    249  */
    250 
    251 #if 1
    252 
    253 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
    254         vqadd.u8    d16, d2, d20
    255     vld1.16     {d4, d5}, [DST_R, :128]!
    256         vqadd.u8    q9, q0, q11
    257     vshrn.u16   d6, q2, #8
    258     fetch_src_pixblock
    259     vshrn.u16   d7, q2, #3
    260     vsli.u16    q2, q2, #5
    261         vshll.u8    q14, d16, #8
    262                                     PF add PF_X, PF_X, #8
    263         vshll.u8    q8, d19, #8
    264                                     PF tst PF_CTL, #0xF
    265     vsri.u8     d6, d6, #5
    266                                     PF addne PF_X, PF_X, #8
    267     vmvn.8      d3, d3
    268                                     PF subne PF_CTL, PF_CTL, #1
    269     vsri.u8     d7, d7, #6
    270     vshrn.u16   d30, q2, #2
    271     vmull.u8    q10, d3, d6
    272                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
    273     vmull.u8    q11, d3, d7
    274     vmull.u8    q12, d3, d30
    275                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    276         vsri.u16    q14, q8, #5
    277                                     PF cmp PF_X, ORIG_W
    278         vshll.u8    q9, d18, #8
    279     vrshr.u16   q13, q10, #8
    280                                     PF subge PF_X, PF_X, ORIG_W
    281     vrshr.u16   q3, q11, #8
    282     vrshr.u16   q15, q12, #8
    283                                     PF subges PF_CTL, PF_CTL, #0x10
    284         vsri.u16    q14, q9, #11
    285                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
    286     vraddhn.u16 d20, q10, q13
    287     vraddhn.u16 d23, q11, q3
    288                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    289     vraddhn.u16 d22, q12, q15
    290         vst1.16     {d28, d29}, [DST_W, :128]!
    291 .endm
    292 
    293 #else
    294 
    295 /* If we did not care much about the performance, we would just use this... */
    296 .macro pixman_composite_over_8888_0565_process_pixblock_tail_head
    297     pixman_composite_over_8888_0565_process_pixblock_tail
    298     vst1.16     {d28, d29}, [DST_W, :128]!
    299     vld1.16     {d4, d5}, [DST_R, :128]!
    300     fetch_src_pixblock
    301     pixman_composite_over_8888_0565_process_pixblock_head
    302     cache_preload 8, 8
    303 .endm
    304 
    305 #endif
    306 
    307 /*
    308  * And now the final part. We are using 'generate_composite_function' macro
    309  * to put all the stuff together. We are specifying the name of the function
    310  * which we want to get, number of bits per pixel for the source, mask and
    311  * destination (0 if unused, like mask in this case). Next come some bit
    312  * flags:
    313  *   FLAG_DST_READWRITE      - tells that the destination buffer is both read
    314  *                             and written, for write-only buffer we would use
    315  *                             FLAG_DST_WRITEONLY flag instead
    316  *   FLAG_DEINTERLEAVE_32BPP - tells that we prefer to work with planar data
    317  *                             and separate color channels for 32bpp format.
    318  * The next things are:
    319  *  - the number of pixels processed per iteration (8 in this case, because
    320  *    that's the maximum what can fit into four 64-bit NEON registers).
    321  *  - prefetch distance, measured in pixel blocks. In this case it is 5 times
    322  *    by 8 pixels. That would be 40 pixels, or up to 160 bytes. Optimal
    323  *    prefetch distance can be selected by running some benchmarks.
    324  *
    325  * After that we specify some macros, these are 'default_init',
    326  * 'default_cleanup' here which are empty (but it is possible to have custom
    327  * init/cleanup macros to be able to save/restore some extra NEON registers
    328  * like d8-d15 or do anything else) followed by
    329  * 'pixman_composite_over_8888_0565_process_pixblock_head',
    330  * 'pixman_composite_over_8888_0565_process_pixblock_tail' and
    331  * 'pixman_composite_over_8888_0565_process_pixblock_tail_head'
    332  * which we got implemented above.
    333  *
    334  * The last part is the NEON registers allocation scheme.
    335  */
    336 generate_composite_function \
    337     pixman_composite_over_8888_0565_asm_neon, 32, 0, 16, \
    338     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    339     8, /* number of pixels, processed in a single block */ \
    340     5, /* prefetch distance */ \
    341     default_init, \
    342     default_cleanup, \
    343     pixman_composite_over_8888_0565_process_pixblock_head, \
    344     pixman_composite_over_8888_0565_process_pixblock_tail, \
    345     pixman_composite_over_8888_0565_process_pixblock_tail_head, \
    346     28, /* dst_w_basereg */ \
    347     4,  /* dst_r_basereg */ \
    348     0,  /* src_basereg   */ \
    349     24  /* mask_basereg  */
    350 
    351 /******************************************************************************/
    352 
    353 .macro pixman_composite_over_n_0565_process_pixblock_head
    354     /* convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
    355        and put data into d6 - red, d7 - green, d30 - blue */
    356     vshrn.u16   d6, q2, #8
    357     vshrn.u16   d7, q2, #3
    358     vsli.u16    q2, q2, #5
    359     vsri.u8     d6, d6, #5
    360     vsri.u8     d7, d7, #6
    361     vshrn.u16   d30, q2, #2
    362     /* now do alpha blending, storing results in 8-bit planar format
    363        into d16 - red, d19 - green, d18 - blue */
    364     vmull.u8    q10, d3, d6
    365     vmull.u8    q11, d3, d7
    366     vmull.u8    q12, d3, d30
    367     vrshr.u16   q13, q10, #8
    368     vrshr.u16   q3, q11, #8
    369     vrshr.u16   q15, q12, #8
    370     vraddhn.u16 d20, q10, q13
    371     vraddhn.u16 d23, q11, q3
    372     vraddhn.u16 d22, q12, q15
    373 .endm
    374 
    375 .macro pixman_composite_over_n_0565_process_pixblock_tail
    376     /* ... continue alpha blending */
    377     vqadd.u8    d16, d2, d20
    378     vqadd.u8    q9, q0, q11
    379     /* convert the result to r5g6b5 and store it into {d28, d29} */
    380     vshll.u8    q14, d16, #8
    381     vshll.u8    q8, d19, #8
    382     vshll.u8    q9, d18, #8
    383     vsri.u16    q14, q8, #5
    384     vsri.u16    q14, q9, #11
    385 .endm
    386 
    387 /* TODO: expand macros and do better instructions scheduling */
    388 .macro pixman_composite_over_n_0565_process_pixblock_tail_head
    389     pixman_composite_over_n_0565_process_pixblock_tail
    390     vld1.16     {d4, d5}, [DST_R, :128]!
    391     vst1.16     {d28, d29}, [DST_W, :128]!
    392     pixman_composite_over_n_0565_process_pixblock_head
    393     cache_preload 8, 8
    394 .endm
    395 
    396 .macro pixman_composite_over_n_0565_init
    397     add         DUMMY, sp, #ARGS_STACK_OFFSET
    398     vld1.32     {d3[0]}, [DUMMY]
    399     vdup.8      d0, d3[0]
    400     vdup.8      d1, d3[1]
    401     vdup.8      d2, d3[2]
    402     vdup.8      d3, d3[3]
    403     vmvn.8      d3, d3      /* invert source alpha */
    404 .endm
    405 
    406 generate_composite_function \
    407     pixman_composite_over_n_0565_asm_neon, 0, 0, 16, \
    408     FLAG_DST_READWRITE, \
    409     8, /* number of pixels, processed in a single block */ \
    410     5, /* prefetch distance */ \
    411     pixman_composite_over_n_0565_init, \
    412     default_cleanup, \
    413     pixman_composite_over_n_0565_process_pixblock_head, \
    414     pixman_composite_over_n_0565_process_pixblock_tail, \
    415     pixman_composite_over_n_0565_process_pixblock_tail_head, \
    416     28, /* dst_w_basereg */ \
    417     4,  /* dst_r_basereg */ \
    418     0,  /* src_basereg   */ \
    419     24  /* mask_basereg  */
    420 
    421 /******************************************************************************/
    422 
    423 .macro pixman_composite_src_8888_0565_process_pixblock_head
    424     vshll.u8    q8, d1, #8
    425     vshll.u8    q14, d2, #8
    426     vshll.u8    q9, d0, #8
    427 .endm
    428 
    429 .macro pixman_composite_src_8888_0565_process_pixblock_tail
    430     vsri.u16    q14, q8, #5
    431     vsri.u16    q14, q9, #11
    432 .endm
    433 
    434 .macro pixman_composite_src_8888_0565_process_pixblock_tail_head
    435         vsri.u16    q14, q8, #5
    436                                     PF add PF_X, PF_X, #8
    437                                     PF tst PF_CTL, #0xF
    438     fetch_src_pixblock
    439                                     PF addne PF_X, PF_X, #8
    440                                     PF subne PF_CTL, PF_CTL, #1
    441         vsri.u16    q14, q9, #11
    442                                     PF cmp PF_X, ORIG_W
    443                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
    444     vshll.u8    q8, d1, #8
    445         vst1.16     {d28, d29}, [DST_W, :128]!
    446                                     PF subge PF_X, PF_X, ORIG_W
    447                                     PF subges PF_CTL, PF_CTL, #0x10
    448     vshll.u8    q14, d2, #8
    449                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
    450     vshll.u8    q9, d0, #8
    451 .endm
    452 
    453 generate_composite_function \
    454     pixman_composite_src_8888_0565_asm_neon, 32, 0, 16, \
    455     FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
    456     8, /* number of pixels, processed in a single block */ \
    457     10, /* prefetch distance */ \
    458     default_init, \
    459     default_cleanup, \
    460     pixman_composite_src_8888_0565_process_pixblock_head, \
    461     pixman_composite_src_8888_0565_process_pixblock_tail, \
    462     pixman_composite_src_8888_0565_process_pixblock_tail_head
    463 
    464 /******************************************************************************/
    465 
    466 .macro pixman_composite_src_0565_8888_process_pixblock_head
    467     vshrn.u16   d30, q0, #8
    468     vshrn.u16   d29, q0, #3
    469     vsli.u16    q0, q0, #5
    470     vmov.u8     d31, #255
    471     vsri.u8     d30, d30, #5
    472     vsri.u8     d29, d29, #6
    473     vshrn.u16   d28, q0, #2
    474 .endm
    475 
    476 .macro pixman_composite_src_0565_8888_process_pixblock_tail
    477 .endm
    478 
    479 /* TODO: expand macros and do better instructions scheduling */
    480 .macro pixman_composite_src_0565_8888_process_pixblock_tail_head
    481     pixman_composite_src_0565_8888_process_pixblock_tail
    482     vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
    483     fetch_src_pixblock
    484     pixman_composite_src_0565_8888_process_pixblock_head
    485     cache_preload 8, 8
    486 .endm
    487 
    488 generate_composite_function \
    489     pixman_composite_src_0565_8888_asm_neon, 16, 0, 32, \
    490     FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
    491     8, /* number of pixels, processed in a single block */ \
    492     10, /* prefetch distance */ \
    493     default_init, \
    494     default_cleanup, \
    495     pixman_composite_src_0565_8888_process_pixblock_head, \
    496     pixman_composite_src_0565_8888_process_pixblock_tail, \
    497     pixman_composite_src_0565_8888_process_pixblock_tail_head
    498 
    499 /******************************************************************************/
    500 
    501 .macro pixman_composite_add_8_8_process_pixblock_head
    502     vqadd.u8    q14, q0, q2
    503     vqadd.u8    q15, q1, q3
    504 .endm
    505 
    506 .macro pixman_composite_add_8_8_process_pixblock_tail
    507 .endm
    508 
    509 .macro pixman_composite_add_8_8_process_pixblock_tail_head
    510     fetch_src_pixblock
    511                                     PF add PF_X, PF_X, #32
    512                                     PF tst PF_CTL, #0xF
    513     vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
    514                                     PF addne PF_X, PF_X, #32
    515                                     PF subne PF_CTL, PF_CTL, #1
    516         vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
    517                                     PF cmp PF_X, ORIG_W
    518                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
    519                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    520                                     PF subge PF_X, PF_X, ORIG_W
    521                                     PF subges PF_CTL, PF_CTL, #0x10
    522     vqadd.u8    q14, q0, q2
    523                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
    524                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    525     vqadd.u8    q15, q1, q3
    526 .endm
    527 
    528 generate_composite_function \
    529     pixman_composite_add_8_8_asm_neon, 8, 0, 8, \
    530     FLAG_DST_READWRITE, \
    531     32, /* number of pixels, processed in a single block */ \
    532     10, /* prefetch distance */ \
    533     default_init, \
    534     default_cleanup, \
    535     pixman_composite_add_8_8_process_pixblock_head, \
    536     pixman_composite_add_8_8_process_pixblock_tail, \
    537     pixman_composite_add_8_8_process_pixblock_tail_head
    538 
    539 /******************************************************************************/
    540 
    541 .macro pixman_composite_add_8888_8888_process_pixblock_tail_head
    542     fetch_src_pixblock
    543                                     PF add PF_X, PF_X, #8
    544                                     PF tst PF_CTL, #0xF
    545     vld1.32     {d4, d5, d6, d7}, [DST_R, :128]!
    546                                     PF addne PF_X, PF_X, #8
    547                                     PF subne PF_CTL, PF_CTL, #1
    548         vst1.32     {d28, d29, d30, d31}, [DST_W, :128]!
    549                                     PF cmp PF_X, ORIG_W
    550                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
    551                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    552                                     PF subge PF_X, PF_X, ORIG_W
    553                                     PF subges PF_CTL, PF_CTL, #0x10
    554     vqadd.u8    q14, q0, q2
    555                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
    556                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    557     vqadd.u8    q15, q1, q3
    558 .endm
    559 
    560 generate_composite_function \
    561     pixman_composite_add_8888_8888_asm_neon, 32, 0, 32, \
    562     FLAG_DST_READWRITE, \
    563     8, /* number of pixels, processed in a single block */ \
    564     10, /* prefetch distance */ \
    565     default_init, \
    566     default_cleanup, \
    567     pixman_composite_add_8_8_process_pixblock_head, \
    568     pixman_composite_add_8_8_process_pixblock_tail, \
    569     pixman_composite_add_8888_8888_process_pixblock_tail_head
    570 
    571 generate_composite_function_single_scanline \
    572     pixman_composite_scanline_add_asm_neon, 32, 0, 32, \
    573     FLAG_DST_READWRITE, \
    574     8, /* number of pixels, processed in a single block */ \
    575     default_init, \
    576     default_cleanup, \
    577     pixman_composite_add_8_8_process_pixblock_head, \
    578     pixman_composite_add_8_8_process_pixblock_tail, \
    579     pixman_composite_add_8888_8888_process_pixblock_tail_head
    580 
    581 /******************************************************************************/
    582 
    583 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_head
    584     vmvn.8      d24, d3  /* get inverted alpha */
    585     /* do alpha blending */
    586     vmull.u8    q8, d24, d4
    587     vmull.u8    q9, d24, d5
    588     vmull.u8    q10, d24, d6
    589     vmull.u8    q11, d24, d7
    590 .endm
    591 
    592 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail
    593     vrshr.u16   q14, q8, #8
    594     vrshr.u16   q15, q9, #8
    595     vrshr.u16   q12, q10, #8
    596     vrshr.u16   q13, q11, #8
    597     vraddhn.u16 d28, q14, q8
    598     vraddhn.u16 d29, q15, q9
    599     vraddhn.u16 d30, q12, q10
    600     vraddhn.u16 d31, q13, q11
    601 .endm
    602 
    603 .macro pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
    604     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
    605         vrshr.u16   q14, q8, #8
    606                                     PF add PF_X, PF_X, #8
    607                                     PF tst PF_CTL, #0xF
    608         vrshr.u16   q15, q9, #8
    609         vrshr.u16   q12, q10, #8
    610         vrshr.u16   q13, q11, #8
    611                                     PF addne PF_X, PF_X, #8
    612                                     PF subne PF_CTL, PF_CTL, #1
    613         vraddhn.u16 d28, q14, q8
    614         vraddhn.u16 d29, q15, q9
    615                                     PF cmp PF_X, ORIG_W
    616         vraddhn.u16 d30, q12, q10
    617         vraddhn.u16 d31, q13, q11
    618     fetch_src_pixblock
    619                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
    620     vmvn.8      d22, d3
    621                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    622         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
    623                                     PF subge PF_X, PF_X, ORIG_W
    624     vmull.u8    q8, d22, d4
    625                                     PF subges PF_CTL, PF_CTL, #0x10
    626     vmull.u8    q9, d22, d5
    627                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
    628     vmull.u8    q10, d22, d6
    629                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    630     vmull.u8    q11, d22, d7
    631 .endm
    632 
    633 generate_composite_function_single_scanline \
    634     pixman_composite_scanline_out_reverse_asm_neon, 32, 0, 32, \
    635     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    636     8, /* number of pixels, processed in a single block */ \
    637     default_init, \
    638     default_cleanup, \
    639     pixman_composite_out_reverse_8888_8888_process_pixblock_head, \
    640     pixman_composite_out_reverse_8888_8888_process_pixblock_tail, \
    641     pixman_composite_out_reverse_8888_8888_process_pixblock_tail_head
    642 
    643 /******************************************************************************/
    644 
    645 .macro pixman_composite_over_8888_8888_process_pixblock_head
    646     pixman_composite_out_reverse_8888_8888_process_pixblock_head
    647 .endm
    648 
    649 .macro pixman_composite_over_8888_8888_process_pixblock_tail
    650     pixman_composite_out_reverse_8888_8888_process_pixblock_tail
    651     vqadd.u8    q14, q0, q14
    652     vqadd.u8    q15, q1, q15
    653 .endm
    654 
    655 .macro pixman_composite_over_8888_8888_process_pixblock_tail_head
    656     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
    657         vrshr.u16   q14, q8, #8
    658                                     PF add PF_X, PF_X, #8
    659                                     PF tst PF_CTL, #0xF
    660         vrshr.u16   q15, q9, #8
    661         vrshr.u16   q12, q10, #8
    662         vrshr.u16   q13, q11, #8
    663                                     PF addne PF_X, PF_X, #8
    664                                     PF subne PF_CTL, PF_CTL, #1
    665         vraddhn.u16 d28, q14, q8
    666         vraddhn.u16 d29, q15, q9
    667                                     PF cmp PF_X, ORIG_W
    668         vraddhn.u16 d30, q12, q10
    669         vraddhn.u16 d31, q13, q11
    670         vqadd.u8    q14, q0, q14
    671         vqadd.u8    q15, q1, q15
    672     fetch_src_pixblock
    673                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
    674     vmvn.8      d22, d3
    675                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    676         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
    677                                     PF subge PF_X, PF_X, ORIG_W
    678     vmull.u8    q8, d22, d4
    679                                     PF subges PF_CTL, PF_CTL, #0x10
    680     vmull.u8    q9, d22, d5
    681                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
    682     vmull.u8    q10, d22, d6
    683                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    684     vmull.u8    q11, d22, d7
    685 .endm
    686 
    687 generate_composite_function \
    688     pixman_composite_over_8888_8888_asm_neon, 32, 0, 32, \
    689     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    690     8, /* number of pixels, processed in a single block */ \
    691     5, /* prefetch distance */ \
    692     default_init, \
    693     default_cleanup, \
    694     pixman_composite_over_8888_8888_process_pixblock_head, \
    695     pixman_composite_over_8888_8888_process_pixblock_tail, \
    696     pixman_composite_over_8888_8888_process_pixblock_tail_head
    697 
    698 generate_composite_function_single_scanline \
    699     pixman_composite_scanline_over_asm_neon, 32, 0, 32, \
    700     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    701     8, /* number of pixels, processed in a single block */ \
    702     default_init, \
    703     default_cleanup, \
    704     pixman_composite_over_8888_8888_process_pixblock_head, \
    705     pixman_composite_over_8888_8888_process_pixblock_tail, \
    706     pixman_composite_over_8888_8888_process_pixblock_tail_head
    707 
    708 /******************************************************************************/
    709 
    710 .macro pixman_composite_over_n_8888_process_pixblock_head
    711     /* deinterleaved source pixels in {d0, d1, d2, d3} */
    712     /* inverted alpha in {d24} */
    713     /* destination pixels in {d4, d5, d6, d7} */
    714     vmull.u8    q8, d24, d4
    715     vmull.u8    q9, d24, d5
    716     vmull.u8    q10, d24, d6
    717     vmull.u8    q11, d24, d7
    718 .endm
    719 
    720 .macro pixman_composite_over_n_8888_process_pixblock_tail
    721     vrshr.u16   q14, q8, #8
    722     vrshr.u16   q15, q9, #8
    723     vrshr.u16   q2, q10, #8
    724     vrshr.u16   q3, q11, #8
    725     vraddhn.u16 d28, q14, q8
    726     vraddhn.u16 d29, q15, q9
    727     vraddhn.u16 d30, q2, q10
    728     vraddhn.u16 d31, q3, q11
    729     vqadd.u8    q14, q0, q14
    730     vqadd.u8    q15, q1, q15
    731 .endm
    732 
    733 .macro pixman_composite_over_n_8888_process_pixblock_tail_head
    734         vrshr.u16   q14, q8, #8
    735         vrshr.u16   q15, q9, #8
    736         vrshr.u16   q2, q10, #8
    737         vrshr.u16   q3, q11, #8
    738         vraddhn.u16 d28, q14, q8
    739         vraddhn.u16 d29, q15, q9
    740         vraddhn.u16 d30, q2, q10
    741         vraddhn.u16 d31, q3, q11
    742     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
    743         vqadd.u8    q14, q0, q14
    744                                     PF add PF_X, PF_X, #8
    745                                     PF tst PF_CTL, #0x0F
    746                                     PF addne PF_X, PF_X, #8
    747                                     PF subne PF_CTL, PF_CTL, #1
    748         vqadd.u8    q15, q1, q15
    749                                     PF cmp PF_X, ORIG_W
    750     vmull.u8    q8, d24, d4
    751                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    752     vmull.u8    q9, d24, d5
    753                                     PF subge PF_X, PF_X, ORIG_W
    754     vmull.u8    q10, d24, d6
    755                                     PF subges PF_CTL, PF_CTL, #0x10
    756     vmull.u8    q11, d24, d7
    757                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    758         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
    759 .endm
    760 
    761 .macro pixman_composite_over_n_8888_init
    762     add         DUMMY, sp, #ARGS_STACK_OFFSET
    763     vld1.32     {d3[0]}, [DUMMY]
    764     vdup.8      d0, d3[0]
    765     vdup.8      d1, d3[1]
    766     vdup.8      d2, d3[2]
    767     vdup.8      d3, d3[3]
    768     vmvn.8      d24, d3  /* get inverted alpha */
    769 .endm
    770 
    771 generate_composite_function \
    772     pixman_composite_over_n_8888_asm_neon, 0, 0, 32, \
    773     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    774     8, /* number of pixels, processed in a single block */ \
    775     5, /* prefetch distance */ \
    776     pixman_composite_over_n_8888_init, \
    777     default_cleanup, \
    778     pixman_composite_over_8888_8888_process_pixblock_head, \
    779     pixman_composite_over_8888_8888_process_pixblock_tail, \
    780     pixman_composite_over_n_8888_process_pixblock_tail_head
    781 
    782 /******************************************************************************/
    783 
    784 .macro pixman_composite_over_reverse_n_8888_process_pixblock_tail_head
    785         vrshr.u16   q14, q8, #8
    786                                     PF add PF_X, PF_X, #8
    787                                     PF tst PF_CTL, #0xF
    788         vrshr.u16   q15, q9, #8
    789         vrshr.u16   q12, q10, #8
    790         vrshr.u16   q13, q11, #8
    791                                     PF addne PF_X, PF_X, #8
    792                                     PF subne PF_CTL, PF_CTL, #1
    793         vraddhn.u16 d28, q14, q8
    794         vraddhn.u16 d29, q15, q9
    795                                     PF cmp PF_X, ORIG_W
    796         vraddhn.u16 d30, q12, q10
    797         vraddhn.u16 d31, q13, q11
    798         vqadd.u8    q14, q0, q14
    799         vqadd.u8    q15, q1, q15
    800     vld4.8      {d0, d1, d2, d3}, [DST_R, :128]!
    801     vmvn.8      d22, d3
    802                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    803         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
    804                                     PF subge PF_X, PF_X, ORIG_W
    805     vmull.u8    q8, d22, d4
    806                                     PF subges PF_CTL, PF_CTL, #0x10
    807     vmull.u8    q9, d22, d5
    808     vmull.u8    q10, d22, d6
    809                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    810     vmull.u8    q11, d22, d7
    811 .endm
    812 
    813 .macro pixman_composite_over_reverse_n_8888_init
    814     add         DUMMY, sp, #ARGS_STACK_OFFSET
    815     vld1.32     {d7[0]}, [DUMMY]
    816     vdup.8      d4, d7[0]
    817     vdup.8      d5, d7[1]
    818     vdup.8      d6, d7[2]
    819     vdup.8      d7, d7[3]
    820 .endm
    821 
    822 generate_composite_function \
    823     pixman_composite_over_reverse_n_8888_asm_neon, 0, 0, 32, \
    824     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    825     8, /* number of pixels, processed in a single block */ \
    826     5, /* prefetch distance */ \
    827     pixman_composite_over_reverse_n_8888_init, \
    828     default_cleanup, \
    829     pixman_composite_over_8888_8888_process_pixblock_head, \
    830     pixman_composite_over_8888_8888_process_pixblock_tail, \
    831     pixman_composite_over_reverse_n_8888_process_pixblock_tail_head, \
    832     28, /* dst_w_basereg */ \
    833     0,  /* dst_r_basereg */ \
    834     4,  /* src_basereg   */ \
    835     24  /* mask_basereg  */
    836 
    837 /******************************************************************************/
    838 
    839 .macro pixman_composite_over_8888_8_0565_process_pixblock_head
    840     vmull.u8    q0,  d24, d8    /* IN for SRC pixels (part1) */
    841     vmull.u8    q1,  d24, d9
    842     vmull.u8    q6,  d24, d10
    843     vmull.u8    q7,  d24, d11
    844         vshrn.u16   d6,  q2, #8 /* convert DST_R data to 32-bpp (part1) */
    845         vshrn.u16   d7,  q2, #3
    846         vsli.u16    q2,  q2, #5
    847     vrshr.u16   q8,  q0,  #8    /* IN for SRC pixels (part2) */
    848     vrshr.u16   q9,  q1,  #8
    849     vrshr.u16   q10, q6,  #8
    850     vrshr.u16   q11, q7,  #8
    851     vraddhn.u16 d0,  q0,  q8
    852     vraddhn.u16 d1,  q1,  q9
    853     vraddhn.u16 d2,  q6,  q10
    854     vraddhn.u16 d3,  q7,  q11
    855         vsri.u8     d6,  d6, #5 /* convert DST_R data to 32-bpp (part2) */
    856         vsri.u8     d7,  d7, #6
    857     vmvn.8      d3,  d3
    858         vshrn.u16   d30, q2, #2
    859     vmull.u8    q8,  d3, d6     /* now do alpha blending */
    860     vmull.u8    q9,  d3, d7
    861     vmull.u8    q10, d3, d30
    862 .endm
    863 
    864 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail
    865     /* 3 cycle bubble (after vmull.u8) */
    866     vrshr.u16   q13, q8,  #8
    867     vrshr.u16   q11, q9,  #8
    868     vrshr.u16   q15, q10, #8
    869     vraddhn.u16 d16, q8,  q13
    870     vraddhn.u16 d27, q9,  q11
    871     vraddhn.u16 d26, q10, q15
    872     vqadd.u8    d16, d2,  d16
    873     /* 1 cycle bubble */
    874     vqadd.u8    q9,  q0,  q13
    875     vshll.u8    q14, d16, #8    /* convert to 16bpp */
    876     vshll.u8    q8,  d19, #8
    877     vshll.u8    q9,  d18, #8
    878     vsri.u16    q14, q8,  #5
    879     /* 1 cycle bubble */
    880     vsri.u16    q14, q9,  #11
    881 .endm
    882 
    883 .macro pixman_composite_over_8888_8_0565_process_pixblock_tail_head
    884     vld1.16     {d4, d5}, [DST_R, :128]!
    885     vshrn.u16   d6,  q2,  #8
    886     fetch_mask_pixblock
    887     vshrn.u16   d7,  q2,  #3
    888     fetch_src_pixblock
    889     vmull.u8    q6,  d24, d10
    890         vrshr.u16   q13, q8,  #8
    891         vrshr.u16   q11, q9,  #8
    892         vrshr.u16   q15, q10, #8
    893         vraddhn.u16 d16, q8,  q13
    894         vraddhn.u16 d27, q9,  q11
    895         vraddhn.u16 d26, q10, q15
    896         vqadd.u8    d16, d2,  d16
    897     vmull.u8    q1,  d24, d9
    898         vqadd.u8    q9,  q0,  q13
    899         vshll.u8    q14, d16, #8
    900     vmull.u8    q0,  d24, d8
    901         vshll.u8    q8,  d19, #8
    902         vshll.u8    q9,  d18, #8
    903         vsri.u16    q14, q8,  #5
    904     vmull.u8    q7,  d24, d11
    905         vsri.u16    q14, q9,  #11
    906 
    907     cache_preload 8, 8
    908 
    909     vsli.u16    q2,  q2,  #5
    910     vrshr.u16   q8,  q0,  #8
    911     vrshr.u16   q9,  q1,  #8
    912     vrshr.u16   q10, q6,  #8
    913     vrshr.u16   q11, q7,  #8
    914     vraddhn.u16 d0,  q0,  q8
    915     vraddhn.u16 d1,  q1,  q9
    916     vraddhn.u16 d2,  q6,  q10
    917     vraddhn.u16 d3,  q7,  q11
    918     vsri.u8     d6,  d6,  #5
    919     vsri.u8     d7,  d7,  #6
    920     vmvn.8      d3,  d3
    921     vshrn.u16   d30, q2,  #2
    922     vst1.16     {d28, d29}, [DST_W, :128]!
    923     vmull.u8    q8,  d3,  d6
    924     vmull.u8    q9,  d3,  d7
    925     vmull.u8    q10, d3,  d30
    926 .endm
    927 
    928 generate_composite_function \
    929     pixman_composite_over_8888_8_0565_asm_neon, 32, 8, 16, \
    930     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    931     8, /* number of pixels, processed in a single block */ \
    932     5, /* prefetch distance */ \
    933     default_init_need_all_regs, \
    934     default_cleanup_need_all_regs, \
    935     pixman_composite_over_8888_8_0565_process_pixblock_head, \
    936     pixman_composite_over_8888_8_0565_process_pixblock_tail, \
    937     pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
    938     28, /* dst_w_basereg */ \
    939     4,  /* dst_r_basereg */ \
    940     8,  /* src_basereg   */ \
    941     24  /* mask_basereg  */
    942 
    943 /******************************************************************************/
    944 
    945 /*
    946  * This function needs a special initialization of solid mask.
    947  * Solid source pixel data is fetched from stack at ARGS_STACK_OFFSET
    948  * offset, split into color components and replicated in d8-d11
    949  * registers. Additionally, this function needs all the NEON registers,
    950  * so it has to save d8-d15 registers which are callee saved according
    951  * to ABI. These registers are restored from 'cleanup' macro. All the
    952  * other NEON registers are caller saved, so can be clobbered freely
    953  * without introducing any problems.
    954  */
    955 .macro pixman_composite_over_n_8_0565_init
    956     add         DUMMY, sp, #ARGS_STACK_OFFSET
    957     vpush       {d8-d15}
    958     vld1.32     {d11[0]}, [DUMMY]
    959     vdup.8      d8, d11[0]
    960     vdup.8      d9, d11[1]
    961     vdup.8      d10, d11[2]
    962     vdup.8      d11, d11[3]
    963 .endm
    964 
    965 .macro pixman_composite_over_n_8_0565_cleanup
    966     vpop        {d8-d15}
    967 .endm
    968 
    969 generate_composite_function \
    970     pixman_composite_over_n_8_0565_asm_neon, 0, 8, 16, \
    971     FLAG_DST_READWRITE, \
    972     8, /* number of pixels, processed in a single block */ \
    973     5, /* prefetch distance */ \
    974     pixman_composite_over_n_8_0565_init, \
    975     pixman_composite_over_n_8_0565_cleanup, \
    976     pixman_composite_over_8888_8_0565_process_pixblock_head, \
    977     pixman_composite_over_8888_8_0565_process_pixblock_tail, \
    978     pixman_composite_over_8888_8_0565_process_pixblock_tail_head
    979 
    980 /******************************************************************************/
    981 
    982 .macro pixman_composite_over_8888_n_0565_init
    983     add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
    984     vpush       {d8-d15}
    985     vld1.32     {d24[0]}, [DUMMY]
    986     vdup.8      d24, d24[3]
    987 .endm
    988 
    989 .macro pixman_composite_over_8888_n_0565_cleanup
    990     vpop        {d8-d15}
    991 .endm
    992 
    993 generate_composite_function \
    994     pixman_composite_over_8888_n_0565_asm_neon, 32, 0, 16, \
    995     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
    996     8, /* number of pixels, processed in a single block */ \
    997     5, /* prefetch distance */ \
    998     pixman_composite_over_8888_n_0565_init, \
    999     pixman_composite_over_8888_n_0565_cleanup, \
   1000     pixman_composite_over_8888_8_0565_process_pixblock_head, \
   1001     pixman_composite_over_8888_8_0565_process_pixblock_tail, \
   1002     pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
   1003     28, /* dst_w_basereg */ \
   1004     4,  /* dst_r_basereg */ \
   1005     8,  /* src_basereg   */ \
   1006     24  /* mask_basereg  */
   1007 
   1008 /******************************************************************************/
   1009 
   1010 .macro pixman_composite_src_0565_0565_process_pixblock_head
   1011 .endm
   1012 
   1013 .macro pixman_composite_src_0565_0565_process_pixblock_tail
   1014 .endm
   1015 
   1016 .macro pixman_composite_src_0565_0565_process_pixblock_tail_head
   1017     vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
   1018     fetch_src_pixblock
   1019     cache_preload 16, 16
   1020 .endm
   1021 
   1022 generate_composite_function \
   1023     pixman_composite_src_0565_0565_asm_neon, 16, 0, 16, \
   1024     FLAG_DST_WRITEONLY, \
   1025     16, /* number of pixels, processed in a single block */ \
   1026     10, /* prefetch distance */ \
   1027     default_init, \
   1028     default_cleanup, \
   1029     pixman_composite_src_0565_0565_process_pixblock_head, \
   1030     pixman_composite_src_0565_0565_process_pixblock_tail, \
   1031     pixman_composite_src_0565_0565_process_pixblock_tail_head, \
   1032     0, /* dst_w_basereg */ \
   1033     0, /* dst_r_basereg */ \
   1034     0, /* src_basereg   */ \
   1035     0  /* mask_basereg  */
   1036 
   1037 /******************************************************************************/
   1038 
   1039 .macro pixman_composite_src_n_8_process_pixblock_head
   1040 .endm
   1041 
   1042 .macro pixman_composite_src_n_8_process_pixblock_tail
   1043 .endm
   1044 
   1045 .macro pixman_composite_src_n_8_process_pixblock_tail_head
   1046     vst1.8  {d0, d1, d2, d3}, [DST_W, :128]!
   1047 .endm
   1048 
   1049 .macro pixman_composite_src_n_8_init
   1050     add         DUMMY, sp, #ARGS_STACK_OFFSET
   1051     vld1.32     {d0[0]}, [DUMMY]
   1052     vsli.u64    d0, d0, #8
   1053     vsli.u64    d0, d0, #16
   1054     vsli.u64    d0, d0, #32
   1055     vorr        d1, d0, d0
   1056     vorr        q1, q0, q0
   1057 .endm
   1058 
   1059 .macro pixman_composite_src_n_8_cleanup
   1060 .endm
   1061 
   1062 generate_composite_function \
   1063     pixman_composite_src_n_8_asm_neon, 0, 0, 8, \
   1064     FLAG_DST_WRITEONLY, \
   1065     32, /* number of pixels, processed in a single block */ \
   1066     0,  /* prefetch distance */ \
   1067     pixman_composite_src_n_8_init, \
   1068     pixman_composite_src_n_8_cleanup, \
   1069     pixman_composite_src_n_8_process_pixblock_head, \
   1070     pixman_composite_src_n_8_process_pixblock_tail, \
   1071     pixman_composite_src_n_8_process_pixblock_tail_head, \
   1072     0, /* dst_w_basereg */ \
   1073     0, /* dst_r_basereg */ \
   1074     0, /* src_basereg   */ \
   1075     0  /* mask_basereg  */
   1076 
   1077 /******************************************************************************/
   1078 
   1079 .macro pixman_composite_src_n_0565_process_pixblock_head
   1080 .endm
   1081 
   1082 .macro pixman_composite_src_n_0565_process_pixblock_tail
   1083 .endm
   1084 
   1085 .macro pixman_composite_src_n_0565_process_pixblock_tail_head
   1086     vst1.16 {d0, d1, d2, d3}, [DST_W, :128]!
   1087 .endm
   1088 
   1089 .macro pixman_composite_src_n_0565_init
   1090     add         DUMMY, sp, #ARGS_STACK_OFFSET
   1091     vld1.32     {d0[0]}, [DUMMY]
   1092     vsli.u64    d0, d0, #16
   1093     vsli.u64    d0, d0, #32
   1094     vorr        d1, d0, d0
   1095     vorr        q1, q0, q0
   1096 .endm
   1097 
   1098 .macro pixman_composite_src_n_0565_cleanup
   1099 .endm
   1100 
   1101 generate_composite_function \
   1102     pixman_composite_src_n_0565_asm_neon, 0, 0, 16, \
   1103     FLAG_DST_WRITEONLY, \
   1104     16, /* number of pixels, processed in a single block */ \
   1105     0,  /* prefetch distance */ \
   1106     pixman_composite_src_n_0565_init, \
   1107     pixman_composite_src_n_0565_cleanup, \
   1108     pixman_composite_src_n_0565_process_pixblock_head, \
   1109     pixman_composite_src_n_0565_process_pixblock_tail, \
   1110     pixman_composite_src_n_0565_process_pixblock_tail_head, \
   1111     0, /* dst_w_basereg */ \
   1112     0, /* dst_r_basereg */ \
   1113     0, /* src_basereg   */ \
   1114     0  /* mask_basereg  */
   1115 
   1116 /******************************************************************************/
   1117 
   1118 .macro pixman_composite_src_n_8888_process_pixblock_head
   1119 .endm
   1120 
   1121 .macro pixman_composite_src_n_8888_process_pixblock_tail
   1122 .endm
   1123 
   1124 .macro pixman_composite_src_n_8888_process_pixblock_tail_head
   1125     vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
   1126 .endm
   1127 
   1128 .macro pixman_composite_src_n_8888_init
   1129     add         DUMMY, sp, #ARGS_STACK_OFFSET
   1130     vld1.32     {d0[0]}, [DUMMY]
   1131     vsli.u64    d0, d0, #32
   1132     vorr        d1, d0, d0
   1133     vorr        q1, q0, q0
   1134 .endm
   1135 
   1136 .macro pixman_composite_src_n_8888_cleanup
   1137 .endm
   1138 
   1139 generate_composite_function \
   1140     pixman_composite_src_n_8888_asm_neon, 0, 0, 32, \
   1141     FLAG_DST_WRITEONLY, \
   1142     8, /* number of pixels, processed in a single block */ \
   1143     0, /* prefetch distance */ \
   1144     pixman_composite_src_n_8888_init, \
   1145     pixman_composite_src_n_8888_cleanup, \
   1146     pixman_composite_src_n_8888_process_pixblock_head, \
   1147     pixman_composite_src_n_8888_process_pixblock_tail, \
   1148     pixman_composite_src_n_8888_process_pixblock_tail_head, \
   1149     0, /* dst_w_basereg */ \
   1150     0, /* dst_r_basereg */ \
   1151     0, /* src_basereg   */ \
   1152     0  /* mask_basereg  */
   1153 
   1154 /******************************************************************************/
   1155 
   1156 .macro pixman_composite_src_8888_8888_process_pixblock_head
   1157 .endm
   1158 
   1159 .macro pixman_composite_src_8888_8888_process_pixblock_tail
   1160 .endm
   1161 
   1162 .macro pixman_composite_src_8888_8888_process_pixblock_tail_head
   1163     vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
   1164     fetch_src_pixblock
   1165     cache_preload 8, 8
   1166 .endm
   1167 
   1168 generate_composite_function \
   1169     pixman_composite_src_8888_8888_asm_neon, 32, 0, 32, \
   1170     FLAG_DST_WRITEONLY, \
   1171     8, /* number of pixels, processed in a single block */ \
   1172     10, /* prefetch distance */ \
   1173     default_init, \
   1174     default_cleanup, \
   1175     pixman_composite_src_8888_8888_process_pixblock_head, \
   1176     pixman_composite_src_8888_8888_process_pixblock_tail, \
   1177     pixman_composite_src_8888_8888_process_pixblock_tail_head, \
   1178     0, /* dst_w_basereg */ \
   1179     0, /* dst_r_basereg */ \
   1180     0, /* src_basereg   */ \
   1181     0  /* mask_basereg  */
   1182 
   1183 /******************************************************************************/
   1184 
   1185 .macro pixman_composite_src_x888_8888_process_pixblock_head
   1186     vorr     q0, q0, q2
   1187     vorr     q1, q1, q2
   1188 .endm
   1189 
   1190 .macro pixman_composite_src_x888_8888_process_pixblock_tail
   1191 .endm
   1192 
   1193 .macro pixman_composite_src_x888_8888_process_pixblock_tail_head
   1194     vst1.32 {d0, d1, d2, d3}, [DST_W, :128]!
   1195     fetch_src_pixblock
   1196     vorr     q0, q0, q2
   1197     vorr     q1, q1, q2
   1198     cache_preload 8, 8
   1199 .endm
   1200 
   1201 .macro pixman_composite_src_x888_8888_init
   1202     vmov.u8  q2, #0xFF
   1203     vshl.u32 q2, q2, #24
   1204 .endm
   1205 
   1206 generate_composite_function \
   1207     pixman_composite_src_x888_8888_asm_neon, 32, 0, 32, \
   1208     FLAG_DST_WRITEONLY, \
   1209     8, /* number of pixels, processed in a single block */ \
   1210     10, /* prefetch distance */ \
   1211     pixman_composite_src_x888_8888_init, \
   1212     default_cleanup, \
   1213     pixman_composite_src_x888_8888_process_pixblock_head, \
   1214     pixman_composite_src_x888_8888_process_pixblock_tail, \
   1215     pixman_composite_src_x888_8888_process_pixblock_tail_head, \
   1216     0, /* dst_w_basereg */ \
   1217     0, /* dst_r_basereg */ \
   1218     0, /* src_basereg   */ \
   1219     0  /* mask_basereg  */
   1220 
   1221 /******************************************************************************/
   1222 
   1223 .macro pixman_composite_src_n_8_8888_process_pixblock_head
   1224     /* expecting solid source in {d0, d1, d2, d3} */
   1225     /* mask is in d24 (d25, d26, d27 are unused) */
   1226 
   1227     /* in */
   1228     vmull.u8    q8, d24, d0
   1229     vmull.u8    q9, d24, d1
   1230     vmull.u8    q10, d24, d2
   1231     vmull.u8    q11, d24, d3
   1232     vrsra.u16   q8, q8, #8
   1233     vrsra.u16   q9, q9, #8
   1234     vrsra.u16   q10, q10, #8
   1235     vrsra.u16   q11, q11, #8
   1236 .endm
   1237 
   1238 .macro pixman_composite_src_n_8_8888_process_pixblock_tail
   1239     vrshrn.u16  d28, q8, #8
   1240     vrshrn.u16  d29, q9, #8
   1241     vrshrn.u16  d30, q10, #8
   1242     vrshrn.u16  d31, q11, #8
   1243 .endm
   1244 
   1245 .macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
   1246     fetch_mask_pixblock
   1247                                     PF add PF_X, PF_X, #8
   1248         vrshrn.u16  d28, q8, #8
   1249                                     PF tst PF_CTL, #0x0F
   1250         vrshrn.u16  d29, q9, #8
   1251                                     PF addne PF_X, PF_X, #8
   1252         vrshrn.u16  d30, q10, #8
   1253                                     PF subne PF_CTL, PF_CTL, #1
   1254         vrshrn.u16  d31, q11, #8
   1255                                     PF cmp PF_X, ORIG_W
   1256     vmull.u8    q8, d24, d0
   1257                                     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
   1258     vmull.u8    q9, d24, d1
   1259                                     PF subge PF_X, PF_X, ORIG_W
   1260     vmull.u8    q10, d24, d2
   1261                                     PF subges PF_CTL, PF_CTL, #0x10
   1262     vmull.u8    q11, d24, d3
   1263                                     PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
   1264         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
   1265     vrsra.u16   q8, q8, #8
   1266     vrsra.u16   q9, q9, #8
   1267     vrsra.u16   q10, q10, #8
   1268     vrsra.u16   q11, q11, #8
   1269 .endm
   1270 
   1271 .macro pixman_composite_src_n_8_8888_init
   1272     add         DUMMY, sp, #ARGS_STACK_OFFSET
   1273     vld1.32     {d3[0]}, [DUMMY]
   1274     vdup.8      d0, d3[0]
   1275     vdup.8      d1, d3[1]
   1276     vdup.8      d2, d3[2]
   1277     vdup.8      d3, d3[3]
   1278 .endm
   1279 
   1280 .macro pixman_composite_src_n_8_8888_cleanup
   1281 .endm
   1282 
   1283 generate_composite_function \
   1284     pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
   1285     FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
   1286     8, /* number of pixels, processed in a single block */ \
   1287     5, /* prefetch distance */ \
   1288     pixman_composite_src_n_8_8888_init, \
   1289     pixman_composite_src_n_8_8888_cleanup, \
   1290     pixman_composite_src_n_8_8888_process_pixblock_head, \
   1291     pixman_composite_src_n_8_8888_process_pixblock_tail, \
   1292     pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
   1293 
   1294 /******************************************************************************/
   1295 
   1296 .macro pixman_composite_src_n_8_8_process_pixblock_head
   1297     vmull.u8    q0, d24, d16
   1298     vmull.u8    q1, d25, d16
   1299     vmull.u8    q2, d26, d16
   1300     vmull.u8    q3, d27, d16
   1301     vrsra.u16   q0, q0,  #8
   1302     vrsra.u16   q1, q1,  #8
   1303     vrsra.u16   q2, q2,  #8
   1304     vrsra.u16   q3, q3,  #8
   1305 .endm
   1306 
   1307 .macro pixman_composite_src_n_8_8_process_pixblock_tail
   1308     vrshrn.u16  d28, q0, #8
   1309     vrshrn.u16  d29, q1, #8
   1310     vrshrn.u16  d30, q2, #8
   1311     vrshrn.u16  d31, q3, #8
   1312 .endm
   1313 
   1314 .macro pixman_composite_src_n_8_8_process_pixblock_tail_head
   1315     fetch_mask_pixblock
   1316                                     PF add PF_X, PF_X, #8
   1317         vrshrn.u16  d28, q0, #8
   1318                                     PF tst PF_CTL, #0x0F
   1319         vrshrn.u16  d29, q1, #8
   1320                                     PF addne PF_X, PF_X, #8
   1321         vrshrn.u16  d30, q2, #8
   1322                                     PF subne PF_CTL, PF_CTL, #1
   1323         vrshrn.u16  d31, q3, #8
   1324                                     PF cmp PF_X, ORIG_W
   1325     vmull.u8    q0,  d24, d16
   1326                                     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
   1327     vmull.u8    q1,  d25, d16
   1328                                     PF subge PF_X, PF_X, ORIG_W
   1329     vmull.u8    q2,  d26, d16
   1330                                     PF subges PF_CTL, PF_CTL, #0x10
   1331     vmull.u8    q3,  d27, d16
   1332                                     PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
   1333         vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
   1334     vrsra.u16   q0, q0,  #8
   1335     vrsra.u16   q1, q1,  #8
   1336     vrsra.u16   q2, q2,  #8
   1337     vrsra.u16   q3, q3,  #8
   1338 .endm
   1339 
   1340 .macro pixman_composite_src_n_8_8_init
   1341     add         DUMMY, sp, #ARGS_STACK_OFFSET
   1342     vld1.32     {d16[0]}, [DUMMY]
   1343     vdup.8      d16, d16[3]
   1344 .endm
   1345 
   1346 .macro pixman_composite_src_n_8_8_cleanup
   1347 .endm
   1348 
   1349 generate_composite_function \
   1350     pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
   1351     FLAG_DST_WRITEONLY, \
   1352     32, /* number of pixels, processed in a single block */ \
   1353     5, /* prefetch distance */ \
   1354     pixman_composite_src_n_8_8_init, \
   1355     pixman_composite_src_n_8_8_cleanup, \
   1356     pixman_composite_src_n_8_8_process_pixblock_head, \
   1357     pixman_composite_src_n_8_8_process_pixblock_tail, \
   1358     pixman_composite_src_n_8_8_process_pixblock_tail_head
   1359 
   1360 /******************************************************************************/
   1361 
   1362 .macro pixman_composite_over_n_8_8888_process_pixblock_head
   1363     /* expecting deinterleaved source data in {d8, d9, d10, d11} */
   1364     /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
   1365     /* and destination data in {d4, d5, d6, d7} */
   1366     /* mask is in d24 (d25, d26, d27 are unused) */
   1367 
   1368     /* in */
   1369     vmull.u8    q6, d24, d8
   1370     vmull.u8    q7, d24, d9
   1371     vmull.u8    q8, d24, d10
   1372     vmull.u8    q9, d24, d11
   1373     vrshr.u16   q10, q6, #8
   1374     vrshr.u16   q11, q7, #8
   1375     vrshr.u16   q12, q8, #8
   1376     vrshr.u16   q13, q9, #8
   1377     vraddhn.u16 d0, q6, q10
   1378     vraddhn.u16 d1, q7, q11
   1379     vraddhn.u16 d2, q8, q12
   1380     vraddhn.u16 d3, q9, q13
   1381     vmvn.8      d25, d3  /* get inverted alpha */
   1382     /* source:      d0 - blue, d1 - green, d2 - red, d3 - alpha */
   1383     /* destination: d4 - blue, d5 - green, d6 - red, d7 - alpha */
   1384     /* now do alpha blending */
   1385     vmull.u8    q8, d25, d4
   1386     vmull.u8    q9, d25, d5
   1387     vmull.u8    q10, d25, d6
   1388     vmull.u8    q11, d25, d7
   1389 .endm
   1390 
   1391 .macro pixman_composite_over_n_8_8888_process_pixblock_tail
   1392     vrshr.u16   q14, q8, #8
   1393     vrshr.u16   q15, q9, #8
   1394     vrshr.u16   q6, q10, #8
   1395     vrshr.u16   q7, q11, #8
   1396     vraddhn.u16 d28, q14, q8
   1397     vraddhn.u16 d29, q15, q9
   1398     vraddhn.u16 d30, q6, q10
   1399     vraddhn.u16 d31, q7, q11
   1400     vqadd.u8    q14, q0, q14
   1401     vqadd.u8    q15, q1, q15
   1402 .endm
   1403 
   1404 .macro pixman_composite_over_n_8_8888_process_pixblock_tail_head
   1405         vrshr.u16   q14, q8, #8
   1406     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
   1407         vrshr.u16   q15, q9, #8
   1408     fetch_mask_pixblock
   1409         vrshr.u16   q6, q10, #8
   1410                                     PF add PF_X, PF_X, #8
   1411         vrshr.u16   q7, q11, #8
   1412                                     PF tst PF_CTL, #0x0F
   1413         vraddhn.u16 d28, q14, q8
   1414                                     PF addne PF_X, PF_X, #8
   1415         vraddhn.u16 d29, q15, q9
   1416                                     PF subne PF_CTL, PF_CTL, #1
   1417         vraddhn.u16 d30, q6, q10
   1418                                     PF cmp PF_X, ORIG_W
   1419         vraddhn.u16 d31, q7, q11
   1420                                     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
   1421     vmull.u8    q6, d24, d8
   1422                                     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
   1423     vmull.u8    q7, d24, d9
   1424                                     PF subge PF_X, PF_X, ORIG_W
   1425     vmull.u8    q8, d24, d10
   1426                                     PF subges PF_CTL, PF_CTL, #0x10
   1427     vmull.u8    q9, d24, d11
   1428                                     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
   1429         vqadd.u8    q14, q0, q14
   1430                                     PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
   1431         vqadd.u8    q15, q1, q15
   1432     vrshr.u16   q10, q6, #8
   1433     vrshr.u16   q11, q7, #8
   1434     vrshr.u16   q12, q8, #8
   1435     vrshr.u16   q13, q9, #8
   1436     vraddhn.u16 d0, q6, q10
   1437     vraddhn.u16 d1, q7, q11
   1438     vraddhn.u16 d2, q8, q12
   1439     vraddhn.u16 d3, q9, q13
   1440         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
   1441     vmvn.8      d25, d3
   1442     vmull.u8    q8, d25, d4
   1443     vmull.u8    q9, d25, d5
   1444     vmull.u8    q10, d25, d6
   1445     vmull.u8    q11, d25, d7
   1446 .endm
   1447 
   1448 .macro pixman_composite_over_n_8_8888_init
   1449     add         DUMMY, sp, #ARGS_STACK_OFFSET
   1450     vpush       {d8-d15}
   1451     vld1.32     {d11[0]}, [DUMMY]
   1452     vdup.8      d8, d11[0]
   1453     vdup.8      d9, d11[1]
   1454     vdup.8      d10, d11[2]
   1455     vdup.8      d11, d11[3]
   1456 .endm
   1457 
   1458 .macro pixman_composite_over_n_8_8888_cleanup
   1459     vpop        {d8-d15}
   1460 .endm
   1461 
   1462 generate_composite_function \
   1463     pixman_composite_over_n_8_8888_asm_neon, 0, 8, 32, \
   1464     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   1465     8, /* number of pixels, processed in a single block */ \
   1466     5, /* prefetch distance */ \
   1467     pixman_composite_over_n_8_8888_init, \
   1468     pixman_composite_over_n_8_8888_cleanup, \
   1469     pixman_composite_over_n_8_8888_process_pixblock_head, \
   1470     pixman_composite_over_n_8_8888_process_pixblock_tail, \
   1471     pixman_composite_over_n_8_8888_process_pixblock_tail_head
   1472 
   1473 /******************************************************************************/
   1474 
   1475 .macro pixman_composite_over_n_8_8_process_pixblock_head
   1476     vmull.u8    q0,  d24, d8
   1477     vmull.u8    q1,  d25, d8
   1478     vmull.u8    q6,  d26, d8
   1479     vmull.u8    q7,  d27, d8
   1480     vrshr.u16   q10, q0,  #8
   1481     vrshr.u16   q11, q1,  #8
   1482     vrshr.u16   q12, q6,  #8
   1483     vrshr.u16   q13, q7,  #8
   1484     vraddhn.u16 d0,  q0,  q10
   1485     vraddhn.u16 d1,  q1,  q11
   1486     vraddhn.u16 d2,  q6,  q12
   1487     vraddhn.u16 d3,  q7,  q13
   1488     vmvn.8      q12, q0
   1489     vmvn.8      q13, q1
   1490     vmull.u8    q8,  d24, d4
   1491     vmull.u8    q9,  d25, d5
   1492     vmull.u8    q10, d26, d6
   1493     vmull.u8    q11, d27, d7
   1494 .endm
   1495 
   1496 .macro pixman_composite_over_n_8_8_process_pixblock_tail
   1497     vrshr.u16   q14, q8,  #8
   1498     vrshr.u16   q15, q9,  #8
   1499     vrshr.u16   q12, q10, #8
   1500     vrshr.u16   q13, q11, #8
   1501     vraddhn.u16 d28, q14, q8
   1502     vraddhn.u16 d29, q15, q9
   1503     vraddhn.u16 d30, q12, q10
   1504     vraddhn.u16 d31, q13, q11
   1505     vqadd.u8    q14, q0,  q14
   1506     vqadd.u8    q15, q1,  q15
   1507 .endm
   1508 
   1509 /* TODO: expand macros and do better instructions scheduling */
   1510 .macro pixman_composite_over_n_8_8_process_pixblock_tail_head
   1511     vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
   1512     pixman_composite_over_n_8_8_process_pixblock_tail
   1513     fetch_mask_pixblock
   1514     cache_preload 32, 32
   1515     vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
   1516     pixman_composite_over_n_8_8_process_pixblock_head
   1517 .endm
   1518 
   1519 .macro pixman_composite_over_n_8_8_init
   1520     add         DUMMY, sp, #ARGS_STACK_OFFSET
   1521     vpush       {d8-d15}
   1522     vld1.32     {d8[0]}, [DUMMY]
   1523     vdup.8      d8, d8[3]
   1524 .endm
   1525 
   1526 .macro pixman_composite_over_n_8_8_cleanup
   1527     vpop        {d8-d15}
   1528 .endm
   1529 
   1530 generate_composite_function \
   1531     pixman_composite_over_n_8_8_asm_neon, 0, 8, 8, \
   1532     FLAG_DST_READWRITE, \
   1533     32, /* number of pixels, processed in a single block */ \
   1534     5, /* prefetch distance */ \
   1535     pixman_composite_over_n_8_8_init, \
   1536     pixman_composite_over_n_8_8_cleanup, \
   1537     pixman_composite_over_n_8_8_process_pixblock_head, \
   1538     pixman_composite_over_n_8_8_process_pixblock_tail, \
   1539     pixman_composite_over_n_8_8_process_pixblock_tail_head
   1540 
   1541 /******************************************************************************/
   1542 
   1543 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_head
   1544     /*
   1545      * 'combine_mask_ca' replacement
   1546      *
   1547      * input:  solid src (n) in {d8,  d9,  d10, d11}
   1548      *         dest in          {d4,  d5,  d6,  d7 }
   1549      *         mask in          {d24, d25, d26, d27}
   1550      * output: updated src in   {d0,  d1,  d2,  d3 }
   1551      *         updated mask in  {d24, d25, d26, d3 }
   1552      */
   1553     vmull.u8    q0,  d24, d8
   1554     vmull.u8    q1,  d25, d9
   1555     vmull.u8    q6,  d26, d10
   1556     vmull.u8    q7,  d27, d11
   1557     vmull.u8    q9,  d11, d25
   1558     vmull.u8    q12, d11, d24
   1559     vmull.u8    q13, d11, d26
   1560     vrshr.u16   q8,  q0,  #8
   1561     vrshr.u16   q10, q1,  #8
   1562     vrshr.u16   q11, q6,  #8
   1563     vraddhn.u16 d0,  q0,  q8
   1564     vraddhn.u16 d1,  q1,  q10
   1565     vraddhn.u16 d2,  q6,  q11
   1566     vrshr.u16   q11, q12, #8
   1567     vrshr.u16   q8,  q9,  #8
   1568     vrshr.u16   q6,  q13, #8
   1569     vrshr.u16   q10, q7,  #8
   1570     vraddhn.u16 d24, q12, q11
   1571     vraddhn.u16 d25, q9,  q8
   1572     vraddhn.u16 d26, q13, q6
   1573     vraddhn.u16 d3,  q7,  q10
   1574     /*
   1575      * 'combine_over_ca' replacement
   1576      *
   1577      * output: updated dest in {d28, d29, d30, d31}
   1578      */
   1579     vmvn.8      q12, q12
   1580     vmvn.8      d26, d26
   1581     vmull.u8    q8,  d24, d4
   1582     vmull.u8    q9,  d25, d5
   1583     vmvn.8      d27, d3
   1584     vmull.u8    q10, d26, d6
   1585     vmull.u8    q11, d27, d7
   1586 .endm
   1587 
   1588 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail
   1589     /* ... continue 'combine_over_ca' replacement */
   1590     vrshr.u16   q14, q8,  #8
   1591     vrshr.u16   q15, q9,  #8
   1592     vrshr.u16   q6,  q10, #8
   1593     vrshr.u16   q7,  q11, #8
   1594     vraddhn.u16 d28, q14, q8
   1595     vraddhn.u16 d29, q15, q9
   1596     vraddhn.u16 d30, q6,  q10
   1597     vraddhn.u16 d31, q7,  q11
   1598     vqadd.u8    q14, q0,  q14
   1599     vqadd.u8    q15, q1,  q15
   1600 .endm
   1601 
   1602 .macro pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
   1603         vrshr.u16   q14, q8, #8
   1604         vrshr.u16   q15, q9, #8
   1605     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
   1606         vrshr.u16   q6, q10, #8
   1607         vrshr.u16   q7, q11, #8
   1608         vraddhn.u16 d28, q14, q8
   1609         vraddhn.u16 d29, q15, q9
   1610         vraddhn.u16 d30, q6, q10
   1611         vraddhn.u16 d31, q7, q11
   1612     fetch_mask_pixblock
   1613         vqadd.u8    q14, q0, q14
   1614         vqadd.u8    q15, q1, q15
   1615     cache_preload 8, 8
   1616     pixman_composite_over_n_8888_8888_ca_process_pixblock_head
   1617     vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
   1618 .endm
   1619 
   1620 .macro pixman_composite_over_n_8888_8888_ca_init
   1621     add         DUMMY, sp, #ARGS_STACK_OFFSET
   1622     vpush       {d8-d15}
   1623     vld1.32     {d11[0]}, [DUMMY]
   1624     vdup.8      d8, d11[0]
   1625     vdup.8      d9, d11[1]
   1626     vdup.8      d10, d11[2]
   1627     vdup.8      d11, d11[3]
   1628 .endm
   1629 
   1630 .macro pixman_composite_over_n_8888_8888_ca_cleanup
   1631     vpop        {d8-d15}
   1632 .endm
   1633 
   1634 generate_composite_function \
   1635     pixman_composite_over_n_8888_8888_ca_asm_neon, 0, 32, 32, \
   1636     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   1637     8, /* number of pixels, processed in a single block */ \
   1638     5, /* prefetch distance */ \
   1639     pixman_composite_over_n_8888_8888_ca_init, \
   1640     pixman_composite_over_n_8888_8888_ca_cleanup, \
   1641     pixman_composite_over_n_8888_8888_ca_process_pixblock_head, \
   1642     pixman_composite_over_n_8888_8888_ca_process_pixblock_tail, \
   1643     pixman_composite_over_n_8888_8888_ca_process_pixblock_tail_head
   1644 
   1645 /******************************************************************************/
   1646 
   1647 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_head
   1648     /*
   1649      * 'combine_mask_ca' replacement
   1650      *
   1651      * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
   1652      *         mask in          {d24, d25, d26}       [B, G, R]
   1653      * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
   1654      *         updated mask in  {d24, d25, d26}       [B, G, R]
   1655      */
   1656     vmull.u8    q0,  d24, d8
   1657     vmull.u8    q1,  d25, d9
   1658     vmull.u8    q6,  d26, d10
   1659     vmull.u8    q9,  d11, d25
   1660     vmull.u8    q12, d11, d24
   1661     vmull.u8    q13, d11, d26
   1662     vrshr.u16   q8,  q0,  #8
   1663     vrshr.u16   q10, q1,  #8
   1664     vrshr.u16   q11, q6,  #8
   1665     vraddhn.u16 d0,  q0,  q8
   1666     vraddhn.u16 d1,  q1,  q10
   1667     vraddhn.u16 d2,  q6,  q11
   1668     vrshr.u16   q11, q12, #8
   1669     vrshr.u16   q8,  q9,  #8
   1670     vrshr.u16   q6,  q13, #8
   1671     vraddhn.u16 d24, q12, q11
   1672     vraddhn.u16 d25, q9,  q8
   1673     /*
   1674      * convert 8 r5g6b5 pixel data from {d4, d5} to planar 8-bit format
   1675      * and put data into d16 - blue, d17 - green, d18 - red
   1676      */
   1677        vshrn.u16   d17, q2,  #3
   1678        vshrn.u16   d18, q2,  #8
   1679     vraddhn.u16 d26, q13, q6
   1680        vsli.u16    q2,  q2,  #5
   1681        vsri.u8     d18, d18, #5
   1682        vsri.u8     d17, d17, #6
   1683     /*
   1684      * 'combine_over_ca' replacement
   1685      *
   1686      * output: updated dest in d16 - blue, d17 - green, d18 - red
   1687      */
   1688     vmvn.8      q12, q12
   1689        vshrn.u16   d16, q2,  #2
   1690     vmvn.8      d26, d26
   1691     vmull.u8    q6,  d16, d24
   1692     vmull.u8    q7,  d17, d25
   1693     vmull.u8    q11, d18, d26
   1694 .endm
   1695 
   1696 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail
   1697     /* ... continue 'combine_over_ca' replacement */
   1698     vrshr.u16   q10, q6,  #8
   1699     vrshr.u16   q14, q7,  #8
   1700     vrshr.u16   q15, q11, #8
   1701     vraddhn.u16 d16, q10, q6
   1702     vraddhn.u16 d17, q14, q7
   1703     vraddhn.u16 d18, q15, q11
   1704     vqadd.u8    q8,  q0,  q8
   1705     vqadd.u8    d18, d2,  d18
   1706     /*
   1707      * convert the results in d16, d17, d18 to r5g6b5 and store
   1708      * them into {d28, d29}
   1709      */
   1710     vshll.u8    q14, d18, #8
   1711     vshll.u8    q10, d17, #8
   1712     vshll.u8    q15, d16, #8
   1713     vsri.u16    q14, q10, #5
   1714     vsri.u16    q14, q15, #11
   1715 .endm
   1716 
   1717 .macro pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
   1718     fetch_mask_pixblock
   1719         vrshr.u16   q10, q6, #8
   1720         vrshr.u16   q14, q7, #8
   1721     vld1.16     {d4, d5}, [DST_R, :128]!
   1722         vrshr.u16   q15, q11, #8
   1723         vraddhn.u16 d16, q10, q6
   1724         vraddhn.u16 d17, q14, q7
   1725         vraddhn.u16 d22, q15, q11
   1726             /* process_pixblock_head */
   1727             /*
   1728              * 'combine_mask_ca' replacement
   1729              *
   1730              * input:  solid src (n) in {d8,  d9,  d10, d11}  [B, G, R, A]
   1731              *         mask in          {d24, d25, d26}       [B, G, R]
   1732              * output: updated src in   {d0,  d1,  d2 }       [B, G, R]
   1733              *         updated mask in  {d24, d25, d26}       [B, G, R]
   1734              */
   1735             vmull.u8    q6,  d26, d10
   1736         vqadd.u8    q8,  q0, q8
   1737             vmull.u8    q0,  d24, d8
   1738         vqadd.u8    d22, d2, d22
   1739             vmull.u8    q1,  d25, d9
   1740         /*
   1741          * convert the result in d16, d17, d22 to r5g6b5 and store
   1742          * it into {d28, d29}
   1743          */
   1744         vshll.u8    q14, d22, #8
   1745         vshll.u8    q10, d17, #8
   1746         vshll.u8    q15, d16, #8
   1747             vmull.u8    q9,  d11, d25
   1748         vsri.u16    q14, q10, #5
   1749             vmull.u8    q12, d11, d24
   1750             vmull.u8    q13, d11, d26
   1751         vsri.u16    q14, q15, #11
   1752     cache_preload 8, 8
   1753             vrshr.u16   q8,  q0,  #8
   1754             vrshr.u16   q10, q1,  #8
   1755             vrshr.u16   q11, q6,  #8
   1756             vraddhn.u16 d0,  q0,  q8
   1757             vraddhn.u16 d1,  q1,  q10
   1758             vraddhn.u16 d2,  q6,  q11
   1759             vrshr.u16   q11, q12, #8
   1760             vrshr.u16   q8,  q9,  #8
   1761             vrshr.u16   q6,  q13, #8
   1762             vraddhn.u16 d24, q12, q11
   1763             vraddhn.u16 d25, q9,  q8
   1764                 /*
   1765                  * convert 8 r5g6b5 pixel data from {d4, d5} to planar
   1766 	         * 8-bit format and put data into d16 - blue, d17 - green,
   1767 	         * d18 - red
   1768                  */
   1769                 vshrn.u16   d17, q2,  #3
   1770                 vshrn.u16   d18, q2,  #8
   1771             vraddhn.u16 d26, q13, q6
   1772                 vsli.u16    q2,  q2,  #5
   1773                 vsri.u8     d17, d17, #6
   1774                 vsri.u8     d18, d18, #5
   1775             /*
   1776              * 'combine_over_ca' replacement
   1777              *
   1778              * output: updated dest in d16 - blue, d17 - green, d18 - red
   1779              */
   1780             vmvn.8      q12, q12
   1781                 vshrn.u16   d16, q2,  #2
   1782             vmvn.8      d26, d26
   1783             vmull.u8    q7,  d17, d25
   1784             vmull.u8    q6,  d16, d24
   1785             vmull.u8    q11, d18, d26
   1786     vst1.16     {d28, d29}, [DST_W, :128]!
   1787 .endm
   1788 
   1789 .macro pixman_composite_over_n_8888_0565_ca_init
   1790     add         DUMMY, sp, #ARGS_STACK_OFFSET
   1791     vpush       {d8-d15}
   1792     vld1.32     {d11[0]}, [DUMMY]
   1793     vdup.8      d8, d11[0]
   1794     vdup.8      d9, d11[1]
   1795     vdup.8      d10, d11[2]
   1796     vdup.8      d11, d11[3]
   1797 .endm
   1798 
   1799 .macro pixman_composite_over_n_8888_0565_ca_cleanup
   1800     vpop        {d8-d15}
   1801 .endm
   1802 
   1803 generate_composite_function \
   1804     pixman_composite_over_n_8888_0565_ca_asm_neon, 0, 32, 16, \
   1805     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   1806     8, /* number of pixels, processed in a single block */ \
   1807     5, /* prefetch distance */ \
   1808     pixman_composite_over_n_8888_0565_ca_init, \
   1809     pixman_composite_over_n_8888_0565_ca_cleanup, \
   1810     pixman_composite_over_n_8888_0565_ca_process_pixblock_head, \
   1811     pixman_composite_over_n_8888_0565_ca_process_pixblock_tail, \
   1812     pixman_composite_over_n_8888_0565_ca_process_pixblock_tail_head
   1813 
   1814 /******************************************************************************/
   1815 
   1816 .macro pixman_composite_in_n_8_process_pixblock_head
   1817     /* expecting source data in {d0, d1, d2, d3} */
   1818     /* and destination data in {d4, d5, d6, d7} */
   1819     vmull.u8    q8,  d4,  d3
   1820     vmull.u8    q9,  d5,  d3
   1821     vmull.u8    q10, d6,  d3
   1822     vmull.u8    q11, d7,  d3
   1823 .endm
   1824 
   1825 .macro pixman_composite_in_n_8_process_pixblock_tail
   1826     vrshr.u16   q14, q8,  #8
   1827     vrshr.u16   q15, q9,  #8
   1828     vrshr.u16   q12, q10, #8
   1829     vrshr.u16   q13, q11, #8
   1830     vraddhn.u16 d28, q8,  q14
   1831     vraddhn.u16 d29, q9,  q15
   1832     vraddhn.u16 d30, q10, q12
   1833     vraddhn.u16 d31, q11, q13
   1834 .endm
   1835 
   1836 .macro pixman_composite_in_n_8_process_pixblock_tail_head
   1837     pixman_composite_in_n_8_process_pixblock_tail
   1838     vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
   1839     cache_preload 32, 32
   1840     pixman_composite_in_n_8_process_pixblock_head
   1841     vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
   1842 .endm
   1843 
   1844 .macro pixman_composite_in_n_8_init
   1845     add         DUMMY, sp, #ARGS_STACK_OFFSET
   1846     vld1.32     {d3[0]}, [DUMMY]
   1847     vdup.8      d3, d3[3]
   1848 .endm
   1849 
   1850 .macro pixman_composite_in_n_8_cleanup
   1851 .endm
   1852 
   1853 generate_composite_function \
   1854     pixman_composite_in_n_8_asm_neon, 0, 0, 8, \
   1855     FLAG_DST_READWRITE, \
   1856     32, /* number of pixels, processed in a single block */ \
   1857     5, /* prefetch distance */ \
   1858     pixman_composite_in_n_8_init, \
   1859     pixman_composite_in_n_8_cleanup, \
   1860     pixman_composite_in_n_8_process_pixblock_head, \
   1861     pixman_composite_in_n_8_process_pixblock_tail, \
   1862     pixman_composite_in_n_8_process_pixblock_tail_head, \
   1863     28, /* dst_w_basereg */ \
   1864     4,  /* dst_r_basereg */ \
   1865     0,  /* src_basereg   */ \
   1866     24  /* mask_basereg  */
   1867 
   1868 .macro pixman_composite_add_n_8_8_process_pixblock_head
   1869     /* expecting source data in {d8, d9, d10, d11} */
   1870     /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
   1871     /* and destination data in {d4, d5, d6, d7} */
   1872     /* mask is in d24, d25, d26, d27 */
   1873     vmull.u8    q0, d24, d11
   1874     vmull.u8    q1, d25, d11
   1875     vmull.u8    q6, d26, d11
   1876     vmull.u8    q7, d27, d11
   1877     vrshr.u16   q10, q0, #8
   1878     vrshr.u16   q11, q1, #8
   1879     vrshr.u16   q12, q6, #8
   1880     vrshr.u16   q13, q7, #8
   1881     vraddhn.u16 d0, q0, q10
   1882     vraddhn.u16 d1, q1, q11
   1883     vraddhn.u16 d2, q6, q12
   1884     vraddhn.u16 d3, q7, q13
   1885     vqadd.u8    q14, q0, q2
   1886     vqadd.u8    q15, q1, q3
   1887 .endm
   1888 
   1889 .macro pixman_composite_add_n_8_8_process_pixblock_tail
   1890 .endm
   1891 
   1892 /* TODO: expand macros and do better instructions scheduling */
   1893 .macro pixman_composite_add_n_8_8_process_pixblock_tail_head
   1894     pixman_composite_add_n_8_8_process_pixblock_tail
   1895     vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
   1896     vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
   1897     fetch_mask_pixblock
   1898     cache_preload 32, 32
   1899     pixman_composite_add_n_8_8_process_pixblock_head
   1900 .endm
   1901 
   1902 .macro pixman_composite_add_n_8_8_init
   1903     add         DUMMY, sp, #ARGS_STACK_OFFSET
   1904     vpush       {d8-d15}
   1905     vld1.32     {d11[0]}, [DUMMY]
   1906     vdup.8      d11, d11[3]
   1907 .endm
   1908 
   1909 .macro pixman_composite_add_n_8_8_cleanup
   1910     vpop        {d8-d15}
   1911 .endm
   1912 
   1913 generate_composite_function \
   1914     pixman_composite_add_n_8_8_asm_neon, 0, 8, 8, \
   1915     FLAG_DST_READWRITE, \
   1916     32, /* number of pixels, processed in a single block */ \
   1917     5, /* prefetch distance */ \
   1918     pixman_composite_add_n_8_8_init, \
   1919     pixman_composite_add_n_8_8_cleanup, \
   1920     pixman_composite_add_n_8_8_process_pixblock_head, \
   1921     pixman_composite_add_n_8_8_process_pixblock_tail, \
   1922     pixman_composite_add_n_8_8_process_pixblock_tail_head
   1923 
   1924 /******************************************************************************/
   1925 
   1926 .macro pixman_composite_add_8_8_8_process_pixblock_head
   1927     /* expecting source data in {d0, d1, d2, d3} */
   1928     /* destination data in {d4, d5, d6, d7} */
   1929     /* mask in {d24, d25, d26, d27} */
   1930     vmull.u8    q8, d24, d0
   1931     vmull.u8    q9, d25, d1
   1932     vmull.u8    q10, d26, d2
   1933     vmull.u8    q11, d27, d3
   1934     vrshr.u16   q0, q8, #8
   1935     vrshr.u16   q1, q9, #8
   1936     vrshr.u16   q12, q10, #8
   1937     vrshr.u16   q13, q11, #8
   1938     vraddhn.u16 d0, q0, q8
   1939     vraddhn.u16 d1, q1, q9
   1940     vraddhn.u16 d2, q12, q10
   1941     vraddhn.u16 d3, q13, q11
   1942     vqadd.u8    q14, q0, q2
   1943     vqadd.u8    q15, q1, q3
   1944 .endm
   1945 
   1946 .macro pixman_composite_add_8_8_8_process_pixblock_tail
   1947 .endm
   1948 
   1949 /* TODO: expand macros and do better instructions scheduling */
   1950 .macro pixman_composite_add_8_8_8_process_pixblock_tail_head
   1951     pixman_composite_add_8_8_8_process_pixblock_tail
   1952     vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
   1953     vld1.8      {d4, d5, d6, d7}, [DST_R, :128]!
   1954     fetch_mask_pixblock
   1955     fetch_src_pixblock
   1956     cache_preload 32, 32
   1957     pixman_composite_add_8_8_8_process_pixblock_head
   1958 .endm
   1959 
   1960 .macro pixman_composite_add_8_8_8_init
   1961 .endm
   1962 
   1963 .macro pixman_composite_add_8_8_8_cleanup
   1964 .endm
   1965 
   1966 generate_composite_function \
   1967     pixman_composite_add_8_8_8_asm_neon, 8, 8, 8, \
   1968     FLAG_DST_READWRITE, \
   1969     32, /* number of pixels, processed in a single block */ \
   1970     5, /* prefetch distance */ \
   1971     pixman_composite_add_8_8_8_init, \
   1972     pixman_composite_add_8_8_8_cleanup, \
   1973     pixman_composite_add_8_8_8_process_pixblock_head, \
   1974     pixman_composite_add_8_8_8_process_pixblock_tail, \
   1975     pixman_composite_add_8_8_8_process_pixblock_tail_head
   1976 
   1977 /******************************************************************************/
   1978 
   1979 .macro pixman_composite_add_8888_8888_8888_process_pixblock_head
   1980     /* expecting source data in {d0, d1, d2, d3} */
   1981     /* destination data in {d4, d5, d6, d7} */
   1982     /* mask in {d24, d25, d26, d27} */
   1983     vmull.u8    q8,  d27, d0
   1984     vmull.u8    q9,  d27, d1
   1985     vmull.u8    q10, d27, d2
   1986     vmull.u8    q11, d27, d3
   1987     /* 1 cycle bubble */
   1988     vrsra.u16   q8,  q8,  #8
   1989     vrsra.u16   q9,  q9,  #8
   1990     vrsra.u16   q10, q10, #8
   1991     vrsra.u16   q11, q11, #8
   1992 .endm
   1993 
   1994 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail
   1995     /* 2 cycle bubble */
   1996     vrshrn.u16  d28, q8,  #8
   1997     vrshrn.u16  d29, q9,  #8
   1998     vrshrn.u16  d30, q10, #8
   1999     vrshrn.u16  d31, q11, #8
   2000     vqadd.u8    q14, q2,  q14
   2001     /* 1 cycle bubble */
   2002     vqadd.u8    q15, q3,  q15
   2003 .endm
   2004 
   2005 .macro pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
   2006     fetch_src_pixblock
   2007         vrshrn.u16  d28, q8,  #8
   2008     fetch_mask_pixblock
   2009         vrshrn.u16  d29, q9,  #8
   2010     vmull.u8    q8,  d27, d0
   2011         vrshrn.u16  d30, q10, #8
   2012     vmull.u8    q9,  d27, d1
   2013         vrshrn.u16  d31, q11, #8
   2014     vmull.u8    q10, d27, d2
   2015         vqadd.u8    q14, q2,  q14
   2016     vmull.u8    q11, d27, d3
   2017         vqadd.u8    q15, q3,  q15
   2018     vrsra.u16   q8,  q8,  #8
   2019     vld4.8      {d4, d5, d6, d7}, [DST_R, :128]!
   2020     vrsra.u16   q9,  q9,  #8
   2021         vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
   2022     vrsra.u16   q10, q10, #8
   2023 
   2024     cache_preload 8, 8
   2025 
   2026     vrsra.u16   q11, q11, #8
   2027 .endm
   2028 
   2029 generate_composite_function \
   2030     pixman_composite_add_8888_8888_8888_asm_neon, 32, 32, 32, \
   2031     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2032     8, /* number of pixels, processed in a single block */ \
   2033     10, /* prefetch distance */ \
   2034     default_init, \
   2035     default_cleanup, \
   2036     pixman_composite_add_8888_8888_8888_process_pixblock_head, \
   2037     pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
   2038     pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
   2039 
   2040 generate_composite_function_single_scanline \
   2041     pixman_composite_scanline_add_mask_asm_neon, 32, 32, 32, \
   2042     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2043     8, /* number of pixels, processed in a single block */ \
   2044     default_init, \
   2045     default_cleanup, \
   2046     pixman_composite_add_8888_8888_8888_process_pixblock_head, \
   2047     pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
   2048     pixman_composite_add_8888_8888_8888_process_pixblock_tail_head
   2049 
   2050 /******************************************************************************/
   2051 
   2052 generate_composite_function \
   2053     pixman_composite_add_8888_8_8888_asm_neon, 32, 8, 32, \
   2054     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2055     8, /* number of pixels, processed in a single block */ \
   2056     5, /* prefetch distance */ \
   2057     default_init, \
   2058     default_cleanup, \
   2059     pixman_composite_add_8888_8888_8888_process_pixblock_head, \
   2060     pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
   2061     pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
   2062     28, /* dst_w_basereg */ \
   2063     4,  /* dst_r_basereg */ \
   2064     0,  /* src_basereg   */ \
   2065     27  /* mask_basereg  */
   2066 
   2067 /******************************************************************************/
   2068 
   2069 .macro pixman_composite_add_n_8_8888_init
   2070     add         DUMMY, sp, #ARGS_STACK_OFFSET
   2071     vld1.32     {d3[0]}, [DUMMY]
   2072     vdup.8      d0, d3[0]
   2073     vdup.8      d1, d3[1]
   2074     vdup.8      d2, d3[2]
   2075     vdup.8      d3, d3[3]
   2076 .endm
   2077 
   2078 .macro pixman_composite_add_n_8_8888_cleanup
   2079 .endm
   2080 
   2081 generate_composite_function \
   2082     pixman_composite_add_n_8_8888_asm_neon, 0, 8, 32, \
   2083     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2084     8, /* number of pixels, processed in a single block */ \
   2085     5, /* prefetch distance */ \
   2086     pixman_composite_add_n_8_8888_init, \
   2087     pixman_composite_add_n_8_8888_cleanup, \
   2088     pixman_composite_add_8888_8888_8888_process_pixblock_head, \
   2089     pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
   2090     pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
   2091     28, /* dst_w_basereg */ \
   2092     4,  /* dst_r_basereg */ \
   2093     0,  /* src_basereg   */ \
   2094     27  /* mask_basereg  */
   2095 
   2096 /******************************************************************************/
   2097 
   2098 .macro pixman_composite_add_8888_n_8888_init
   2099     add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
   2100     vld1.32     {d27[0]}, [DUMMY]
   2101     vdup.8      d27, d27[3]
   2102 .endm
   2103 
   2104 .macro pixman_composite_add_8888_n_8888_cleanup
   2105 .endm
   2106 
   2107 generate_composite_function \
   2108     pixman_composite_add_8888_n_8888_asm_neon, 32, 0, 32, \
   2109     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2110     8, /* number of pixels, processed in a single block */ \
   2111     5, /* prefetch distance */ \
   2112     pixman_composite_add_8888_n_8888_init, \
   2113     pixman_composite_add_8888_n_8888_cleanup, \
   2114     pixman_composite_add_8888_8888_8888_process_pixblock_head, \
   2115     pixman_composite_add_8888_8888_8888_process_pixblock_tail, \
   2116     pixman_composite_add_8888_8888_8888_process_pixblock_tail_head, \
   2117     28, /* dst_w_basereg */ \
   2118     4,  /* dst_r_basereg */ \
   2119     0,  /* src_basereg   */ \
   2120     27  /* mask_basereg  */
   2121 
   2122 /******************************************************************************/
   2123 
   2124 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
   2125     /* expecting source data in {d0, d1, d2, d3} */
   2126     /* destination data in {d4, d5, d6, d7} */
   2127     /* solid mask is in d15 */
   2128 
   2129     /* 'in' */
   2130     vmull.u8    q8, d15, d3
   2131     vmull.u8    q6, d15, d2
   2132     vmull.u8    q5, d15, d1
   2133     vmull.u8    q4, d15, d0
   2134     vrshr.u16   q13, q8, #8
   2135     vrshr.u16   q12, q6, #8
   2136     vrshr.u16   q11, q5, #8
   2137     vrshr.u16   q10, q4, #8
   2138     vraddhn.u16 d3, q8, q13
   2139     vraddhn.u16 d2, q6, q12
   2140     vraddhn.u16 d1, q5, q11
   2141     vraddhn.u16 d0, q4, q10
   2142     vmvn.8      d24, d3  /* get inverted alpha */
   2143     /* now do alpha blending */
   2144     vmull.u8    q8, d24, d4
   2145     vmull.u8    q9, d24, d5
   2146     vmull.u8    q10, d24, d6
   2147     vmull.u8    q11, d24, d7
   2148 .endm
   2149 
   2150 .macro pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
   2151     vrshr.u16   q14, q8, #8
   2152     vrshr.u16   q15, q9, #8
   2153     vrshr.u16   q12, q10, #8
   2154     vrshr.u16   q13, q11, #8
   2155     vraddhn.u16 d28, q14, q8
   2156     vraddhn.u16 d29, q15, q9
   2157     vraddhn.u16 d30, q12, q10
   2158     vraddhn.u16 d31, q13, q11
   2159 .endm
   2160 
   2161 /* TODO: expand macros and do better instructions scheduling */
   2162 .macro pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head
   2163     vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
   2164     pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
   2165     fetch_src_pixblock
   2166     cache_preload 8, 8
   2167     fetch_mask_pixblock
   2168     pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
   2169     vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
   2170 .endm
   2171 
   2172 generate_composite_function_single_scanline \
   2173     pixman_composite_scanline_out_reverse_mask_asm_neon, 32, 32, 32, \
   2174     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2175     8, /* number of pixels, processed in a single block */ \
   2176     default_init_need_all_regs, \
   2177     default_cleanup_need_all_regs, \
   2178     pixman_composite_out_reverse_8888_n_8888_process_pixblock_head, \
   2179     pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail, \
   2180     pixman_composite_out_reverse_8888_8888_8888_process_pixblock_tail_head \
   2181     28, /* dst_w_basereg */ \
   2182     4,  /* dst_r_basereg */ \
   2183     0,  /* src_basereg   */ \
   2184     12  /* mask_basereg  */
   2185 
   2186 /******************************************************************************/
   2187 
   2188 .macro pixman_composite_over_8888_n_8888_process_pixblock_head
   2189     pixman_composite_out_reverse_8888_n_8888_process_pixblock_head
   2190 .endm
   2191 
   2192 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail
   2193     pixman_composite_out_reverse_8888_n_8888_process_pixblock_tail
   2194     vqadd.u8    q14, q0, q14
   2195     vqadd.u8    q15, q1, q15
   2196 .endm
   2197 
   2198 /* TODO: expand macros and do better instructions scheduling */
   2199 .macro pixman_composite_over_8888_n_8888_process_pixblock_tail_head
   2200     vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
   2201     pixman_composite_over_8888_n_8888_process_pixblock_tail
   2202     fetch_src_pixblock
   2203     cache_preload 8, 8
   2204     pixman_composite_over_8888_n_8888_process_pixblock_head
   2205     vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
   2206 .endm
   2207 
   2208 .macro pixman_composite_over_8888_n_8888_init
   2209     add         DUMMY, sp, #48
   2210     vpush       {d8-d15}
   2211     vld1.32     {d15[0]}, [DUMMY]
   2212     vdup.8      d15, d15[3]
   2213 .endm
   2214 
   2215 .macro pixman_composite_over_8888_n_8888_cleanup
   2216     vpop        {d8-d15}
   2217 .endm
   2218 
   2219 generate_composite_function \
   2220     pixman_composite_over_8888_n_8888_asm_neon, 32, 0, 32, \
   2221     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2222     8, /* number of pixels, processed in a single block */ \
   2223     5, /* prefetch distance */ \
   2224     pixman_composite_over_8888_n_8888_init, \
   2225     pixman_composite_over_8888_n_8888_cleanup, \
   2226     pixman_composite_over_8888_n_8888_process_pixblock_head, \
   2227     pixman_composite_over_8888_n_8888_process_pixblock_tail, \
   2228     pixman_composite_over_8888_n_8888_process_pixblock_tail_head
   2229 
   2230 /******************************************************************************/
   2231 
   2232 /* TODO: expand macros and do better instructions scheduling */
   2233 .macro pixman_composite_over_8888_8888_8888_process_pixblock_tail_head
   2234     vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
   2235     pixman_composite_over_8888_n_8888_process_pixblock_tail
   2236     fetch_src_pixblock
   2237     cache_preload 8, 8
   2238     fetch_mask_pixblock
   2239     pixman_composite_over_8888_n_8888_process_pixblock_head
   2240     vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
   2241 .endm
   2242 
   2243 generate_composite_function \
   2244     pixman_composite_over_8888_8888_8888_asm_neon, 32, 32, 32, \
   2245     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2246     8, /* number of pixels, processed in a single block */ \
   2247     5, /* prefetch distance */ \
   2248     default_init_need_all_regs, \
   2249     default_cleanup_need_all_regs, \
   2250     pixman_composite_over_8888_n_8888_process_pixblock_head, \
   2251     pixman_composite_over_8888_n_8888_process_pixblock_tail, \
   2252     pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
   2253     28, /* dst_w_basereg */ \
   2254     4,  /* dst_r_basereg */ \
   2255     0,  /* src_basereg   */ \
   2256     12  /* mask_basereg  */
   2257 
   2258 generate_composite_function_single_scanline \
   2259     pixman_composite_scanline_over_mask_asm_neon, 32, 32, 32, \
   2260     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2261     8, /* number of pixels, processed in a single block */ \
   2262     default_init_need_all_regs, \
   2263     default_cleanup_need_all_regs, \
   2264     pixman_composite_over_8888_n_8888_process_pixblock_head, \
   2265     pixman_composite_over_8888_n_8888_process_pixblock_tail, \
   2266     pixman_composite_over_8888_8888_8888_process_pixblock_tail_head \
   2267     28, /* dst_w_basereg */ \
   2268     4,  /* dst_r_basereg */ \
   2269     0,  /* src_basereg   */ \
   2270     12  /* mask_basereg  */
   2271 
   2272 /******************************************************************************/
   2273 
   2274 /* TODO: expand macros and do better instructions scheduling */
   2275 .macro pixman_composite_over_8888_8_8888_process_pixblock_tail_head
   2276     vld4.8     {d4, d5, d6, d7}, [DST_R, :128]!
   2277     pixman_composite_over_8888_n_8888_process_pixblock_tail
   2278     fetch_src_pixblock
   2279     cache_preload 8, 8
   2280     fetch_mask_pixblock
   2281     pixman_composite_over_8888_n_8888_process_pixblock_head
   2282     vst4.8     {d28, d29, d30, d31}, [DST_W, :128]!
   2283 .endm
   2284 
   2285 generate_composite_function \
   2286     pixman_composite_over_8888_8_8888_asm_neon, 32, 8, 32, \
   2287     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2288     8, /* number of pixels, processed in a single block */ \
   2289     5, /* prefetch distance */ \
   2290     default_init_need_all_regs, \
   2291     default_cleanup_need_all_regs, \
   2292     pixman_composite_over_8888_n_8888_process_pixblock_head, \
   2293     pixman_composite_over_8888_n_8888_process_pixblock_tail, \
   2294     pixman_composite_over_8888_8_8888_process_pixblock_tail_head \
   2295     28, /* dst_w_basereg */ \
   2296     4,  /* dst_r_basereg */ \
   2297     0,  /* src_basereg   */ \
   2298     15  /* mask_basereg  */
   2299 
   2300 /******************************************************************************/
   2301 
   2302 .macro pixman_composite_src_0888_0888_process_pixblock_head
   2303 .endm
   2304 
   2305 .macro pixman_composite_src_0888_0888_process_pixblock_tail
   2306 .endm
   2307 
   2308 .macro pixman_composite_src_0888_0888_process_pixblock_tail_head
   2309     vst3.8 {d0, d1, d2}, [DST_W]!
   2310     fetch_src_pixblock
   2311     cache_preload 8, 8
   2312 .endm
   2313 
   2314 generate_composite_function \
   2315     pixman_composite_src_0888_0888_asm_neon, 24, 0, 24, \
   2316     FLAG_DST_WRITEONLY, \
   2317     8, /* number of pixels, processed in a single block */ \
   2318     10, /* prefetch distance */ \
   2319     default_init, \
   2320     default_cleanup, \
   2321     pixman_composite_src_0888_0888_process_pixblock_head, \
   2322     pixman_composite_src_0888_0888_process_pixblock_tail, \
   2323     pixman_composite_src_0888_0888_process_pixblock_tail_head, \
   2324     0, /* dst_w_basereg */ \
   2325     0, /* dst_r_basereg */ \
   2326     0, /* src_basereg   */ \
   2327     0  /* mask_basereg  */
   2328 
   2329 /******************************************************************************/
   2330 
   2331 .macro pixman_composite_src_0888_8888_rev_process_pixblock_head
   2332     vswp   d0, d2
   2333 .endm
   2334 
   2335 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail
   2336 .endm
   2337 
   2338 .macro pixman_composite_src_0888_8888_rev_process_pixblock_tail_head
   2339     vst4.8 {d0, d1, d2, d3}, [DST_W]!
   2340     fetch_src_pixblock
   2341     vswp   d0, d2
   2342     cache_preload 8, 8
   2343 .endm
   2344 
   2345 .macro pixman_composite_src_0888_8888_rev_init
   2346     veor   d3, d3, d3
   2347 .endm
   2348 
   2349 generate_composite_function \
   2350     pixman_composite_src_0888_8888_rev_asm_neon, 24, 0, 32, \
   2351     FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
   2352     8, /* number of pixels, processed in a single block */ \
   2353     10, /* prefetch distance */ \
   2354     pixman_composite_src_0888_8888_rev_init, \
   2355     default_cleanup, \
   2356     pixman_composite_src_0888_8888_rev_process_pixblock_head, \
   2357     pixman_composite_src_0888_8888_rev_process_pixblock_tail, \
   2358     pixman_composite_src_0888_8888_rev_process_pixblock_tail_head, \
   2359     0, /* dst_w_basereg */ \
   2360     0, /* dst_r_basereg */ \
   2361     0, /* src_basereg   */ \
   2362     0  /* mask_basereg  */
   2363 
   2364 /******************************************************************************/
   2365 
   2366 .macro pixman_composite_src_0888_0565_rev_process_pixblock_head
   2367     vshll.u8    q8, d1, #8
   2368     vshll.u8    q9, d2, #8
   2369 .endm
   2370 
   2371 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail
   2372     vshll.u8    q14, d0, #8
   2373     vsri.u16    q14, q8, #5
   2374     vsri.u16    q14, q9, #11
   2375 .endm
   2376 
   2377 .macro pixman_composite_src_0888_0565_rev_process_pixblock_tail_head
   2378         vshll.u8    q14, d0, #8
   2379     fetch_src_pixblock
   2380         vsri.u16    q14, q8, #5
   2381         vsri.u16    q14, q9, #11
   2382     vshll.u8    q8, d1, #8
   2383         vst1.16 {d28, d29}, [DST_W, :128]!
   2384     vshll.u8    q9, d2, #8
   2385 .endm
   2386 
   2387 generate_composite_function \
   2388     pixman_composite_src_0888_0565_rev_asm_neon, 24, 0, 16, \
   2389     FLAG_DST_WRITEONLY, \
   2390     8, /* number of pixels, processed in a single block */ \
   2391     10, /* prefetch distance */ \
   2392     default_init, \
   2393     default_cleanup, \
   2394     pixman_composite_src_0888_0565_rev_process_pixblock_head, \
   2395     pixman_composite_src_0888_0565_rev_process_pixblock_tail, \
   2396     pixman_composite_src_0888_0565_rev_process_pixblock_tail_head, \
   2397     28, /* dst_w_basereg */ \
   2398     0, /* dst_r_basereg */ \
   2399     0, /* src_basereg   */ \
   2400     0  /* mask_basereg  */
   2401 
   2402 /******************************************************************************/
   2403 
   2404 .macro pixman_composite_src_pixbuf_8888_process_pixblock_head
   2405     vmull.u8    q8, d3, d0
   2406     vmull.u8    q9, d3, d1
   2407     vmull.u8    q10, d3, d2
   2408 .endm
   2409 
   2410 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail
   2411     vrshr.u16   q11, q8, #8
   2412     vswp        d3, d31
   2413     vrshr.u16   q12, q9, #8
   2414     vrshr.u16   q13, q10, #8
   2415     vraddhn.u16 d30, q11, q8
   2416     vraddhn.u16 d29, q12, q9
   2417     vraddhn.u16 d28, q13, q10
   2418 .endm
   2419 
   2420 .macro pixman_composite_src_pixbuf_8888_process_pixblock_tail_head
   2421         vrshr.u16   q11, q8, #8
   2422         vswp        d3, d31
   2423         vrshr.u16   q12, q9, #8
   2424         vrshr.u16   q13, q10, #8
   2425     fetch_src_pixblock
   2426         vraddhn.u16 d30, q11, q8
   2427                                     PF add PF_X, PF_X, #8
   2428                                     PF tst PF_CTL, #0xF
   2429                                     PF addne PF_X, PF_X, #8
   2430                                     PF subne PF_CTL, PF_CTL, #1
   2431         vraddhn.u16 d29, q12, q9
   2432         vraddhn.u16 d28, q13, q10
   2433     vmull.u8    q8, d3, d0
   2434     vmull.u8    q9, d3, d1
   2435     vmull.u8    q10, d3, d2
   2436         vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
   2437                                     PF cmp PF_X, ORIG_W
   2438                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
   2439                                     PF subge PF_X, PF_X, ORIG_W
   2440                                     PF subges PF_CTL, PF_CTL, #0x10
   2441                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
   2442 .endm
   2443 
   2444 generate_composite_function \
   2445     pixman_composite_src_pixbuf_8888_asm_neon, 32, 0, 32, \
   2446     FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
   2447     8, /* number of pixels, processed in a single block */ \
   2448     10, /* prefetch distance */ \
   2449     default_init, \
   2450     default_cleanup, \
   2451     pixman_composite_src_pixbuf_8888_process_pixblock_head, \
   2452     pixman_composite_src_pixbuf_8888_process_pixblock_tail, \
   2453     pixman_composite_src_pixbuf_8888_process_pixblock_tail_head, \
   2454     28, /* dst_w_basereg */ \
   2455     0, /* dst_r_basereg */ \
   2456     0, /* src_basereg   */ \
   2457     0  /* mask_basereg  */
   2458 
   2459 /******************************************************************************/
   2460 
   2461 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_head
   2462     vmull.u8    q8, d3, d0
   2463     vmull.u8    q9, d3, d1
   2464     vmull.u8    q10, d3, d2
   2465 .endm
   2466 
   2467 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail
   2468     vrshr.u16   q11, q8, #8
   2469     vswp        d3, d31
   2470     vrshr.u16   q12, q9, #8
   2471     vrshr.u16   q13, q10, #8
   2472     vraddhn.u16 d28, q11, q8
   2473     vraddhn.u16 d29, q12, q9
   2474     vraddhn.u16 d30, q13, q10
   2475 .endm
   2476 
   2477 .macro pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head
   2478         vrshr.u16   q11, q8, #8
   2479         vswp        d3, d31
   2480         vrshr.u16   q12, q9, #8
   2481         vrshr.u16   q13, q10, #8
   2482     fetch_src_pixblock
   2483         vraddhn.u16 d28, q11, q8
   2484                                     PF add PF_X, PF_X, #8
   2485                                     PF tst PF_CTL, #0xF
   2486                                     PF addne PF_X, PF_X, #8
   2487                                     PF subne PF_CTL, PF_CTL, #1
   2488         vraddhn.u16 d29, q12, q9
   2489         vraddhn.u16 d30, q13, q10
   2490     vmull.u8    q8, d3, d0
   2491     vmull.u8    q9, d3, d1
   2492     vmull.u8    q10, d3, d2
   2493         vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
   2494                                     PF cmp PF_X, ORIG_W
   2495                                     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
   2496                                     PF subge PF_X, PF_X, ORIG_W
   2497                                     PF subges PF_CTL, PF_CTL, #0x10
   2498                                     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
   2499 .endm
   2500 
   2501 generate_composite_function \
   2502     pixman_composite_src_rpixbuf_8888_asm_neon, 32, 0, 32, \
   2503     FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
   2504     8, /* number of pixels, processed in a single block */ \
   2505     10, /* prefetch distance */ \
   2506     default_init, \
   2507     default_cleanup, \
   2508     pixman_composite_src_rpixbuf_8888_process_pixblock_head, \
   2509     pixman_composite_src_rpixbuf_8888_process_pixblock_tail, \
   2510     pixman_composite_src_rpixbuf_8888_process_pixblock_tail_head, \
   2511     28, /* dst_w_basereg */ \
   2512     0, /* dst_r_basereg */ \
   2513     0, /* src_basereg   */ \
   2514     0  /* mask_basereg  */
   2515 
   2516 /******************************************************************************/
   2517 
   2518 .macro pixman_composite_over_0565_8_0565_process_pixblock_head
   2519     /* mask is in d15 */
   2520     convert_0565_to_x888 q4, d2, d1, d0
   2521     convert_0565_to_x888 q5, d6, d5, d4
   2522     /* source pixel data is in      {d0, d1, d2, XX} */
   2523     /* destination pixel data is in {d4, d5, d6, XX} */
   2524     vmvn.8      d7,  d15
   2525     vmull.u8    q6,  d15, d2
   2526     vmull.u8    q5,  d15, d1
   2527     vmull.u8    q4,  d15, d0
   2528     vmull.u8    q8,  d7,  d4
   2529     vmull.u8    q9,  d7,  d5
   2530     vmull.u8    q13, d7,  d6
   2531     vrshr.u16   q12, q6,  #8
   2532     vrshr.u16   q11, q5,  #8
   2533     vrshr.u16   q10, q4,  #8
   2534     vraddhn.u16 d2,  q6,  q12
   2535     vraddhn.u16 d1,  q5,  q11
   2536     vraddhn.u16 d0,  q4,  q10
   2537 .endm
   2538 
   2539 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail
   2540     vrshr.u16   q14, q8,  #8
   2541     vrshr.u16   q15, q9,  #8
   2542     vrshr.u16   q12, q13, #8
   2543     vraddhn.u16 d28, q14, q8
   2544     vraddhn.u16 d29, q15, q9
   2545     vraddhn.u16 d30, q12, q13
   2546     vqadd.u8    q0,  q0,  q14
   2547     vqadd.u8    q1,  q1,  q15
   2548     /* 32bpp result is in {d0, d1, d2, XX} */
   2549     convert_8888_to_0565 d2, d1, d0, q14, q15, q3
   2550 .endm
   2551 
   2552 /* TODO: expand macros and do better instructions scheduling */
   2553 .macro pixman_composite_over_0565_8_0565_process_pixblock_tail_head
   2554     fetch_mask_pixblock
   2555     pixman_composite_over_0565_8_0565_process_pixblock_tail
   2556     fetch_src_pixblock
   2557     vld1.16    {d10, d11}, [DST_R, :128]!
   2558     cache_preload 8, 8
   2559     pixman_composite_over_0565_8_0565_process_pixblock_head
   2560     vst1.16    {d28, d29}, [DST_W, :128]!
   2561 .endm
   2562 
   2563 generate_composite_function \
   2564     pixman_composite_over_0565_8_0565_asm_neon, 16, 8, 16, \
   2565     FLAG_DST_READWRITE, \
   2566     8, /* number of pixels, processed in a single block */ \
   2567     5, /* prefetch distance */ \
   2568     default_init_need_all_regs, \
   2569     default_cleanup_need_all_regs, \
   2570     pixman_composite_over_0565_8_0565_process_pixblock_head, \
   2571     pixman_composite_over_0565_8_0565_process_pixblock_tail, \
   2572     pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
   2573     28, /* dst_w_basereg */ \
   2574     10,  /* dst_r_basereg */ \
   2575     8,  /* src_basereg   */ \
   2576     15  /* mask_basereg  */
   2577 
   2578 /******************************************************************************/
   2579 
   2580 .macro pixman_composite_over_0565_n_0565_init
   2581     add         DUMMY, sp, #(ARGS_STACK_OFFSET + 8)
   2582     vpush       {d8-d15}
   2583     vld1.32     {d15[0]}, [DUMMY]
   2584     vdup.8      d15, d15[3]
   2585 .endm
   2586 
   2587 .macro pixman_composite_over_0565_n_0565_cleanup
   2588     vpop        {d8-d15}
   2589 .endm
   2590 
   2591 generate_composite_function \
   2592     pixman_composite_over_0565_n_0565_asm_neon, 16, 0, 16, \
   2593     FLAG_DST_READWRITE, \
   2594     8, /* number of pixels, processed in a single block */ \
   2595     5, /* prefetch distance */ \
   2596     pixman_composite_over_0565_n_0565_init, \
   2597     pixman_composite_over_0565_n_0565_cleanup, \
   2598     pixman_composite_over_0565_8_0565_process_pixblock_head, \
   2599     pixman_composite_over_0565_8_0565_process_pixblock_tail, \
   2600     pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
   2601     28, /* dst_w_basereg */ \
   2602     10, /* dst_r_basereg */ \
   2603     8,  /* src_basereg   */ \
   2604     15  /* mask_basereg  */
   2605 
   2606 /******************************************************************************/
   2607 
   2608 .macro pixman_composite_add_0565_8_0565_process_pixblock_head
   2609     /* mask is in d15 */
   2610     convert_0565_to_x888 q4, d2, d1, d0
   2611     convert_0565_to_x888 q5, d6, d5, d4
   2612     /* source pixel data is in      {d0, d1, d2, XX} */
   2613     /* destination pixel data is in {d4, d5, d6, XX} */
   2614     vmull.u8    q6,  d15, d2
   2615     vmull.u8    q5,  d15, d1
   2616     vmull.u8    q4,  d15, d0
   2617     vrshr.u16   q12, q6,  #8
   2618     vrshr.u16   q11, q5,  #8
   2619     vrshr.u16   q10, q4,  #8
   2620     vraddhn.u16 d2,  q6,  q12
   2621     vraddhn.u16 d1,  q5,  q11
   2622     vraddhn.u16 d0,  q4,  q10
   2623 .endm
   2624 
   2625 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail
   2626     vqadd.u8    q0,  q0,  q2
   2627     vqadd.u8    q1,  q1,  q3
   2628     /* 32bpp result is in {d0, d1, d2, XX} */
   2629     convert_8888_to_0565 d2, d1, d0, q14, q15, q3
   2630 .endm
   2631 
   2632 /* TODO: expand macros and do better instructions scheduling */
   2633 .macro pixman_composite_add_0565_8_0565_process_pixblock_tail_head
   2634     fetch_mask_pixblock
   2635     pixman_composite_add_0565_8_0565_process_pixblock_tail
   2636     fetch_src_pixblock
   2637     vld1.16    {d10, d11}, [DST_R, :128]!
   2638     cache_preload 8, 8
   2639     pixman_composite_add_0565_8_0565_process_pixblock_head
   2640     vst1.16    {d28, d29}, [DST_W, :128]!
   2641 .endm
   2642 
   2643 generate_composite_function \
   2644     pixman_composite_add_0565_8_0565_asm_neon, 16, 8, 16, \
   2645     FLAG_DST_READWRITE, \
   2646     8, /* number of pixels, processed in a single block */ \
   2647     5, /* prefetch distance */ \
   2648     default_init_need_all_regs, \
   2649     default_cleanup_need_all_regs, \
   2650     pixman_composite_add_0565_8_0565_process_pixblock_head, \
   2651     pixman_composite_add_0565_8_0565_process_pixblock_tail, \
   2652     pixman_composite_add_0565_8_0565_process_pixblock_tail_head, \
   2653     28, /* dst_w_basereg */ \
   2654     10, /* dst_r_basereg */ \
   2655     8,  /* src_basereg   */ \
   2656     15  /* mask_basereg  */
   2657 
   2658 /******************************************************************************/
   2659 
   2660 .macro pixman_composite_out_reverse_8_0565_process_pixblock_head
   2661     /* mask is in d15 */
   2662     convert_0565_to_x888 q5, d6, d5, d4
   2663     /* destination pixel data is in {d4, d5, d6, xx} */
   2664     vmvn.8      d24, d15 /* get inverted alpha */
   2665     /* now do alpha blending */
   2666     vmull.u8    q8, d24, d4
   2667     vmull.u8    q9, d24, d5
   2668     vmull.u8    q10, d24, d6
   2669 .endm
   2670 
   2671 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail
   2672     vrshr.u16   q14, q8, #8
   2673     vrshr.u16   q15, q9, #8
   2674     vrshr.u16   q12, q10, #8
   2675     vraddhn.u16 d0, q14, q8
   2676     vraddhn.u16 d1, q15, q9
   2677     vraddhn.u16 d2, q12, q10
   2678     /* 32bpp result is in {d0, d1, d2, XX} */
   2679     convert_8888_to_0565 d2, d1, d0, q14, q15, q3
   2680 .endm
   2681 
   2682 /* TODO: expand macros and do better instructions scheduling */
   2683 .macro pixman_composite_out_reverse_8_0565_process_pixblock_tail_head
   2684     fetch_src_pixblock
   2685     pixman_composite_out_reverse_8_0565_process_pixblock_tail
   2686     vld1.16    {d10, d11}, [DST_R, :128]!
   2687     cache_preload 8, 8
   2688     pixman_composite_out_reverse_8_0565_process_pixblock_head
   2689     vst1.16    {d28, d29}, [DST_W, :128]!
   2690 .endm
   2691 
   2692 generate_composite_function \
   2693     pixman_composite_out_reverse_8_0565_asm_neon, 8, 0, 16, \
   2694     FLAG_DST_READWRITE, \
   2695     8, /* number of pixels, processed in a single block */ \
   2696     5, /* prefetch distance */ \
   2697     default_init_need_all_regs, \
   2698     default_cleanup_need_all_regs, \
   2699     pixman_composite_out_reverse_8_0565_process_pixblock_head, \
   2700     pixman_composite_out_reverse_8_0565_process_pixblock_tail, \
   2701     pixman_composite_out_reverse_8_0565_process_pixblock_tail_head, \
   2702     28, /* dst_w_basereg */ \
   2703     10, /* dst_r_basereg */ \
   2704     15, /* src_basereg   */ \
   2705     0   /* mask_basereg  */
   2706 
   2707 /******************************************************************************/
   2708 
   2709 .macro pixman_composite_out_reverse_8_8888_process_pixblock_head
   2710     /* src is in d0 */
   2711     /* destination pixel data is in {d4, d5, d6, d7} */
   2712     vmvn.8      d1, d0 /* get inverted alpha */
   2713     /* now do alpha blending */
   2714     vmull.u8    q8, d1, d4
   2715     vmull.u8    q9, d1, d5
   2716     vmull.u8    q10, d1, d6
   2717     vmull.u8    q11, d1, d7
   2718 .endm
   2719 
   2720 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail
   2721     vrshr.u16   q14, q8, #8
   2722     vrshr.u16   q15, q9, #8
   2723     vrshr.u16   q12, q10, #8
   2724     vrshr.u16   q13, q11, #8
   2725     vraddhn.u16 d28, q14, q8
   2726     vraddhn.u16 d29, q15, q9
   2727     vraddhn.u16 d30, q12, q10
   2728     vraddhn.u16 d31, q13, q11
   2729     /* 32bpp result is in {d28, d29, d30, d31} */
   2730 .endm
   2731 
   2732 /* TODO: expand macros and do better instructions scheduling */
   2733 .macro pixman_composite_out_reverse_8_8888_process_pixblock_tail_head
   2734     fetch_src_pixblock
   2735     pixman_composite_out_reverse_8_8888_process_pixblock_tail
   2736     vld4.8    {d4, d5, d6, d7}, [DST_R, :128]!
   2737     cache_preload 8, 8
   2738     pixman_composite_out_reverse_8_8888_process_pixblock_head
   2739     vst4.8    {d28, d29, d30, d31}, [DST_W, :128]!
   2740 .endm
   2741 
   2742 generate_composite_function \
   2743     pixman_composite_out_reverse_8_8888_asm_neon, 8, 0, 32, \
   2744     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2745     8, /* number of pixels, processed in a single block */ \
   2746     5, /* prefetch distance */ \
   2747     default_init, \
   2748     default_cleanup, \
   2749     pixman_composite_out_reverse_8_8888_process_pixblock_head, \
   2750     pixman_composite_out_reverse_8_8888_process_pixblock_tail, \
   2751     pixman_composite_out_reverse_8_8888_process_pixblock_tail_head, \
   2752     28, /* dst_w_basereg */ \
   2753     4, /* dst_r_basereg */ \
   2754     0, /* src_basereg   */ \
   2755     0   /* mask_basereg  */
   2756 
   2757 /******************************************************************************/
   2758 
   2759 generate_composite_function_nearest_scanline \
   2760     pixman_scaled_nearest_scanline_8888_8888_OVER_asm_neon, 32, 0, 32, \
   2761     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2762     8, /* number of pixels, processed in a single block */ \
   2763     default_init, \
   2764     default_cleanup, \
   2765     pixman_composite_over_8888_8888_process_pixblock_head, \
   2766     pixman_composite_over_8888_8888_process_pixblock_tail, \
   2767     pixman_composite_over_8888_8888_process_pixblock_tail_head
   2768 
   2769 generate_composite_function_nearest_scanline \
   2770     pixman_scaled_nearest_scanline_8888_0565_OVER_asm_neon, 32, 0, 16, \
   2771     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2772     8, /* number of pixels, processed in a single block */ \
   2773     default_init, \
   2774     default_cleanup, \
   2775     pixman_composite_over_8888_0565_process_pixblock_head, \
   2776     pixman_composite_over_8888_0565_process_pixblock_tail, \
   2777     pixman_composite_over_8888_0565_process_pixblock_tail_head, \
   2778     28, /* dst_w_basereg */ \
   2779     4,  /* dst_r_basereg */ \
   2780     0,  /* src_basereg   */ \
   2781     24  /* mask_basereg  */
   2782 
   2783 generate_composite_function_nearest_scanline \
   2784     pixman_scaled_nearest_scanline_8888_0565_SRC_asm_neon, 32, 0, 16, \
   2785     FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
   2786     8, /* number of pixels, processed in a single block */ \
   2787     default_init, \
   2788     default_cleanup, \
   2789     pixman_composite_src_8888_0565_process_pixblock_head, \
   2790     pixman_composite_src_8888_0565_process_pixblock_tail, \
   2791     pixman_composite_src_8888_0565_process_pixblock_tail_head
   2792 
   2793 generate_composite_function_nearest_scanline \
   2794     pixman_scaled_nearest_scanline_0565_8888_SRC_asm_neon, 16, 0, 32, \
   2795     FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
   2796     8, /* number of pixels, processed in a single block */ \
   2797     default_init, \
   2798     default_cleanup, \
   2799     pixman_composite_src_0565_8888_process_pixblock_head, \
   2800     pixman_composite_src_0565_8888_process_pixblock_tail, \
   2801     pixman_composite_src_0565_8888_process_pixblock_tail_head
   2802 
   2803 generate_composite_function_nearest_scanline \
   2804     pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_neon, 32, 8, 16, \
   2805     FLAG_DST_READWRITE | FLAG_DEINTERLEAVE_32BPP, \
   2806     8, /* number of pixels, processed in a single block */ \
   2807     default_init_need_all_regs, \
   2808     default_cleanup_need_all_regs, \
   2809     pixman_composite_over_8888_8_0565_process_pixblock_head, \
   2810     pixman_composite_over_8888_8_0565_process_pixblock_tail, \
   2811     pixman_composite_over_8888_8_0565_process_pixblock_tail_head, \
   2812     28, /* dst_w_basereg */ \
   2813     4,  /* dst_r_basereg */ \
   2814     8,  /* src_basereg   */ \
   2815     24  /* mask_basereg  */
   2816 
   2817 generate_composite_function_nearest_scanline \
   2818     pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_neon, 16, 8, 16, \
   2819     FLAG_DST_READWRITE, \
   2820     8, /* number of pixels, processed in a single block */ \
   2821     default_init_need_all_regs, \
   2822     default_cleanup_need_all_regs, \
   2823     pixman_composite_over_0565_8_0565_process_pixblock_head, \
   2824     pixman_composite_over_0565_8_0565_process_pixblock_tail, \
   2825     pixman_composite_over_0565_8_0565_process_pixblock_tail_head, \
   2826     28, /* dst_w_basereg */ \
   2827     10,  /* dst_r_basereg */ \
   2828     8,  /* src_basereg   */ \
   2829     15  /* mask_basereg  */
   2830 
   2831 /******************************************************************************/
   2832 
   2833 /* Supplementary macro for setting function attributes */
   2834 .macro pixman_asm_function fname
   2835     .func fname
   2836     .global fname
   2837 #ifdef __ELF__
   2838     .hidden fname
   2839     .type fname, %function
   2840 #endif
   2841 fname:
   2842 .endm
   2843 
   2844 /*
   2845  * Bilinear scaling support code which tries to provide pixel fetching, color
   2846  * format conversion, and interpolation as separate macros which can be used
   2847  * as the basic building blocks for constructing bilinear scanline functions.
   2848  */
   2849 
   2850 .macro bilinear_load_8888 reg1, reg2, tmp
   2851     mov       TMP1, X, asr #16
   2852     add       X, X, UX
   2853     add       TMP1, TOP, TMP1, asl #2
   2854     vld1.32   {reg1}, [TMP1], STRIDE
   2855     vld1.32   {reg2}, [TMP1]
   2856 .endm
   2857 
   2858 .macro bilinear_load_0565 reg1, reg2, tmp
   2859     mov       TMP1, X, asr #16
   2860     add       X, X, UX
   2861     add       TMP1, TOP, TMP1, asl #1
   2862     vld1.32   {reg2[0]}, [TMP1], STRIDE
   2863     vld1.32   {reg2[1]}, [TMP1]
   2864     convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
   2865 .endm
   2866 
   2867 .macro bilinear_load_and_vertical_interpolate_two_8888 \
   2868                     acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
   2869 
   2870     bilinear_load_8888 reg1, reg2, tmp1
   2871     vmull.u8  acc1, reg1, d28
   2872     vmlal.u8  acc1, reg2, d29
   2873     bilinear_load_8888 reg3, reg4, tmp2
   2874     vmull.u8  acc2, reg3, d28
   2875     vmlal.u8  acc2, reg4, d29
   2876 .endm
   2877 
   2878 .macro bilinear_load_and_vertical_interpolate_four_8888 \
   2879                 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
   2880                 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
   2881 
   2882     bilinear_load_and_vertical_interpolate_two_8888 \
   2883                 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
   2884     bilinear_load_and_vertical_interpolate_two_8888 \
   2885                 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
   2886 .endm
   2887 
   2888 .macro bilinear_load_and_vertical_interpolate_two_0565 \
   2889                 acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
   2890 
   2891     mov       TMP1, X, asr #16
   2892     add       X, X, UX
   2893     add       TMP1, TOP, TMP1, asl #1
   2894     mov       TMP2, X, asr #16
   2895     add       X, X, UX
   2896     add       TMP2, TOP, TMP2, asl #1
   2897     vld1.32   {acc2lo[0]}, [TMP1], STRIDE
   2898     vld1.32   {acc2hi[0]}, [TMP2], STRIDE
   2899     vld1.32   {acc2lo[1]}, [TMP1]
   2900     vld1.32   {acc2hi[1]}, [TMP2]
   2901     convert_0565_to_x888 acc2, reg3, reg2, reg1
   2902     vzip.u8   reg1, reg3
   2903     vzip.u8   reg2, reg4
   2904     vzip.u8   reg3, reg4
   2905     vzip.u8   reg1, reg2
   2906     vmull.u8  acc1, reg1, d28
   2907     vmlal.u8  acc1, reg2, d29
   2908     vmull.u8  acc2, reg3, d28
   2909     vmlal.u8  acc2, reg4, d29
   2910 .endm
   2911 
   2912 .macro bilinear_load_and_vertical_interpolate_four_0565 \
   2913                 xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
   2914                 yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
   2915 
   2916     mov       TMP1, X, asr #16
   2917     add       X, X, UX
   2918     add       TMP1, TOP, TMP1, asl #1
   2919     mov       TMP2, X, asr #16
   2920     add       X, X, UX
   2921     add       TMP2, TOP, TMP2, asl #1
   2922     vld1.32   {xacc2lo[0]}, [TMP1], STRIDE
   2923     vld1.32   {xacc2hi[0]}, [TMP2], STRIDE
   2924     vld1.32   {xacc2lo[1]}, [TMP1]
   2925     vld1.32   {xacc2hi[1]}, [TMP2]
   2926     convert_0565_to_x888 xacc2, xreg3, xreg2, xreg1
   2927     mov       TMP1, X, asr #16
   2928     add       X, X, UX
   2929     add       TMP1, TOP, TMP1, asl #1
   2930     mov       TMP2, X, asr #16
   2931     add       X, X, UX
   2932     add       TMP2, TOP, TMP2, asl #1
   2933     vld1.32   {yacc2lo[0]}, [TMP1], STRIDE
   2934     vzip.u8   xreg1, xreg3
   2935     vld1.32   {yacc2hi[0]}, [TMP2], STRIDE
   2936     vzip.u8   xreg2, xreg4
   2937     vld1.32   {yacc2lo[1]}, [TMP1]
   2938     vzip.u8   xreg3, xreg4
   2939     vld1.32   {yacc2hi[1]}, [TMP2]
   2940     vzip.u8   xreg1, xreg2
   2941     convert_0565_to_x888 yacc2, yreg3, yreg2, yreg1
   2942     vmull.u8  xacc1, xreg1, d28
   2943     vzip.u8   yreg1, yreg3
   2944     vmlal.u8  xacc1, xreg2, d29
   2945     vzip.u8   yreg2, yreg4
   2946     vmull.u8  xacc2, xreg3, d28
   2947     vzip.u8   yreg3, yreg4
   2948     vmlal.u8  xacc2, xreg4, d29
   2949     vzip.u8   yreg1, yreg2
   2950     vmull.u8  yacc1, yreg1, d28
   2951     vmlal.u8  yacc1, yreg2, d29
   2952     vmull.u8  yacc2, yreg3, d28
   2953     vmlal.u8  yacc2, yreg4, d29
   2954 .endm
   2955 
   2956 .macro bilinear_store_8888 numpix, tmp1, tmp2
   2957 .if numpix == 4
   2958     vst1.32   {d0, d1}, [OUT, :128]!
   2959 .elseif numpix == 2
   2960     vst1.32   {d0}, [OUT, :64]!
   2961 .elseif numpix == 1
   2962     vst1.32   {d0[0]}, [OUT, :32]!
   2963 .else
   2964     .error bilinear_store_8888 numpix is unsupported
   2965 .endif
   2966 .endm
   2967 
   2968 .macro bilinear_store_0565 numpix, tmp1, tmp2
   2969     vuzp.u8 d0, d1
   2970     vuzp.u8 d2, d3
   2971     vuzp.u8 d1, d3
   2972     vuzp.u8 d0, d2
   2973     convert_8888_to_0565 d2, d1, d0, q1, tmp1, tmp2
   2974 .if numpix == 4
   2975     vst1.16   {d2}, [OUT, :64]!
   2976 .elseif numpix == 2
   2977     vst1.32   {d2[0]}, [OUT, :32]!
   2978 .elseif numpix == 1
   2979     vst1.16   {d2[0]}, [OUT, :16]!
   2980 .else
   2981     .error bilinear_store_0565 numpix is unsupported
   2982 .endif
   2983 .endm
   2984 
   2985 .macro bilinear_interpolate_last_pixel src_fmt, dst_fmt
   2986     bilinear_load_&src_fmt d0, d1, d2
   2987     vmull.u8  q1, d0, d28
   2988     vmlal.u8  q1, d1, d29
   2989     /* 5 cycles bubble */
   2990     vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
   2991     vmlsl.u16 q0, d2, d30
   2992     vmlal.u16 q0, d3, d30
   2993     /* 5 cycles bubble */
   2994     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   2995     /* 3 cycles bubble */
   2996     vmovn.u16 d0, q0
   2997     /* 1 cycle bubble */
   2998     bilinear_store_&dst_fmt 1, q2, q3
   2999 .endm
   3000 
   3001 .macro bilinear_interpolate_two_pixels src_fmt, dst_fmt
   3002     bilinear_load_and_vertical_interpolate_two_&src_fmt \
   3003                 q1, q11, d0, d1, d20, d21, d22, d23
   3004     vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
   3005     vmlsl.u16 q0, d2, d30
   3006     vmlal.u16 q0, d3, d30
   3007     vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
   3008     vmlsl.u16 q10, d22, d31
   3009     vmlal.u16 q10, d23, d31
   3010     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   3011     vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
   3012     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3013     vadd.u16  q12, q12, q13
   3014     vmovn.u16 d0, q0
   3015     bilinear_store_&dst_fmt 2, q2, q3
   3016 .endm
   3017 
   3018 .macro bilinear_interpolate_four_pixels src_fmt, dst_fmt
   3019     bilinear_load_and_vertical_interpolate_four_&src_fmt \
   3020                 q1, q11, d0, d1, d20, d21, d22, d23 \
   3021                 q3, q9,  d4, d5, d16, d17, d18, d19
   3022     pld       [TMP1, PF_OFFS]
   3023     sub       TMP1, TMP1, STRIDE
   3024     vshll.u16 q0, d2, #BILINEAR_INTERPOLATION_BITS
   3025     vmlsl.u16 q0, d2, d30
   3026     vmlal.u16 q0, d3, d30
   3027     vshll.u16 q10, d22, #BILINEAR_INTERPOLATION_BITS
   3028     vmlsl.u16 q10, d22, d31
   3029     vmlal.u16 q10, d23, d31
   3030     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3031     vshll.u16 q2, d6, #BILINEAR_INTERPOLATION_BITS
   3032     vmlsl.u16 q2, d6, d30
   3033     vmlal.u16 q2, d7, d30
   3034     vshll.u16 q8, d18, #BILINEAR_INTERPOLATION_BITS
   3035     pld       [TMP2, PF_OFFS]
   3036     vmlsl.u16 q8, d18, d31
   3037     vmlal.u16 q8, d19, d31
   3038     vadd.u16  q12, q12, q13
   3039     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   3040     vshrn.u32 d1, q10, #(2 * BILINEAR_INTERPOLATION_BITS)
   3041     vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
   3042     vshrn.u32 d5, q8, #(2 * BILINEAR_INTERPOLATION_BITS)
   3043     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3044     vmovn.u16 d0, q0
   3045     vmovn.u16 d1, q2
   3046     vadd.u16  q12, q12, q13
   3047     bilinear_store_&dst_fmt 4, q2, q3
   3048 .endm
   3049 
   3050 .macro bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
   3051 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
   3052     bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_head
   3053 .else
   3054     bilinear_interpolate_four_pixels src_fmt, dst_fmt
   3055 .endif
   3056 .endm
   3057 
   3058 .macro bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
   3059 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
   3060     bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail
   3061 .endif
   3062 .endm
   3063 
   3064 .macro bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
   3065 .ifdef have_bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt
   3066     bilinear_interpolate_four_pixels_&src_fmt&_&dst_fmt&_tail_head
   3067 .else
   3068     bilinear_interpolate_four_pixels src_fmt, dst_fmt
   3069 .endif
   3070 .endm
   3071 
   3072 .macro bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
   3073 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
   3074     bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_head
   3075 .else
   3076     bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
   3077     bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
   3078 .endif
   3079 .endm
   3080 
   3081 .macro bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
   3082 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
   3083     bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail
   3084 .else
   3085     bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
   3086 .endif
   3087 .endm
   3088 
   3089 .macro bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
   3090 .ifdef have_bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt
   3091     bilinear_interpolate_eight_pixels_&src_fmt&_&dst_fmt&_tail_head
   3092 .else
   3093     bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
   3094     bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
   3095 .endif
   3096 .endm
   3097 
   3098 .set BILINEAR_FLAG_UNROLL_4,          0
   3099 .set BILINEAR_FLAG_UNROLL_8,          1
   3100 .set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
   3101 
   3102 /*
   3103  * Main template macro for generating NEON optimized bilinear scanline
   3104  * functions.
   3105  *
   3106  * Bilinear scanline scaler macro template uses the following arguments:
   3107  *  fname             - name of the function to generate
   3108  *  src_fmt           - source color format (8888 or 0565)
   3109  *  dst_fmt           - destination color format (8888 or 0565)
   3110  *  bpp_shift         - (1 << bpp_shift) is the size of source pixel in bytes
   3111  *  prefetch_distance - prefetch in the source image by that many
   3112  *                      pixels ahead
   3113  */
   3114 
   3115 .macro generate_bilinear_scanline_func fname, src_fmt, dst_fmt, \
   3116                                        src_bpp_shift, dst_bpp_shift, \
   3117                                        prefetch_distance, flags
   3118 
   3119 pixman_asm_function fname
   3120     OUT       .req      r0
   3121     TOP       .req      r1
   3122     BOTTOM    .req      r2
   3123     WT        .req      r3
   3124     WB        .req      r4
   3125     X         .req      r5
   3126     UX        .req      r6
   3127     WIDTH     .req      ip
   3128     TMP1      .req      r3
   3129     TMP2      .req      r4
   3130     PF_OFFS   .req      r7
   3131     TMP3      .req      r8
   3132     TMP4      .req      r9
   3133     STRIDE    .req      r2
   3134 
   3135     mov       ip, sp
   3136     push      {r4, r5, r6, r7, r8, r9}
   3137     mov       PF_OFFS, #prefetch_distance
   3138     ldmia     ip, {WB, X, UX, WIDTH}
   3139     mul       PF_OFFS, PF_OFFS, UX
   3140 
   3141 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
   3142     vpush     {d8-d15}
   3143 .endif
   3144 
   3145     sub       STRIDE, BOTTOM, TOP
   3146     .unreq    BOTTOM
   3147 
   3148     cmp       WIDTH, #0
   3149     ble       3f
   3150 
   3151     vdup.u16  q12, X
   3152     vdup.u16  q13, UX
   3153     vdup.u8   d28, WT
   3154     vdup.u8   d29, WB
   3155     vadd.u16  d25, d25, d26
   3156 
   3157     /* ensure good destination alignment  */
   3158     cmp       WIDTH, #1
   3159     blt       0f
   3160     tst       OUT, #(1 << dst_bpp_shift)
   3161     beq       0f
   3162     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3163     vadd.u16  q12, q12, q13
   3164     bilinear_interpolate_last_pixel src_fmt, dst_fmt
   3165     sub       WIDTH, WIDTH, #1
   3166 0:
   3167     vadd.u16  q13, q13, q13
   3168     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3169     vadd.u16  q12, q12, q13
   3170 
   3171     cmp       WIDTH, #2
   3172     blt       0f
   3173     tst       OUT, #(1 << (dst_bpp_shift + 1))
   3174     beq       0f
   3175     bilinear_interpolate_two_pixels src_fmt, dst_fmt
   3176     sub       WIDTH, WIDTH, #2
   3177 0:
   3178 .if ((flags) & BILINEAR_FLAG_UNROLL_8) != 0
   3179 /*********** 8 pixels per iteration *****************/
   3180     cmp       WIDTH, #4
   3181     blt       0f
   3182     tst       OUT, #(1 << (dst_bpp_shift + 2))
   3183     beq       0f
   3184     bilinear_interpolate_four_pixels src_fmt, dst_fmt
   3185     sub       WIDTH, WIDTH, #4
   3186 0:
   3187     subs      WIDTH, WIDTH, #8
   3188     blt       1f
   3189     mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
   3190     bilinear_interpolate_eight_pixels_head src_fmt, dst_fmt
   3191     subs      WIDTH, WIDTH, #8
   3192     blt       5f
   3193 0:
   3194     bilinear_interpolate_eight_pixels_tail_head src_fmt, dst_fmt
   3195     subs      WIDTH, WIDTH, #8
   3196     bge       0b
   3197 5:
   3198     bilinear_interpolate_eight_pixels_tail src_fmt, dst_fmt
   3199 1:
   3200     tst       WIDTH, #4
   3201     beq       2f
   3202     bilinear_interpolate_four_pixels src_fmt, dst_fmt
   3203 2:
   3204 .else
   3205 /*********** 4 pixels per iteration *****************/
   3206     subs      WIDTH, WIDTH, #4
   3207     blt       1f
   3208     mov       PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
   3209     bilinear_interpolate_four_pixels_head src_fmt, dst_fmt
   3210     subs      WIDTH, WIDTH, #4
   3211     blt       5f
   3212 0:
   3213     bilinear_interpolate_four_pixels_tail_head src_fmt, dst_fmt
   3214     subs      WIDTH, WIDTH, #4
   3215     bge       0b
   3216 5:
   3217     bilinear_interpolate_four_pixels_tail src_fmt, dst_fmt
   3218 1:
   3219 /****************************************************/
   3220 .endif
   3221     /* handle the remaining trailing pixels */
   3222     tst       WIDTH, #2
   3223     beq       2f
   3224     bilinear_interpolate_two_pixels src_fmt, dst_fmt
   3225 2:
   3226     tst       WIDTH, #1
   3227     beq       3f
   3228     bilinear_interpolate_last_pixel src_fmt, dst_fmt
   3229 3:
   3230 .if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
   3231     vpop      {d8-d15}
   3232 .endif
   3233     pop       {r4, r5, r6, r7, r8, r9}
   3234     bx        lr
   3235 
   3236     .unreq    OUT
   3237     .unreq    TOP
   3238     .unreq    WT
   3239     .unreq    WB
   3240     .unreq    X
   3241     .unreq    UX
   3242     .unreq    WIDTH
   3243     .unreq    TMP1
   3244     .unreq    TMP2
   3245     .unreq    PF_OFFS
   3246     .unreq    TMP3
   3247     .unreq    TMP4
   3248     .unreq    STRIDE
   3249 .endfunc
   3250 
   3251 .endm
   3252 
   3253 /*****************************************************************************/
   3254 
   3255 .set have_bilinear_interpolate_four_pixels_8888_8888, 1
   3256 
   3257 .macro bilinear_interpolate_four_pixels_8888_8888_head
   3258     mov       TMP1, X, asr #16
   3259     add       X, X, UX
   3260     add       TMP1, TOP, TMP1, asl #2
   3261     mov       TMP2, X, asr #16
   3262     add       X, X, UX
   3263     add       TMP2, TOP, TMP2, asl #2
   3264 
   3265     vld1.32   {d22}, [TMP1], STRIDE
   3266     vld1.32   {d23}, [TMP1]
   3267     mov       TMP3, X, asr #16
   3268     add       X, X, UX
   3269     add       TMP3, TOP, TMP3, asl #2
   3270     vmull.u8  q8, d22, d28
   3271     vmlal.u8  q8, d23, d29
   3272 
   3273     vld1.32   {d22}, [TMP2], STRIDE
   3274     vld1.32   {d23}, [TMP2]
   3275     mov       TMP4, X, asr #16
   3276     add       X, X, UX
   3277     add       TMP4, TOP, TMP4, asl #2
   3278     vmull.u8  q9, d22, d28
   3279     vmlal.u8  q9, d23, d29
   3280 
   3281     vld1.32   {d22}, [TMP3], STRIDE
   3282     vld1.32   {d23}, [TMP3]
   3283     vmull.u8  q10, d22, d28
   3284     vmlal.u8  q10, d23, d29
   3285 
   3286     vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
   3287     vmlsl.u16 q0, d16, d30
   3288     vmlal.u16 q0, d17, d30
   3289 
   3290     pld       [TMP4, PF_OFFS]
   3291     vld1.32   {d16}, [TMP4], STRIDE
   3292     vld1.32   {d17}, [TMP4]
   3293     pld       [TMP4, PF_OFFS]
   3294     vmull.u8  q11, d16, d28
   3295     vmlal.u8  q11, d17, d29
   3296 
   3297     vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
   3298     vmlsl.u16 q1, d18, d31
   3299 .endm
   3300 
   3301 .macro bilinear_interpolate_four_pixels_8888_8888_tail
   3302     vmlal.u16 q1, d19, d31
   3303     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3304     vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
   3305     vmlsl.u16 q2, d20, d30
   3306     vmlal.u16 q2, d21, d30
   3307     vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
   3308     vmlsl.u16 q3, d22, d31
   3309     vmlal.u16 q3, d23, d31
   3310     vadd.u16  q12, q12, q13
   3311     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   3312     vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
   3313     vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
   3314     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3315     vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
   3316     vmovn.u16 d6, q0
   3317     vmovn.u16 d7, q2
   3318     vadd.u16  q12, q12, q13
   3319     vst1.32   {d6, d7}, [OUT, :128]!
   3320 .endm
   3321 
   3322 .macro bilinear_interpolate_four_pixels_8888_8888_tail_head
   3323     mov       TMP1, X, asr #16
   3324     add       X, X, UX
   3325     add       TMP1, TOP, TMP1, asl #2
   3326     mov       TMP2, X, asr #16
   3327     add       X, X, UX
   3328     add       TMP2, TOP, TMP2, asl #2
   3329         vmlal.u16 q1, d19, d31
   3330         vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3331         vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
   3332         vmlsl.u16 q2, d20, d30
   3333         vmlal.u16 q2, d21, d30
   3334         vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
   3335     vld1.32   {d20}, [TMP1], STRIDE
   3336         vmlsl.u16 q3, d22, d31
   3337         vmlal.u16 q3, d23, d31
   3338     vld1.32   {d21}, [TMP1]
   3339     vmull.u8  q8, d20, d28
   3340     vmlal.u8  q8, d21, d29
   3341         vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   3342         vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
   3343         vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
   3344     vld1.32   {d22}, [TMP2], STRIDE
   3345         vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
   3346         vadd.u16  q12, q12, q13
   3347     vld1.32   {d23}, [TMP2]
   3348     vmull.u8  q9, d22, d28
   3349     mov       TMP3, X, asr #16
   3350     add       X, X, UX
   3351     add       TMP3, TOP, TMP3, asl #2
   3352     mov       TMP4, X, asr #16
   3353     add       X, X, UX
   3354     add       TMP4, TOP, TMP4, asl #2
   3355     vmlal.u8  q9, d23, d29
   3356     vld1.32   {d22}, [TMP3], STRIDE
   3357         vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3358     vld1.32   {d23}, [TMP3]
   3359     vmull.u8  q10, d22, d28
   3360     vmlal.u8  q10, d23, d29
   3361         vmovn.u16 d6, q0
   3362     vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
   3363         vmovn.u16 d7, q2
   3364     vmlsl.u16 q0, d16, d30
   3365     vmlal.u16 q0, d17, d30
   3366     pld       [TMP4, PF_OFFS]
   3367     vld1.32   {d16}, [TMP4], STRIDE
   3368         vadd.u16  q12, q12, q13
   3369     vld1.32   {d17}, [TMP4]
   3370     pld       [TMP4, PF_OFFS]
   3371     vmull.u8  q11, d16, d28
   3372     vmlal.u8  q11, d17, d29
   3373         vst1.32   {d6, d7}, [OUT, :128]!
   3374     vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
   3375     vmlsl.u16 q1, d18, d31
   3376 .endm
   3377 
   3378 /*****************************************************************************/
   3379 
   3380 .set have_bilinear_interpolate_eight_pixels_8888_0565, 1
   3381 
   3382 .macro bilinear_interpolate_eight_pixels_8888_0565_head
   3383     mov       TMP1, X, asr #16
   3384     add       X, X, UX
   3385     add       TMP1, TOP, TMP1, asl #2
   3386     mov       TMP2, X, asr #16
   3387     add       X, X, UX
   3388     add       TMP2, TOP, TMP2, asl #2
   3389     vld1.32   {d20}, [TMP1], STRIDE
   3390     vld1.32   {d21}, [TMP1]
   3391     vmull.u8  q8, d20, d28
   3392     vmlal.u8  q8, d21, d29
   3393     vld1.32   {d22}, [TMP2], STRIDE
   3394     vld1.32   {d23}, [TMP2]
   3395     vmull.u8  q9, d22, d28
   3396     mov       TMP3, X, asr #16
   3397     add       X, X, UX
   3398     add       TMP3, TOP, TMP3, asl #2
   3399     mov       TMP4, X, asr #16
   3400     add       X, X, UX
   3401     add       TMP4, TOP, TMP4, asl #2
   3402     vmlal.u8  q9, d23, d29
   3403     vld1.32   {d22}, [TMP3], STRIDE
   3404     vld1.32   {d23}, [TMP3]
   3405     vmull.u8  q10, d22, d28
   3406     vmlal.u8  q10, d23, d29
   3407     vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
   3408     vmlsl.u16 q0, d16, d30
   3409     vmlal.u16 q0, d17, d30
   3410     pld       [TMP4, PF_OFFS]
   3411     vld1.32   {d16}, [TMP4], STRIDE
   3412     vld1.32   {d17}, [TMP4]
   3413     pld       [TMP4, PF_OFFS]
   3414     vmull.u8  q11, d16, d28
   3415     vmlal.u8  q11, d17, d29
   3416     vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
   3417     vmlsl.u16 q1, d18, d31
   3418 
   3419     mov       TMP1, X, asr #16
   3420     add       X, X, UX
   3421     add       TMP1, TOP, TMP1, asl #2
   3422     mov       TMP2, X, asr #16
   3423     add       X, X, UX
   3424     add       TMP2, TOP, TMP2, asl #2
   3425         vmlal.u16 q1, d19, d31
   3426         vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3427         vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
   3428         vmlsl.u16 q2, d20, d30
   3429         vmlal.u16 q2, d21, d30
   3430         vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
   3431     vld1.32   {d20}, [TMP1], STRIDE
   3432         vmlsl.u16 q3, d22, d31
   3433         vmlal.u16 q3, d23, d31
   3434     vld1.32   {d21}, [TMP1]
   3435     vmull.u8  q8, d20, d28
   3436     vmlal.u8  q8, d21, d29
   3437         vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   3438         vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
   3439         vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
   3440     vld1.32   {d22}, [TMP2], STRIDE
   3441         vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
   3442         vadd.u16  q12, q12, q13
   3443     vld1.32   {d23}, [TMP2]
   3444     vmull.u8  q9, d22, d28
   3445     mov       TMP3, X, asr #16
   3446     add       X, X, UX
   3447     add       TMP3, TOP, TMP3, asl #2
   3448     mov       TMP4, X, asr #16
   3449     add       X, X, UX
   3450     add       TMP4, TOP, TMP4, asl #2
   3451     vmlal.u8  q9, d23, d29
   3452     vld1.32   {d22}, [TMP3], STRIDE
   3453         vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3454     vld1.32   {d23}, [TMP3]
   3455     vmull.u8  q10, d22, d28
   3456     vmlal.u8  q10, d23, d29
   3457         vmovn.u16 d8, q0
   3458     vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
   3459         vmovn.u16 d9, q2
   3460     vmlsl.u16 q0, d16, d30
   3461     vmlal.u16 q0, d17, d30
   3462     pld       [TMP4, PF_OFFS]
   3463     vld1.32   {d16}, [TMP4], STRIDE
   3464         vadd.u16  q12, q12, q13
   3465     vld1.32   {d17}, [TMP4]
   3466     pld       [TMP4, PF_OFFS]
   3467     vmull.u8  q11, d16, d28
   3468     vmlal.u8  q11, d17, d29
   3469     vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
   3470     vmlsl.u16 q1, d18, d31
   3471 .endm
   3472 
   3473 .macro bilinear_interpolate_eight_pixels_8888_0565_tail
   3474     vmlal.u16 q1, d19, d31
   3475     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3476     vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
   3477     vmlsl.u16 q2, d20, d30
   3478     vmlal.u16 q2, d21, d30
   3479     vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
   3480     vmlsl.u16 q3, d22, d31
   3481     vmlal.u16 q3, d23, d31
   3482     vadd.u16  q12, q12, q13
   3483     vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   3484     vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
   3485     vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
   3486     vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3487     vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
   3488     vmovn.u16 d10, q0
   3489     vmovn.u16 d11, q2
   3490     vadd.u16  q12, q12, q13
   3491 
   3492     vuzp.u8   d8, d9
   3493     vuzp.u8   d10, d11
   3494     vuzp.u8   d9, d11
   3495     vuzp.u8   d8, d10
   3496     vshll.u8  q6, d9, #8
   3497     vshll.u8  q5, d10, #8
   3498     vshll.u8  q7, d8, #8
   3499     vsri.u16  q5, q6, #5
   3500     vsri.u16  q5, q7, #11
   3501     vst1.32   {d10, d11}, [OUT, :128]!
   3502 .endm
   3503 
   3504 .macro bilinear_interpolate_eight_pixels_8888_0565_tail_head
   3505     mov       TMP1, X, asr #16
   3506     add       X, X, UX
   3507     add       TMP1, TOP, TMP1, asl #2
   3508     mov       TMP2, X, asr #16
   3509     add       X, X, UX
   3510     add       TMP2, TOP, TMP2, asl #2
   3511         vmlal.u16 q1, d19, d31
   3512         vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3513             vuzp.u8 d8, d9
   3514         vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
   3515         vmlsl.u16 q2, d20, d30
   3516         vmlal.u16 q2, d21, d30
   3517         vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
   3518     vld1.32   {d20}, [TMP1], STRIDE
   3519         vmlsl.u16 q3, d22, d31
   3520         vmlal.u16 q3, d23, d31
   3521     vld1.32   {d21}, [TMP1]
   3522     vmull.u8  q8, d20, d28
   3523     vmlal.u8  q8, d21, d29
   3524         vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   3525         vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
   3526         vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
   3527     vld1.32   {d22}, [TMP2], STRIDE
   3528         vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
   3529         vadd.u16  q12, q12, q13
   3530     vld1.32   {d23}, [TMP2]
   3531     vmull.u8  q9, d22, d28
   3532     mov       TMP3, X, asr #16
   3533     add       X, X, UX
   3534     add       TMP3, TOP, TMP3, asl #2
   3535     mov       TMP4, X, asr #16
   3536     add       X, X, UX
   3537     add       TMP4, TOP, TMP4, asl #2
   3538     vmlal.u8  q9, d23, d29
   3539     vld1.32   {d22}, [TMP3], STRIDE
   3540         vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3541     vld1.32   {d23}, [TMP3]
   3542     vmull.u8  q10, d22, d28
   3543     vmlal.u8  q10, d23, d29
   3544         vmovn.u16 d10, q0
   3545     vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
   3546         vmovn.u16 d11, q2
   3547     vmlsl.u16 q0, d16, d30
   3548     vmlal.u16 q0, d17, d30
   3549     pld       [TMP4, PF_OFFS]
   3550     vld1.32   {d16}, [TMP4], STRIDE
   3551         vadd.u16  q12, q12, q13
   3552     vld1.32   {d17}, [TMP4]
   3553     pld       [TMP4, PF_OFFS]
   3554     vmull.u8  q11, d16, d28
   3555     vmlal.u8  q11, d17, d29
   3556             vuzp.u8 d10, d11
   3557     vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
   3558     vmlsl.u16 q1, d18, d31
   3559 
   3560     mov       TMP1, X, asr #16
   3561     add       X, X, UX
   3562     add       TMP1, TOP, TMP1, asl #2
   3563     mov       TMP2, X, asr #16
   3564     add       X, X, UX
   3565     add       TMP2, TOP, TMP2, asl #2
   3566         vmlal.u16 q1, d19, d31
   3567             vuzp.u8 d9, d11
   3568         vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3569         vshll.u16 q2, d20, #BILINEAR_INTERPOLATION_BITS
   3570             vuzp.u8 d8, d10
   3571         vmlsl.u16 q2, d20, d30
   3572         vmlal.u16 q2, d21, d30
   3573         vshll.u16 q3, d22, #BILINEAR_INTERPOLATION_BITS
   3574     vld1.32   {d20}, [TMP1], STRIDE
   3575         vmlsl.u16 q3, d22, d31
   3576         vmlal.u16 q3, d23, d31
   3577     vld1.32   {d21}, [TMP1]
   3578     vmull.u8  q8, d20, d28
   3579     vmlal.u8  q8, d21, d29
   3580             vshll.u8  q6, d9, #8
   3581             vshll.u8  q5, d10, #8
   3582             vshll.u8  q7, d8, #8
   3583         vshrn.u32 d0, q0, #(2 * BILINEAR_INTERPOLATION_BITS)
   3584             vsri.u16  q5, q6, #5
   3585         vshrn.u32 d1, q1, #(2 * BILINEAR_INTERPOLATION_BITS)
   3586             vsri.u16  q5, q7, #11
   3587         vshrn.u32 d4, q2, #(2 * BILINEAR_INTERPOLATION_BITS)
   3588     vld1.32   {d22}, [TMP2], STRIDE
   3589         vshrn.u32 d5, q3, #(2 * BILINEAR_INTERPOLATION_BITS)
   3590         vadd.u16  q12, q12, q13
   3591     vld1.32   {d23}, [TMP2]
   3592     vmull.u8  q9, d22, d28
   3593     mov       TMP3, X, asr #16
   3594     add       X, X, UX
   3595     add       TMP3, TOP, TMP3, asl #2
   3596     mov       TMP4, X, asr #16
   3597     add       X, X, UX
   3598     add       TMP4, TOP, TMP4, asl #2
   3599     vmlal.u8  q9, d23, d29
   3600     vld1.32   {d22}, [TMP3], STRIDE
   3601         vshr.u16  q15, q12, #(16 - BILINEAR_INTERPOLATION_BITS)
   3602     vld1.32   {d23}, [TMP3]
   3603     vmull.u8  q10, d22, d28
   3604     vmlal.u8  q10, d23, d29
   3605         vmovn.u16 d8, q0
   3606     vshll.u16 q0, d16, #BILINEAR_INTERPOLATION_BITS
   3607         vmovn.u16 d9, q2
   3608     vmlsl.u16 q0, d16, d30
   3609     vmlal.u16 q0, d17, d30
   3610     pld       [TMP4, PF_OFFS]
   3611     vld1.32   {d16}, [TMP4], STRIDE
   3612         vadd.u16  q12, q12, q13
   3613     vld1.32   {d17}, [TMP4]
   3614     pld       [TMP4, PF_OFFS]
   3615     vmull.u8  q11, d16, d28
   3616     vmlal.u8  q11, d17, d29
   3617     vshll.u16 q1, d18, #BILINEAR_INTERPOLATION_BITS
   3618             vst1.32   {d10, d11}, [OUT, :128]!
   3619     vmlsl.u16 q1, d18, d31
   3620 .endm
   3621 /*****************************************************************************/
   3622 
   3623 generate_bilinear_scanline_func \
   3624     pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon, 8888, 8888, \
   3625     2, 2, 28, BILINEAR_FLAG_UNROLL_4
   3626 
   3627 generate_bilinear_scanline_func \
   3628     pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_neon, 8888, 0565, \
   3629     2, 1, 28, BILINEAR_FLAG_UNROLL_8 | BILINEAR_FLAG_USE_ALL_NEON_REGS
   3630 
   3631 generate_bilinear_scanline_func \
   3632     pixman_scaled_bilinear_scanline_0565_x888_SRC_asm_neon, 0565, 8888, \
   3633     1, 2, 28, BILINEAR_FLAG_UNROLL_4
   3634 
   3635 generate_bilinear_scanline_func \
   3636     pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_neon, 0565, 0565, \
   3637     1, 1, 28, BILINEAR_FLAG_UNROLL_4
   3638