Home | History | Annotate | Download | only in pixman
      1 /*
      2  * Copyright  2009 Nokia Corporation
      3  *
      4  * Permission is hereby granted, free of charge, to any person obtaining a
      5  * copy of this software and associated documentation files (the "Software"),
      6  * to deal in the Software without restriction, including without limitation
      7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8  * and/or sell copies of the Software, and to permit persons to whom the
      9  * Software is furnished to do so, subject to the following conditions:
     10  *
     11  * The above copyright notice and this permission notice (including the next
     12  * paragraph) shall be included in all copies or substantial portions of the
     13  * Software.
     14  *
     15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     21  * DEALINGS IN THE SOFTWARE.
     22  *
     23  * Author:  Siarhei Siamashka (siarhei.siamashka (at) nokia.com)
     24  */
     25 
     26 /*
     27  * This file contains a macro ('generate_composite_function') which can
     28  * construct 2D image processing functions, based on a common template.
     29  * Any combinations of source, destination and mask images with 8bpp,
     30  * 16bpp, 24bpp, 32bpp color formats are supported.
     31  *
     32  * This macro takes care of:
     33  *  - handling of leading and trailing unaligned pixels
     34  *  - doing most of the work related to L2 cache preload
     35  *  - encourages the use of software pipelining for better instructions
     36  *    scheduling
     37  *
     38  * The user of this macro has to provide some configuration parameters
     39  * (bit depths for the images, prefetch distance, etc.) and a set of
     40  * macros, which should implement basic code chunks responsible for
     41  * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
     42  * examples.
     43  *
     44  * TODO:
     45  *  - try overlapped pixel method (from Ian Rickards) when processing
     46  *    exactly two blocks of pixels
     47  *  - maybe add an option to do reverse scanline processing
     48  */
     49 
     50 /*
     51  * Bit flags for 'generate_composite_function' macro which are used
     52  * to tune generated functions behavior.
     53  */
     54 .set FLAG_DST_WRITEONLY,       0
     55 .set FLAG_DST_READWRITE,       1
     56 .set FLAG_DEINTERLEAVE_32BPP,  2
     57 
     58 /*
     59  * Offset in stack where mask and source pointer/stride can be accessed
     60  * from 'init' macro. This is useful for doing special handling for solid mask.
     61  */
     62 .set ARGS_STACK_OFFSET,        40
     63 
     64 /*
     65  * Constants for selecting preferable prefetch type.
     66  */
     67 .set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
     68 .set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
     69 .set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
     70 
     71 /*
     72  * Definitions of supplementary pixld/pixst macros (for partial load/store of
     73  * pixel data).
     74  */
     75 
     76 .macro pixldst1 op, elem_size, reg1, mem_operand, abits
     77 .if abits > 0
     78     op&.&elem_size {d&reg1}, [&mem_operand&, :&abits&]!
     79 .else
     80     op&.&elem_size {d&reg1}, [&mem_operand&]!
     81 .endif
     82 .endm
     83 
     84 .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
     85 .if abits > 0
     86     op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&, :&abits&]!
     87 .else
     88     op&.&elem_size {d&reg1, d&reg2}, [&mem_operand&]!
     89 .endif
     90 .endm
     91 
     92 .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
     93 .if abits > 0
     94     op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&, :&abits&]!
     95 .else
     96     op&.&elem_size {d&reg1, d&reg2, d&reg3, d&reg4}, [&mem_operand&]!
     97 .endif
     98 .endm
     99 
    100 .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
    101     op&.&elem_size {d&reg1[idx]}, [&mem_operand&]!
    102 .endm
    103 
    104 .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
    105     op&.&elem_size {d&reg1, d&reg2, d&reg3}, [&mem_operand&]!
    106 .endm
    107 
    108 .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
    109     op&.&elem_size {d&reg1[idx], d&reg2[idx], d&reg3[idx]}, [&mem_operand&]!
    110 .endm
    111 
    112 .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
    113 .if numbytes == 32
    114     pixldst4 op, elem_size, %(basereg+4), %(basereg+5), \
    115                               %(basereg+6), %(basereg+7), mem_operand, abits
    116 .elseif numbytes == 16
    117     pixldst2 op, elem_size, %(basereg+2), %(basereg+3), mem_operand, abits
    118 .elseif numbytes == 8
    119     pixldst1 op, elem_size, %(basereg+1), mem_operand, abits
    120 .elseif numbytes == 4
    121     .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 32)
    122         pixldst0 op, 32, %(basereg+0), 1, mem_operand, abits
    123     .elseif elem_size == 16
    124         pixldst0 op, 16, %(basereg+0), 2, mem_operand, abits
    125         pixldst0 op, 16, %(basereg+0), 3, mem_operand, abits
    126     .else
    127         pixldst0 op, 8, %(basereg+0), 4, mem_operand, abits
    128         pixldst0 op, 8, %(basereg+0), 5, mem_operand, abits
    129         pixldst0 op, 8, %(basereg+0), 6, mem_operand, abits
    130         pixldst0 op, 8, %(basereg+0), 7, mem_operand, abits
    131     .endif
    132 .elseif numbytes == 2
    133     .if !RESPECT_STRICT_ALIGNMENT || (elem_size == 16)
    134         pixldst0 op, 16, %(basereg+0), 1, mem_operand, abits
    135     .else
    136         pixldst0 op, 8, %(basereg+0), 2, mem_operand, abits
    137         pixldst0 op, 8, %(basereg+0), 3, mem_operand, abits
    138     .endif
    139 .elseif numbytes == 1
    140     pixldst0 op, 8, %(basereg+0), 1, mem_operand, abits
    141 .else
    142     .error "unsupported size: numbytes"
    143 .endif
    144 .endm
    145 
    146 .macro pixld numpix, bpp, basereg, mem_operand, abits=0
    147 .if bpp > 0
    148 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
    149     pixldst4 vld4, 8, %(basereg+4), %(basereg+5), \
    150                       %(basereg+6), %(basereg+7), mem_operand, abits
    151 .elseif (bpp == 24) && (numpix == 8)
    152     pixldst3 vld3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
    153 .elseif (bpp == 24) && (numpix == 4)
    154     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
    155     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
    156     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
    157     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
    158 .elseif (bpp == 24) && (numpix == 2)
    159     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
    160     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
    161 .elseif (bpp == 24) && (numpix == 1)
    162     pixldst30 vld3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
    163 .else
    164     pixldst %(numpix * bpp / 8), vld1, %(bpp), basereg, mem_operand, abits
    165 .endif
    166 .endif
    167 .endm
    168 
    169 .macro pixst numpix, bpp, basereg, mem_operand, abits=0
    170 .if bpp > 0
    171 .if (bpp == 32) && (numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
    172     pixldst4 vst4, 8, %(basereg+4), %(basereg+5), \
    173                       %(basereg+6), %(basereg+7), mem_operand, abits
    174 .elseif (bpp == 24) && (numpix == 8)
    175     pixldst3 vst3, 8, %(basereg+3), %(basereg+4), %(basereg+5), mem_operand
    176 .elseif (bpp == 24) && (numpix == 4)
    177     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 4, mem_operand
    178     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 5, mem_operand
    179     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 6, mem_operand
    180     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 7, mem_operand
    181 .elseif (bpp == 24) && (numpix == 2)
    182     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 2, mem_operand
    183     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 3, mem_operand
    184 .elseif (bpp == 24) && (numpix == 1)
    185     pixldst30 vst3, 8, %(basereg+0), %(basereg+1), %(basereg+2), 1, mem_operand
    186 .else
    187     pixldst %(numpix * bpp / 8), vst1, %(bpp), basereg, mem_operand, abits
    188 .endif
    189 .endif
    190 .endm
    191 
    192 .macro pixld_a numpix, bpp, basereg, mem_operand
    193 .if (bpp * numpix) <= 128
    194     pixld numpix, bpp, basereg, mem_operand, %(bpp * numpix)
    195 .else
    196     pixld numpix, bpp, basereg, mem_operand, 128
    197 .endif
    198 .endm
    199 
    200 .macro pixst_a numpix, bpp, basereg, mem_operand
    201 .if (bpp * numpix) <= 128
    202     pixst numpix, bpp, basereg, mem_operand, %(bpp * numpix)
    203 .else
    204     pixst numpix, bpp, basereg, mem_operand, 128
    205 .endif
    206 .endm
    207 
    208 /*
    209  * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
    210  * aliases to be defined)
    211  */
    212 .macro pixld1_s elem_size, reg1, mem_operand
    213 .if elem_size == 16
    214     mov     TMP1, VX, asr #16
    215     adds    VX, VX, UNIT_X
    216 5:  subpls  VX, VX, SRC_WIDTH_FIXED
    217     bpl     5b
    218     add     TMP1, mem_operand, TMP1, asl #1
    219     mov     TMP2, VX, asr #16
    220     adds    VX, VX, UNIT_X
    221 5:  subpls  VX, VX, SRC_WIDTH_FIXED
    222     bpl     5b
    223     add     TMP2, mem_operand, TMP2, asl #1
    224     vld1.16 {d&reg1&[0]}, [TMP1, :16]
    225     mov     TMP1, VX, asr #16
    226     adds    VX, VX, UNIT_X
    227 5:  subpls  VX, VX, SRC_WIDTH_FIXED
    228     bpl     5b
    229     add     TMP1, mem_operand, TMP1, asl #1
    230     vld1.16 {d&reg1&[1]}, [TMP2, :16]
    231     mov     TMP2, VX, asr #16
    232     adds    VX, VX, UNIT_X
    233 5:  subpls  VX, VX, SRC_WIDTH_FIXED
    234     bpl     5b
    235     add     TMP2, mem_operand, TMP2, asl #1
    236     vld1.16 {d&reg1&[2]}, [TMP1, :16]
    237     vld1.16 {d&reg1&[3]}, [TMP2, :16]
    238 .elseif elem_size == 32
    239     mov     TMP1, VX, asr #16
    240     adds    VX, VX, UNIT_X
    241 5:  subpls  VX, VX, SRC_WIDTH_FIXED
    242     bpl     5b
    243     add     TMP1, mem_operand, TMP1, asl #2
    244     mov     TMP2, VX, asr #16
    245     adds    VX, VX, UNIT_X
    246 5:  subpls  VX, VX, SRC_WIDTH_FIXED
    247     bpl     5b
    248     add     TMP2, mem_operand, TMP2, asl #2
    249     vld1.32 {d&reg1&[0]}, [TMP1, :32]
    250     vld1.32 {d&reg1&[1]}, [TMP2, :32]
    251 .else
    252     .error "unsupported"
    253 .endif
    254 .endm
    255 
    256 .macro pixld2_s elem_size, reg1, reg2, mem_operand
    257 .if 0 /* elem_size == 32 */
    258     mov     TMP1, VX, asr #16
    259     add     VX, VX, UNIT_X, asl #1
    260     add     TMP1, mem_operand, TMP1, asl #2
    261     mov     TMP2, VX, asr #16
    262     sub     VX, VX, UNIT_X
    263     add     TMP2, mem_operand, TMP2, asl #2
    264     vld1.32 {d&reg1&[0]}, [TMP1, :32]
    265     mov     TMP1, VX, asr #16
    266     add     VX, VX, UNIT_X, asl #1
    267     add     TMP1, mem_operand, TMP1, asl #2
    268     vld1.32 {d&reg2&[0]}, [TMP2, :32]
    269     mov     TMP2, VX, asr #16
    270     add     VX, VX, UNIT_X
    271     add     TMP2, mem_operand, TMP2, asl #2
    272     vld1.32 {d&reg1&[1]}, [TMP1, :32]
    273     vld1.32 {d&reg2&[1]}, [TMP2, :32]
    274 .else
    275     pixld1_s elem_size, reg1, mem_operand
    276     pixld1_s elem_size, reg2, mem_operand
    277 .endif
    278 .endm
    279 
    280 .macro pixld0_s elem_size, reg1, idx, mem_operand
    281 .if elem_size == 16
    282     mov     TMP1, VX, asr #16
    283     adds    VX, VX, UNIT_X
    284 5:  subpls  VX, VX, SRC_WIDTH_FIXED
    285     bpl     5b
    286     add     TMP1, mem_operand, TMP1, asl #1
    287     vld1.16 {d&reg1&[idx]}, [TMP1, :16]
    288 .elseif elem_size == 32
    289     mov     TMP1, VX, asr #16
    290     adds    VX, VX, UNIT_X
    291 5:  subpls  VX, VX, SRC_WIDTH_FIXED
    292     bpl     5b
    293     add     TMP1, mem_operand, TMP1, asl #2
    294     vld1.32 {d&reg1&[idx]}, [TMP1, :32]
    295 .endif
    296 .endm
    297 
    298 .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
    299 .if numbytes == 32
    300     pixld2_s elem_size, %(basereg+4), %(basereg+5), mem_operand
    301     pixld2_s elem_size, %(basereg+6), %(basereg+7), mem_operand
    302     pixdeinterleave elem_size, %(basereg+4)
    303 .elseif numbytes == 16
    304     pixld2_s elem_size, %(basereg+2), %(basereg+3), mem_operand
    305 .elseif numbytes == 8
    306     pixld1_s elem_size, %(basereg+1), mem_operand
    307 .elseif numbytes == 4
    308     .if elem_size == 32
    309         pixld0_s elem_size, %(basereg+0), 1, mem_operand
    310     .elseif elem_size == 16
    311         pixld0_s elem_size, %(basereg+0), 2, mem_operand
    312         pixld0_s elem_size, %(basereg+0), 3, mem_operand
    313     .else
    314         pixld0_s elem_size, %(basereg+0), 4, mem_operand
    315         pixld0_s elem_size, %(basereg+0), 5, mem_operand
    316         pixld0_s elem_size, %(basereg+0), 6, mem_operand
    317         pixld0_s elem_size, %(basereg+0), 7, mem_operand
    318     .endif
    319 .elseif numbytes == 2
    320     .if elem_size == 16
    321         pixld0_s elem_size, %(basereg+0), 1, mem_operand
    322     .else
    323         pixld0_s elem_size, %(basereg+0), 2, mem_operand
    324         pixld0_s elem_size, %(basereg+0), 3, mem_operand
    325     .endif
    326 .elseif numbytes == 1
    327     pixld0_s elem_size, %(basereg+0), 1, mem_operand
    328 .else
    329     .error "unsupported size: numbytes"
    330 .endif
    331 .endm
    332 
    333 .macro pixld_s numpix, bpp, basereg, mem_operand
    334 .if bpp > 0
    335     pixld_s_internal %(numpix * bpp / 8), %(bpp), basereg, mem_operand
    336 .endif
    337 .endm
    338 
    339 .macro vuzp8 reg1, reg2
    340     vuzp.8 d&reg1, d&reg2
    341 .endm
    342 
    343 .macro vzip8 reg1, reg2
    344     vzip.8 d&reg1, d&reg2
    345 .endm
    346 
    347 /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
    348 .macro pixdeinterleave bpp, basereg
    349 .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
    350     vuzp8 %(basereg+0), %(basereg+1)
    351     vuzp8 %(basereg+2), %(basereg+3)
    352     vuzp8 %(basereg+1), %(basereg+3)
    353     vuzp8 %(basereg+0), %(basereg+2)
    354 .endif
    355 .endm
    356 
    357 /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
    358 .macro pixinterleave bpp, basereg
    359 .if (bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
    360     vzip8 %(basereg+0), %(basereg+2)
    361     vzip8 %(basereg+1), %(basereg+3)
    362     vzip8 %(basereg+2), %(basereg+3)
    363     vzip8 %(basereg+0), %(basereg+1)
    364 .endif
    365 .endm
    366 
    367 /*
    368  * This is a macro for implementing cache preload. The main idea is that
    369  * cache preload logic is mostly independent from the rest of pixels
    370  * processing code. It starts at the top left pixel and moves forward
    371  * across pixels and can jump across scanlines. Prefetch distance is
    372  * handled in an 'incremental' way: it starts from 0 and advances to the
    373  * optimal distance over time. After reaching optimal prefetch distance,
    374  * it is kept constant. There are some checks which prevent prefetching
    375  * unneeded pixel lines below the image (but it still can prefetch a bit
    376  * more data on the right side of the image - not a big issue and may
    377  * be actually helpful when rendering text glyphs). Additional trick is
    378  * the use of LDR instruction for prefetch instead of PLD when moving to
    379  * the next line, the point is that we have a high chance of getting TLB
    380  * miss in this case, and PLD would be useless.
    381  *
    382  * This sounds like it may introduce a noticeable overhead (when working with
    383  * fully cached data). But in reality, due to having a separate pipeline and
    384  * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
    385  * execute simultaneously with NEON and be completely shadowed by it. Thus
    386  * we get no performance overhead at all (*). This looks like a very nice
    387  * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
    388  * but still can implement some rather advanced prefetch logic in software
    389  * for almost zero cost!
    390  *
    391  * (*) The overhead of the prefetcher is visible when running some trivial
    392  * pixels processing like simple copy. Anyway, having prefetch is a must
    393  * when working with the graphics data.
    394  */
    395 .macro PF a, x:vararg
    396 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
    397     a x
    398 .endif
    399 .endm
    400 
    401 .macro cache_preload std_increment, boost_increment
    402 .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
    403 .if regs_shortage
    404     PF ldr ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
    405 .endif
    406 .if std_increment != 0
    407     PF add PF_X, PF_X, #std_increment
    408 .endif
    409     PF tst PF_CTL, #0xF
    410     PF addne PF_X, PF_X, #boost_increment
    411     PF subne PF_CTL, PF_CTL, #1
    412     PF cmp PF_X, ORIG_W
    413 .if src_bpp_shift >= 0
    414     PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
    415 .endif
    416 .if dst_r_bpp != 0
    417     PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    418 .endif
    419 .if mask_bpp_shift >= 0
    420     PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
    421 .endif
    422     PF subge PF_X, PF_X, ORIG_W
    423     PF subges PF_CTL, PF_CTL, #0x10
    424 .if src_bpp_shift >= 0
    425     PF ldrgeb DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
    426 .endif
    427 .if dst_r_bpp != 0
    428     PF ldrgeb DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    429 .endif
    430 .if mask_bpp_shift >= 0
    431     PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
    432 .endif
    433 .endif
    434 .endm
    435 
    436 .macro cache_preload_simple
    437 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
    438 .if src_bpp > 0
    439     pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
    440 .endif
    441 .if dst_r_bpp > 0
    442     pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
    443 .endif
    444 .if mask_bpp > 0
    445     pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
    446 .endif
    447 .endif
    448 .endm
    449 
    450 .macro fetch_mask_pixblock
    451     pixld       pixblock_size, mask_bpp, \
    452                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
    453 .endm
    454 
    455 /*
    456  * Macro which is used to process leading pixels until destination
    457  * pointer is properly aligned (at 16 bytes boundary). When destination
    458  * buffer uses 16bpp format, this is unnecessary, or even pointless.
    459  */
    460 .macro ensure_destination_ptr_alignment process_pixblock_head, \
    461                                         process_pixblock_tail, \
    462                                         process_pixblock_tail_head
    463 .if dst_w_bpp != 24
    464     tst         DST_R, #0xF
    465     beq         2f
    466 
    467 .irp lowbit, 1, 2, 4, 8, 16
    468 local skip1
    469 .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
    470 .if lowbit < 16 /* we don't need more than 16-byte alignment */
    471     tst         DST_R, #lowbit
    472     beq         1f
    473 .endif
    474     pixld_src   (lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
    475     pixld       (lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
    476 .if dst_r_bpp > 0
    477     pixld_a     (lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
    478 .else
    479     add         DST_R, DST_R, #lowbit
    480 .endif
    481     PF add      PF_X, PF_X, #(lowbit * 8 / dst_w_bpp)
    482     sub         W, W, #(lowbit * 8 / dst_w_bpp)
    483 1:
    484 .endif
    485 .endr
    486     pixdeinterleave src_bpp, src_basereg
    487     pixdeinterleave mask_bpp, mask_basereg
    488     pixdeinterleave dst_r_bpp, dst_r_basereg
    489 
    490     process_pixblock_head
    491     cache_preload 0, pixblock_size
    492     cache_preload_simple
    493     process_pixblock_tail
    494 
    495     pixinterleave dst_w_bpp, dst_w_basereg
    496 .irp lowbit, 1, 2, 4, 8, 16
    497 .if (dst_w_bpp <= (lowbit * 8)) && ((lowbit * 8) < (pixblock_size * dst_w_bpp))
    498 .if lowbit < 16 /* we don't need more than 16-byte alignment */
    499     tst         DST_W, #lowbit
    500     beq         1f
    501 .endif
    502     pixst_a     (lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
    503 1:
    504 .endif
    505 .endr
    506 .endif
    507 2:
    508 .endm
    509 
    510 /*
    511  * Special code for processing up to (pixblock_size - 1) remaining
    512  * trailing pixels. As SIMD processing performs operation on
    513  * pixblock_size pixels, anything smaller than this has to be loaded
    514  * and stored in a special way. Loading and storing of pixel data is
    515  * performed in such a way that we fill some 'slots' in the NEON
    516  * registers (some slots naturally are unused), then perform compositing
    517  * operation as usual. In the end, the data is taken from these 'slots'
    518  * and saved to memory.
    519  *
    520  * cache_preload_flag - allows to suppress prefetch if
    521  *                      set to 0
    522  * dst_aligned_flag   - selects whether destination buffer
    523  *                      is aligned
    524  */
    525 .macro process_trailing_pixels cache_preload_flag, \
    526                                dst_aligned_flag, \
    527                                process_pixblock_head, \
    528                                process_pixblock_tail, \
    529                                process_pixblock_tail_head
    530     tst         W, #(pixblock_size - 1)
    531     beq         2f
    532 .irp chunk_size, 16, 8, 4, 2, 1
    533 .if pixblock_size > chunk_size
    534     tst         W, #chunk_size
    535     beq         1f
    536     pixld_src   chunk_size, src_bpp, src_basereg, SRC
    537     pixld       chunk_size, mask_bpp, mask_basereg, MASK
    538 .if dst_aligned_flag != 0
    539     pixld_a     chunk_size, dst_r_bpp, dst_r_basereg, DST_R
    540 .else
    541     pixld       chunk_size, dst_r_bpp, dst_r_basereg, DST_R
    542 .endif
    543 .if cache_preload_flag != 0
    544     PF add      PF_X, PF_X, #chunk_size
    545 .endif
    546 1:
    547 .endif
    548 .endr
    549     pixdeinterleave src_bpp, src_basereg
    550     pixdeinterleave mask_bpp, mask_basereg
    551     pixdeinterleave dst_r_bpp, dst_r_basereg
    552 
    553     process_pixblock_head
    554 .if cache_preload_flag != 0
    555     cache_preload 0, pixblock_size
    556     cache_preload_simple
    557 .endif
    558     process_pixblock_tail
    559     pixinterleave dst_w_bpp, dst_w_basereg
    560 .irp chunk_size, 16, 8, 4, 2, 1
    561 .if pixblock_size > chunk_size
    562     tst         W, #chunk_size
    563     beq         1f
    564 .if dst_aligned_flag != 0
    565     pixst_a     chunk_size, dst_w_bpp, dst_w_basereg, DST_W
    566 .else
    567     pixst       chunk_size, dst_w_bpp, dst_w_basereg, DST_W
    568 .endif
    569 1:
    570 .endif
    571 .endr
    572 2:
    573 .endm
    574 
    575 /*
    576  * Macro, which performs all the needed operations to switch to the next
    577  * scanline and start the next loop iteration unless all the scanlines
    578  * are already processed.
    579  */
    580 .macro advance_to_next_scanline start_of_loop_label
    581 .if regs_shortage
    582     ldrd        W, [sp] /* load W and H (width and height) from stack */
    583 .else
    584     mov         W, ORIG_W
    585 .endif
    586     add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
    587 .if src_bpp != 0
    588     add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
    589 .endif
    590 .if mask_bpp != 0
    591     add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
    592 .endif
    593 .if (dst_w_bpp != 24)
    594     sub         DST_W, DST_W, W, lsl #dst_bpp_shift
    595 .endif
    596 .if (src_bpp != 24) && (src_bpp != 0)
    597     sub         SRC, SRC, W, lsl #src_bpp_shift
    598 .endif
    599 .if (mask_bpp != 24) && (mask_bpp != 0)
    600     sub         MASK, MASK, W, lsl #mask_bpp_shift
    601 .endif
    602     subs        H, H, #1
    603     mov         DST_R, DST_W
    604 .if regs_shortage
    605     str         H, [sp, #4] /* save updated height to stack */
    606 .endif
    607     bge         start_of_loop_label
    608 .endm
    609 
    610 /*
    611  * Registers are allocated in the following way by default:
    612  * d0, d1, d2, d3     - reserved for loading source pixel data
    613  * d4, d5, d6, d7     - reserved for loading destination pixel data
    614  * d24, d25, d26, d27 - reserved for loading mask pixel data
    615  * d28, d29, d30, d31 - final destination pixel data for writeback to memory
    616  */
    617 .macro generate_composite_function fname, \
    618                                    src_bpp_, \
    619                                    mask_bpp_, \
    620                                    dst_w_bpp_, \
    621                                    flags, \
    622                                    pixblock_size_, \
    623                                    prefetch_distance, \
    624                                    init, \
    625                                    cleanup, \
    626                                    process_pixblock_head, \
    627                                    process_pixblock_tail, \
    628                                    process_pixblock_tail_head, \
    629                                    dst_w_basereg_ = 28, \
    630                                    dst_r_basereg_ = 4, \
    631                                    src_basereg_   = 0, \
    632                                    mask_basereg_  = 24
    633 
    634     .func fname
    635     .global fname
    636     /* For ELF format also set function visibility to hidden */
    637 #ifdef __ELF__
    638     .hidden fname
    639     .type fname, %function
    640 #endif
    641 fname:
    642     push        {r4-r12, lr}        /* save all registers */
    643 
    644 /*
    645  * Select prefetch type for this function. If prefetch distance is
    646  * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
    647  * has to be used instead of ADVANCED.
    648  */
    649     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
    650 .if prefetch_distance == 0
    651     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
    652 .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
    653         ((src_bpp_ == 24) || (mask_bpp_ == 24) || (dst_w_bpp_ == 24))
    654     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
    655 .endif
    656 
    657 /*
    658  * Make some macro arguments globally visible and accessible
    659  * from other macros
    660  */
    661     .set src_bpp, src_bpp_
    662     .set mask_bpp, mask_bpp_
    663     .set dst_w_bpp, dst_w_bpp_
    664     .set pixblock_size, pixblock_size_
    665     .set dst_w_basereg, dst_w_basereg_
    666     .set dst_r_basereg, dst_r_basereg_
    667     .set src_basereg, src_basereg_
    668     .set mask_basereg, mask_basereg_
    669 
    670     .macro pixld_src x:vararg
    671         pixld x
    672     .endm
    673     .macro fetch_src_pixblock
    674         pixld_src   pixblock_size, src_bpp, \
    675                     (src_basereg - pixblock_size * src_bpp / 64), SRC
    676     .endm
    677 /*
    678  * Assign symbolic names to registers
    679  */
    680     W           .req        r0      /* width (is updated during processing) */
    681     H           .req        r1      /* height (is updated during processing) */
    682     DST_W       .req        r2      /* destination buffer pointer for writes */
    683     DST_STRIDE  .req        r3      /* destination image stride */
    684     SRC         .req        r4      /* source buffer pointer */
    685     SRC_STRIDE  .req        r5      /* source image stride */
    686     DST_R       .req        r6      /* destination buffer pointer for reads */
    687 
    688     MASK        .req        r7      /* mask pointer */
    689     MASK_STRIDE .req        r8      /* mask stride */
    690 
    691     PF_CTL      .req        r9      /* combined lines counter and prefetch */
    692                                     /* distance increment counter */
    693     PF_X        .req        r10     /* pixel index in a scanline for current */
    694                                     /* pretetch position */
    695     PF_SRC      .req        r11     /* pointer to source scanline start */
    696                                     /* for prefetch purposes */
    697     PF_DST      .req        r12     /* pointer to destination scanline start */
    698                                     /* for prefetch purposes */
    699     PF_MASK     .req        r14     /* pointer to mask scanline start */
    700                                     /* for prefetch purposes */
    701 /*
    702  * Check whether we have enough registers for all the local variables.
    703  * If we don't have enough registers, original width and height are
    704  * kept on top of stack (and 'regs_shortage' variable is set to indicate
    705  * this for the rest of code). Even if there are enough registers, the
    706  * allocation scheme may be a bit different depending on whether source
    707  * or mask is not used.
    708  */
    709 .if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
    710     ORIG_W      .req        r10     /* saved original width */
    711     DUMMY       .req        r12     /* temporary register */
    712     .set        regs_shortage, 0
    713 .elseif mask_bpp == 0
    714     ORIG_W      .req        r7      /* saved original width */
    715     DUMMY       .req        r8      /* temporary register */
    716     .set        regs_shortage, 0
    717 .elseif src_bpp == 0
    718     ORIG_W      .req        r4      /* saved original width */
    719     DUMMY       .req        r5      /* temporary register */
    720     .set        regs_shortage, 0
    721 .else
    722     ORIG_W      .req        r1      /* saved original width */
    723     DUMMY       .req        r1      /* temporary register */
    724     .set        regs_shortage, 1
    725 .endif
    726 
    727     .set mask_bpp_shift, -1
    728 .if src_bpp == 32
    729     .set src_bpp_shift, 2
    730 .elseif src_bpp == 24
    731     .set src_bpp_shift, 0
    732 .elseif src_bpp == 16
    733     .set src_bpp_shift, 1
    734 .elseif src_bpp == 8
    735     .set src_bpp_shift, 0
    736 .elseif src_bpp == 0
    737     .set src_bpp_shift, -1
    738 .else
    739     .error "requested src bpp (src_bpp) is not supported"
    740 .endif
    741 .if mask_bpp == 32
    742     .set mask_bpp_shift, 2
    743 .elseif mask_bpp == 24
    744     .set mask_bpp_shift, 0
    745 .elseif mask_bpp == 8
    746     .set mask_bpp_shift, 0
    747 .elseif mask_bpp == 0
    748     .set mask_bpp_shift, -1
    749 .else
    750     .error "requested mask bpp (mask_bpp) is not supported"
    751 .endif
    752 .if dst_w_bpp == 32
    753     .set dst_bpp_shift, 2
    754 .elseif dst_w_bpp == 24
    755     .set dst_bpp_shift, 0
    756 .elseif dst_w_bpp == 16
    757     .set dst_bpp_shift, 1
    758 .elseif dst_w_bpp == 8
    759     .set dst_bpp_shift, 0
    760 .else
    761     .error "requested dst bpp (dst_w_bpp) is not supported"
    762 .endif
    763 
    764 .if (((flags) & FLAG_DST_READWRITE) != 0)
    765     .set dst_r_bpp, dst_w_bpp
    766 .else
    767     .set dst_r_bpp, 0
    768 .endif
    769 .if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
    770     .set DEINTERLEAVE_32BPP_ENABLED, 1
    771 .else
    772     .set DEINTERLEAVE_32BPP_ENABLED, 0
    773 .endif
    774 
    775 .if prefetch_distance < 0 || prefetch_distance > 15
    776     .error "invalid prefetch distance (prefetch_distance)"
    777 .endif
    778 
    779 .if src_bpp > 0
    780     ldr         SRC, [sp, #40]
    781 .endif
    782 .if mask_bpp > 0
    783     ldr         MASK, [sp, #48]
    784 .endif
    785     PF mov      PF_X, #0
    786 .if src_bpp > 0
    787     ldr         SRC_STRIDE, [sp, #44]
    788 .endif
    789 .if mask_bpp > 0
    790     ldr         MASK_STRIDE, [sp, #52]
    791 .endif
    792     mov         DST_R, DST_W
    793 
    794 .if src_bpp == 24
    795     sub         SRC_STRIDE, SRC_STRIDE, W
    796     sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
    797 .endif
    798 .if mask_bpp == 24
    799     sub         MASK_STRIDE, MASK_STRIDE, W
    800     sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
    801 .endif
    802 .if dst_w_bpp == 24
    803     sub         DST_STRIDE, DST_STRIDE, W
    804     sub         DST_STRIDE, DST_STRIDE, W, lsl #1
    805 .endif
    806 
    807 /*
    808  * Setup advanced prefetcher initial state
    809  */
    810     PF mov      PF_SRC, SRC
    811     PF mov      PF_DST, DST_R
    812     PF mov      PF_MASK, MASK
    813     /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
    814     PF mov      PF_CTL, H, lsl #4
    815     PF add      PF_CTL, #(prefetch_distance - 0x10)
    816 
    817     init
    818 .if regs_shortage
    819     push        {r0, r1}
    820 .endif
    821     subs        H, H, #1
    822 .if regs_shortage
    823     str         H, [sp, #4] /* save updated height to stack */
    824 .else
    825     mov         ORIG_W, W
    826 .endif
    827     blt         9f
    828     cmp         W, #(pixblock_size * 2)
    829     blt         8f
    830 /*
    831  * This is the start of the pipelined loop, which if optimized for
    832  * long scanlines
    833  */
    834 0:
    835     ensure_destination_ptr_alignment process_pixblock_head, \
    836                                      process_pixblock_tail, \
    837                                      process_pixblock_tail_head
    838 
    839     /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
    840     pixld_a     pixblock_size, dst_r_bpp, \
    841                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
    842     fetch_src_pixblock
    843     pixld       pixblock_size, mask_bpp, \
    844                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
    845     PF add      PF_X, PF_X, #pixblock_size
    846     process_pixblock_head
    847     cache_preload 0, pixblock_size
    848     cache_preload_simple
    849     subs        W, W, #(pixblock_size * 2)
    850     blt         2f
    851 1:
    852     process_pixblock_tail_head
    853     cache_preload_simple
    854     subs        W, W, #pixblock_size
    855     bge         1b
    856 2:
    857     process_pixblock_tail
    858     pixst_a     pixblock_size, dst_w_bpp, \
    859                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
    860 
    861     /* Process the remaining trailing pixels in the scanline */
    862     process_trailing_pixels 1, 1, \
    863                             process_pixblock_head, \
    864                             process_pixblock_tail, \
    865                             process_pixblock_tail_head
    866     advance_to_next_scanline 0b
    867 
    868 .if regs_shortage
    869     pop         {r0, r1}
    870 .endif
    871     cleanup
    872     pop         {r4-r12, pc}  /* exit */
    873 /*
    874  * This is the start of the loop, designed to process images with small width
    875  * (less than pixblock_size * 2 pixels). In this case neither pipelining
    876  * nor prefetch are used.
    877  */
    878 8:
    879     /* Process exactly pixblock_size pixels if needed */
    880     tst         W, #pixblock_size
    881     beq         1f
    882     pixld       pixblock_size, dst_r_bpp, \
    883                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
    884     fetch_src_pixblock
    885     pixld       pixblock_size, mask_bpp, \
    886                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
    887     process_pixblock_head
    888     process_pixblock_tail
    889     pixst       pixblock_size, dst_w_bpp, \
    890                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
    891 1:
    892     /* Process the remaining trailing pixels in the scanline */
    893     process_trailing_pixels 0, 0, \
    894                             process_pixblock_head, \
    895                             process_pixblock_tail, \
    896                             process_pixblock_tail_head
    897     advance_to_next_scanline 8b
    898 9:
    899 .if regs_shortage
    900     pop         {r0, r1}
    901 .endif
    902     cleanup
    903     pop         {r4-r12, pc}  /* exit */
    904 
    905     .purgem     fetch_src_pixblock
    906     .purgem     pixld_src
    907 
    908     .unreq      SRC
    909     .unreq      MASK
    910     .unreq      DST_R
    911     .unreq      DST_W
    912     .unreq      ORIG_W
    913     .unreq      W
    914     .unreq      H
    915     .unreq      SRC_STRIDE
    916     .unreq      DST_STRIDE
    917     .unreq      MASK_STRIDE
    918     .unreq      PF_CTL
    919     .unreq      PF_X
    920     .unreq      PF_SRC
    921     .unreq      PF_DST
    922     .unreq      PF_MASK
    923     .unreq      DUMMY
    924     .endfunc
    925 .endm
    926 
    927 /*
    928  * A simplified variant of function generation template for a single
    929  * scanline processing (for implementing pixman combine functions)
    930  */
    931 .macro generate_composite_function_scanline        use_nearest_scaling, \
    932                                                    fname, \
    933                                                    src_bpp_, \
    934                                                    mask_bpp_, \
    935                                                    dst_w_bpp_, \
    936                                                    flags, \
    937                                                    pixblock_size_, \
    938                                                    init, \
    939                                                    cleanup, \
    940                                                    process_pixblock_head, \
    941                                                    process_pixblock_tail, \
    942                                                    process_pixblock_tail_head, \
    943                                                    dst_w_basereg_ = 28, \
    944                                                    dst_r_basereg_ = 4, \
    945                                                    src_basereg_   = 0, \
    946                                                    mask_basereg_  = 24
    947 
    948     .func fname
    949     .global fname
    950     /* For ELF format also set function visibility to hidden */
    951 #ifdef __ELF__
    952     .hidden fname
    953     .type fname, %function
    954 #endif
    955 fname:
    956     .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
    957 /*
    958  * Make some macro arguments globally visible and accessible
    959  * from other macros
    960  */
    961     .set src_bpp, src_bpp_
    962     .set mask_bpp, mask_bpp_
    963     .set dst_w_bpp, dst_w_bpp_
    964     .set pixblock_size, pixblock_size_
    965     .set dst_w_basereg, dst_w_basereg_
    966     .set dst_r_basereg, dst_r_basereg_
    967     .set src_basereg, src_basereg_
    968     .set mask_basereg, mask_basereg_
    969 
    970 .if use_nearest_scaling != 0
    971     /*
    972      * Assign symbolic names to registers for nearest scaling
    973      */
    974     W           .req        r0
    975     DST_W       .req        r1
    976     SRC         .req        r2
    977     VX          .req        r3
    978     UNIT_X      .req        ip
    979     MASK        .req        lr
    980     TMP1        .req        r4
    981     TMP2        .req        r5
    982     DST_R       .req        r6
    983     SRC_WIDTH_FIXED .req        r7
    984 
    985     .macro pixld_src x:vararg
    986         pixld_s x
    987     .endm
    988 
    989     ldr         UNIT_X, [sp]
    990     push        {r4-r8, lr}
    991     ldr         SRC_WIDTH_FIXED, [sp, #(24 + 4)]
    992     .if mask_bpp != 0
    993     ldr         MASK, [sp, #(24 + 8)]
    994     .endif
    995 .else
    996     /*
    997      * Assign symbolic names to registers
    998      */
    999     W           .req        r0      /* width (is updated during processing) */
   1000     DST_W       .req        r1      /* destination buffer pointer for writes */
   1001     SRC         .req        r2      /* source buffer pointer */
   1002     DST_R       .req        ip      /* destination buffer pointer for reads */
   1003     MASK        .req        r3      /* mask pointer */
   1004 
   1005     .macro pixld_src x:vararg
   1006         pixld x
   1007     .endm
   1008 .endif
   1009 
   1010 .if (((flags) & FLAG_DST_READWRITE) != 0)
   1011     .set dst_r_bpp, dst_w_bpp
   1012 .else
   1013     .set dst_r_bpp, 0
   1014 .endif
   1015 .if (((flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
   1016     .set DEINTERLEAVE_32BPP_ENABLED, 1
   1017 .else
   1018     .set DEINTERLEAVE_32BPP_ENABLED, 0
   1019 .endif
   1020 
   1021     .macro fetch_src_pixblock
   1022         pixld_src   pixblock_size, src_bpp, \
   1023                     (src_basereg - pixblock_size * src_bpp / 64), SRC
   1024     .endm
   1025 
   1026     init
   1027     mov         DST_R, DST_W
   1028 
   1029     cmp         W, #pixblock_size
   1030     blt         8f
   1031 
   1032     ensure_destination_ptr_alignment process_pixblock_head, \
   1033                                      process_pixblock_tail, \
   1034                                      process_pixblock_tail_head
   1035 
   1036     subs        W, W, #pixblock_size
   1037     blt         7f
   1038 
   1039     /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
   1040     pixld_a     pixblock_size, dst_r_bpp, \
   1041                 (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
   1042     fetch_src_pixblock
   1043     pixld       pixblock_size, mask_bpp, \
   1044                 (mask_basereg - pixblock_size * mask_bpp / 64), MASK
   1045     process_pixblock_head
   1046     subs        W, W, #pixblock_size
   1047     blt         2f
   1048 1:
   1049     process_pixblock_tail_head
   1050     subs        W, W, #pixblock_size
   1051     bge         1b
   1052 2:
   1053     process_pixblock_tail
   1054     pixst_a     pixblock_size, dst_w_bpp, \
   1055                 (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
   1056 7:
   1057     /* Process the remaining trailing pixels in the scanline (dst aligned) */
   1058     process_trailing_pixels 0, 1, \
   1059                             process_pixblock_head, \
   1060                             process_pixblock_tail, \
   1061                             process_pixblock_tail_head
   1062 
   1063     cleanup
   1064 .if use_nearest_scaling != 0
   1065     pop         {r4-r8, pc}  /* exit */
   1066 .else
   1067     bx          lr  /* exit */
   1068 .endif
   1069 8:
   1070     /* Process the remaining trailing pixels in the scanline (dst unaligned) */
   1071     process_trailing_pixels 0, 0, \
   1072                             process_pixblock_head, \
   1073                             process_pixblock_tail, \
   1074                             process_pixblock_tail_head
   1075 
   1076     cleanup
   1077 
   1078 .if use_nearest_scaling != 0
   1079     pop         {r4-r8, pc}  /* exit */
   1080 
   1081     .unreq      DST_R
   1082     .unreq      SRC
   1083     .unreq      W
   1084     .unreq      VX
   1085     .unreq      UNIT_X
   1086     .unreq      TMP1
   1087     .unreq      TMP2
   1088     .unreq      DST_W
   1089     .unreq      MASK
   1090     .unreq      SRC_WIDTH_FIXED
   1091 
   1092 .else
   1093     bx          lr  /* exit */
   1094 
   1095     .unreq      SRC
   1096     .unreq      MASK
   1097     .unreq      DST_R
   1098     .unreq      DST_W
   1099     .unreq      W
   1100 .endif
   1101 
   1102     .purgem     fetch_src_pixblock
   1103     .purgem     pixld_src
   1104 
   1105     .endfunc
   1106 .endm
   1107 
   1108 .macro generate_composite_function_single_scanline x:vararg
   1109     generate_composite_function_scanline 0, x
   1110 .endm
   1111 
   1112 .macro generate_composite_function_nearest_scanline x:vararg
   1113     generate_composite_function_scanline 1, x
   1114 .endm
   1115 
   1116 /* Default prologue/epilogue, nothing special needs to be done */
   1117 
   1118 .macro default_init
   1119 .endm
   1120 
   1121 .macro default_cleanup
   1122 .endm
   1123 
   1124 /*
   1125  * Prologue/epilogue variant which additionally saves/restores d8-d15
   1126  * registers (they need to be saved/restored by callee according to ABI).
   1127  * This is required if the code needs to use all the NEON registers.
   1128  */
   1129 
   1130 .macro default_init_need_all_regs
   1131     vpush       {d8-d15}
   1132 .endm
   1133 
   1134 .macro default_cleanup_need_all_regs
   1135     vpop        {d8-d15}
   1136 .endm
   1137 
   1138 /******************************************************************************/
   1139 
   1140 /*
   1141  * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
   1142  * into a planar a8r8g8b8 format (with a, r, g, b color components
   1143  * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
   1144  *
   1145  * Warning: the conversion is destructive and the original
   1146  *          value (in) is lost.
   1147  */
   1148 .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
   1149     vshrn.u16   out_r, in,    #8
   1150     vshrn.u16   out_g, in,    #3
   1151     vsli.u16    in,    in,    #5
   1152     vmov.u8     out_a, #255
   1153     vsri.u8     out_r, out_r, #5
   1154     vsri.u8     out_g, out_g, #6
   1155     vshrn.u16   out_b, in,    #2
   1156 .endm
   1157 
   1158 .macro convert_0565_to_x888 in, out_r, out_g, out_b
   1159     vshrn.u16   out_r, in,    #8
   1160     vshrn.u16   out_g, in,    #3
   1161     vsli.u16    in,    in,    #5
   1162     vsri.u8     out_r, out_r, #5
   1163     vsri.u8     out_g, out_g, #6
   1164     vshrn.u16   out_b, in,    #2
   1165 .endm
   1166 
   1167 /*
   1168  * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
   1169  * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
   1170  * pixels packed in 128-bit register (out). Requires two temporary 128-bit
   1171  * registers (tmp1, tmp2)
   1172  */
   1173 .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
   1174     vshll.u8    tmp1, in_g, #8
   1175     vshll.u8    out, in_r, #8
   1176     vshll.u8    tmp2, in_b, #8
   1177     vsri.u16    out, tmp1, #5
   1178     vsri.u16    out, tmp2, #11
   1179 .endm
   1180 
   1181 /*
   1182  * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
   1183  * returned in (out0, out1) registers pair. Requires one temporary
   1184  * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
   1185  * value from 'in' is lost
   1186  */
   1187 .macro convert_four_0565_to_x888_packed in, out0, out1, tmp
   1188     vshl.u16    out0, in,   #5  /* G top 6 bits */
   1189     vshl.u16    tmp,  in,   #11 /* B top 5 bits */
   1190     vsri.u16    in,   in,   #5  /* R is ready in top bits */
   1191     vsri.u16    out0, out0, #6  /* G is ready in top bits */
   1192     vsri.u16    tmp,  tmp,  #5  /* B is ready in top bits */
   1193     vshr.u16    out1, in,   #8  /* R is in place */
   1194     vsri.u16    out0, tmp,  #8  /* G & B is in place */
   1195     vzip.u16    out0, out1      /* everything is in place */
   1196 .endm
   1197