Home | History | Annotate | Download | only in pixman
      1 /*
      2  * Copyright  2012 Raspberry Pi Foundation
      3  * Copyright  2012 RISC OS Open Ltd
      4  *
      5  * Permission to use, copy, modify, distribute, and sell this software and its
      6  * documentation for any purpose is hereby granted without fee, provided that
      7  * the above copyright notice appear in all copies and that both that
      8  * copyright notice and this permission notice appear in supporting
      9  * documentation, and that the name of the copyright holders not be used in
     10  * advertising or publicity pertaining to distribution of the software without
     11  * specific, written prior permission.  The copyright holders make no
     12  * representations about the suitability of this software for any purpose.  It
     13  * is provided "as is" without express or implied warranty.
     14  *
     15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
     16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
     17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
     18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
     20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
     21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
     22  * SOFTWARE.
     23  *
     24  * Author:  Ben Avison (bavison (at) riscosopen.org)
     25  *
     26  */
     27 
     28 /*
     29  * Because the alignment of pixel data to cachelines, and even the number of
     30  * cachelines per row can vary from row to row, and because of the need to
     31  * preload each scanline once and only once, this prefetch strategy treats
     32  * each row of pixels independently. When a pixel row is long enough, there
     33  * are three distinct phases of prefetch:
     34  * * an inner loop section, where each time a cacheline of data is
     35  *    processed, another cacheline is preloaded (the exact distance ahead is
     36  *    determined empirically using profiling results from lowlevel-blt-bench)
     37  * * a leading section, where enough cachelines are preloaded to ensure no
     38  *    cachelines escape being preloaded when the inner loop starts
     39  * * a trailing section, where a limited number (0 or more) of cachelines
     40  *    are preloaded to deal with data (if any) that hangs off the end of the
     41  *    last iteration of the inner loop, plus any trailing bytes that were not
     42  *    enough to make up one whole iteration of the inner loop
     43  *
     44  * There are (in general) three distinct code paths, selected between
     45  * depending upon how long the pixel row is. If it is long enough that there
     46  * is at least one iteration of the inner loop (as described above) then
     47  * this is described as the "wide" case. If it is shorter than that, but
     48  * there are still enough bytes output that there is at least one 16-byte-
     49  * long, 16-byte-aligned write to the destination (the optimum type of
     50  * write), then this is the "medium" case. If it is not even this long, then
     51  * this is the "narrow" case, and there is no attempt to align writes to
     52  * 16-byte boundaries. In the "medium" and "narrow" cases, all the
     53  * cachelines containing data from the pixel row are prefetched up-front.
     54  */
     55 
     56 /*
     57  * Determine whether we put the arguments on the stack for debugging.
     58  */
     59 #undef DEBUG_PARAMS
     60 
     61 /*
     62  * Bit flags for 'generate_composite_function' macro which are used
     63  * to tune generated functions behavior.
     64  */
     65 .set FLAG_DST_WRITEONLY,         0
     66 .set FLAG_DST_READWRITE,         1
     67 .set FLAG_COND_EXEC,             0
     68 .set FLAG_BRANCH_OVER,           2
     69 .set FLAG_PROCESS_PRESERVES_PSR, 0
     70 .set FLAG_PROCESS_CORRUPTS_PSR,  4
     71 .set FLAG_PROCESS_DOESNT_STORE,  0
     72 .set FLAG_PROCESS_DOES_STORE,    8 /* usually because it needs to conditionally skip it */
     73 .set FLAG_NO_SPILL_LINE_VARS,        0
     74 .set FLAG_SPILL_LINE_VARS_WIDE,      16
     75 .set FLAG_SPILL_LINE_VARS_NON_WIDE,  32
     76 .set FLAG_SPILL_LINE_VARS,           48
     77 .set FLAG_PROCESS_CORRUPTS_SCRATCH,  0
     78 .set FLAG_PROCESS_PRESERVES_SCRATCH, 64
     79 
     80 /*
     81  * Offset into stack where mask and source pointer/stride can be accessed.
     82  */
     83 #ifdef DEBUG_PARAMS
     84 .set ARGS_STACK_OFFSET,        (9*4+9*4)
     85 #else
     86 .set ARGS_STACK_OFFSET,        (9*4)
     87 #endif
     88 
     89 /*
     90  * Constants for selecting preferable prefetch type.
     91  */
     92 .set PREFETCH_TYPE_NONE,       0
     93 .set PREFETCH_TYPE_STANDARD,   1
     94 
     95 /*
     96  * Definitions of macros for load/store of pixel data.
     97  */
     98 
     99 .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
    100  .if numbytes == 16
    101   .if unaligned == 1
    102         op&r&cond    WK&reg0, [base], #4
    103         op&r&cond    WK&reg1, [base], #4
    104         op&r&cond    WK&reg2, [base], #4
    105         op&r&cond    WK&reg3, [base], #4
    106   .else
    107         op&m&cond&ia base!, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
    108   .endif
    109  .elseif numbytes == 8
    110   .if unaligned == 1
    111         op&r&cond    WK&reg0, [base], #4
    112         op&r&cond    WK&reg1, [base], #4
    113   .else
    114         op&m&cond&ia base!, {WK&reg0,WK&reg1}
    115   .endif
    116  .elseif numbytes == 4
    117         op&r&cond    WK&reg0, [base], #4
    118  .elseif numbytes == 2
    119         op&r&cond&h  WK&reg0, [base], #2
    120  .elseif numbytes == 1
    121         op&r&cond&b  WK&reg0, [base], #1
    122  .else
    123   .error "unsupported size: numbytes"
    124  .endif
    125 .endm
    126 
    127 .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
    128  .if numbytes == 16
    129         stm&cond&db base, {WK&reg0,WK&reg1,WK&reg2,WK&reg3}
    130  .elseif numbytes == 8
    131         stm&cond&db base, {WK&reg0,WK&reg1}
    132  .elseif numbytes == 4
    133         str&cond    WK&reg0, [base, #-4]
    134  .elseif numbytes == 2
    135         str&cond&h  WK&reg0, [base, #-2]
    136  .elseif numbytes == 1
    137         str&cond&b  WK&reg0, [base, #-1]
    138  .else
    139   .error "unsupported size: numbytes"
    140  .endif
    141 .endm
    142 
    143 .macro pixld cond, numbytes, firstreg, base, unaligned
    144         pixldst ld, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base, unaligned
    145 .endm
    146 
    147 .macro pixst cond, numbytes, firstreg, base
    148  .if (flags) & FLAG_DST_READWRITE
    149         pixst_baseupdated cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
    150  .else
    151         pixldst st, cond, numbytes, %(firstreg+0), %(firstreg+1), %(firstreg+2), %(firstreg+3), base
    152  .endif
    153 .endm
    154 
    155 .macro PF a, x:vararg
    156  .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
    157         a x
    158  .endif
    159 .endm
    160 
    161 
    162 .macro preload_leading_step1  bpp, ptr, base
    163 /* If the destination is already 16-byte aligned, then we need to preload
    164  * between 0 and prefetch_distance (inclusive) cache lines ahead so there
    165  * are no gaps when the inner loop starts.
    166  */
    167  .if bpp > 0
    168         PF  bic,    ptr, base, #31
    169   .set OFFSET, 0
    170   .rept prefetch_distance+1
    171         PF  pld,    [ptr, #OFFSET]
    172    .set OFFSET, OFFSET+32
    173   .endr
    174  .endif
    175 .endm
    176 
    177 .macro preload_leading_step2  bpp, bpp_shift, ptr, base
    178 /* However, if the destination is not 16-byte aligned, we may need to
    179  * preload more cache lines than that. The question we need to ask is:
    180  * are the bytes corresponding to the leading pixels more than the amount
    181  * by which the source pointer will be rounded down for preloading, and if
    182  * so, by how many cache lines? Effectively, we want to calculate
    183  *     leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
    184  *     inner_loop_offset = (src+leading_bytes)&31
    185  *     extra_needed = leading_bytes - inner_loop_offset
    186  * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
    187  * possible when there are 4 src bytes for every 1 dst byte).
    188  */
    189  .if bpp > 0
    190   .ifc base,DST
    191         /* The test can be simplified further when preloading the destination */
    192         PF  tst,    base, #16
    193         PF  beq,    61f
    194   .else
    195    .if bpp/dst_w_bpp == 4
    196         PF  add,    SCRATCH, base, WK0, lsl #bpp_shift-dst_bpp_shift
    197         PF  and,    SCRATCH, SCRATCH, #31
    198         PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #bpp_shift-dst_bpp_shift
    199         PF  sub,    SCRATCH, SCRATCH, #1    /* so now ranges are -16..-1 / 0..31 / 32..63 */
    200         PF  movs,   SCRATCH, SCRATCH, #32-6 /* so this sets         NC   /  nc   /   Nc   */
    201         PF  bcs,    61f
    202         PF  bpl,    60f
    203         PF  pld,    [ptr, #32*(prefetch_distance+2)]
    204    .else
    205         PF  mov,    SCRATCH, base, lsl #32-5
    206         PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
    207         PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+bpp_shift-dst_bpp_shift
    208         PF  bls,    61f
    209    .endif
    210   .endif
    211 60:     PF  pld,    [ptr, #32*(prefetch_distance+1)]
    212 61:
    213  .endif
    214 .endm
    215 
    216 #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
    217 .macro preload_middle   bpp, base, scratch_holds_offset
    218  .if bpp > 0
    219         /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
    220   .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/bpp)
    221    .if scratch_holds_offset
    222         PF  pld,    [base, SCRATCH]
    223    .else
    224         PF  bic,    SCRATCH, base, #31
    225         PF  pld,    [SCRATCH, #32*prefetch_distance]
    226    .endif
    227   .endif
    228  .endif
    229 .endm
    230 
    231 .macro preload_trailing  bpp, bpp_shift, base
    232  .if bpp > 0
    233   .if bpp*pix_per_block > 256
    234         /* Calculations are more complex if more than one fetch per block */
    235         PF  and,    WK1, base, #31
    236         PF  add,    WK1, WK1, WK0, lsl #bpp_shift
    237         PF  add,    WK1, WK1, #32*(bpp*pix_per_block/256-1)*(prefetch_distance+1)
    238         PF  bic,    SCRATCH, base, #31
    239 80:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
    240         PF  add,    SCRATCH, SCRATCH, #32
    241         PF  subs,   WK1, WK1, #32
    242         PF  bhi,    80b
    243   .else
    244         /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
    245         PF  mov,    SCRATCH, base, lsl #32-5
    246         PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+bpp_shift
    247         PF  adceqs, SCRATCH, SCRATCH, #0
    248         /* The instruction above has two effects: ensures Z is only
    249          * set if C was clear (so Z indicates that both shifted quantities
    250          * were 0), and clears C if Z was set (so C indicates that the sum
    251          * of the shifted quantities was greater and not equal to 32) */
    252         PF  beq,    82f
    253         PF  bic,    SCRATCH, base, #31
    254         PF  bcc,    81f
    255         PF  pld,    [SCRATCH, #32*(prefetch_distance+2)]
    256 81:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
    257 82:
    258   .endif
    259  .endif
    260 .endm
    261 
    262 
    263 .macro preload_line    narrow_case, bpp, bpp_shift, base
    264 /* "narrow_case" - just means that the macro was invoked from the "narrow"
    265  *    code path rather than the "medium" one - because in the narrow case,
    266  *    the row of pixels is known to output no more than 30 bytes, then
    267  *    (assuming the source pixels are no wider than the the destination
    268  *    pixels) they cannot possibly straddle more than 2 32-byte cachelines,
    269  *    meaning there's no need for a loop.
    270  * "bpp" - number of bits per pixel in the channel (source, mask or
    271  *    destination) that's being preloaded, or 0 if this channel is not used
    272  *    for reading
    273  * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
    274  * "base" - base address register of channel to preload (SRC, MASK or DST)
    275  */
    276  .if bpp > 0
    277   .if narrow_case && (bpp <= dst_w_bpp)
    278         /* In these cases, each line for each channel is in either 1 or 2 cache lines */
    279         PF  bic,    WK0, base, #31
    280         PF  pld,    [WK0]
    281         PF  add,    WK1, base, X, LSL #bpp_shift
    282         PF  sub,    WK1, WK1, #1
    283         PF  bic,    WK1, WK1, #31
    284         PF  cmp,    WK1, WK0
    285         PF  beq,    90f
    286         PF  pld,    [WK1]
    287 90:
    288   .else
    289         PF  bic,    WK0, base, #31
    290         PF  pld,    [WK0]
    291         PF  add,    WK1, base, X, lsl #bpp_shift
    292         PF  sub,    WK1, WK1, #1
    293         PF  bic,    WK1, WK1, #31
    294         PF  cmp,    WK1, WK0
    295         PF  beq,    92f
    296 91:     PF  add,    WK0, WK0, #32
    297         PF  cmp,    WK0, WK1
    298         PF  pld,    [WK0]
    299         PF  bne,    91b
    300 92:
    301   .endif
    302  .endif
    303 .endm
    304 
    305 
    306 .macro conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
    307         process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, 0
    308  .if decrementx
    309         sub&cond X, X, #8*numbytes/dst_w_bpp
    310  .endif
    311         process_tail  cond, numbytes, firstreg
    312  .if !((flags) & FLAG_PROCESS_DOES_STORE)
    313         pixst   cond, numbytes, firstreg, DST
    314  .endif
    315 .endm
    316 
    317 .macro conditional_process1  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
    318  .if (flags) & FLAG_BRANCH_OVER
    319   .ifc cond,mi
    320         bpl     100f
    321   .endif
    322   .ifc cond,cs
    323         bcc     100f
    324   .endif
    325   .ifc cond,ne
    326         beq     100f
    327   .endif
    328         conditional_process1_helper  , process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
    329 100:
    330  .else
    331         conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
    332  .endif
    333 .endm
    334 
    335 .macro conditional_process2  test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
    336  .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
    337         /* Can't interleave reads and writes */
    338         test
    339         conditional_process1  cond1, process_head, process_tail, numbytes1, firstreg1, unaligned_src, unaligned_mask, decrementx
    340   .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
    341         test
    342   .endif
    343         conditional_process1  cond2, process_head, process_tail, numbytes2, firstreg2, unaligned_src, unaligned_mask, decrementx
    344  .else
    345         /* Can interleave reads and writes for better scheduling */
    346         test
    347         process_head  cond1, numbytes1, firstreg1, unaligned_src, unaligned_mask, 0
    348         process_head  cond2, numbytes2, firstreg2, unaligned_src, unaligned_mask, 0
    349   .if decrementx
    350         sub&cond1 X, X, #8*numbytes1/dst_w_bpp
    351         sub&cond2 X, X, #8*numbytes2/dst_w_bpp
    352   .endif
    353         process_tail  cond1, numbytes1, firstreg1
    354         process_tail  cond2, numbytes2, firstreg2
    355         pixst   cond1, numbytes1, firstreg1, DST
    356         pixst   cond2, numbytes2, firstreg2, DST
    357  .endif
    358 .endm
    359 
    360 
    361 .macro test_bits_1_0_ptr
    362         movs    SCRATCH, WK0, lsl #32-1  /* C,N = bits 1,0 of DST */
    363 .endm
    364 
    365 .macro test_bits_3_2_ptr
    366         movs    SCRATCH, WK0, lsl #32-3  /* C,N = bits 3, 2 of DST */
    367 .endm
    368 
    369 .macro leading_15bytes  process_head, process_tail
    370         /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
    371         /* Use unaligned loads in all cases for simplicity */
    372  .if dst_w_bpp == 8
    373         conditional_process2  test_bits_1_0_ptr, mi, cs, process_head, process_tail, 1, 2, 1, 2, 1, 1, 1
    374  .elseif dst_w_bpp == 16
    375         test_bits_1_0_ptr
    376         conditional_process1  cs, process_head, process_tail, 2, 2, 1, 1, 1
    377  .endif
    378         conditional_process2  test_bits_3_2_ptr, mi, cs, process_head, process_tail, 4, 8, 1, 2, 1, 1, 1
    379 .endm
    380 
    381 .macro test_bits_3_2_pix
    382         movs    SCRATCH, X, lsl #dst_bpp_shift+32-3
    383 .endm
    384 
    385 .macro test_bits_1_0_pix
    386  .if dst_w_bpp == 8
    387         movs    SCRATCH, X, lsl #dst_bpp_shift+32-1
    388  .else
    389         movs    SCRATCH, X, lsr #1
    390  .endif
    391 .endm
    392 
    393 .macro trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
    394         conditional_process2  test_bits_3_2_pix, cs, mi, process_head, process_tail, 8, 4, 0, 2, unaligned_src, unaligned_mask, 0
    395  .if dst_w_bpp == 16
    396         test_bits_1_0_pix
    397         conditional_process1  cs, process_head, process_tail, 2, 0, unaligned_src, unaligned_mask, 0
    398  .elseif dst_w_bpp == 8
    399         conditional_process2  test_bits_1_0_pix, cs, mi, process_head, process_tail, 2, 1, 0, 1, unaligned_src, unaligned_mask, 0
    400  .endif
    401 .endm
    402 
    403 
    404 .macro wide_case_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
    405 110:
    406  .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
    407  .rept pix_per_block*dst_w_bpp/128
    408         process_head  , 16, 0, unaligned_src, unaligned_mask, 1
    409   .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
    410         preload_middle  src_bpp, SRC, 1
    411   .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
    412         preload_middle  mask_bpp, MASK, 1
    413   .else
    414         preload_middle  src_bpp, SRC, 0
    415         preload_middle  mask_bpp, MASK, 0
    416   .endif
    417   .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0)
    418         /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
    419          * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
    420          * preloads for, to achieve staggered prefetches for multiple channels, because there are
    421          * always two STMs per prefetch, so there is always an opposite STM on which to put the
    422          * preload. Note, no need to BIC the base register here */
    423         PF  pld,    [DST, #32*prefetch_distance - dst_alignment]
    424   .endif
    425         process_tail  , 16, 0
    426   .if !((flags) & FLAG_PROCESS_DOES_STORE)
    427         pixst   , 16, 0, DST
    428   .endif
    429   .set SUBBLOCK, SUBBLOCK+1
    430  .endr
    431         subs    X, X, #pix_per_block
    432         bhs     110b
    433 .endm
    434 
    435 .macro wide_case_inner_loop_and_trailing_pixels  process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
    436         /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
    437  .if dst_r_bpp > 0
    438         tst     DST, #16
    439         bne     111f
    440         process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 16
    441         b       112f
    442 111:
    443  .endif
    444         process_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, 0
    445 112:
    446         /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
    447  .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
    448         PF  and,    WK0, X, #pix_per_block-1
    449  .endif
    450         preload_trailing  src_bpp, src_bpp_shift, SRC
    451         preload_trailing  mask_bpp, mask_bpp_shift, MASK
    452         preload_trailing  dst_r_bpp, dst_bpp_shift, DST
    453         add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
    454         /* The remainder of the line is handled identically to the medium case */
    455         medium_case_inner_loop_and_trailing_pixels  process_head, process_tail,, exit_label, unaligned_src, unaligned_mask
    456 .endm
    457 
    458 .macro medium_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
    459 120:
    460         process_head  , 16, 0, unaligned_src, unaligned_mask, 0
    461         process_tail  , 16, 0
    462  .if !((flags) & FLAG_PROCESS_DOES_STORE)
    463         pixst   , 16, 0, DST
    464  .endif
    465         subs    X, X, #128/dst_w_bpp
    466         bhs     120b
    467         /* Trailing pixels */
    468         tst     X, #128/dst_w_bpp - 1
    469         beq     exit_label
    470         trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
    471 .endm
    472 
    473 .macro narrow_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
    474         tst     X, #16*8/dst_w_bpp
    475         conditional_process1  ne, process_head, process_tail, 16, 0, unaligned_src, unaligned_mask, 0
    476         /* Trailing pixels */
    477         /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
    478         trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
    479 .endm
    480 
    481 .macro switch_on_alignment  action, process_head, process_tail, process_inner_loop, exit_label
    482  /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
    483  .if mask_bpp == 8 || mask_bpp == 16
    484         tst     MASK, #3
    485         bne     141f
    486  .endif
    487   .if src_bpp == 8 || src_bpp == 16
    488         tst     SRC, #3
    489         bne     140f
    490   .endif
    491         action  process_head, process_tail, process_inner_loop, exit_label, 0, 0
    492   .if src_bpp == 8 || src_bpp == 16
    493         b       exit_label
    494 140:
    495         action  process_head, process_tail, process_inner_loop, exit_label, 1, 0
    496   .endif
    497  .if mask_bpp == 8 || mask_bpp == 16
    498         b       exit_label
    499 141:
    500   .if src_bpp == 8 || src_bpp == 16
    501         tst     SRC, #3
    502         bne     142f
    503   .endif
    504         action  process_head, process_tail, process_inner_loop, exit_label, 0, 1
    505   .if src_bpp == 8 || src_bpp == 16
    506         b       exit_label
    507 142:
    508         action  process_head, process_tail, process_inner_loop, exit_label, 1, 1
    509   .endif
    510  .endif
    511 .endm
    512 
    513 
    514 .macro end_of_line      restore_x, vars_spilled, loop_label, last_one
    515  .if vars_spilled
    516         /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
    517         /* This is ldmia sp,{} */
    518         .word   0xE89D0000 | LINE_SAVED_REGS
    519  .endif
    520         subs    Y, Y, #1
    521  .if vars_spilled
    522   .if (LINE_SAVED_REGS) & (1<<1)
    523         str     Y, [sp]
    524   .endif
    525  .endif
    526         add     DST, DST, STRIDE_D
    527  .if src_bpp > 0
    528         add     SRC, SRC, STRIDE_S
    529  .endif
    530  .if mask_bpp > 0
    531         add     MASK, MASK, STRIDE_M
    532  .endif
    533  .if restore_x
    534         mov     X, ORIG_W
    535  .endif
    536         bhs     loop_label
    537  .ifc "last_one",""
    538   .if vars_spilled
    539         b       197f
    540   .else
    541         b       198f
    542   .endif
    543  .else
    544   .if (!vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
    545         b       198f
    546   .endif
    547  .endif
    548 .endm
    549 
    550 
    551 .macro generate_composite_function fname, \
    552                                    src_bpp_, \
    553                                    mask_bpp_, \
    554                                    dst_w_bpp_, \
    555                                    flags_, \
    556                                    prefetch_distance_, \
    557                                    init, \
    558                                    newline, \
    559                                    cleanup, \
    560                                    process_head, \
    561                                    process_tail, \
    562                                    process_inner_loop
    563 
    564  .func fname
    565  .global fname
    566  /* For ELF format also set function visibility to hidden */
    567 #ifdef __ELF__
    568  .hidden fname
    569  .type fname, %function
    570 #endif
    571 
    572 /*
    573  * Make some macro arguments globally visible and accessible
    574  * from other macros
    575  */
    576  .set src_bpp, src_bpp_
    577  .set mask_bpp, mask_bpp_
    578  .set dst_w_bpp, dst_w_bpp_
    579  .set flags, flags_
    580  .set prefetch_distance, prefetch_distance_
    581 
    582 /*
    583  * Select prefetch type for this function.
    584  */
    585  .if prefetch_distance == 0
    586   .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
    587  .else
    588   .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
    589  .endif
    590 
    591  .if src_bpp == 32
    592   .set src_bpp_shift, 2
    593  .elseif src_bpp == 24
    594   .set src_bpp_shift, 0
    595  .elseif src_bpp == 16
    596   .set src_bpp_shift, 1
    597  .elseif src_bpp == 8
    598   .set src_bpp_shift, 0
    599  .elseif src_bpp == 0
    600   .set src_bpp_shift, -1
    601  .else
    602   .error "requested src bpp (src_bpp) is not supported"
    603  .endif
    604 
    605  .if mask_bpp == 32
    606   .set mask_bpp_shift, 2
    607  .elseif mask_bpp == 24
    608   .set mask_bpp_shift, 0
    609  .elseif mask_bpp == 8
    610   .set mask_bpp_shift, 0
    611  .elseif mask_bpp == 0
    612   .set mask_bpp_shift, -1
    613  .else
    614   .error "requested mask bpp (mask_bpp) is not supported"
    615  .endif
    616 
    617  .if dst_w_bpp == 32
    618   .set dst_bpp_shift, 2
    619  .elseif dst_w_bpp == 24
    620   .set dst_bpp_shift, 0
    621  .elseif dst_w_bpp == 16
    622   .set dst_bpp_shift, 1
    623  .elseif dst_w_bpp == 8
    624   .set dst_bpp_shift, 0
    625  .else
    626   .error "requested dst bpp (dst_w_bpp) is not supported"
    627  .endif
    628 
    629  .if (((flags) & FLAG_DST_READWRITE) != 0)
    630   .set dst_r_bpp, dst_w_bpp
    631  .else
    632   .set dst_r_bpp, 0
    633  .endif
    634 
    635  .set pix_per_block, 16*8/dst_w_bpp
    636  .if src_bpp != 0
    637   .if 32*8/src_bpp > pix_per_block
    638    .set pix_per_block, 32*8/src_bpp
    639   .endif
    640  .endif
    641  .if mask_bpp != 0
    642   .if 32*8/mask_bpp > pix_per_block
    643    .set pix_per_block, 32*8/mask_bpp
    644   .endif
    645  .endif
    646  .if dst_r_bpp != 0
    647   .if 32*8/dst_r_bpp > pix_per_block
    648    .set pix_per_block, 32*8/dst_r_bpp
    649   .endif
    650  .endif
    651 
    652 /* The standard entry conditions set up by pixman-arm-common.h are:
    653  * r0 = width (pixels)
    654  * r1 = height (rows)
    655  * r2 = pointer to top-left pixel of destination
    656  * r3 = destination stride (pixels)
    657  * [sp] = source pixel value, or pointer to top-left pixel of source
    658  * [sp,#4] = 0 or source stride (pixels)
    659  * The following arguments are unused for non-mask operations
    660  * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
    661  * [sp,#12] = 0 or mask stride (pixels)
    662  */
    663 
    664 /*
    665  * Assign symbolic names to registers
    666  */
    667     X           .req    r0  /* pixels to go on this line */
    668     Y           .req    r1  /* lines to go */
    669     DST         .req    r2  /* destination pixel pointer */
    670     STRIDE_D    .req    r3  /* destination stride (bytes, minus width) */
    671     SRC         .req    r4  /* source pixel pointer */
    672     STRIDE_S    .req    r5  /* source stride (bytes, minus width) */
    673     MASK        .req    r6  /* mask pixel pointer (if applicable) */
    674     STRIDE_M    .req    r7  /* mask stride (bytes, minus width) */
    675     WK0         .req    r8  /* pixel data registers */
    676     WK1         .req    r9
    677     WK2         .req    r10
    678     WK3         .req    r11
    679     SCRATCH     .req    r12
    680     ORIG_W      .req    r14 /* width (pixels) */
    681 
    682 fname:
    683         push    {r4-r11, lr}        /* save all registers */
    684 
    685         subs    Y, Y, #1
    686         blo     199f
    687 
    688 #ifdef DEBUG_PARAMS
    689         sub     sp, sp, #9*4
    690 #endif
    691 
    692  .if src_bpp > 0
    693         ldr     SRC, [sp, #ARGS_STACK_OFFSET]
    694         ldr     STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
    695  .endif
    696  .if mask_bpp > 0
    697         ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
    698         ldr     STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]
    699  .endif
    700 
    701 #ifdef DEBUG_PARAMS
    702         add     Y, Y, #1
    703         stmia   sp, {r0-r7,pc}
    704         sub     Y, Y, #1
    705 #endif
    706 
    707         init
    708 
    709         lsl     STRIDE_D, #dst_bpp_shift /* stride in bytes */
    710         sub     STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
    711  .if src_bpp > 0
    712         lsl     STRIDE_S, #src_bpp_shift
    713         sub     STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift
    714  .endif
    715  .if mask_bpp > 0
    716         lsl     STRIDE_M, #mask_bpp_shift
    717         sub     STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift
    718  .endif
    719 
    720         /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
    721         cmp     X, #2*16*8/dst_w_bpp - 1
    722         blo     170f
    723  .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */
    724         /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
    725         cmp     X, #(prefetch_distance+3)*pix_per_block - 1
    726         blo     160f
    727 
    728         /* Wide case */
    729         /* Adjust X so that the decrement instruction can also test for
    730          * inner loop termination. We want it to stop when there are
    731          * (prefetch_distance+1) complete blocks to go. */
    732         sub     X, X, #(prefetch_distance+2)*pix_per_block
    733         mov     ORIG_W, X
    734   .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
    735         /* This is stmdb sp!,{} */
    736         .word   0xE92D0000 | LINE_SAVED_REGS
    737   .endif
    738 151:    /* New line */
    739         newline
    740         preload_leading_step1  src_bpp, WK1, SRC
    741         preload_leading_step1  mask_bpp, WK2, MASK
    742         preload_leading_step1  dst_r_bpp, WK3, DST
    743 
    744         tst     DST, #15
    745         beq     154f
    746         rsb     WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
    747   .if (src_bpp != 0 && src_bpp != 2*dst_w_bpp) || (mask_bpp != 0 && mask_bpp != 2*dst_w_bpp)
    748         PF  and,    WK0, WK0, #15
    749   .endif
    750 
    751         preload_leading_step2  src_bpp, src_bpp_shift, WK1, SRC
    752         preload_leading_step2  mask_bpp, mask_bpp_shift, WK2, MASK
    753         preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST
    754 
    755         leading_15bytes  process_head, process_tail
    756 
    757 154:    /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
    758  .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
    759         and     SCRATCH, SRC, #31
    760         rsb     SCRATCH, SCRATCH, #32*prefetch_distance
    761  .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
    762         and     SCRATCH, MASK, #31
    763         rsb     SCRATCH, SCRATCH, #32*prefetch_distance
    764  .endif
    765  .ifc "process_inner_loop",""
    766         switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, wide_case_inner_loop, 157f
    767  .else
    768         switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, process_head, process_tail, process_inner_loop, 157f
    769  .endif
    770 
    771 157:    /* Check for another line */
    772         end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
    773  .endif
    774 
    775  .ltorg
    776 
    777 160:    /* Medium case */
    778         mov     ORIG_W, X
    779  .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
    780         /* This is stmdb sp!,{} */
    781         .word   0xE92D0000 | LINE_SAVED_REGS
    782  .endif
    783 161:    /* New line */
    784         newline
    785         preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
    786         preload_line 0, mask_bpp, mask_bpp_shift, MASK
    787         preload_line 0, dst_r_bpp, dst_bpp_shift, DST
    788 
    789         sub     X, X, #128/dst_w_bpp     /* simplifies inner loop termination */
    790         tst     DST, #15
    791         beq     164f
    792         rsb     WK0, DST, #0 /* bits 0-3 = number of leading bytes until destination aligned */
    793 
    794         leading_15bytes  process_head, process_tail
    795 
    796 164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */
    797         switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 167f
    798 
    799 167:    /* Check for another line */
    800         end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
    801 
    802  .ltorg
    803 
    804 170:    /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
    805  .if dst_w_bpp < 32
    806         mov     ORIG_W, X
    807  .endif
    808  .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
    809         /* This is stmdb sp!,{} */
    810         .word   0xE92D0000 | LINE_SAVED_REGS
    811  .endif
    812 171:    /* New line */
    813         newline
    814         preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
    815         preload_line 1, mask_bpp, mask_bpp_shift, MASK
    816         preload_line 1, dst_r_bpp, dst_bpp_shift, DST
    817 
    818  .if dst_w_bpp == 8
    819         tst     DST, #3
    820         beq     174f
    821 172:    subs    X, X, #1
    822         blo     177f
    823         process_head  , 1, 0, 1, 1, 0
    824         process_tail  , 1, 0
    825   .if !((flags) & FLAG_PROCESS_DOES_STORE)
    826         pixst   , 1, 0, DST
    827   .endif
    828         tst     DST, #3
    829         bne     172b
    830  .elseif dst_w_bpp == 16
    831         tst     DST, #2
    832         beq     174f
    833         subs    X, X, #1
    834         blo     177f
    835         process_head  , 2, 0, 1, 1, 0
    836         process_tail  , 2, 0
    837   .if !((flags) & FLAG_PROCESS_DOES_STORE)
    838         pixst   , 2, 0, DST
    839   .endif
    840  .endif
    841 
    842 174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
    843         switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, process_head, process_tail,, 177f
    844 
    845 177:    /* Check for another line */
    846         end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
    847 
    848 197:
    849  .if (flags) & FLAG_SPILL_LINE_VARS
    850         add     sp, sp, #LINE_SAVED_REG_COUNT*4
    851  .endif
    852 198:
    853         cleanup
    854 
    855 #ifdef DEBUG_PARAMS
    856         add     sp, sp, #9*4 /* junk the debug copy of arguments */
    857 #endif
    858 199:
    859         pop     {r4-r11, pc}  /* exit */
    860 
    861  .ltorg
    862 
    863     .unreq  X
    864     .unreq  Y
    865     .unreq  DST
    866     .unreq  STRIDE_D
    867     .unreq  SRC
    868     .unreq  STRIDE_S
    869     .unreq  MASK
    870     .unreq  STRIDE_M
    871     .unreq  WK0
    872     .unreq  WK1
    873     .unreq  WK2
    874     .unreq  WK3
    875     .unreq  SCRATCH
    876     .unreq  ORIG_W
    877     .endfunc
    878 .endm
    879 
    880 .macro line_saved_regs  x:vararg
    881  .set LINE_SAVED_REGS, 0
    882  .set LINE_SAVED_REG_COUNT, 0
    883  .irp SAVED_REG,x
    884   .ifc "SAVED_REG","Y"
    885    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
    886    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
    887   .endif
    888   .ifc "SAVED_REG","STRIDE_D"
    889    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
    890    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
    891   .endif
    892   .ifc "SAVED_REG","STRIDE_S"
    893    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)
    894    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
    895   .endif
    896   .ifc "SAVED_REG","STRIDE_M"
    897    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)
    898    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
    899   .endif
    900   .ifc "SAVED_REG","ORIG_W"
    901    .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)
    902    .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
    903   .endif
    904  .endr
    905 .endm
    906 
    907 .macro nop_macro x:vararg
    908 .endm
    909