Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2015 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
     18 #define END(f) .size f, .-f;
     19 
     20 /* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1
     21  * integer (bicubic has a little overshoot).  It would also be possible to add
     22  * a temporary DC bias to eliminate the sign bit for more precision, but that's
     23  * extra arithmetic.
     24  */
     25 .set VERTBITS, 14
     26 
     27 /* The size of the scratch buffer in which we store our vertically convolved
     28  * intermediates.
     29  */
     30 .set CHUNKSHIFT, 7       /* 5 tests better for uchar4, but 7 is necessary for ridiculous (10:1) scale factors */
     31 .set CHUNKSIZE, (1 << CHUNKSHIFT)
     32 
     33 /* The number of components processed in a single iteration of the innermost
     34  * loop.
     35  */
     36 .set VECSHIFT, 3
     37 .set VECSIZE, (1<<VECSHIFT)
     38 
     39 /* Read four different lines (except at edges where addresses may be clamped,
     40  * which is why we don't simply take base and stride registers), and multiply
     41  * and accumulate them by the coefficients in v3[0..3], leaving the results in
     42  * v12.  This gives eight 16-bit results representing a horizontal line of 2-8
     43  * input pixels (depending on number of components per pixel) to be fed into
     44  * the horizontal scaling pass.
     45  *
     46  * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are
     47  * known to represent negative values and VMLS is used to implement this).
     48  * Output is VERTBITS signed fixed-point, which must leave room for a little
     49  * v12.  This gives eight 16-bit results.
     50  */
     51 .macro vert8, dstlo=v12.4h, dsthi=v12.8h
     52         ld1         {v8.8b}, [x4], #8
     53         ld1         {v9.8b}, [x5], #8
     54         ld1         {v10.8b}, [x6], #8
     55         ld1         {v11.8b}, [x7], #8
     56         uxtl        v8.8h, v8.8b
     57         uxtl        v9.8h, v9.8b
     58         uxtl        v10.8h, v10.8b
     59         uxtl        v11.8h, v11.8b
     60         umull       v12.4s, v9.4h, v3.h[1]
     61         umull2      v13.4s, v9.8h, v3.h[1]
     62         umlsl       v12.4s, v8.4h, v3.h[0]
     63         umlsl2      v13.4s, v8.8h, v3.h[0]
     64         umlal       v12.4s, v10.4h, v3.h[2]
     65         umlal2      v13.4s, v10.8h, v3.h[2]
     66         umlsl       v12.4s, v11.4h, v3.h[3]
     67         umlsl2      v13.4s, v11.8h, v3.h[3]
     68 
     69         /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies),
     70          * minus VERTBITS (the number of fraction bits we want to keep from
     71          * here on).
     72          */
     73         sqshrn      \dstlo, v12.4s, #8 + (16 - VERTBITS)
     74         sqshrn2     \dsthi, v13.4s, #8 + (16 - VERTBITS)
     75 .endm
     76 
     77 /* As above, but only four 16-bit results into v12hi.
     78  */
     79 .macro vert4, dst=v12.8h
     80         ld1         {v8.s}[0], [x4], #4
     81         ld1         {v9.s}[0], [x5], #4
     82         ld1         {v10.s}[0], [x6], #4
     83         ld1         {v11.s}[0], [x7], #4
     84         uxtl        v8.8h, v8.8b
     85         uxtl        v9.8h, v9.8b
     86         uxtl        v10.8h, v10.8b
     87         uxtl        v11.8h, v11.8b
     88         umull       v12.4s, v9.4h, v3.h[1]
     89         umlsl       v12.4s, v8.4h, v3.h[0]
     90         umlal       v12.4s, v10.4h, v3.h[2]
     91         umlsl       v12.4s, v11.4h, v3.h[3]
     92 .ifc \dst,v12.8h
     93         sqshrn2     \dst, v12.4s, #8 + (16 - VERTBITS)
     94 .else
     95         sqshrn      \dst, v12.4s, #8 + (16 - VERTBITS)
     96 .endif
     97 .endm
     98 
     99 
    100 /* During horizontal resize having CHUNKSIZE input available means being able
    101  * to produce a varying amount of output, depending on the phase of the data.
    102  * This function calculates the minimum number of VECSIZE chunks extracted from
    103  * a CHUNKSIZE window (x1), and the threshold value for when the count will be
    104  * one higher than that (x0).
    105  * These work out, conveniently, to be the quotient and remainder from:
    106  *      (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE)
    107  *
    108  * The two values are packed together in a uint64_t for convenience; and
    109  * they are, in fact, used this way as an arithmetic short-cut later on.
    110  */
    111 /* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc) */
    112 ENTRY(rsdIntrinsicResize_oscctl_K)
    113         lsl         x2, x0, #VECSHIFT
    114         mov         x0, #(CHUNKSIZE << 16) - 1
    115         add         x0, x0, x2
    116         udiv        x1, x0, x2
    117         msub        x0, x1, x2, x0
    118         add         x0, x0, x1, LSL #32
    119         ret
    120 END(rsdIntrinsicResize_oscctl_K)
    121 
    122 /* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code.
    123  * For the most part the vertical pass (the outer loop) is the same for all
    124  * versions.  Exceptions are handled in-line with conditional assembly.
    125  */
    126 .irp comp, 1, 2, 4
    127 .if \comp == 1
    128 .set COMPONENT_SHIFT, 0
    129 .elseif \comp == 2
    130 .set COMPONENT_SHIFT, 1
    131 .elseif \comp == 4
    132 .set COMPONENT_SHIFT, 2
    133 .else
    134 .error "Unknown component count"
    135 .endif
    136 .set COMPONENT_COUNT, (1 << COMPONENT_SHIFT)
    137 .set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT)
    138 
    139 .set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2
    140 
    141 /* void rsdIntrinsicResizeB1_K(
    142  *             uint8_t * restrict dst,          // x0
    143  *             size_t count,                    // x1
    144  *             uint32_t xf,                     // x2
    145  *             uint32_t xinc,                   // x3
    146  *             uint8_t const * restrict srcn,   // x4
    147  *             uint8_t const * restrict src0,   // x5
    148  *             uint8_t const * restrict src1,   // x6
    149  *             uint8_t const * restrict src2,   // x7
    150  *             size_t xclip,                    // [sp,#0]  -> [sp,#64] -> x12
    151  *             size_t avail,                    // [sp,#8]  -> [sp,#72] -> x11
    152  *             uint64_t osc_ctl,                // [sp,#16] -> [sp,#80] -> x10
    153  *             int32 const *yr,                 // [sp,#24] -> [sp,#88] -> v4   (copied to v3   for scalar access)
    154  */
    155 ENTRY(rsdIntrinsicResizeB\comp\()_K)
    156             sub         x8, sp, #32
    157             sub         sp, sp, #64
    158             st1         {v8.1d - v11.1d}, [sp]
    159             st1         {v12.1d - v15.1d}, [x8]
    160 
    161             /* align the working buffer on the stack to make it easy to use bit
    162              * twiddling for address calculations.
    163              */
    164             sub         x12, sp, #BUFFER_SIZE
    165             bic         x12, x12, #(1 << (CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1)) - 1
    166 
    167             ldr         x8, [sp,#88]            // yr
    168             adr         x9, 8f
    169             ld1         {v4.4s}, [x8]
    170             ld1         {v5.8h}, [x9]
    171             sqxtun      v4.4h, v4.4s            // yr
    172             dup         v6.8h, w2
    173             dup         v7.8h, w3
    174             mla         v6.8h, v5.8h, v7.8h     // vxf
    175             shl         v7.8h, v7.8h, #VECSHIFT // vxinc
    176 
    177             /* Compute starting condition for oscillator used to compute ahead
    178              * of time how many iterations are possible before needing to
    179              * refill the working buffer.  This is based on the fixed-point
    180              * index of the last element in the vector of pixels processed in
    181              * each iteration, counting up until it would overflow.
    182              */
    183             sub         x8, x2, x3
    184             lsl         x9, x3, #VECSHIFT
    185             add         x8, x8, x9
    186 
    187             ldr         x10, [sp,#80]           // osc_ctl
    188             ldp         x13,x11, [sp,#64]       // xclip, avail
    189 
    190             mov         x18, sp
    191             mov         sp, x12
    192 
    193             /* x4-x7 contain pointers to the four lines of input to be
    194              * convolved.  These pointers have been clamped vertically and
    195              * horizontally (which is why it's not a simple row/stride pair),
    196              * and the xclip argument (now in x13) indicates how many pixels
    197              * from true the x position of the pointer is.  This value should
    198              * be 0, 1, or 2 only.
    199              *
    200              * Start by placing four pixels worth of input at the far end of
    201              * the buffer.  As many as two of these may be clipped, so four
    202              * pixels are fetched, and then the first pixel is duplicated and
    203              * the data shifted according to xclip.  The source pointers are
    204              * then also adjusted according to xclip so that subsequent fetches
    205              * match.
    206              */
    207             mov         v3.8b, v4.8b  /* make y coeffs available for vert4 and vert8 macros */
    208             sub         x14, x12, x13, LSL #(COMPONENT_SHIFT + 1)
    209             add         x15, x12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2
    210             add         x14, x14, #4 * COMPONENT_COUNT * 2
    211 .if \comp == 1
    212             vert4       v12.4h
    213             dup         v11.4h, v12.h[0]
    214             st1         {v11.4h,v12.4h}, [x12]
    215             ld1         {v12.4h}, [x14]
    216             st1         {v12.4h}, [x15]
    217 .elseif \comp == 2
    218             vert8
    219             dup         v11.4s, v12.s[0]
    220             st1         {v11.8h,v12.8h}, [x12]
    221             ld1         {v12.8h}, [x14]
    222             st1         {v12.8h}, [x15]
    223 .elseif \comp == 4
    224             vert8       v14.4h, v14.8h
    225             vert8       v15.4h, v15.8h
    226             dup         v12.2d, v14.d[0]
    227             dup         v13.2d, v14.d[0]
    228             st1         {v12.8h,v13.8h}, [x12], #32
    229             st1         {v14.8h,v15.8h}, [x12]
    230             sub         x12, x12, #32
    231             ld1         {v11.8h,v12.8h}, [x14]
    232             st1         {v11.8h,v12.8h}, [x15]
    233 .endif
    234             /* Count off four pixels into the working buffer.
    235              */
    236             sub         x11, x11, #4
    237             /* Incoming pointers were to the first _legal_ pixel.  Four pixels
    238              * were read unconditionally, but some may have been discarded by
    239              * xclip, so we rewind the pointers to compensate.
    240              */
    241             sub         x4, x4, x13, LSL #(COMPONENT_SHIFT)
    242             sub         x5, x5, x13, LSL #(COMPONENT_SHIFT)
    243             sub         x6, x6, x13, LSL #(COMPONENT_SHIFT)
    244             sub         x7, x7, x13, LSL #(COMPONENT_SHIFT)
    245 
    246             /* First tap starts where we just pre-filled, at the end of the
    247              * buffer.
    248              */
    249             add         x2, x2, #(CHUNKSIZE * 2 - 4) << 16
    250 
    251             /* Use overflowing arithmetic to implement wraparound array
    252              * indexing.
    253              */
    254             lsl         x2, x2, #(47 - CHUNKSHIFT)
    255             lsl         x3, x3, #(47 - CHUNKSHIFT)
    256 
    257 
    258             /* Start of outermost loop.
    259              * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the
    260              * number of iterations of the inner loop that can be performed and
    261              * get into that.
    262              *
    263              * The fill is complicated by the possibility of running out of
    264              * input before the scratch buffer is filled.  If this isn't a risk
    265              * then it's handled by the simple loop at 2:, otherwise the
    266              * horrible loop at 3:.
    267              */
    268 1:          mov         v3.8b, v4.8b            /* put y scaling coefficients somewhere handy */
    269             subs        x11, x11, #CHUNKSIZE
    270             bge         2f                      /* if at least CHUNKSIZE are available... */
    271             add         x11, x11, #CHUNKSIZE    /* if they're not... */
    272             b           4f
    273             /* ..just sneaking a literal in here after this unconditional branch.. */
    274 8:          .hword      0, 1, 2, 3, 4, 5, 6, 7
    275             /* basic fill loop, processing 8 bytes at a time until there are
    276              * fewer than eight bytes available.
    277              */
    278 3:          vert8
    279             sub         x11, x11, #8 / COMPONENT_COUNT
    280             st1         {v12.8h}, [x12], #16
    281 4:          cmp         x11, #8 / COMPONENT_COUNT - 1
    282             bgt         3b
    283 .if \comp == 4
    284             blt         3f
    285             /* The last pixel (four bytes) if necessary */
    286             vert4
    287 .else
    288             cmp         x11, #1
    289             blt         3f
    290             /* The last pixels if necessary */
    291             sub         x4, x4, #8
    292             sub         x5, x5, #8
    293             sub         x6, x6, #8
    294             sub         x7, x7, #8
    295             add         x4, x4, x11, LSL #(COMPONENT_SHIFT)
    296             add         x5, x5, x11, LSL #(COMPONENT_SHIFT)
    297             add         x6, x6, x11, LSL #(COMPONENT_SHIFT)
    298             add         x7, x7, x11, LSL #(COMPONENT_SHIFT)
    299             vert8
    300             sub         x11, sp, x11, LSL #(COMPONENT_SHIFT + 1)
    301             sub         sp, sp, #32
    302             sub         x11, x11, #16
    303 .if \comp == 1
    304             dup         v13.8h, v12.h[7]
    305 .elseif \comp == 2
    306             dup         v13.4s, v12.s[3]
    307 .endif
    308             st1         {v12.8h,v13.8h}, [sp]
    309             ld1         {v12.8h}, [x11]
    310             add         sp, sp, #32
    311             b           4f
    312 .endif
    313             /* Keep filling until we get to the end of this chunk of the buffer */
    314 3:
    315 .if \comp == 1
    316             dup         v12.8h, v12.h[7]
    317 .elseif \comp == 2
    318             dup         v12.4s, v12.s[3]
    319 .elseif \comp == 4
    320             dup         v12.2d, v12.d[1]
    321 .endif
    322 4:          st1         {v12.8h}, [x12], #16
    323             tst         x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
    324             bne         3b
    325             b           4f
    326 
    327 .align 4
    328 2:          /* Quickly pull a chunk of data into the working buffer.
    329              */
    330             vert8
    331             st1         {v12.8h}, [x12], #16
    332             vert8
    333             st1         {v12.8h}, [x12], #16
    334             tst         x12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
    335             bne         2b
    336             cmp         x11, #0
    337             bne         3f
    338 4:          /* if we end with 0 pixels left we'll have nothing handy to spread
    339              * across to the right, so we rewind a bit.
    340              */
    341             mov         x11, #1
    342             sub         x4, x4, #COMPONENT_COUNT
    343             sub         x5, x5, #COMPONENT_COUNT
    344             sub         x6, x6, #COMPONENT_COUNT
    345             sub         x7, x7, #COMPONENT_COUNT
    346 3:          /* copy four taps (width of cubic window) to far end for overflow
    347              * address handling
    348              */
    349             sub         x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
    350             eor         x12, x13, #CHUNKSIZE * COMPONENT_COUNT * 2
    351 .if \comp == 1
    352             ld1         {v14.4h}, [x13]
    353 .elseif \comp == 2
    354             ld1         {v14.8h}, [x13]
    355 .elseif \comp == 4
    356             ld1         {v14.8h,v15.8h}, [x13]
    357 .endif
    358             add         x13, x12, #CHUNKSIZE * COMPONENT_COUNT * 2
    359 .if \comp == 1
    360             st1         {v14.4h}, [x13]
    361 .elseif \comp == 2
    362             st1         {v14.8h}, [x13]
    363 .elseif \comp == 4
    364             st1         {v14.8h,v15.8h}, [x13]
    365 .endif
    366             /* The high 32-bits of x10 contains the maximum possible iteration
    367              * count, but if x8 is greater than the low 32-bits of x10 then
    368              * this indicates that the count must be reduced by one for this
    369              * iteration to avoid reading past the end of the available data.
    370              */
    371             sub         x13, x10, x8
    372             lsr         x13, x13, #32
    373 
    374             madd        x8, x13, x9, x8
    375             sub         x8, x8, #(CHUNKSIZE << 16)
    376 
    377             /* prefer to count pixels, rather than vectors, to clarify the tail
    378              * store case on exit.
    379              */
    380             lsl         x13, x13, #VECSHIFT
    381             cmp         x13, x1
    382             csel        x13, x1, x13, gt
    383 
    384             sub         x1, x1, x13
    385 
    386             lsl         x13, x13, #COMPONENT_SHIFT
    387 
    388             mov         w14, #0x8000
    389             movi        v30.8h, #3
    390             dup         v31.8h, w14
    391 
    392             cmp         x13, #0
    393             bgt         3f
    394             cmp         x1, #0
    395             bgt         1b     /* an extreme case where we shouldn't use code in this structure */
    396             b           9f
    397 
    398             .align 4
    399 2:          /* Inner loop continues here, but starts at 3:, see end of loop
    400              * below for explanation. */
    401 .if LOOP_OUTPUT_SIZE == 4
    402             st1         {v8.s}[0], [x0], #4
    403 .elseif LOOP_OUTPUT_SIZE == 8
    404             st1         {v8.8b}, [x0], #8
    405 .elseif LOOP_OUTPUT_SIZE == 16
    406             st1         {v8.16b}, [x0], #16
    407 .elseif LOOP_OUTPUT_SIZE == 32
    408             st1         {v8.16b,v9.16b}, [x0], #32
    409 .endif
    410             /* Inner loop:  here the four x coefficients for each tap are
    411              * calculated in vector code, and the addresses are calculated in
    412              * scalar code, and these calculations are interleaved.
    413              */
    414 3:          ushr        v8.8h, v6.8h, #1            // sxf
    415             lsr         x14, x2, #(63 - CHUNKSHIFT)
    416             sqrdmulh    v9.8h, v8.8h, v8.8h         // sxf**2
    417             add         x2, x2, x3
    418             sqrdmulh    v10.8h, v9.8h, v8.8h        // sxf**3
    419             lsr         x15, x2, #(63 - CHUNKSHIFT)
    420             sshll       v11.4s, v9.4h, #2
    421             sshll2      v12.4s, v9.8h, #2
    422             add         x2, x2, x3
    423             smlsl       v11.4s, v10.4h, v30.4h
    424             smlsl2      v12.4s, v10.8h, v30.8h
    425             lsr         x16, x2, #(63 - CHUNKSHIFT)
    426 
    427             shadd       v0.8h, v10.8h, v8.8h
    428             add         x2, x2, x3
    429             sub         v0.8h, v9.8h, v0.8h
    430             lsr         x17, x2, #(63 - CHUNKSHIFT)
    431 
    432             saddw       v1.4s, v11.4s, v9.4h
    433             saddw2      v13.4s, v12.4s, v9.8h
    434             add         x2, x2, x3
    435             shrn        v1.4h, v1.4s, #1
    436             shrn2       v1.8h, v13.4s, #1
    437             add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
    438             sub         v1.8h, v1.8h, v31.8h
    439             add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
    440 
    441             saddw       v2.4s, v11.4s, v8.4h
    442             saddw2      v13.4s, v12.4s, v8.8h
    443             add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
    444             shrn        v2.4h, v2.4s, #1
    445             shrn2       v2.8h, v13.4s, #1
    446             add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
    447             neg         v2.8h, v2.8h
    448 
    449             shsub       v3.8h, v10.8h, v9.8h
    450 
    451             /* increment the x fractional parts (oveflow is ignored, as the
    452              * scalar arithmetic shadows this addition with full precision).
    453              */
    454             add         v6.8h, v6.8h, v7.8h
    455 
    456             /* At this point we have four pointers in x8-x11, pointing to the
    457              * four taps in the scratch buffer that must be convolved together
    458              * to produce an output pixel (one output pixel per pointer).
    459              * These pointers usually overlap, but their spacing is irregular
    460              * so resolving the redundancy through L1 is a pragmatic solution.
    461              *
    462              * The scratch buffer is made of signed 16-bit data, holding over
    463              * some extra precision, and overshoot, from the vertical pass.
    464              *
    465              * We also have the 16-bit unsigned fixed-point weights for each
    466              * of the four taps in v0 - v3.  That's eight pixels worth of
    467              * coefficients when we have only four pointers, so calculations
    468              * for four more pixels are interleaved with the fetch and permute
    469              * code for each variant in the following code.
    470              *
    471              * The data arrangement is less than ideal for any pixel format,
    472              * but permuting loads help to mitigate most of the problems.
    473              *
    474              * Note also that the two outside taps of a bicubic are negative,
    475              * but these coefficients are unsigned.  The sign is hard-coded by
    476              * use of multiply-and-subtract operations.
    477              */
    478 .if \comp == 1
    479             /* The uchar 1 case.
    480              * Issue one lanewise ld4.h to load four consecutive pixels from
    481              * one pointer (one pixel) into four different registers; then load
    482              * four consecutive s16 values from the next pointer (pixel) into
    483              * the next lane of those four registers, etc., so that we finish
    484              * with v12 - v15 representing the four taps, and each lane
    485              * representing a separate pixel.
    486              *
    487              * The first ld4 uses a splat to avoid any false dependency on
    488              * the previous state of the register.
    489              */
    490             ld4r        {v12.8h,v13.8h,v14.8h,v15.8h}, [x14]
    491             lsr         x14, x2, #(63 - CHUNKSHIFT)
    492             add         x2, x2, x3
    493             ld4         {v12.h,v13.h,v14.h,v15.h}[1], [x15]
    494             add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
    495             lsr         x15, x2, #(63 - CHUNKSHIFT)
    496             add         x2, x2, x3
    497             ld4         {v12.h,v13.h,v14.h,v15.h}[2], [x16]
    498             add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
    499             lsr         x16, x2, #(63 - CHUNKSHIFT)
    500             add         x2, x2, x3
    501             ld4         {v12.h,v13.h,v14.h,v15.h}[3], [x17]
    502             add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
    503             lsr         x17, x2, #(63 - CHUNKSHIFT)
    504             add         x2, x2, x3
    505             ld4         {v12.h,v13.h,v14.h,v15.h}[4], [x14]
    506             add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
    507             ld4         {v12.h,v13.h,v14.h,v15.h}[5], [x15]
    508             ld4         {v12.h,v13.h,v14.h,v15.h}[6], [x16]
    509             ld4         {v12.h,v13.h,v14.h,v15.h}[7], [x17]
    510 
    511             smull       v8.4s, v12.4h, v0.4h
    512             smull2      v9.4s, v12.8h, v0.8h
    513             smlsl       v8.4s, v13.4h, v1.4h
    514             smlsl2      v9.4s, v13.8h, v1.8h
    515             smlsl       v8.4s, v14.4h, v2.4h
    516             smlsl2      v9.4s, v14.8h, v2.8h
    517             smlal       v8.4s, v15.4h, v3.4h
    518             smlal2      v9.4s, v15.8h, v3.8h
    519 
    520             subs        x13, x13, #LOOP_OUTPUT_SIZE
    521 
    522             sqrshrn     v8.4h, v8.4s, #15
    523             sqrshrn2    v8.8h, v9.4s, #15
    524 
    525             sqrshrun    v8.8b, v8.8h, #VERTBITS - 8
    526 .elseif \comp == 2
    527             /* The uchar2 case:
    528              * This time load pairs of values into adjacent lanes in v12 - v15
    529              * by aliasing them as u32 data; leaving room for only four pixels,
    530              * so the process has to be done twice.  This also means that the
    531              * coefficient registers fail to align with the coefficient data
    532              * (eight separate pixels), so that has to be doubled-up to match.
    533              */
    534             ld4r        {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
    535             lsr         x14, x2, #(63 - CHUNKSHIFT)
    536             add         x2, x2, x3
    537             ld4         {v12.s,v13.s,v14.s,v15.s}[1], [x15]
    538             add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
    539             lsr         x15, x2, #(63 - CHUNKSHIFT)
    540             add         x2, x2, x3
    541             ld4         {v12.s,v13.s,v14.s,v15.s}[2], [x16]
    542             add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
    543             lsr         x16, x2, #(63 - CHUNKSHIFT)
    544             add         x2, x2, x3
    545             ld4         {v12.s,v13.s,v14.s,v15.s}[3], [x17]
    546             add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
    547             lsr         x17, x2, #(63 - CHUNKSHIFT)
    548             add         x2, x2, x3
    549 
    550             /* double-up coefficients to align with component pairs */
    551             zip1        v16.8h, v0.8h, v0.8h
    552             add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
    553             zip1        v17.8h, v1.8h, v1.8h
    554             zip1        v18.8h, v2.8h, v2.8h
    555             zip1        v19.8h, v3.8h, v3.8h
    556 
    557             smull       v8.4s, v12.4h, v16.4h
    558             smull2      v9.4s, v12.8h, v16.8h
    559             smlsl       v8.4s, v13.4h, v17.4h
    560             smlsl2      v9.4s, v13.8h, v17.8h
    561             smlsl       v8.4s, v14.4h, v18.4h
    562             smlsl2      v9.4s, v14.8h, v18.8h
    563             smlal       v8.4s, v15.4h, v19.4h
    564             smlal2      v9.4s, v15.8h, v19.8h
    565 
    566             sqrshrn     v8.4h, v8.4s, #15
    567             sqrshrn2    v8.8h, v9.4s, #15
    568 
    569             ld4r        {v12.4s,v13.4s,v14.4s,v15.4s}, [x14]
    570             ld4         {v12.s,v13.s,v14.s,v15.s}[1], [x15]
    571             ld4         {v12.s,v13.s,v14.s,v15.s}[2], [x16]
    572             ld4         {v12.s,v13.s,v14.s,v15.s}[3], [x17]
    573 
    574             /* double-up coefficients to align with component pairs */
    575             zip2        v16.8h, v0.8h, v0.8h
    576             zip2        v17.8h, v1.8h, v1.8h
    577             zip2        v18.8h, v2.8h, v2.8h
    578             zip2        v19.8h, v3.8h, v3.8h
    579 
    580             smull       v10.4s, v12.4h, v16.4h
    581             smull2      v11.4s, v12.8h, v16.8h
    582             smlsl       v10.4s, v13.4h, v17.4h
    583             smlsl2      v11.4s, v13.8h, v17.8h
    584             smlsl       v10.4s, v14.4h, v18.4h
    585             smlsl2      v11.4s, v14.8h, v18.8h
    586             smlal       v10.4s, v15.4h, v19.4h
    587             smlal2      v11.4s, v15.8h, v19.8h
    588 
    589             subs        x13, x13, #LOOP_OUTPUT_SIZE
    590 
    591             sqrshrn     v9.4h, v10.4s, #15
    592             sqrshrn2    v9.8h, v11.4s, #15
    593 
    594             sqrshrun     v8.8b, v8.8h, #VERTBITS - 8
    595             sqrshrun2    v8.16b, v9.8h, #VERTBITS - 8
    596 .elseif \comp == 4
    597             /* The uchar4 case.
    598              * This case is comparatively painless because four s16s are the
    599              * smallest addressable unit for a vmul-by-scalar.  Rather than
    600              * permute the data, simply arrange the multiplies to suit the way
    601              * the data comes in.  That's a lot of data, though, so things
    602              * progress in pairs of pixels at a time.
    603              */
    604             ld1         {v12.8h,v13.8h}, [x14]
    605             lsr         x14, x2, #(63 - CHUNKSHIFT)
    606             add         x2, x2, x3
    607             ld1         {v14.8h,v15.8h}, [x15]
    608             add         x14, sp, x14, LSL #(COMPONENT_SHIFT + 1)
    609             lsr         x15, x2, #(63 - CHUNKSHIFT)
    610             add         x2, x2, x3
    611 
    612             smull       v8.4s, v12.4h, v0.h[0]
    613             smull       v9.4s, v14.4h, v0.h[1]
    614             smlsl2      v8.4s, v12.8h, v1.h[0]
    615             smlsl2      v9.4s, v14.8h, v1.h[1]
    616             smlsl       v8.4s, v13.4h, v2.h[0]
    617             smlsl       v9.4s, v15.4h, v2.h[1]
    618             smlal2      v8.4s, v13.8h, v3.h[0]
    619             smlal2      v9.4s, v15.8h, v3.h[1]
    620 
    621             /* And two more...  */
    622             ld1         {v12.8h,v13.8h}, [x16]
    623             add         x15, sp, x15, LSL #(COMPONENT_SHIFT + 1)
    624             lsr         x16, x2, #(63 - CHUNKSHIFT)
    625             add         x2, x2, x3
    626             ld1         {v14.8h,v15.8h}, [x17]
    627             add         x16, sp, x16, LSL #(COMPONENT_SHIFT + 1)
    628             lsr         x17, x2, #(63 - CHUNKSHIFT)
    629             add         x2, x2, x3
    630 
    631             sqrshrn     v8.4h, v8.4s, #15
    632             add         x17, sp, x17, LSL #(COMPONENT_SHIFT + 1)
    633             sqrshrn2    v8.8h, v9.4s, #15
    634 
    635             smull       v10.4s, v12.4h, v0.h[2]
    636             smull       v11.4s, v14.4h, v0.h[3]
    637             smlsl2      v10.4s, v12.8h, v1.h[2]
    638             smlsl2      v11.4s, v14.8h, v1.h[3]
    639             smlsl       v10.4s, v13.4h, v2.h[2]
    640             smlsl       v11.4s, v15.4h, v2.h[3]
    641             smlal2      v10.4s, v13.8h, v3.h[2]
    642             smlal2      v11.4s, v15.8h, v3.h[3]
    643 
    644             sqrshrn     v9.4h, v10.4s, #15
    645             sqrshrn2    v9.8h, v11.4s, #15
    646 
    647             sqrshrun     v8.8b, v8.8h, #VERTBITS - 8
    648             sqrshrun2    v8.16b, v9.8h, #VERTBITS - 8
    649 
    650             /* And two more...  */
    651             ld1         {v12.8h,v13.8h}, [x14]
    652             ld1         {v14.8h,v15.8h}, [x15]
    653 
    654             smull       v10.4s, v12.4h, v0.h[4]
    655             smull       v11.4s, v14.4h, v0.h[5]
    656             smlsl2      v10.4s, v12.8h, v1.h[4]
    657             smlsl2      v11.4s, v14.8h, v1.h[5]
    658             smlsl       v10.4s, v13.4h, v2.h[4]
    659             smlsl       v11.4s, v15.4h, v2.h[5]
    660             smlal2      v10.4s, v13.8h, v3.h[4]
    661             smlal2      v11.4s, v15.8h, v3.h[5]
    662 
    663             /* And two more...  */
    664             ld1         {v12.8h,v13.8h}, [x16]
    665             ld1         {v14.8h,v15.8h}, [x17]
    666 
    667             subs        x13, x13, #LOOP_OUTPUT_SIZE
    668 
    669             sqrshrn     v9.4h, v10.4s, #15
    670             sqrshrn2    v9.8h, v11.4s, #15
    671 
    672             smull       v10.4s, v12.4h, v0.h[6]
    673             smull       v11.4s, v14.4h, v0.h[7]
    674             smlsl2      v10.4s, v12.8h, v1.h[6]
    675             smlsl2      v11.4s, v14.8h, v1.h[7]
    676             smlsl       v10.4s, v13.4h, v2.h[6]
    677             smlsl       v11.4s, v15.4h, v2.h[7]
    678             smlal2      v10.4s, v13.8h, v3.h[6]
    679             smlal2      v11.4s, v15.8h, v3.h[7]
    680 
    681             sqrshrn     v10.4h, v10.4s, #15
    682             sqrshrn2    v10.8h, v11.4s, #15
    683 
    684             sqrshrun     v9.8b, v9.8h, #VERTBITS - 8
    685             sqrshrun2    v9.16b, v10.8h, #VERTBITS - 8
    686 .endif
    687             bgt         2b      /* continue inner loop */
    688             /* The inner loop has already been limited to ensure that none of
    689              * the earlier iterations could overfill the output, so the store
    690              * appears within the loop but after the conditional branch (at the
    691              * top).  At the end, provided it won't overfill, perform the final
    692              * store here.  If it would, then break out to the tricky tail case
    693              * instead.
    694              */
    695             blt         1f
    696             /* Store the amount of data appropriate to the configuration of the
    697              * instance being assembled.
    698              */
    699 .if LOOP_OUTPUT_SIZE == 4
    700             st1         {v8.s}[0], [x0], #4
    701 .elseif LOOP_OUTPUT_SIZE == 8
    702             st1         {v8.8b}, [x0], #8
    703 .elseif LOOP_OUTPUT_SIZE == 16
    704             st1         {v8.16b}, [x0], #16
    705 .elseif LOOP_OUTPUT_SIZE == 32
    706             st1         {v8.16b,v9.16b}, [x0], #32
    707 .endif
    708             b           1b              /* resume outer loop */
    709             /* Partial tail store case:
    710              * Different versions of the code need different subsets of the
    711              * following partial stores.  Here the number of components and the
    712              * size of the chunk of data produced by each inner loop iteration
    713              * is tested to figure out whether or not each phrase is relevant.
    714              */
    715 .if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16
    716 1:          tst         x13, #16
    717             beq         1f
    718             st1         {v8.16b}, [x0], #16
    719             mov         v8.16b, v9.16b
    720 .endif
    721 .if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8
    722 1:          tst         x13, #8
    723             beq         1f
    724             st1         {v8.8b}, [x0], #8
    725             ext         v8.16b, v8.16b, v8.16b, #8
    726 .endif
    727 .if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4
    728 1:          tst         x13, #4
    729             beq         1f
    730             st1         {v8.s}[0], [x0], #4
    731             ext         v8.8b, v8.8b, v8.8b, #4
    732 .endif
    733 .if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2
    734 1:          tst         x13, #2
    735             beq         1f
    736             st1         {v8.h}[0], [x0], #2
    737             ext         v8.8b, v8.8b, v8.8b, #2
    738 .endif
    739 .if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1
    740 1:          tst         x13, #1
    741             beq         1f
    742             st1         {v8.b}[0], [x0], #1
    743 .endif
    744 1:
    745 9:          mov         sp, x18
    746             ld1         {v8.1d - v11.1d}, [sp], #32
    747             ld1         {v12.1d - v15.1d}, [sp], #32
    748             ret
    749 END(rsdIntrinsicResizeB\comp\()_K)
    750 .endr
    751 
    752