Home | History | Annotate | Download | only in cpu_ref
      1 /*
      2  * Copyright (C) 2015 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
     18 #define END(f) .fnend; .size f, .-f;
     19 
     20 .eabi_attribute 25,1 @Tag_ABI_align8_preserved
     21 .arm
     22 
     23 /* Fixed-point precision after vertical pass -- 16 bit data minus 1 sign and 1
     24  * integer (bicubic has a little overshoot).  It would also be possible to add
     25  * a temporary DC bias to eliminate the sign bit for more precision, but that's
     26  * extra arithmetic.
     27  */
     28 .set VERTBITS, 14
     29 
     30 /* The size of the scratch buffer in which we store our vertically convolved
     31  * intermediates.
     32  */
     33 .set CHUNKSHIFT, 7
     34 .set CHUNKSIZE, (1 << CHUNKSHIFT)
     35 
     36 /* The number of components processed in a single iteration of the innermost
     37  * loop.
     38  */
     39 .set VECSHIFT, 3
     40 .set VECSIZE, (1<<VECSHIFT)
     41 
     42 /* Read four different lines (except at edges where addresses may be clamped,
     43  * which is why we don't simply take base and stride registers), and multiply
     44  * and accumulate them by the coefficients in d6[0..3], leaving the results in
     45  * q12.  This gives eight 16-bit results representing a horizontal line of 2-8
     46  * input pixels (depending on number of components per pixel) to be fed into
     47  * the horizontal scaling pass.
     48  *
     49  * Input coefficients are 16-bit unsigned fixed-point (although [0] and [3] are
     50  * known to represent negative values and VMLS is used to implement this).
     51  * Output is VERTBITS signed fixed-point, which must leave room for a little
     52  * bit of overshoot beyond [0,1.0).
     53  */
     54 .macro vert8, dstlo=d24, dsthi=d25
     55         vld1.u8     d16, [r4]!
     56         vld1.u8     d18, [r5]!
     57         vld1.u8     d20, [r6]!
     58         vld1.u8     d22, [r7]!
     59         vmovl.u8    q8, d16
     60         vmovl.u8    q9, d18
     61         vmovl.u8    q10, d20
     62         vmovl.u8    q11, d22
     63         vmull.u16   q12, d18, d6[1]
     64         vmull.u16   q13, d19, d6[1]
     65         vmlsl.u16   q12, d16, d6[0]
     66         vmlsl.u16   q13, d17, d6[0]
     67         vmlal.u16   q12, d20, d6[2]
     68         vmlal.u16   q13, d21, d6[2]
     69         vmlsl.u16   q12, d22, d6[3]
     70         vmlsl.u16   q13, d23, d6[3]
     71 
     72         /* Shift by 8 (bits per pixel), plus 16 (the fixed-point multiplies),
     73          * minus VERTBITS (the number of fraction bits we want to keep from
     74          * here on).
     75          */
     76         vqshrn.s32  \dstlo, q12, #8 + 16 - VERTBITS
     77         vqshrn.s32  \dsthi, q13, #8 + 16 - VERTBITS
     78 .endm
     79 
     80 /* As above, but only four 16-bit results into d25.
     81  */
     82 .macro vert4
     83         vld1.u32    d16[0], [r4]!
     84         vld1.u32    d18[0], [r5]!
     85         vld1.u32    d20[0], [r6]!
     86         vld1.u32    d22[0], [r7]!
     87         vmovl.u8    q8, d16
     88         vmovl.u8    q9, d18
     89         vmovl.u8    q10, d20
     90         vmovl.u8    q11, d22
     91         vmull.u16   q12, d18, d6[1]
     92         vmlsl.u16   q12, d16, d6[0]
     93         vmlal.u16   q12, d20, d6[2]
     94         vmlsl.u16   q12, d22, d6[3]
     95         vqshrn.s32  d25, q12, #8 + 16 - VERTBITS
     96 .endm
     97 
     98 
     99 /* During horizontal resize having CHUNKSIZE input available means being able
    100  * to produce a varying amount of output, depending on the phase of the data.
    101  * This function calculates the minimum number of VECSIZE chunks extracted from
    102  * a CHUNKSIZE window (r1), and the threshold value for when the count will be
    103  * one higher than that (r0).
    104  * These work out, conveniently, to be the quotient and remainder from:
    105  *      (CHUNKSIZE + xinc * VECSIZE - 1) / (xinc * VECSIZE)
    106  *
    107  * The two values can be packed together in a uint64_t for convenience; and
    108  * they are, in fact, used this way as an arithmetic short-cut later on.
    109  */
    110 
    111 /* uint64_t rsdIntrinsicResize_oscctl_K(uint32_t xinc); */
    112 ENTRY(rsdIntrinsicResize_oscctl_K)
    113         lsl         r2, r0, #VECSHIFT
    114         movw        r0, #:lower16:(CHUNKSIZE << 16) - 1
    115         movt        r0, #:upper16:(CHUNKSIZE << 16) - 1
    116         add         r0, r0, r2
    117 #if defined(ARCH_ARM_USE_UDIV)
    118         udiv        r1, r0, r2
    119         mls         r0, r1, r2, r0
    120 #else
    121         clz         r3, r2
    122         clz         r1, r0
    123         subs        r3, r3, r1
    124         movlt       r3, #0
    125         mov         r1, #1
    126         lsl         r2, r2, r3
    127         lsl         r3, r1, r3
    128         mov         r1, #0
    129 1:      cmp         r2, r0
    130         addls       r1, r3
    131         subls       r0, r2
    132         lsrs        r3, r3, #1
    133         lsr         r2, r2, #1
    134         bne         1b
    135 #endif
    136         bx          lr
    137 END(rsdIntrinsicResize_oscctl_K)
    138 
    139 /* Iterate to generate the uchar1, uchar2, and uchar4 versions of the code.
    140  * For the most part the vertical pass (the outer loop) is the same for all
    141  * versions.  Exceptions are handled in-line with conditional assembly.
    142  */
    143 .irep comp, 1, 2, 4
    144 .if \comp == 1
    145 .set COMPONENT_SHIFT, 0
    146 .elseif \comp == 2
    147 .set COMPONENT_SHIFT, 1
    148 .elseif \comp == 4
    149 .set COMPONENT_SHIFT, 2
    150 .else
    151 .error "Unknown component count"
    152 .endif
    153 .set COMPONENT_COUNT, (1 << COMPONENT_SHIFT)
    154 .set LOOP_OUTPUT_SIZE, (VECSIZE * COMPONENT_COUNT)
    155 
    156 .set BUFFER_SIZE, (CHUNKSIZE * 2 + 4) * COMPONENT_COUNT * 2
    157 .set OSC_STORE, (BUFFER_SIZE + 0)
    158 .set OSCSTEP_STORE, (BUFFER_SIZE + 4)
    159 .set OSCCTL_STORE, (BUFFER_SIZE + 8)
    160 .set AVAIL_STORE, (BUFFER_SIZE + 16)
    161 .set SP_STORE, (BUFFER_SIZE + 24)   /* should be +20, but rounded up to make a legal constant somewhere */
    162 
    163 /* void rsdIntrinsicResizeB\comp\()_K(
    164  *             uint8_t * restrict dst,          // r0
    165  *             size_t count,                    // r1
    166  *             uint32_t xf,                     // r2
    167  *             uint32_t xinc,                   // r3
    168  *             uint8_t const * restrict srcn,   // [sp]     -> [sp,#104] -> r4
    169  *             uint8_t const * restrict src0,   // [sp,#4]  -> [sp,#108] -> r5
    170  *             uint8_t const * restrict src1,   // [sp,#8]  -> [sp,#112] -> r6
    171  *             uint8_t const * restrict src2,   // [sp,#12] -> [sp,#116] -> r7
    172  *             size_t xclip,                    // [sp,#16] -> [sp,#120]
    173  *             size_t avail,                    // [sp,#20] -> [sp,#124] -> lr
    174  *             uint64_t osc_ctl,                // [sp,#24] -> [sp,#128]
    175  *             int32_t const *yr);              // [sp,#32] -> [sp,#136] -> d8 (copied to d6 for scalar access)
    176  */
    177 ENTRY(rsdIntrinsicResizeB\comp\()_K)
    178             push        {r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
    179             vpush       {d8-d15}
    180 
    181             /* align the working buffer on the stack to make it easy to use bit
    182              * twiddling for address calculations and bounds tests.
    183              */
    184             sub         r12, sp, #BUFFER_SIZE + 32
    185             mov         lr, sp
    186             bfc         r12, #0, #CHUNKSHIFT + 1 + COMPONENT_SHIFT + 1
    187             mov         sp, r12
    188             str         lr, [sp,#SP_STORE]
    189 
    190             ldr         r8, [lr,#136]           // yr
    191             adr         r9, 8f
    192             vld1.s32    {q4}, [r8]
    193             vld1.s16    {q5}, [r9]
    194             vqmovun.s32 d8, q4                  // yr
    195             vdup.s16    q6, r2
    196             vdup.s16    q7, r3
    197             vmla.s16    q6, q5, q7              // vxf
    198             vshl.s16    q7, q7, #VECSHIFT       // vxinc
    199 
    200             ldrd        r4,r5, [lr,#104]        // srcn, src0
    201             ldrd        r6,r7, [lr,#112]        // src1, src2
    202 
    203             /* Compute starting condition for oscillator used to compute ahead
    204              * of time how many iterations are possible before needing to
    205              * refill the working buffer.  This is based on the fixed-point
    206              * index of the last element in the vector of pixels processed in
    207              * each iteration, counting up until it would overflow.
    208              */
    209             sub         r8, r2, r3
    210             mov         r9, r3, LSL #VECSHIFT
    211             add         r8, r8, r9
    212 
    213             ldrd        r10,r11, [lr,#128]      // osc_ctl
    214 
    215             str         r8, [sp,#OSC_STORE]
    216             str         r9, [sp,#OSCSTEP_STORE]
    217             str         r10, [sp,#OSCCTL_STORE]
    218             str         r11, [sp,#OSCCTL_STORE+4]
    219             ldrd        r10,r11, [lr,#120]      // xclip,avail
    220 
    221 
    222             /* r4-r7 contain pointers to the four lines of input to be
    223              * convolved.  These pointers have been clamped vertically and
    224              * horizontally (which is why it's not a simple row/stride pair),
    225              * and the xclip argument (now in r10) indicates how many pixels
    226              * from true the x position of the pointer is.  This value should
    227              * be 0, 1, or 2 only.
    228              *
    229              * Start by placing four pixels worth of input at the far end of
    230              * the buffer.  As many as two of these may be clipped, so four
    231              * pixels are fetched, and then the first pixel is duplicated and
    232              * the data shifted according to xclip.  The source pointers are
    233              * then also adjusted according to xclip so that subsequent fetches
    234              * match.
    235              */
    236             vmov        d6, d8  /* make y coeffs available for vert4 and vert8 macros */
    237 
    238             sub         r8, r12, r10, LSL #COMPONENT_SHIFT + 1
    239             add         r9, r12, #(2 * CHUNKSIZE - 4) * COMPONENT_COUNT * 2
    240             add         r8, r8, #4 * COMPONENT_COUNT * 2
    241 .if \comp == 1
    242             vert4
    243             vdup.s16    d24, d25[0]
    244             vst1.s16    {q12}, [r12]
    245             vld1.s16    {d24}, [r8]
    246             vst1.s16    {d24}, [r9]
    247 .elseif \comp == 2
    248             vert8
    249             vdup.u32    q11, d24[0]
    250             vst1.s16    {q11,q12}, [r12]
    251             vld1.s16    {q12}, [r8]
    252             vst1.s16    {q12}, [r9]
    253 .elseif \comp == 4
    254             vert8       d28, d29
    255             vert8       d30, d31
    256             vmov.u64    d24, d28
    257             vmov.u64    d25, d28
    258             vmov.u64    d26, d28
    259             vmov.u64    d27, d28
    260             vst1.s16    {q12,q13}, [r12]!
    261             vst1.s16    {q14,q15}, [r12]
    262             sub         r12, r12, #32
    263             vld1.s16    {q11,q12}, [r8]
    264             vst1.s16    {q11,q12}, [r9]
    265 .endif
    266             /* Count off four pixels into the working buffer, and move count to
    267              * its new home.
    268              */
    269             sub         lr, r11, #4
    270             /* Incoming pointers were to the first _legal_ pixel.  Four pixels
    271              * were read unconditionally, but some may have been discarded by
    272              * xclip, so we rewind the pointers to compensate.
    273              */
    274             sub         r4, r4, r10, LSL #COMPONENT_SHIFT
    275             sub         r5, r5, r10, LSL #COMPONENT_SHIFT
    276             sub         r6, r6, r10, LSL #COMPONENT_SHIFT
    277             sub         r7, r7, r10, LSL #COMPONENT_SHIFT
    278 
    279             /* First tap starts where we just pre-filled, at the end of the
    280              * buffer.
    281              */
    282             add         r2, r2, #(CHUNKSIZE * 2 - 4) << 16
    283 
    284             /* Use overflowing arithmetic to implement wraparound array
    285              * indexing.
    286              */
    287             mov         r2, r2, LSL #(15 - CHUNKSHIFT)
    288             mov         r3, r3, LSL #(15 - CHUNKSHIFT)
    289 
    290             str         lr, [sp,#AVAIL_STORE]
    291 
    292             /* Start of outermost loop.
    293              * Fetch CHUNKSIZE pixels into scratch buffer, then calculate the
    294              * number of iterations of the inner loop that can be performed and
    295              * get into that.
    296              *
    297              * The fill is complicated by the possibility of running out of
    298              * input before the scratch buffer is filled.  If this isn't a risk
    299              * then it's handled by the simple loop at 2:, otherwise the
    300              * horrible loop at 3:.
    301              */
    302 1:          ldr         lr, [sp,#AVAIL_STORE]   /* get number of pixels available */
    303             vmov        d6, d8              /* put y scaling coefficients somewhere handy */
    304             subs        lr, #CHUNKSIZE
    305             bge         2f                  /* if at least CHUNKSIZE are available... */
    306             add         lr, #CHUNKSIZE      /* if they're not... */
    307             b           4f
    308             /* ..just sneaking a literal in here after this unconditional branch.. */
    309 8:          .hword      0, 1, 2, 3, 4, 5, 6, 7
    310             /* basic fill loop, processing 8 bytes at a time until there are
    311              * fewer than eight bytes available.
    312              */
    313 3:          vert8
    314             sub         lr, lr, #8 / COMPONENT_COUNT
    315             vst1.s16    {q12}, [r12]!
    316 4:          cmp         lr, #8 / COMPONENT_COUNT - 1
    317             bgt         3b
    318 .if \comp == 4
    319             blt         3f
    320             /* The last pixel (four bytes) if necessary */
    321             vert4
    322 .else
    323             cmp         lr, #1
    324             blt         3f
    325             /* The last pixels if necessary */
    326             sub         r4, r4, #8
    327             sub         r5, r5, #8
    328             sub         r6, r6, #8
    329             sub         r7, r7, #8
    330             add         r4, r4, lr, LSL #COMPONENT_SHIFT
    331             add         r5, r5, lr, LSL #COMPONENT_SHIFT
    332             add         r6, r6, lr, LSL #COMPONENT_SHIFT
    333             add         r7, r7, lr, LSL #COMPONENT_SHIFT
    334             vert8
    335             sub         lr, sp, lr, LSL #COMPONENT_SHIFT + 1
    336             sub         sp, sp, #32
    337             sub         lr, lr, #16
    338 .if \comp == 1
    339             vdup.s16    q13, d25[3]
    340 .elseif \comp == 2
    341             vdup.u32    q13, d25[1]
    342 .endif
    343             vst1.s16    {q12,q13}, [sp]
    344             vld1.s16    {q12}, [lr]
    345             add         sp, sp, #32
    346             b           4f
    347 .endif
    348             /* Keep filling until we get to the end of this chunk of the buffer */
    349 3:
    350 .if \comp == 1
    351             vdup.s16    q12, d25[3]
    352 .elseif \comp == 2
    353             vdup.u32    q12, d25[1]
    354 .elseif \comp == 4
    355             vmov.u64    d24, d25
    356 .endif
    357 4:          vst1.s16    {q12}, [r12]!
    358             tst         r12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
    359             bne         3b
    360             b           4f
    361 
    362 .align 4
    363 2:          /* Quickly pull a chunk of data into the working buffer.
    364              */
    365             vert8
    366             vst1.s16    {q12}, [r12]!
    367             vert8
    368             vst1.s16    {q12}, [r12]!
    369             tst         r12, #(CHUNKSIZE - 1) * COMPONENT_COUNT * 2
    370             bne         2b
    371             cmp         lr, #0
    372             bne         3f
    373 4:          /* if we end with 0 pixels left we'll have nothing handy to spread
    374              * across to the right, so we rewind a bit.
    375              */
    376             mov         lr, #1
    377             sub         r4, r4, #COMPONENT_COUNT
    378             sub         r5, r5, #COMPONENT_COUNT
    379             sub         r6, r6, #COMPONENT_COUNT
    380             sub         r7, r7, #COMPONENT_COUNT
    381 3:          str         lr, [sp,#AVAIL_STORE]       /* done with available pixel count */
    382             add         lr, sp, #OSC_STORE
    383             ldrd        r8,r9, [lr,#0]              /* need osc, osc_step soon */
    384             ldrd        r10,r11, [lr,#OSCCTL_STORE-OSC_STORE] /* need osc_ctl too */
    385 
    386             /* copy four taps (width of cubic window) to far end for overflow
    387              * address handling
    388              */
    389             sub         lr, r12, #CHUNKSIZE * COMPONENT_COUNT * 2
    390             eor         r12, lr, #CHUNKSIZE * COMPONENT_COUNT * 2
    391 .if \comp == 1
    392             vld1.s16    {d28}, [lr]
    393 .elseif \comp == 2
    394             vld1.s16    {q14}, [lr]
    395 .elseif \comp == 4
    396             vld1.s16    {q14,q15}, [lr]
    397 .endif
    398             add         lr, r12, #CHUNKSIZE * COMPONENT_COUNT * 2
    399 .if \comp == 1
    400             vst1.s16    {d28}, [lr]
    401 .elseif \comp == 2
    402             vst1.s16    {q14}, [lr]
    403 .elseif \comp == 4
    404             vst1.s16    {q14,q15}, [lr]
    405 .endif
    406             /* r11 contains the maximum possible iteration count, but if r8 is
    407              * greater than r10 then this indicates that the count must be
    408              * reduced by one for this iteration to avoid reading past the end
    409              * of the available data.
    410              */
    411             cmp             r10, r8
    412             sbc         lr, r11, #0
    413 
    414             mla         r8, lr, r9, r8
    415             sub         r8, r8, #(CHUNKSIZE << 16)
    416 
    417             str         r8, [sp,#OSC_STORE]         /* done with osc */
    418 
    419             /* prefer to count pixels, rather than vectors, to clarify the tail
    420              * store case on exit.
    421              */
    422             mov         lr, lr, LSL #VECSHIFT
    423             cmp         lr, r1
    424             movgt       lr, r1
    425 
    426             sub         r1, r1, lr
    427 
    428             mov         lr, lr, LSL #COMPONENT_SHIFT
    429 
    430             vmov.i16    d10, #3
    431             vmov.i16    d11, #0x8000
    432 
    433             cmp         lr, #0
    434             bgt         3f
    435             cmp         r1, #0
    436             bgt         1b     /* an extreme case where we shouldn't use code in this structure */
    437             b           9f
    438 
    439             .align 4
    440 2:          /* Inner loop continues here, but starts at 3:, see end of loop
    441              * below for explanation. */
    442 .if LOOP_OUTPUT_SIZE == 4
    443             vst1.u32    {d16[0]}, [r0]!
    444 .elseif LOOP_OUTPUT_SIZE == 8
    445             vst1.u8     {d16}, [r0]!
    446 .elseif LOOP_OUTPUT_SIZE == 16
    447             vst1.u8     {q8}, [r0]!
    448 .elseif LOOP_OUTPUT_SIZE == 32
    449             vst1.u8     {q8,q9}, [r0]!
    450 .endif
    451             /* Inner loop:  here the four x coefficients for each tap are
    452              * calculated in vector code, and the addresses are calculated in
    453              * scalar code, and these calculations are interleaved.
    454              */
    455 3:          vshr.u16    q8, q6, #1
    456             mov         r8, r2, LSR #(31 - CHUNKSHIFT)
    457             vqrdmulh.s16 q9, q8, q8
    458             add         r2, r2, r3
    459             vqrdmulh.s16 q10, q9, q8
    460             mov         r9, r2, LSR #(31 - CHUNKSHIFT)
    461             vshll.s16   q11, d18, #2
    462             vshll.s16   q12, d19, #2
    463             add         r2, r2, r3
    464             vmlsl.s16   q11, d20, d10
    465             vmlsl.s16   q12, d21, d10
    466             mov         r10, r2, LSR #(31 - CHUNKSHIFT)
    467 
    468             vhadd.s16   q0, q10, q8
    469             add         r2, r2, r3
    470             vsub.s16    q0, q9, q0
    471             mov         r11, r2, LSR #(31 - CHUNKSHIFT)
    472 
    473             vaddw.s16   q1, q11, d18
    474             vaddw.s16   q13, q12, d19
    475             add         r2, r2, r3
    476             vshrn.s32   d2, q1, #1
    477             vshrn.s32   d3, q13, #1
    478             add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
    479             vsub.s16    d2, d2, d11
    480             vsub.s16    d3, d3, d11 // TODO: find a wider d11 and use q-reg operation
    481             add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
    482 
    483             vaddw.s16   q2, q11, d16
    484             vaddw.s16   q13, q12, d17
    485             add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
    486             vshrn.s32   d4, q2, #1
    487             vshrn.s32   d5, q13, #1
    488             add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
    489             vneg.s16    q2, q2
    490 
    491             vhsub.s16   q3, q10, q9
    492 
    493             /* increment the x fractional parts (oveflow is ignored, as the
    494              * scalar arithmetic shadows this addition with full precision).
    495              */
    496             vadd.s16    q6, q6, q7
    497 
    498             /* At this point we have four pointers in r8-r11, pointing to the
    499              * four taps in the scratch buffer that must be convolved together
    500              * to produce an output pixel (one output pixel per pointer).
    501              * These pointers usually overlap, but their spacing is irregular
    502              * so resolving the redundancy through L1 is a pragmatic solution.
    503              *
    504              * The scratch buffer is made of signed 16-bit data, holding over
    505              * some extra precision, and overshoot, from the vertical pass.
    506              *
    507              * We also have the 16-bit unsigned fixed-point weights for each
    508              * of the four taps in q0 - q3.  That's eight pixels worth of
    509              * coefficients when we have only four pointers, so calculations
    510              * for four more pixels are interleaved with the fetch and permute
    511              * code for each variant in the following code.
    512              *
    513              * The data arrangement is less than ideal for any pixel format,
    514              * but permuting loads help to mitigate most of the problems.
    515              *
    516              * Note also that the two outside taps of a bicubic are negative,
    517              * but these coefficients are unsigned.  The sign is hard-coded by
    518              * use of multiply-and-subtract operations.
    519              */
    520 .if \comp == 1
    521             /* The uchar 1 case.
    522              * Issue one lanewise vld4.s16 to load four consecutive pixels from
    523              * one pointer (one pixel) into four different registers; then load
    524              * four consecutive s16 values from the next pointer (pixel) into
    525              * the next lane of those four registers, etc., so that we finish
    526              * with q12 - q15 representing the four taps, and each lane
    527              * representing a separate pixel.
    528              *
    529              * The first vld4 uses a splat to avoid any false dependency on
    530              * the previous state of the register.
    531              */
    532             vld4.s16    {d24[],d26[],d28[],d30[]}, [r8]
    533             mov         r8, r2, LSR #(31 - CHUNKSHIFT)
    534             add         r2, r2, r3
    535             vld4.s16    {d24[1],d26[1],d28[1],d30[1]}, [r9]
    536             add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
    537             mov         r9, r2, LSR #(31 - CHUNKSHIFT)
    538             add         r2, r2, r3
    539             vld4.s16    {d24[2],d26[2],d28[2],d30[2]}, [r10]
    540             add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
    541             mov         r10, r2, LSR #(31 - CHUNKSHIFT)
    542             add         r2, r2, r3
    543             vld4.s16    {d24[3],d26[3],d28[3],d30[3]}, [r11]
    544             add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
    545             mov         r11, r2, LSR #(31 - CHUNKSHIFT)
    546             add         r2, r2, r3
    547             vld4.s16    {d25[],d27[],d29[],d31[]}, [r8]
    548             add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
    549             vld4.s16    {d25[1],d27[1],d29[1],d31[1]}, [r9]
    550             vld4.s16    {d25[2],d27[2],d29[2],d31[2]}, [r10]
    551             vld4.s16    {d25[3],d27[3],d29[3],d31[3]}, [r11]
    552 
    553             vmull.s16   q8, d24, d0
    554             vmull.s16   q9, d25, d1
    555             vmlsl.s16   q8, d26, d2
    556             vmlsl.s16   q9, d27, d3
    557             vmlsl.s16   q8, d28, d4
    558             vmlsl.s16   q9, d29, d5
    559             vmlal.s16   q8, d30, d6
    560             vmlal.s16   q9, d31, d7
    561 
    562             subs        lr, lr, #LOOP_OUTPUT_SIZE
    563 
    564             vqrshrn.s32 d16, q8, #15
    565             vqrshrn.s32 d17, q9, #15
    566 
    567             vqrshrun.s16 d16, q8, #VERTBITS - 8
    568 .elseif \comp == 2
    569             /* The uchar2 case:
    570              * This time load pairs of values into adjacent lanes in q12 - q15
    571              * by aliasing them as u32 data; leaving room for only four pixels,
    572              * so the process has to be done twice.  This also means that the
    573              * coefficient registers fail to align with the coefficient data
    574              * (eight separate pixels), so that has to be doubled-up to match.
    575              */
    576             vld4.u32    {d24[],d26[],d28[],d30[]}, [r8]
    577             mov         r8, r2, LSR #(31 - CHUNKSHIFT)
    578             add         r2, r2, r3
    579             vld4.u32    {d24[1],d26[1],d28[1],d30[1]}, [r9]
    580             add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
    581             mov         r9, r2, LSR #(31 - CHUNKSHIFT)
    582             add         r2, r2, r3
    583             vld4.u32    {d25[],d27[],d29[],d31[]}, [r10]
    584             add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
    585             mov         r10, r2, LSR #(31 - CHUNKSHIFT)
    586             add         r2, r2, r3
    587             vld4.u32    {d25[1],d27[1],d29[1],d31[1]}, [r11]
    588             add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
    589             mov         r11, r2, LSR #(31 - CHUNKSHIFT)
    590             add         r2, r2, r3
    591 
    592             /* double-up coefficients to align with component pairs */
    593             vmov        d20, d0
    594             add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
    595             vmov        d21, d2
    596             vmov        d22, d4
    597             vmov        d23, d6
    598             vzip.s16    d0, d20
    599             vzip.s16    d2, d21
    600             vzip.s16    d4, d22
    601             vzip.s16    d6, d23
    602 
    603             vmull.s16   q8, d24, d0
    604             vmull.s16   q9, d25, d20
    605             vmlsl.s16   q8, d26, d2
    606             vmlsl.s16   q9, d27, d21
    607             vmlsl.s16   q8, d28, d4
    608             vmlsl.s16   q9, d29, d22
    609             vmlal.s16   q8, d30, d6
    610             vmlal.s16   q9, d31, d23
    611 
    612             vqrshrn.s32 d16, q8, #15
    613             vqrshrn.s32 d17, q9, #15
    614 
    615             vld4.u32    {d24[],d26[],d28[],d30[]}, [r8]
    616             vld4.u32    {d24[1],d26[1],d28[1],d30[1]}, [r9]
    617             vld4.u32    {d25[],d27[],d29[],d31[]}, [r10]
    618             vld4.u32    {d25[1],d27[1],d29[1],d31[1]}, [r11]
    619 
    620             /* double-up coefficients to align with component pairs */
    621             vmov        d0, d1
    622             vmov        d2, d3
    623             vmov        d4, d5
    624             vmov        d6, d7
    625             vzip.s16    d0, d1
    626             vzip.s16    d2, d3
    627             vzip.s16    d4, d5
    628             vzip.s16    d6, d7
    629 
    630             vmull.s16   q10, d24, d0
    631             vmull.s16   q11, d25, d1
    632             vmlsl.s16   q10, d26, d2
    633             vmlsl.s16   q11, d27, d3
    634             vmlsl.s16   q10, d28, d4
    635             vmlsl.s16   q11, d29, d5
    636             vmlal.s16   q10, d30, d6
    637             vmlal.s16   q11, d31, d7
    638 
    639             subs        lr, lr, #LOOP_OUTPUT_SIZE
    640 
    641             vqrshrn.s32 d18, q10, #15
    642             vqrshrn.s32 d19, q11, #15
    643 
    644             vqrshrun.s16 d16, q8, #VERTBITS - 8
    645             vqrshrun.s16 d17, q9, #VERTBITS - 8
    646 .elseif \comp == 4
    647             /* The uchar4 case.
    648              * This case is comparatively painless because four s16s are the
    649              * smallest addressable unit for a vmul-by-scalar.  Rather than
    650              * permute the data, simply arrange the multiplies to suit the way
    651              * the data comes in.  That's a lot of data, though, so things
    652              * progress in pairs of pixels at a time.
    653              */
    654             vld1.s16    {q12,q13}, [r8]
    655             mov         r8, r2, LSR #(31 - CHUNKSHIFT)
    656             add         r2, r2, r3
    657             vld1.s16    {q14,q15}, [r9]
    658             add         r8, sp, r8, LSL #(COMPONENT_SHIFT + 1)
    659             mov         r9, r2, LSR #(31 - CHUNKSHIFT)
    660             add         r2, r2, r3
    661 
    662             vmull.s16   q8, d24, d0[0]
    663             vmull.s16   q9, d28, d0[1]
    664             vmlsl.s16   q8, d25, d2[0]
    665             vmlsl.s16   q9, d29, d2[1]
    666             vmlsl.s16   q8, d26, d4[0]
    667             vmlsl.s16   q9, d30, d4[1]
    668             vmlal.s16   q8, d27, d6[0]
    669             vmlal.s16   q9, d31, d6[1]
    670 
    671             /* And two more...  */
    672             vld1.s16    {q12,q13}, [r10]
    673             add         r9, sp, r9, LSL #(COMPONENT_SHIFT + 1)
    674             mov         r10, r2, LSR #(31 - CHUNKSHIFT)
    675             add         r2, r2, r3
    676             vld1.s16    {q14,q15}, [r11]
    677             add         r10, sp, r10, LSL #(COMPONENT_SHIFT + 1)
    678             mov         r11, r2, LSR #(31 - CHUNKSHIFT)
    679             add         r2, r2, r3
    680 
    681             vqrshrn.s32 d16, q8, #15
    682             add         r11, sp, r11, LSL #(COMPONENT_SHIFT + 1)
    683             vqrshrn.s32 d17, q9, #15
    684 
    685             vmull.s16   q10, d24, d0[2]
    686             vmull.s16   q11, d28, d0[3]
    687             vmlsl.s16   q10, d25, d2[2]
    688             vmlsl.s16   q11, d29, d2[3]
    689             vmlsl.s16   q10, d26, d4[2]
    690             vmlsl.s16   q11, d30, d4[3]
    691             vmlal.s16   q10, d27, d6[2]
    692             vmlal.s16   q11, d31, d6[3]
    693 
    694             vqrshrn.s32 d18, q10, #15
    695             vqrshrn.s32 d19, q11, #15
    696 
    697             vqrshrun.s16 d16, q8, #VERTBITS - 8
    698             vqrshrun.s16 d17, q9, #VERTBITS - 8
    699 
    700             /* And two more...  */
    701             vld1.s16    {q12,q13}, [r8]
    702             vld1.s16    {q14,q15}, [r9]
    703 
    704             vmull.s16   q10, d24, d1[0]
    705             vmull.s16   q11, d28, d1[1]
    706             vmlsl.s16   q10, d25, d3[0]
    707             vmlsl.s16   q11, d29, d3[1]
    708             vmlsl.s16   q10, d26, d5[0]
    709             vmlsl.s16   q11, d30, d5[1]
    710             vmlal.s16   q10, d27, d7[0]
    711             vmlal.s16   q11, d31, d7[1]
    712 
    713             /* And two more...  */
    714             vld1.s16    {q12,q13}, [r10]
    715             vld1.s16    {q14,q15}, [r11]
    716 
    717             subs        lr, lr, #LOOP_OUTPUT_SIZE
    718 
    719             vqrshrn.s32 d18, q10, #15
    720             vqrshrn.s32 d19, q11, #15
    721 
    722             vmull.s16   q10, d24, d1[2]
    723             vmull.s16   q11, d28, d1[3]
    724             vmlsl.s16   q10, d25, d3[2]
    725             vmlsl.s16   q11, d29, d3[3]
    726             vmlsl.s16   q10, d26, d5[2]
    727             vmlsl.s16   q11, d30, d5[3]
    728             vmlal.s16   q10, d27, d7[2]
    729             vmlal.s16   q11, d31, d7[3]
    730 
    731             vqrshrn.s32 d20, q10, #15
    732             vqrshrn.s32 d21, q11, #15
    733 
    734             vqrshrun.s16 d18, q9, #VERTBITS - 8
    735             vqrshrun.s16 d19, q10, #VERTBITS - 8
    736 .endif
    737             bgt         2b      /* continue inner loop */
    738             /* The inner loop has already been limited to ensure that none of
    739              * the earlier iterations could overfill the output, so the store
    740              * appears within the loop but after the conditional branch (at the
    741              * top).  At the end, provided it won't overfill, perform the final
    742              * store here.  If it would, then break out to the tricky tail case
    743              * instead.
    744              */
    745             blt         1f
    746             /* Store the amount of data appropriate to the configuration of the
    747              * instance being assembled.
    748              */
    749 .if LOOP_OUTPUT_SIZE == 4
    750             vst1.u32    {d16[0]}, [r0]!
    751 .elseif LOOP_OUTPUT_SIZE == 8
    752             vst1.u8     {d16}, [r0]!
    753 .elseif LOOP_OUTPUT_SIZE == 16
    754             vst1.u8     {q8}, [r0]!
    755 .elseif LOOP_OUTPUT_SIZE == 32
    756             vst1.u8     {q8,q9}, [r0]!
    757 .endif
    758             b           1b              /* resume outer loop */
    759             /* Partial tail store case:
    760              * Different versions of the code need different subsets of the
    761              * following partial stores.  Here the number of components and the
    762              * size of the chunk of data produced by each inner loop iteration
    763              * is tested to figure out whether or not each phrase is relevant.
    764              */
    765 .if 16 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 16
    766 1:          tst         lr, #16
    767             beq         1f
    768             vst1.u8     {q8}, [r0]!
    769             vmov        q8, q9
    770 .endif
    771 .if 8 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 8
    772 1:          tst         lr, #8
    773             beq         1f
    774             vst1.u8     {d16}, [r0]!
    775             vmov.u8     d16, d17
    776 .endif
    777 .if 4 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 4
    778 1:          tst         lr, #4
    779             beq         1f
    780             vst1.u32    {d16[0]}, [r0]!
    781             vext.u32    d16, d16, d16, #1
    782 .endif
    783 .if 2 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 2
    784 1:          tst         lr, #2
    785             beq         1f
    786             vst1.u16    {d16[0]}, [r0]!
    787             vext.u16    d16, d16, d16, #1
    788 .endif
    789 .if 1 < LOOP_OUTPUT_SIZE && COMPONENT_COUNT <= 1
    790 1:          tst         lr, #1
    791             beq         1f
    792             vst1.u8     {d16[0]}, [r0]!
    793 .endif
    794 1:
    795 9:          ldr         sp, [sp,#SP_STORE]
    796             vpop        {d8-d15}
    797             pop         {r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
    798 END(rsdIntrinsicResizeB\comp\()_K)
    799 .endr
    800