Home | History | Annotate | Download | only in simd
      1 /*
      2  * ARMv7 NEON optimizations for libjpeg-turbo
      3  *
      4  * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
      5  * All rights reserved.
      6  * Author: Siarhei Siamashka <siarhei.siamashka (at) nokia.com>
      7  * Copyright (C) 2014 Siarhei Siamashka.  All Rights Reserved.
      8  * Copyright (C) 2014 Linaro Limited.  All Rights Reserved.
      9  *
     10  * This software is provided 'as-is', without any express or implied
     11  * warranty.  In no event will the authors be held liable for any damages
     12  * arising from the use of this software.
     13  *
     14  * Permission is granted to anyone to use this software for any purpose,
     15  * including commercial applications, and to alter it and redistribute it
     16  * freely, subject to the following restrictions:
     17  *
     18  * 1. The origin of this software must not be misrepresented; you must not
     19  *    claim that you wrote the original software. If you use this software
     20  *    in a product, an acknowledgment in the product documentation would be
     21  *    appreciated but is not required.
     22  * 2. Altered source versions must be plainly marked as such, and must not be
     23  *    misrepresented as being the original software.
     24  * 3. This notice may not be removed or altered from any source distribution.
     25  */
     26 
     27 #if defined(__linux__) && defined(__ELF__)
     28 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
     29 #endif
     30 
     31 .text
     32 .fpu neon
     33 .arch armv7a
     34 .object_arch armv4
     35 .arm
     36 
     37 
     38 #define RESPECT_STRICT_ALIGNMENT 1
     39 
     40 
     41 /*****************************************************************************/
     42 
     43 /* Supplementary macro for setting function attributes */
     44 .macro asm_function fname
     45 #ifdef __APPLE__
     46     .globl _\fname
     47 _\fname:
     48 #else
     49     .global \fname
     50 #ifdef __ELF__
     51     .hidden \fname
     52     .type \fname, %function
     53 #endif
     54 \fname:
     55 #endif
     56 .endm
     57 
     58 /* Transpose a block of 4x4 coefficients in four 64-bit registers */
     59 .macro transpose_4x4 x0, x1, x2, x3
     60     vtrn.16 \x0, \x1
     61     vtrn.16 \x2, \x3
     62     vtrn.32 \x0, \x2
     63     vtrn.32 \x1, \x3
     64 .endm
     65 
     66 
     67 #define CENTERJSAMPLE 128
     68 
     69 /*****************************************************************************/
     70 
     71 /*
     72  * Perform dequantization and inverse DCT on one block of coefficients.
     73  *
     74  * GLOBAL(void)
     75  * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
     76  *                        JSAMPARRAY output_buf, JDIMENSION output_col)
     77  */
     78 
     79 #define FIX_0_298631336  (2446)
     80 #define FIX_0_390180644  (3196)
     81 #define FIX_0_541196100  (4433)
     82 #define FIX_0_765366865  (6270)
     83 #define FIX_0_899976223  (7373)
     84 #define FIX_1_175875602  (9633)
     85 #define FIX_1_501321110  (12299)
     86 #define FIX_1_847759065  (15137)
     87 #define FIX_1_961570560  (16069)
     88 #define FIX_2_053119869  (16819)
     89 #define FIX_2_562915447  (20995)
     90 #define FIX_3_072711026  (25172)
     91 
     92 #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
     93 #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
     94 #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
     95 #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
     96 #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
     97 #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
     98 #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
     99 #define FIX_0_541196100_PLUS_0_765366865  (FIX_0_541196100 + FIX_0_765366865)
    100 
    101 /*
    102  * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
    103  * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
    104  */
    105 #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7)   \
    106 {                                                                             \
    107     DCTELEM row0, row1, row2, row3, row4, row5, row6, row7;                   \
    108     INT32   q1, q2, q3, q4, q5, q6, q7;                                       \
    109     INT32   tmp11_plus_tmp2, tmp11_minus_tmp2;                                \
    110                                                                               \
    111     /* 1-D iDCT input data */                                                 \
    112     row0 = xrow0;                                                             \
    113     row1 = xrow1;                                                             \
    114     row2 = xrow2;                                                             \
    115     row3 = xrow3;                                                             \
    116     row4 = xrow4;                                                             \
    117     row5 = xrow5;                                                             \
    118     row6 = xrow6;                                                             \
    119     row7 = xrow7;                                                             \
    120                                                                               \
    121     q5 = row7 + row3;                                                         \
    122     q4 = row5 + row1;                                                         \
    123     q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) +                    \
    124          MULTIPLY(q4, FIX_1_175875602);                                       \
    125     q7 = MULTIPLY(q5, FIX_1_175875602) +                                      \
    126          MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644);                     \
    127     q2 = MULTIPLY(row2, FIX_0_541196100) +                                    \
    128          MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065);                   \
    129     q4 = q6;                                                                  \
    130     q3 = ((INT32) row0 - (INT32) row4) << 13;                                 \
    131     q6 += MULTIPLY(row5, -FIX_2_562915447) +                                  \
    132           MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447);                  \
    133     /* now we can use q1 (reloadable constants have been used up) */          \
    134     q1 = q3 + q2;                                                             \
    135     q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) +                 \
    136           MULTIPLY(row1, -FIX_0_899976223);                                   \
    137     q5 = q7;                                                                  \
    138     q1 = q1 + q6;                                                             \
    139     q7 += MULTIPLY(row7, -FIX_0_899976223) +                                  \
    140           MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223);                  \
    141                                                                               \
    142     /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */        \
    143     tmp11_plus_tmp2 = q1;                                                     \
    144     row1 = 0;                                                                 \
    145                                                                               \
    146     q1 = q1 - q6;                                                             \
    147     q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) +                 \
    148           MULTIPLY(row3, -FIX_2_562915447);                                   \
    149     q1 = q1 - q6;                                                             \
    150     q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) +                   \
    151          MULTIPLY(row6, FIX_0_541196100);                                     \
    152     q3 = q3 - q2;                                                             \
    153                                                                               \
    154     /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */        \
    155     tmp11_minus_tmp2 = q1;                                                    \
    156                                                                               \
    157     q1 = ((INT32) row0 + (INT32) row4) << 13;                                 \
    158     q2 = q1 + q6;                                                             \
    159     q1 = q1 - q6;                                                             \
    160                                                                               \
    161     /* pick up the results */                                                 \
    162     tmp0  = q4;                                                               \
    163     tmp1  = q5;                                                               \
    164     tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2;                         \
    165     tmp3  = q7;                                                               \
    166     tmp10 = q2;                                                               \
    167     tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2;                         \
    168     tmp12 = q3;                                                               \
    169     tmp13 = q1;                                                               \
    170 }
    171 
    172 #define XFIX_0_899976223                    d0[0]
    173 #define XFIX_0_541196100                    d0[1]
    174 #define XFIX_2_562915447                    d0[2]
    175 #define XFIX_0_298631336_MINUS_0_899976223  d0[3]
    176 #define XFIX_1_501321110_MINUS_0_899976223  d1[0]
    177 #define XFIX_2_053119869_MINUS_2_562915447  d1[1]
    178 #define XFIX_0_541196100_PLUS_0_765366865   d1[2]
    179 #define XFIX_1_175875602                    d1[3]
    180 #define XFIX_1_175875602_MINUS_0_390180644  d2[0]
    181 #define XFIX_0_541196100_MINUS_1_847759065  d2[1]
    182 #define XFIX_3_072711026_MINUS_2_562915447  d2[2]
    183 #define XFIX_1_175875602_MINUS_1_961570560  d2[3]
    184 
    185 .balign 16
    186 jsimd_idct_islow_neon_consts:
    187     .short FIX_0_899976223                    /* d0[0] */
    188     .short FIX_0_541196100                    /* d0[1] */
    189     .short FIX_2_562915447                    /* d0[2] */
    190     .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
    191     .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
    192     .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
    193     .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
    194     .short FIX_1_175875602                    /* d1[3] */
    195     /* reloadable constants */
    196     .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
    197     .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
    198     .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
    199     .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
    200 
    201 asm_function jsimd_idct_islow_neon
    202 
    203     DCT_TABLE       .req r0
    204     COEF_BLOCK      .req r1
    205     OUTPUT_BUF      .req r2
    206     OUTPUT_COL      .req r3
    207     TMP1            .req r0
    208     TMP2            .req r1
    209     TMP3            .req r2
    210     TMP4            .req ip
    211 
    212     ROW0L           .req d16
    213     ROW0R           .req d17
    214     ROW1L           .req d18
    215     ROW1R           .req d19
    216     ROW2L           .req d20
    217     ROW2R           .req d21
    218     ROW3L           .req d22
    219     ROW3R           .req d23
    220     ROW4L           .req d24
    221     ROW4R           .req d25
    222     ROW5L           .req d26
    223     ROW5R           .req d27
    224     ROW6L           .req d28
    225     ROW6R           .req d29
    226     ROW7L           .req d30
    227     ROW7R           .req d31
    228 
    229     /* Load and dequantize coefficients into NEON registers
    230      * with the following allocation:
    231      *       0 1 2 3 | 4 5 6 7
    232      *      ---------+--------
    233      *   0 | d16     | d17     ( q8  )
    234      *   1 | d18     | d19     ( q9  )
    235      *   2 | d20     | d21     ( q10 )
    236      *   3 | d22     | d23     ( q11 )
    237      *   4 | d24     | d25     ( q12 )
    238      *   5 | d26     | d27     ( q13 )
    239      *   6 | d28     | d29     ( q14 )
    240      *   7 | d30     | d31     ( q15 )
    241      */
    242     adr             ip, jsimd_idct_islow_neon_consts
    243     vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
    244     vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
    245     vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
    246     vmul.s16        q8, q8, q0
    247     vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
    248     vmul.s16        q9, q9, q1
    249     vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
    250     vmul.s16        q10, q10, q2
    251     vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
    252     vmul.s16        q11, q11, q3
    253     vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
    254     vmul.s16        q12, q12, q0
    255     vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
    256     vmul.s16        q14, q14, q2
    257     vmul.s16        q13, q13, q1
    258     vld1.16         {d0, d1, d2, d3}, [ip, :128] /* load constants */
    259     add             ip, ip, #16
    260     vmul.s16        q15, q15, q3
    261     vpush           {d8-d15} /* save NEON registers */
    262     /* 1-D IDCT, pass 1, left 4x8 half */
    263     vadd.s16        d4,    ROW7L, ROW3L
    264     vadd.s16        d5,    ROW5L, ROW1L
    265     vmull.s16       q6,    d4,    XFIX_1_175875602_MINUS_1_961570560
    266     vmlal.s16       q6,    d5,    XFIX_1_175875602
    267     vmull.s16       q7,    d4,    XFIX_1_175875602
    268       /* Check for the zero coefficients in the right 4x8 half */
    269       push            {r4, r5}
    270     vmlal.s16       q7,    d5,    XFIX_1_175875602_MINUS_0_390180644
    271     vsubl.s16       q3,    ROW0L, ROW4L
    272       ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
    273     vmull.s16       q2,    ROW2L, XFIX_0_541196100
    274     vmlal.s16       q2,    ROW6L, XFIX_0_541196100_MINUS_1_847759065
    275       orr             r0,    r4,    r5
    276     vmov            q4,    q6
    277     vmlsl.s16       q6,    ROW5L, XFIX_2_562915447
    278       ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
    279     vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
    280     vshl.s32        q3,    q3,    #13
    281       orr             r0,    r0,    r4
    282     vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
    283       orr             r0,    r0,    r5
    284     vadd.s32        q1,    q3,    q2
    285       ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
    286     vmov            q5,    q7
    287     vadd.s32        q1,    q1,    q6
    288       orr             r0,    r0,    r4
    289     vmlsl.s16       q7,    ROW7L, XFIX_0_899976223
    290       orr             r0,    r0,    r5
    291     vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
    292     vrshrn.s32      ROW1L, q1,    #11
    293       ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
    294     vsub.s32        q1,    q1,    q6
    295     vmlal.s16       q5,    ROW5L, XFIX_2_053119869_MINUS_2_562915447
    296       orr             r0,    r0,    r4
    297     vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
    298       orr             r0,    r0,    r5
    299     vsub.s32        q1,    q1,    q6
    300     vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
    301       ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
    302     vmlal.s16       q6,    ROW6L, XFIX_0_541196100
    303     vsub.s32        q3,    q3,    q2
    304       orr             r0,    r0,    r4
    305     vrshrn.s32      ROW6L, q1,    #11
    306       orr             r0,    r0,    r5
    307     vadd.s32        q1,    q3,    q5
    308       ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
    309     vsub.s32        q3,    q3,    q5
    310     vaddl.s16       q5,    ROW0L, ROW4L
    311       orr             r0,    r0,    r4
    312     vrshrn.s32      ROW2L, q1,    #11
    313       orr             r0,    r0,    r5
    314     vrshrn.s32      ROW5L, q3,    #11
    315       ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
    316     vshl.s32        q5,    q5,    #13
    317     vmlal.s16       q4,    ROW7L, XFIX_0_298631336_MINUS_0_899976223
    318       orr             r0,    r0,    r4
    319     vadd.s32        q2,    q5,    q6
    320       orrs            r0,    r0,    r5
    321     vsub.s32        q1,    q5,    q6
    322     vadd.s32        q6,    q2,    q7
    323       ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
    324     vsub.s32        q2,    q2,    q7
    325     vadd.s32        q5,    q1,    q4
    326       orr             r0,    r4,    r5
    327     vsub.s32        q3,    q1,    q4
    328       pop             {r4, r5}
    329     vrshrn.s32      ROW7L, q2,    #11
    330     vrshrn.s32      ROW3L, q5,    #11
    331     vrshrn.s32      ROW0L, q6,    #11
    332     vrshrn.s32      ROW4L, q3,    #11
    333 
    334       beq             3f /* Go to do some special handling for the sparse right 4x8 half */
    335 
    336     /* 1-D IDCT, pass 1, right 4x8 half */
    337     vld1.s16        {d2},  [ip, :64]    /* reload constants */
    338     vadd.s16        d10,   ROW7R, ROW3R
    339     vadd.s16        d8,    ROW5R, ROW1R
    340       /* Transpose left 4x8 half */
    341       vtrn.16         ROW6L, ROW7L
    342     vmull.s16       q6,    d10,   XFIX_1_175875602_MINUS_1_961570560
    343     vmlal.s16       q6,    d8,    XFIX_1_175875602
    344       vtrn.16         ROW2L, ROW3L
    345     vmull.s16       q7,    d10,   XFIX_1_175875602
    346     vmlal.s16       q7,    d8,    XFIX_1_175875602_MINUS_0_390180644
    347       vtrn.16         ROW0L, ROW1L
    348     vsubl.s16       q3,    ROW0R, ROW4R
    349     vmull.s16       q2,    ROW2R, XFIX_0_541196100
    350     vmlal.s16       q2,    ROW6R, XFIX_0_541196100_MINUS_1_847759065
    351       vtrn.16         ROW4L, ROW5L
    352     vmov            q4,    q6
    353     vmlsl.s16       q6,    ROW5R, XFIX_2_562915447
    354     vmlal.s16       q6,    ROW3R, XFIX_3_072711026_MINUS_2_562915447
    355       vtrn.32         ROW1L, ROW3L
    356     vshl.s32        q3,    q3,    #13
    357     vmlsl.s16       q4,    ROW1R, XFIX_0_899976223
    358       vtrn.32         ROW4L, ROW6L
    359     vadd.s32        q1,    q3,    q2
    360     vmov            q5,    q7
    361     vadd.s32        q1,    q1,    q6
    362       vtrn.32         ROW0L, ROW2L
    363     vmlsl.s16       q7,    ROW7R, XFIX_0_899976223
    364     vmlal.s16       q7,    ROW1R, XFIX_1_501321110_MINUS_0_899976223
    365     vrshrn.s32      ROW1R, q1,    #11
    366       vtrn.32         ROW5L, ROW7L
    367     vsub.s32        q1,    q1,    q6
    368     vmlal.s16       q5,    ROW5R, XFIX_2_053119869_MINUS_2_562915447
    369     vmlsl.s16       q5,    ROW3R, XFIX_2_562915447
    370     vsub.s32        q1,    q1,    q6
    371     vmull.s16       q6,    ROW2R, XFIX_0_541196100_PLUS_0_765366865
    372     vmlal.s16       q6,    ROW6R, XFIX_0_541196100
    373     vsub.s32        q3,    q3,    q2
    374     vrshrn.s32      ROW6R, q1,    #11
    375     vadd.s32        q1,    q3,    q5
    376     vsub.s32        q3,    q3,    q5
    377     vaddl.s16       q5,    ROW0R, ROW4R
    378     vrshrn.s32      ROW2R, q1,    #11
    379     vrshrn.s32      ROW5R, q3,    #11
    380     vshl.s32        q5,    q5,    #13
    381     vmlal.s16       q4,    ROW7R, XFIX_0_298631336_MINUS_0_899976223
    382     vadd.s32        q2,    q5,    q6
    383     vsub.s32        q1,    q5,    q6
    384     vadd.s32        q6,    q2,    q7
    385     vsub.s32        q2,    q2,    q7
    386     vadd.s32        q5,    q1,    q4
    387     vsub.s32        q3,    q1,    q4
    388     vrshrn.s32      ROW7R, q2,    #11
    389     vrshrn.s32      ROW3R, q5,    #11
    390     vrshrn.s32      ROW0R, q6,    #11
    391     vrshrn.s32      ROW4R, q3,    #11
    392     /* Transpose right 4x8 half */
    393     vtrn.16         ROW6R, ROW7R
    394     vtrn.16         ROW2R, ROW3R
    395     vtrn.16         ROW0R, ROW1R
    396     vtrn.16         ROW4R, ROW5R
    397     vtrn.32         ROW1R, ROW3R
    398     vtrn.32         ROW4R, ROW6R
    399     vtrn.32         ROW0R, ROW2R
    400     vtrn.32         ROW5R, ROW7R
    401 
    402 1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
    403     vld1.s16        {d2},  [ip, :64]    /* reload constants */
    404     vmull.s16       q6,    ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
    405     vmlal.s16       q6,    ROW1L, XFIX_1_175875602
    406     vmlal.s16       q6,    ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
    407     vmlal.s16       q6,    ROW3L, XFIX_1_175875602_MINUS_1_961570560
    408     vmull.s16       q7,    ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
    409     vmlal.s16       q7,    ROW3L, XFIX_1_175875602
    410     vmlal.s16       q7,    ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
    411     vmlal.s16       q7,    ROW1L, XFIX_1_175875602_MINUS_0_390180644
    412     vsubl.s16       q3,    ROW0L, ROW0R /* ROW4L <-> ROW0R */
    413     vmull.s16       q2,    ROW2L, XFIX_0_541196100
    414     vmlal.s16       q2,    ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
    415     vmov            q4,    q6
    416     vmlsl.s16       q6,    ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
    417     vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
    418     vshl.s32        q3,    q3,    #13
    419     vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
    420     vadd.s32        q1,    q3,    q2
    421     vmov            q5,    q7
    422     vadd.s32        q1,    q1,    q6
    423     vmlsl.s16       q7,    ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
    424     vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
    425     vshrn.s32       ROW1L, q1,    #16
    426     vsub.s32        q1,    q1,    q6
    427     vmlal.s16       q5,    ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
    428     vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
    429     vsub.s32        q1,    q1,    q6
    430     vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
    431     vmlal.s16       q6,    ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
    432     vsub.s32        q3,    q3,    q2
    433     vshrn.s32       ROW2R, q1,    #16 /* ROW6L <-> ROW2R */
    434     vadd.s32        q1,    q3,    q5
    435     vsub.s32        q3,    q3,    q5
    436     vaddl.s16       q5,    ROW0L, ROW0R /* ROW4L <-> ROW0R */
    437     vshrn.s32       ROW2L, q1,    #16
    438     vshrn.s32       ROW1R, q3,    #16 /* ROW5L <-> ROW1R */
    439     vshl.s32        q5,    q5,    #13
    440     vmlal.s16       q4,    ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
    441     vadd.s32        q2,    q5,    q6
    442     vsub.s32        q1,    q5,    q6
    443     vadd.s32        q6,    q2,    q7
    444     vsub.s32        q2,    q2,    q7
    445     vadd.s32        q5,    q1,    q4
    446     vsub.s32        q3,    q1,    q4
    447     vshrn.s32       ROW3R, q2,    #16 /* ROW7L <-> ROW3R */
    448     vshrn.s32       ROW3L, q5,    #16
    449     vshrn.s32       ROW0L, q6,    #16
    450     vshrn.s32       ROW0R, q3,    #16 /* ROW4L <-> ROW0R */
    451     /* 1-D IDCT, pass 2, right 4x8 half */
    452     vld1.s16        {d2},  [ip, :64]    /* reload constants */
    453     vmull.s16       q6,    ROW5R, XFIX_1_175875602
    454     vmlal.s16       q6,    ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
    455     vmlal.s16       q6,    ROW7R, XFIX_1_175875602_MINUS_1_961570560
    456     vmlal.s16       q6,    ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
    457     vmull.s16       q7,    ROW7R, XFIX_1_175875602
    458     vmlal.s16       q7,    ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
    459     vmlal.s16       q7,    ROW5R, XFIX_1_175875602_MINUS_0_390180644
    460     vmlal.s16       q7,    ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
    461     vsubl.s16       q3,    ROW4L, ROW4R /* ROW4L <-> ROW0R */
    462     vmull.s16       q2,    ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
    463     vmlal.s16       q2,    ROW6R, XFIX_0_541196100_MINUS_1_847759065
    464     vmov            q4,    q6
    465     vmlsl.s16       q6,    ROW5R, XFIX_2_562915447
    466     vmlal.s16       q6,    ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
    467     vshl.s32        q3,    q3,    #13
    468     vmlsl.s16       q4,    ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
    469     vadd.s32        q1,    q3,    q2
    470     vmov            q5,    q7
    471     vadd.s32        q1,    q1,    q6
    472     vmlsl.s16       q7,    ROW7R, XFIX_0_899976223
    473     vmlal.s16       q7,    ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
    474     vshrn.s32       ROW5L, q1,    #16 /* ROW5L <-> ROW1R */
    475     vsub.s32        q1,    q1,    q6
    476     vmlal.s16       q5,    ROW5R, XFIX_2_053119869_MINUS_2_562915447
    477     vmlsl.s16       q5,    ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
    478     vsub.s32        q1,    q1,    q6
    479     vmull.s16       q6,    ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
    480     vmlal.s16       q6,    ROW6R, XFIX_0_541196100
    481     vsub.s32        q3,    q3,    q2
    482     vshrn.s32       ROW6R, q1,    #16
    483     vadd.s32        q1,    q3,    q5
    484     vsub.s32        q3,    q3,    q5
    485     vaddl.s16       q5,    ROW4L, ROW4R /* ROW4L <-> ROW0R */
    486     vshrn.s32       ROW6L, q1,    #16 /* ROW6L <-> ROW2R */
    487     vshrn.s32       ROW5R, q3,    #16
    488     vshl.s32        q5,    q5,    #13
    489     vmlal.s16       q4,    ROW7R, XFIX_0_298631336_MINUS_0_899976223
    490     vadd.s32        q2,    q5,    q6
    491     vsub.s32        q1,    q5,    q6
    492     vadd.s32        q6,    q2,    q7
    493     vsub.s32        q2,    q2,    q7
    494     vadd.s32        q5,    q1,    q4
    495     vsub.s32        q3,    q1,    q4
    496     vshrn.s32       ROW7R, q2,    #16
    497     vshrn.s32       ROW7L, q5,    #16 /* ROW7L <-> ROW3R */
    498     vshrn.s32       ROW4L, q6,    #16 /* ROW4L <-> ROW0R */
    499     vshrn.s32       ROW4R, q3,    #16
    500 
    501 2:  /* Descale to 8-bit and range limit */
    502     vqrshrn.s16     d16,   q8,    #2
    503     vqrshrn.s16     d17,   q9,    #2
    504     vqrshrn.s16     d18,   q10,   #2
    505     vqrshrn.s16     d19,   q11,   #2
    506     vpop            {d8-d15} /* restore NEON registers */
    507     vqrshrn.s16     d20,   q12,   #2
    508       /* Transpose the final 8-bit samples and do signed->unsigned conversion */
    509       vtrn.16         q8,    q9
    510     vqrshrn.s16     d21,   q13,   #2
    511     vqrshrn.s16     d22,   q14,   #2
    512       vmov.u8         q0,    #(CENTERJSAMPLE)
    513     vqrshrn.s16     d23,   q15,   #2
    514       vtrn.8          d16,   d17
    515       vtrn.8          d18,   d19
    516       vadd.u8         q8,    q8,    q0
    517       vadd.u8         q9,    q9,    q0
    518       vtrn.16         q10,   q11
    519         /* Store results to the output buffer */
    520         ldmia           OUTPUT_BUF!, {TMP1, TMP2}
    521         add             TMP1, TMP1, OUTPUT_COL
    522         add             TMP2, TMP2, OUTPUT_COL
    523         vst1.8          {d16}, [TMP1]
    524       vtrn.8          d20, d21
    525         vst1.8          {d17}, [TMP2]
    526         ldmia           OUTPUT_BUF!, {TMP1, TMP2}
    527         add             TMP1, TMP1, OUTPUT_COL
    528         add             TMP2, TMP2, OUTPUT_COL
    529         vst1.8          {d18}, [TMP1]
    530       vadd.u8         q10,   q10,   q0
    531         vst1.8          {d19}, [TMP2]
    532         ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
    533         add             TMP1, TMP1, OUTPUT_COL
    534         add             TMP2, TMP2, OUTPUT_COL
    535         add             TMP3, TMP3, OUTPUT_COL
    536         add             TMP4, TMP4, OUTPUT_COL
    537       vtrn.8          d22, d23
    538         vst1.8          {d20}, [TMP1]
    539       vadd.u8         q11,   q11,   q0
    540         vst1.8          {d21}, [TMP2]
    541         vst1.8          {d22}, [TMP3]
    542         vst1.8          {d23}, [TMP4]
    543     bx              lr
    544 
    545 3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
    546 
    547     /* Transpose left 4x8 half */
    548     vtrn.16         ROW6L, ROW7L
    549     vtrn.16         ROW2L, ROW3L
    550     vtrn.16         ROW0L, ROW1L
    551     vtrn.16         ROW4L, ROW5L
    552     vshl.s16        ROW0R, ROW0R, #2 /* PASS1_BITS */
    553     vtrn.32         ROW1L, ROW3L
    554     vtrn.32         ROW4L, ROW6L
    555     vtrn.32         ROW0L, ROW2L
    556     vtrn.32         ROW5L, ROW7L
    557 
    558     cmp             r0, #0
    559     beq             4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
    560 
    561     /* Only row 0 is non-zero for the right 4x8 half  */
    562     vdup.s16        ROW1R, ROW0R[1]
    563     vdup.s16        ROW2R, ROW0R[2]
    564     vdup.s16        ROW3R, ROW0R[3]
    565     vdup.s16        ROW4R, ROW0R[0]
    566     vdup.s16        ROW5R, ROW0R[1]
    567     vdup.s16        ROW6R, ROW0R[2]
    568     vdup.s16        ROW7R, ROW0R[3]
    569     vdup.s16        ROW0R, ROW0R[0]
    570     b               1b /* Go to 'normal' second pass */
    571 
    572 4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
    573     vld1.s16        {d2},  [ip, :64]    /* reload constants */
    574     vmull.s16       q6,    ROW1L, XFIX_1_175875602
    575     vmlal.s16       q6,    ROW3L, XFIX_1_175875602_MINUS_1_961570560
    576     vmull.s16       q7,    ROW3L, XFIX_1_175875602
    577     vmlal.s16       q7,    ROW1L, XFIX_1_175875602_MINUS_0_390180644
    578     vmull.s16       q2,    ROW2L, XFIX_0_541196100
    579     vshll.s16       q3,    ROW0L, #13
    580     vmov            q4,    q6
    581     vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
    582     vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
    583     vadd.s32        q1,    q3,    q2
    584     vmov            q5,    q7
    585     vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
    586     vadd.s32        q1,    q1,    q6
    587     vadd.s32        q6,    q6,    q6
    588     vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
    589     vshrn.s32       ROW1L, q1,    #16
    590     vsub.s32        q1,    q1,    q6
    591     vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
    592     vsub.s32        q3,    q3,    q2
    593     vshrn.s32       ROW2R, q1,    #16 /* ROW6L <-> ROW2R */
    594     vadd.s32        q1,    q3,    q5
    595     vsub.s32        q3,    q3,    q5
    596     vshll.s16       q5,    ROW0L, #13
    597     vshrn.s32       ROW2L, q1,    #16
    598     vshrn.s32       ROW1R, q3,    #16 /* ROW5L <-> ROW1R */
    599     vadd.s32        q2,    q5,    q6
    600     vsub.s32        q1,    q5,    q6
    601     vadd.s32        q6,    q2,    q7
    602     vsub.s32        q2,    q2,    q7
    603     vadd.s32        q5,    q1,    q4
    604     vsub.s32        q3,    q1,    q4
    605     vshrn.s32       ROW3R, q2,    #16 /* ROW7L <-> ROW3R */
    606     vshrn.s32       ROW3L, q5,    #16
    607     vshrn.s32       ROW0L, q6,    #16
    608     vshrn.s32       ROW0R, q3,    #16 /* ROW4L <-> ROW0R */
    609     /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
    610     vld1.s16        {d2},  [ip, :64]    /* reload constants */
    611     vmull.s16       q6,    ROW5L, XFIX_1_175875602
    612     vmlal.s16       q6,    ROW7L, XFIX_1_175875602_MINUS_1_961570560
    613     vmull.s16       q7,    ROW7L, XFIX_1_175875602
    614     vmlal.s16       q7,    ROW5L, XFIX_1_175875602_MINUS_0_390180644
    615     vmull.s16       q2,    ROW6L, XFIX_0_541196100
    616     vshll.s16       q3,    ROW4L, #13
    617     vmov            q4,    q6
    618     vmlal.s16       q6,    ROW7L, XFIX_3_072711026_MINUS_2_562915447
    619     vmlsl.s16       q4,    ROW5L, XFIX_0_899976223
    620     vadd.s32        q1,    q3,    q2
    621     vmov            q5,    q7
    622     vmlal.s16       q7,    ROW5L, XFIX_1_501321110_MINUS_0_899976223
    623     vadd.s32        q1,    q1,    q6
    624     vadd.s32        q6,    q6,    q6
    625     vmlsl.s16       q5,    ROW7L, XFIX_2_562915447
    626     vshrn.s32       ROW5L, q1,    #16 /* ROW5L <-> ROW1R */
    627     vsub.s32        q1,    q1,    q6
    628     vmull.s16       q6,    ROW6L, XFIX_0_541196100_PLUS_0_765366865
    629     vsub.s32        q3,    q3,    q2
    630     vshrn.s32       ROW6R, q1,    #16
    631     vadd.s32        q1,    q3,    q5
    632     vsub.s32        q3,    q3,    q5
    633     vshll.s16       q5,    ROW4L, #13
    634     vshrn.s32       ROW6L, q1,    #16 /* ROW6L <-> ROW2R */
    635     vshrn.s32       ROW5R, q3,    #16
    636     vadd.s32        q2,    q5,    q6
    637     vsub.s32        q1,    q5,    q6
    638     vadd.s32        q6,    q2,    q7
    639     vsub.s32        q2,    q2,    q7
    640     vadd.s32        q5,    q1,    q4
    641     vsub.s32        q3,    q1,    q4
    642     vshrn.s32       ROW7R, q2,    #16
    643     vshrn.s32       ROW7L, q5,    #16 /* ROW7L <-> ROW3R */
    644     vshrn.s32       ROW4L, q6,    #16 /* ROW4L <-> ROW0R */
    645     vshrn.s32       ROW4R, q3,    #16
    646     b               2b /* Go to epilogue */
    647 
    648     .unreq          DCT_TABLE
    649     .unreq          COEF_BLOCK
    650     .unreq          OUTPUT_BUF
    651     .unreq          OUTPUT_COL
    652     .unreq          TMP1
    653     .unreq          TMP2
    654     .unreq          TMP3
    655     .unreq          TMP4
    656 
    657     .unreq          ROW0L
    658     .unreq          ROW0R
    659     .unreq          ROW1L
    660     .unreq          ROW1R
    661     .unreq          ROW2L
    662     .unreq          ROW2R
    663     .unreq          ROW3L
    664     .unreq          ROW3R
    665     .unreq          ROW4L
    666     .unreq          ROW4R
    667     .unreq          ROW5L
    668     .unreq          ROW5R
    669     .unreq          ROW6L
    670     .unreq          ROW6R
    671     .unreq          ROW7L
    672     .unreq          ROW7R
    673 
    674 
    675 /*****************************************************************************/
    676 
    677 /*
    678  * jsimd_idct_ifast_neon
    679  *
    680  * This function contains a fast, not so accurate integer implementation of
    681  * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
    682  * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
    683  * function from jidctfst.c
    684  *
    685  * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
    686  * But in ARM NEON case some extra additions are required because VQDMULH
    687  * instruction can't handle the constants larger than 1. So the expressions
    688  * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
    689  * which introduces an extra addition. Overall, there are 6 extra additions
    690  * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
    691  */
    692 
    693 #define XFIX_1_082392200 d0[0]
    694 #define XFIX_1_414213562 d0[1]
    695 #define XFIX_1_847759065 d0[2]
    696 #define XFIX_2_613125930 d0[3]
    697 
    698 .balign 16
    699 jsimd_idct_ifast_neon_consts:
    700     .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
    701     .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
    702     .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
    703     .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
    704 
    705 asm_function jsimd_idct_ifast_neon
    706 
    707     DCT_TABLE       .req r0
    708     COEF_BLOCK      .req r1
    709     OUTPUT_BUF      .req r2
    710     OUTPUT_COL      .req r3
    711     TMP1            .req r0
    712     TMP2            .req r1
    713     TMP3            .req r2
    714     TMP4            .req ip
    715 
    716     /* Load and dequantize coefficients into NEON registers
    717      * with the following allocation:
    718      *       0 1 2 3 | 4 5 6 7
    719      *      ---------+--------
    720      *   0 | d16     | d17     ( q8  )
    721      *   1 | d18     | d19     ( q9  )
    722      *   2 | d20     | d21     ( q10 )
    723      *   3 | d22     | d23     ( q11 )
    724      *   4 | d24     | d25     ( q12 )
    725      *   5 | d26     | d27     ( q13 )
    726      *   6 | d28     | d29     ( q14 )
    727      *   7 | d30     | d31     ( q15 )
    728      */
    729     adr             ip, jsimd_idct_ifast_neon_consts
    730     vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
    731     vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
    732     vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
    733     vmul.s16        q8,  q8,  q0
    734     vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
    735     vmul.s16        q9,  q9,  q1
    736     vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
    737     vmul.s16        q10, q10, q2
    738     vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
    739     vmul.s16        q11, q11, q3
    740     vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
    741     vmul.s16        q12, q12, q0
    742     vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
    743     vmul.s16        q14, q14, q2
    744     vmul.s16        q13, q13, q1
    745     vld1.16         {d0}, [ip, :64] /* load constants */
    746     vmul.s16        q15, q15, q3
    747     vpush           {d8-d13}        /* save NEON registers */
    748     /* 1-D IDCT, pass 1 */
    749     vsub.s16        q2,  q10, q14
    750     vadd.s16        q14, q10, q14
    751     vsub.s16        q1,  q11, q13
    752     vadd.s16        q13, q11, q13
    753     vsub.s16        q5,  q9,  q15
    754     vadd.s16        q15, q9,  q15
    755     vqdmulh.s16     q4,  q2,  XFIX_1_414213562
    756     vqdmulh.s16     q6,  q1,  XFIX_2_613125930
    757     vadd.s16        q3,  q1,  q1
    758     vsub.s16        q1,  q5,  q1
    759     vadd.s16        q10, q2,  q4
    760     vqdmulh.s16     q4,  q1,  XFIX_1_847759065
    761     vsub.s16        q2,  q15, q13
    762     vadd.s16        q3,  q3,  q6
    763     vqdmulh.s16     q6,  q2,  XFIX_1_414213562
    764     vadd.s16        q1,  q1,  q4
    765     vqdmulh.s16     q4,  q5,  XFIX_1_082392200
    766     vsub.s16        q10, q10, q14
    767     vadd.s16        q2,  q2,  q6
    768     vsub.s16        q6,  q8,  q12
    769     vadd.s16        q12, q8,  q12
    770     vadd.s16        q9,  q5,  q4
    771     vadd.s16        q5,  q6,  q10
    772     vsub.s16        q10, q6,  q10
    773     vadd.s16        q6,  q15, q13
    774     vadd.s16        q8,  q12, q14
    775     vsub.s16        q3,  q6,  q3
    776     vsub.s16        q12, q12, q14
    777     vsub.s16        q3,  q3,  q1
    778     vsub.s16        q1,  q9,  q1
    779     vadd.s16        q2,  q3,  q2
    780     vsub.s16        q15, q8,  q6
    781     vadd.s16        q1,  q1,  q2
    782     vadd.s16        q8,  q8,  q6
    783     vadd.s16        q14, q5,  q3
    784     vsub.s16        q9,  q5,  q3
    785     vsub.s16        q13, q10, q2
    786     vadd.s16        q10, q10, q2
    787       /* Transpose */
    788       vtrn.16         q8,  q9
    789     vsub.s16        q11, q12, q1
    790       vtrn.16         q14, q15
    791     vadd.s16        q12, q12, q1
    792       vtrn.16         q10, q11
    793       vtrn.16         q12, q13
    794       vtrn.32         q9,  q11
    795       vtrn.32         q12, q14
    796       vtrn.32         q8,  q10
    797       vtrn.32         q13, q15
    798       vswp            d28, d21
    799       vswp            d26, d19
    800     /* 1-D IDCT, pass 2 */
    801     vsub.s16        q2,  q10, q14
    802       vswp            d30, d23
    803     vadd.s16        q14, q10, q14
    804       vswp            d24, d17
    805     vsub.s16        q1,  q11, q13
    806     vadd.s16        q13, q11, q13
    807     vsub.s16        q5,  q9,  q15
    808     vadd.s16        q15, q9,  q15
    809     vqdmulh.s16     q4,  q2,  XFIX_1_414213562
    810     vqdmulh.s16     q6,  q1,  XFIX_2_613125930
    811     vadd.s16        q3,  q1,  q1
    812     vsub.s16        q1,  q5,  q1
    813     vadd.s16        q10, q2,  q4
    814     vqdmulh.s16     q4,  q1,  XFIX_1_847759065
    815     vsub.s16        q2,  q15, q13
    816     vadd.s16        q3,  q3,  q6
    817     vqdmulh.s16     q6,  q2,  XFIX_1_414213562
    818     vadd.s16        q1,  q1,  q4
    819     vqdmulh.s16     q4,  q5,  XFIX_1_082392200
    820     vsub.s16        q10, q10, q14
    821     vadd.s16        q2,  q2,  q6
    822     vsub.s16        q6,  q8,  q12
    823     vadd.s16        q12, q8,  q12
    824     vadd.s16        q9,  q5,  q4
    825     vadd.s16        q5,  q6,  q10
    826     vsub.s16        q10, q6,  q10
    827     vadd.s16        q6,  q15, q13
    828     vadd.s16        q8,  q12, q14
    829     vsub.s16        q3,  q6,  q3
    830     vsub.s16        q12, q12, q14
    831     vsub.s16        q3,  q3,  q1
    832     vsub.s16        q1,  q9,  q1
    833     vadd.s16        q2,  q3,  q2
    834     vsub.s16        q15, q8,  q6
    835     vadd.s16        q1,  q1,  q2
    836     vadd.s16        q8,  q8,  q6
    837     vadd.s16        q14, q5,  q3
    838     vsub.s16        q9,  q5,  q3
    839     vsub.s16        q13, q10, q2
    840     vpop            {d8-d13}        /* restore NEON registers */
    841     vadd.s16        q10, q10, q2
    842     vsub.s16        q11, q12, q1
    843     vadd.s16        q12, q12, q1
    844     /* Descale to 8-bit and range limit */
    845     vmov.u8         q0,  #0x80
    846     vqshrn.s16      d16, q8,  #5
    847     vqshrn.s16      d17, q9,  #5
    848     vqshrn.s16      d18, q10, #5
    849     vqshrn.s16      d19, q11, #5
    850     vqshrn.s16      d20, q12, #5
    851     vqshrn.s16      d21, q13, #5
    852     vqshrn.s16      d22, q14, #5
    853     vqshrn.s16      d23, q15, #5
    854     vadd.u8         q8,  q8,  q0
    855     vadd.u8         q9,  q9,  q0
    856     vadd.u8         q10, q10, q0
    857     vadd.u8         q11, q11, q0
    858     /* Transpose the final 8-bit samples */
    859     vtrn.16         q8,  q9
    860     vtrn.16         q10, q11
    861     vtrn.32         q8,  q10
    862     vtrn.32         q9,  q11
    863     vtrn.8          d16, d17
    864     vtrn.8          d18, d19
    865       /* Store results to the output buffer */
    866       ldmia           OUTPUT_BUF!, {TMP1, TMP2}
    867       add             TMP1, TMP1, OUTPUT_COL
    868       add             TMP2, TMP2, OUTPUT_COL
    869       vst1.8          {d16}, [TMP1]
    870       vst1.8          {d17}, [TMP2]
    871       ldmia           OUTPUT_BUF!, {TMP1, TMP2}
    872       add             TMP1, TMP1, OUTPUT_COL
    873       add             TMP2, TMP2, OUTPUT_COL
    874       vst1.8          {d18}, [TMP1]
    875     vtrn.8          d20, d21
    876       vst1.8          {d19}, [TMP2]
    877       ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
    878       add             TMP1, TMP1, OUTPUT_COL
    879       add             TMP2, TMP2, OUTPUT_COL
    880       add             TMP3, TMP3, OUTPUT_COL
    881       add             TMP4, TMP4, OUTPUT_COL
    882       vst1.8          {d20}, [TMP1]
    883     vtrn.8          d22, d23
    884       vst1.8          {d21}, [TMP2]
    885       vst1.8          {d22}, [TMP3]
    886       vst1.8          {d23}, [TMP4]
    887     bx              lr
    888 
    889     .unreq          DCT_TABLE
    890     .unreq          COEF_BLOCK
    891     .unreq          OUTPUT_BUF
    892     .unreq          OUTPUT_COL
    893     .unreq          TMP1
    894     .unreq          TMP2
    895     .unreq          TMP3
    896     .unreq          TMP4
    897 
    898 
    899 /*****************************************************************************/
    900 
    901 /*
    902  * jsimd_idct_4x4_neon
    903  *
    904  * This function contains inverse-DCT code for getting reduced-size
    905  * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
    906  * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
    907  * function from jpeg-6b (jidctred.c).
    908  *
    909  * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
    910  *       requires much less arithmetic operations and hence should be faster.
    911  *       The primary purpose of this particular NEON optimized function is
    912  *       bit exact compatibility with jpeg-6b.
    913  *
    914  * TODO: a bit better instructions scheduling can be achieved by expanding
    915  *       idct_helper/transpose_4x4 macros and reordering instructions,
    916  *       but readability will suffer somewhat.
    917  */
    918 
    919 #define CONST_BITS  13
    920 
    921 #define FIX_0_211164243  (1730)  /* FIX(0.211164243) */
    922 #define FIX_0_509795579  (4176)  /* FIX(0.509795579) */
    923 #define FIX_0_601344887  (4926)  /* FIX(0.601344887) */
    924 #define FIX_0_720959822  (5906)  /* FIX(0.720959822) */
    925 #define FIX_0_765366865  (6270)  /* FIX(0.765366865) */
    926 #define FIX_0_850430095  (6967)  /* FIX(0.850430095) */
    927 #define FIX_0_899976223  (7373)  /* FIX(0.899976223) */
    928 #define FIX_1_061594337  (8697)  /* FIX(1.061594337) */
    929 #define FIX_1_272758580  (10426) /* FIX(1.272758580) */
    930 #define FIX_1_451774981  (11893) /* FIX(1.451774981) */
    931 #define FIX_1_847759065  (15137) /* FIX(1.847759065) */
    932 #define FIX_2_172734803  (17799) /* FIX(2.172734803) */
    933 #define FIX_2_562915447  (20995) /* FIX(2.562915447) */
    934 #define FIX_3_624509785  (29692) /* FIX(3.624509785) */
    935 
    936 .balign 16
    937 jsimd_idct_4x4_neon_consts:
    938     .short     FIX_1_847759065     /* d0[0] */
    939     .short     -FIX_0_765366865    /* d0[1] */
    940     .short     -FIX_0_211164243    /* d0[2] */
    941     .short     FIX_1_451774981     /* d0[3] */
    942     .short     -FIX_2_172734803    /* d1[0] */
    943     .short     FIX_1_061594337     /* d1[1] */
    944     .short     -FIX_0_509795579    /* d1[2] */
    945     .short     -FIX_0_601344887    /* d1[3] */
    946     .short     FIX_0_899976223     /* d2[0] */
    947     .short     FIX_2_562915447     /* d2[1] */
    948     .short     1 << (CONST_BITS+1) /* d2[2] */
    949     .short     0                   /* d2[3] */
    950 
    951 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
    952     vmull.s16       q14, \x4,  d2[2]
    953     vmlal.s16       q14, \x8,  d0[0]
    954     vmlal.s16       q14, \x14, d0[1]
    955 
    956     vmull.s16       q13, \x16, d1[2]
    957     vmlal.s16       q13, \x12, d1[3]
    958     vmlal.s16       q13, \x10, d2[0]
    959     vmlal.s16       q13, \x6,  d2[1]
    960 
    961     vmull.s16       q15, \x4,  d2[2]
    962     vmlsl.s16       q15, \x8,  d0[0]
    963     vmlsl.s16       q15, \x14, d0[1]
    964 
    965     vmull.s16       q12, \x16, d0[2]
    966     vmlal.s16       q12, \x12, d0[3]
    967     vmlal.s16       q12, \x10, d1[0]
    968     vmlal.s16       q12, \x6,  d1[1]
    969 
    970     vadd.s32        q10, q14, q13
    971     vsub.s32        q14, q14, q13
    972 
    973 .if \shift > 16
    974     vrshr.s32       q10,  q10, #\shift
    975     vrshr.s32       q14,  q14, #\shift
    976     vmovn.s32       \y26, q10
    977     vmovn.s32       \y29, q14
    978 .else
    979     vrshrn.s32      \y26, q10, #\shift
    980     vrshrn.s32      \y29, q14, #\shift
    981 .endif
    982 
    983     vadd.s32        q10, q15, q12
    984     vsub.s32        q15, q15, q12
    985 
    986 .if \shift > 16
    987     vrshr.s32       q10,  q10, #\shift
    988     vrshr.s32       q15,  q15, #\shift
    989     vmovn.s32       \y27, q10
    990     vmovn.s32       \y28, q15
    991 .else
    992     vrshrn.s32      \y27, q10, #\shift
    993     vrshrn.s32      \y28, q15, #\shift
    994 .endif
    995 
    996 .endm
    997 
    998 asm_function jsimd_idct_4x4_neon
    999 
   1000     DCT_TABLE       .req r0
   1001     COEF_BLOCK      .req r1
   1002     OUTPUT_BUF      .req r2
   1003     OUTPUT_COL      .req r3
   1004     TMP1            .req r0
   1005     TMP2            .req r1
   1006     TMP3            .req r2
   1007     TMP4            .req ip
   1008 
   1009     vpush           {d8-d15}
   1010 
   1011     /* Load constants (d3 is just used for padding) */
   1012     adr             TMP4, jsimd_idct_4x4_neon_consts
   1013     vld1.16         {d0, d1, d2, d3}, [TMP4, :128]
   1014 
   1015     /* Load all COEF_BLOCK into NEON registers with the following allocation:
   1016      *       0 1 2 3 | 4 5 6 7
   1017      *      ---------+--------
   1018      *   0 | d4      | d5
   1019      *   1 | d6      | d7
   1020      *   2 | d8      | d9
   1021      *   3 | d10     | d11
   1022      *   4 | -       | -
   1023      *   5 | d12     | d13
   1024      *   6 | d14     | d15
   1025      *   7 | d16     | d17
   1026      */
   1027     vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
   1028     vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
   1029     add COEF_BLOCK, COEF_BLOCK, #16
   1030     vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
   1031     vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
   1032     /* dequantize */
   1033     vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
   1034     vmul.s16        q2, q2, q9
   1035     vld1.16         {d22, d23, d24, d25}, [DCT_TABLE, :128]!
   1036     vmul.s16        q3, q3, q10
   1037     vmul.s16        q4, q4, q11
   1038     add             DCT_TABLE, DCT_TABLE, #16
   1039     vld1.16         {d26, d27, d28, d29}, [DCT_TABLE, :128]!
   1040     vmul.s16        q5, q5, q12
   1041     vmul.s16        q6, q6, q13
   1042     vld1.16         {d30, d31}, [DCT_TABLE, :128]!
   1043     vmul.s16        q7, q7, q14
   1044     vmul.s16        q8, q8, q15
   1045 
   1046     /* Pass 1 */
   1047     idct_helper     d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
   1048     transpose_4x4   d4, d6, d8, d10
   1049     idct_helper     d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
   1050     transpose_4x4   d5, d7, d9, d11
   1051 
   1052     /* Pass 2 */
   1053     idct_helper     d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
   1054     transpose_4x4   d26, d27, d28, d29
   1055 
   1056     /* Range limit */
   1057     vmov.u16        q15, #0x80
   1058     vadd.s16        q13, q13, q15
   1059     vadd.s16        q14, q14, q15
   1060     vqmovun.s16     d26, q13
   1061     vqmovun.s16     d27, q14
   1062 
   1063     /* Store results to the output buffer */
   1064     ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
   1065     add             TMP1, TMP1, OUTPUT_COL
   1066     add             TMP2, TMP2, OUTPUT_COL
   1067     add             TMP3, TMP3, OUTPUT_COL
   1068     add             TMP4, TMP4, OUTPUT_COL
   1069 
   1070 #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
   1071     /* We can use much less instructions on little endian systems if the
   1072      * OS kernel is not configured to trap unaligned memory accesses
   1073      */
   1074     vst1.32         {d26[0]}, [TMP1]!
   1075     vst1.32         {d27[0]}, [TMP3]!
   1076     vst1.32         {d26[1]}, [TMP2]!
   1077     vst1.32         {d27[1]}, [TMP4]!
   1078 #else
   1079     vst1.8          {d26[0]}, [TMP1]!
   1080     vst1.8          {d27[0]}, [TMP3]!
   1081     vst1.8          {d26[1]}, [TMP1]!
   1082     vst1.8          {d27[1]}, [TMP3]!
   1083     vst1.8          {d26[2]}, [TMP1]!
   1084     vst1.8          {d27[2]}, [TMP3]!
   1085     vst1.8          {d26[3]}, [TMP1]!
   1086     vst1.8          {d27[3]}, [TMP3]!
   1087 
   1088     vst1.8          {d26[4]}, [TMP2]!
   1089     vst1.8          {d27[4]}, [TMP4]!
   1090     vst1.8          {d26[5]}, [TMP2]!
   1091     vst1.8          {d27[5]}, [TMP4]!
   1092     vst1.8          {d26[6]}, [TMP2]!
   1093     vst1.8          {d27[6]}, [TMP4]!
   1094     vst1.8          {d26[7]}, [TMP2]!
   1095     vst1.8          {d27[7]}, [TMP4]!
   1096 #endif
   1097 
   1098     vpop            {d8-d15}
   1099     bx              lr
   1100 
   1101     .unreq          DCT_TABLE
   1102     .unreq          COEF_BLOCK
   1103     .unreq          OUTPUT_BUF
   1104     .unreq          OUTPUT_COL
   1105     .unreq          TMP1
   1106     .unreq          TMP2
   1107     .unreq          TMP3
   1108     .unreq          TMP4
   1109 
   1110 .purgem idct_helper
   1111 
   1112 
   1113 /*****************************************************************************/
   1114 
   1115 /*
   1116  * jsimd_idct_2x2_neon
   1117  *
   1118  * This function contains inverse-DCT code for getting reduced-size
   1119  * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
   1120  * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
   1121  * function from jpeg-6b (jidctred.c).
   1122  *
   1123  * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
   1124  *       requires much less arithmetic operations and hence should be faster.
   1125  *       The primary purpose of this particular NEON optimized function is
   1126  *       bit exact compatibility with jpeg-6b.
   1127  */
   1128 
   1129 .balign 8
   1130 jsimd_idct_2x2_neon_consts:
   1131     .short     -FIX_0_720959822    /* d0[0] */
   1132     .short     FIX_0_850430095     /* d0[1] */
   1133     .short     -FIX_1_272758580    /* d0[2] */
   1134     .short     FIX_3_624509785     /* d0[3] */
   1135 
   1136 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
   1137     vshll.s16  q14,  \x4,  #15
   1138     vmull.s16  q13,  \x6,  d0[3]
   1139     vmlal.s16  q13,  \x10, d0[2]
   1140     vmlal.s16  q13,  \x12, d0[1]
   1141     vmlal.s16  q13,  \x16, d0[0]
   1142 
   1143     vadd.s32   q10,  q14,  q13
   1144     vsub.s32   q14,  q14,  q13
   1145 
   1146 .if \shift > 16
   1147     vrshr.s32  q10,  q10,  #\shift
   1148     vrshr.s32  q14,  q14,  #\shift
   1149     vmovn.s32  \y26, q10
   1150     vmovn.s32  \y27, q14
   1151 .else
   1152     vrshrn.s32 \y26, q10,  #\shift
   1153     vrshrn.s32 \y27, q14,  #\shift
   1154 .endif
   1155 
   1156 .endm
   1157 
   1158 asm_function jsimd_idct_2x2_neon
   1159 
   1160     DCT_TABLE       .req r0
   1161     COEF_BLOCK      .req r1
   1162     OUTPUT_BUF      .req r2
   1163     OUTPUT_COL      .req r3
   1164     TMP1            .req r0
   1165     TMP2            .req ip
   1166 
   1167     vpush           {d8-d15}
   1168 
   1169     /* Load constants */
   1170     adr             TMP2, jsimd_idct_2x2_neon_consts
   1171     vld1.16         {d0}, [TMP2, :64]
   1172 
   1173     /* Load all COEF_BLOCK into NEON registers with the following allocation:
   1174      *       0 1 2 3 | 4 5 6 7
   1175      *      ---------+--------
   1176      *   0 | d4      | d5
   1177      *   1 | d6      | d7
   1178      *   2 | -       | -
   1179      *   3 | d10     | d11
   1180      *   4 | -       | -
   1181      *   5 | d12     | d13
   1182      *   6 | -       | -
   1183      *   7 | d16     | d17
   1184      */
   1185     vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
   1186     add             COEF_BLOCK, COEF_BLOCK, #16
   1187     vld1.16         {d10, d11}, [COEF_BLOCK, :128]!
   1188     add             COEF_BLOCK, COEF_BLOCK, #16
   1189     vld1.16         {d12, d13}, [COEF_BLOCK, :128]!
   1190     add             COEF_BLOCK, COEF_BLOCK, #16
   1191     vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
   1192     /* Dequantize */
   1193     vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
   1194     vmul.s16        q2, q2, q9
   1195     vmul.s16        q3, q3, q10
   1196     add             DCT_TABLE, DCT_TABLE, #16
   1197     vld1.16         {d24, d25}, [DCT_TABLE, :128]!
   1198     vmul.s16        q5, q5, q12
   1199     add             DCT_TABLE, DCT_TABLE, #16
   1200     vld1.16         {d26, d27}, [DCT_TABLE, :128]!
   1201     vmul.s16        q6, q6, q13
   1202     add             DCT_TABLE, DCT_TABLE, #16
   1203     vld1.16         {d30, d31}, [DCT_TABLE, :128]!
   1204     vmul.s16        q8, q8, q15
   1205 
   1206     /* Pass 1 */
   1207 #if 0
   1208     idct_helper     d4, d6, d10, d12, d16, 13, d4, d6
   1209     transpose_4x4   d4, d6, d8,  d10
   1210     idct_helper     d5, d7, d11, d13, d17, 13, d5, d7
   1211     transpose_4x4   d5, d7, d9,  d11
   1212 #else
   1213     vmull.s16       q13, d6,  d0[3]
   1214     vmlal.s16       q13, d10, d0[2]
   1215     vmlal.s16       q13, d12, d0[1]
   1216     vmlal.s16       q13, d16, d0[0]
   1217     vmull.s16       q12, d7,  d0[3]
   1218     vmlal.s16       q12, d11, d0[2]
   1219     vmlal.s16       q12, d13, d0[1]
   1220     vmlal.s16       q12, d17, d0[0]
   1221     vshll.s16       q14, d4,  #15
   1222     vshll.s16       q15, d5,  #15
   1223     vadd.s32        q10, q14, q13
   1224     vsub.s32        q14, q14, q13
   1225     vrshrn.s32      d4,  q10, #13
   1226     vrshrn.s32      d6,  q14, #13
   1227     vadd.s32        q10, q15, q12
   1228     vsub.s32        q14, q15, q12
   1229     vrshrn.s32      d5,  q10, #13
   1230     vrshrn.s32      d7,  q14, #13
   1231     vtrn.16         q2,  q3
   1232     vtrn.32         q3,  q5
   1233 #endif
   1234 
   1235     /* Pass 2 */
   1236     idct_helper     d4, d6, d10, d7, d11, 20, d26, d27
   1237 
   1238     /* Range limit */
   1239     vmov.u16        q15, #0x80
   1240     vadd.s16        q13, q13, q15
   1241     vqmovun.s16     d26, q13
   1242     vqmovun.s16     d27, q13
   1243 
   1244     /* Store results to the output buffer */
   1245     ldmia           OUTPUT_BUF, {TMP1, TMP2}
   1246     add             TMP1, TMP1, OUTPUT_COL
   1247     add             TMP2, TMP2, OUTPUT_COL
   1248 
   1249     vst1.8          {d26[0]}, [TMP1]!
   1250     vst1.8          {d27[4]}, [TMP1]!
   1251     vst1.8          {d26[1]}, [TMP2]!
   1252     vst1.8          {d27[5]}, [TMP2]!
   1253 
   1254     vpop            {d8-d15}
   1255     bx              lr
   1256 
   1257     .unreq          DCT_TABLE
   1258     .unreq          COEF_BLOCK
   1259     .unreq          OUTPUT_BUF
   1260     .unreq          OUTPUT_COL
   1261     .unreq          TMP1
   1262     .unreq          TMP2
   1263 
   1264 .purgem idct_helper
   1265 
   1266 
   1267 /*****************************************************************************/
   1268 
   1269 /*
   1270  * jsimd_ycc_extrgb_convert_neon
   1271  * jsimd_ycc_extbgr_convert_neon
   1272  * jsimd_ycc_extrgbx_convert_neon
   1273  * jsimd_ycc_extbgrx_convert_neon
   1274  * jsimd_ycc_extxbgr_convert_neon
   1275  * jsimd_ycc_extxrgb_convert_neon
   1276  *
   1277  * Colorspace conversion YCbCr -> RGB
   1278  */
   1279 
   1280 
   1281 .macro do_load size
   1282     .if \size == 8
   1283         vld1.8  {d4}, [U, :64]!
   1284         vld1.8  {d5}, [V, :64]!
   1285         vld1.8  {d0}, [Y, :64]!
   1286         pld     [U, #64]
   1287         pld     [V, #64]
   1288         pld     [Y, #64]
   1289     .elseif \size == 4
   1290         vld1.8  {d4[0]}, [U]!
   1291         vld1.8  {d4[1]}, [U]!
   1292         vld1.8  {d4[2]}, [U]!
   1293         vld1.8  {d4[3]}, [U]!
   1294         vld1.8  {d5[0]}, [V]!
   1295         vld1.8  {d5[1]}, [V]!
   1296         vld1.8  {d5[2]}, [V]!
   1297         vld1.8  {d5[3]}, [V]!
   1298         vld1.8  {d0[0]}, [Y]!
   1299         vld1.8  {d0[1]}, [Y]!
   1300         vld1.8  {d0[2]}, [Y]!
   1301         vld1.8  {d0[3]}, [Y]!
   1302     .elseif \size == 2
   1303         vld1.8  {d4[4]}, [U]!
   1304         vld1.8  {d4[5]}, [U]!
   1305         vld1.8  {d5[4]}, [V]!
   1306         vld1.8  {d5[5]}, [V]!
   1307         vld1.8  {d0[4]}, [Y]!
   1308         vld1.8  {d0[5]}, [Y]!
   1309     .elseif \size == 1
   1310         vld1.8  {d4[6]}, [U]!
   1311         vld1.8  {d5[6]}, [V]!
   1312         vld1.8  {d0[6]}, [Y]!
   1313     .else
   1314         .error unsupported macroblock size
   1315     .endif
   1316 .endm
   1317 
   1318 .macro do_store bpp, size
   1319     .if \bpp == 24
   1320         .if \size == 8
   1321             vst3.8  {d10, d11, d12}, [RGB]!
   1322         .elseif \size == 4
   1323             vst3.8  {d10[0], d11[0], d12[0]}, [RGB]!
   1324             vst3.8  {d10[1], d11[1], d12[1]}, [RGB]!
   1325             vst3.8  {d10[2], d11[2], d12[2]}, [RGB]!
   1326             vst3.8  {d10[3], d11[3], d12[3]}, [RGB]!
   1327         .elseif \size == 2
   1328             vst3.8  {d10[4], d11[4], d12[4]}, [RGB]!
   1329             vst3.8  {d10[5], d11[5], d12[5]}, [RGB]!
   1330         .elseif \size == 1
   1331             vst3.8  {d10[6], d11[6], d12[6]}, [RGB]!
   1332         .else
   1333             .error unsupported macroblock size
   1334         .endif
   1335     .elseif \bpp == 32
   1336         .if \size == 8
   1337             vst4.8  {d10, d11, d12, d13}, [RGB]!
   1338         .elseif \size == 4
   1339             vst4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
   1340             vst4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
   1341             vst4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
   1342             vst4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
   1343         .elseif \size == 2
   1344             vst4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
   1345             vst4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
   1346         .elseif \size == 1
   1347             vst4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
   1348         .else
   1349             .error unsupported macroblock size
   1350         .endif
   1351     .elseif \bpp == 16
   1352         .if \size == 8
   1353             vst1.16  {q15}, [RGB]!
   1354         .elseif \size == 4
   1355             vst1.16  {d30}, [RGB]!
   1356         .elseif \size == 2
   1357             vst1.16  {d31[0]}, [RGB]!
   1358             vst1.16  {d31[1]}, [RGB]!
   1359         .elseif \size == 1
   1360             vst1.16  {d31[2]}, [RGB]!
   1361         .else
   1362             .error unsupported macroblock size
   1363         .endif
   1364     .else
   1365         .error unsupported bpp
   1366     .endif
   1367 .endm
   1368 
   1369 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
   1370 
   1371 /*
   1372  * 2 stage pipelined YCbCr->RGB conversion
   1373  */
   1374 
   1375 .macro do_yuv_to_rgb_stage1
   1376     vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
   1377     vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
   1378     vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
   1379     vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */
   1380     vmull.s16       q11, d7, d1[1] /* multiply by -11277 */
   1381     vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */
   1382     vmull.s16       q12, d8, d1[0] /* multiply by 22971 */
   1383     vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
   1384     vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
   1385     vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
   1386 .endm
   1387 
   1388 .macro do_yuv_to_rgb_stage2
   1389     vrshrn.s32      d20, q10, #15
   1390     vrshrn.s32      d21, q11, #15
   1391     vrshrn.s32      d24, q12, #14
   1392     vrshrn.s32      d25, q13, #14
   1393     vrshrn.s32      d28, q14, #14
   1394     vrshrn.s32      d29, q15, #14
   1395     vaddw.u8        q11, q10, d0
   1396     vaddw.u8        q12, q12, d0
   1397     vaddw.u8        q14, q14, d0
   1398 .if \bpp != 16
   1399     vqmovun.s16     d1\g_offs, q11
   1400     vqmovun.s16     d1\r_offs, q12
   1401     vqmovun.s16     d1\b_offs, q14
   1402 .else /* rgb565 */
   1403     vqshlu.s16      q13, q11, #8
   1404     vqshlu.s16      q15, q12, #8
   1405     vqshlu.s16      q14, q14, #8
   1406     vsri.u16        q15, q13, #5
   1407     vsri.u16        q15, q14, #11
   1408 .endif
   1409 .endm
   1410 
   1411 .macro do_yuv_to_rgb_stage2_store_load_stage1
   1412                                        /* "do_yuv_to_rgb_stage2" and "store" */
   1413                                        vrshrn.s32      d20, q10, #15
   1414     /* "load" and "do_yuv_to_rgb_stage1" */
   1415     pld             [U, #64]
   1416                                        vrshrn.s32      d21, q11, #15
   1417     pld             [V, #64]
   1418                                        vrshrn.s32      d24, q12, #14
   1419                                        vrshrn.s32      d25, q13, #14
   1420     vld1.8          {d4}, [U, :64]!
   1421                                        vrshrn.s32      d28, q14, #14
   1422     vld1.8          {d5}, [V, :64]!
   1423                                        vrshrn.s32      d29, q15, #14
   1424     vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
   1425     vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
   1426                                        vaddw.u8        q11, q10, d0
   1427     vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
   1428     vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */
   1429                                        vaddw.u8        q12, q12, d0
   1430                                        vaddw.u8        q14, q14, d0
   1431 .if \bpp != 16 /**************** rgb24/rgb32 *********************************/
   1432                                        vqmovun.s16     d1\g_offs, q11
   1433     pld             [Y, #64]
   1434                                        vqmovun.s16     d1\r_offs, q12
   1435     vld1.8          {d0}, [Y, :64]!
   1436                                        vqmovun.s16     d1\b_offs, q14
   1437     vmull.s16       q11, d7, d1[1] /* multiply by -11277 */
   1438     vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */
   1439                                        do_store        \bpp, 8
   1440     vmull.s16       q12, d8, d1[0] /* multiply by 22971 */
   1441     vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
   1442     vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
   1443     vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
   1444 .else /**************************** rgb565 ***********************************/
   1445                                        vqshlu.s16      q13, q11, #8
   1446     pld             [Y, #64]
   1447                                        vqshlu.s16      q15, q12, #8
   1448                                        vqshlu.s16      q14, q14, #8
   1449     vld1.8          {d0}, [Y, :64]!
   1450     vmull.s16       q11, d7, d1[1]
   1451     vmlal.s16       q11, d9, d1[2]
   1452                                        vsri.u16        q15, q13, #5
   1453     vmull.s16       q12, d8, d1[0]
   1454                                        vsri.u16        q15, q14, #11
   1455     vmull.s16       q13, d9, d1[0]
   1456     vmull.s16       q14, d6, d1[3]
   1457                                        do_store        \bpp, 8
   1458     vmull.s16       q15, d7, d1[3]
   1459 .endif
   1460 .endm
   1461 
   1462 .macro do_yuv_to_rgb
   1463     do_yuv_to_rgb_stage1
   1464     do_yuv_to_rgb_stage2
   1465 .endm
   1466 
   1467 /* Apple gas crashes on adrl, work around that by using adr.
   1468  * But this requires a copy of these constants for each function.
   1469  */
   1470 
   1471 .balign 16
   1472 jsimd_ycc_\colorid\()_neon_consts:
   1473     .short          0,      0,     0,      0
   1474     .short          22971, -11277, -23401, 29033
   1475     .short          -128,  -128,   -128,   -128
   1476     .short          -128,  -128,   -128,   -128
   1477 
   1478 asm_function jsimd_ycc_\colorid\()_convert_neon
   1479     OUTPUT_WIDTH    .req r0
   1480     INPUT_BUF       .req r1
   1481     INPUT_ROW       .req r2
   1482     OUTPUT_BUF      .req r3
   1483     NUM_ROWS        .req r4
   1484 
   1485     INPUT_BUF0      .req r5
   1486     INPUT_BUF1      .req r6
   1487     INPUT_BUF2      .req INPUT_BUF
   1488 
   1489     RGB             .req r7
   1490     Y               .req r8
   1491     U               .req r9
   1492     V               .req r10
   1493     N               .req ip
   1494 
   1495     /* Load constants to d1, d2, d3 (d0 is just used for padding) */
   1496     adr             ip, jsimd_ycc_\colorid\()_neon_consts
   1497     vld1.16         {d0, d1, d2, d3}, [ip, :128]
   1498 
   1499     /* Save ARM registers and handle input arguments */
   1500     push            {r4, r5, r6, r7, r8, r9, r10, lr}
   1501     ldr             NUM_ROWS, [sp, #(4 * 8)]
   1502     ldr             INPUT_BUF0, [INPUT_BUF]
   1503     ldr             INPUT_BUF1, [INPUT_BUF, #4]
   1504     ldr             INPUT_BUF2, [INPUT_BUF, #8]
   1505     .unreq          INPUT_BUF
   1506 
   1507     /* Save NEON registers */
   1508     vpush           {d8-d15}
   1509 
   1510     /* Initially set d10, d11, d12, d13 to 0xFF */
   1511     vmov.u8         q5, #255
   1512     vmov.u8         q6, #255
   1513 
   1514     /* Outer loop over scanlines */
   1515     cmp             NUM_ROWS, #1
   1516     blt             9f
   1517 0:
   1518     ldr             Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
   1519     ldr             U, [INPUT_BUF1, INPUT_ROW, lsl #2]
   1520     mov             N, OUTPUT_WIDTH
   1521     ldr             V, [INPUT_BUF2, INPUT_ROW, lsl #2]
   1522     add             INPUT_ROW, INPUT_ROW, #1
   1523     ldr             RGB, [OUTPUT_BUF], #4
   1524 
   1525     /* Inner loop over pixels */
   1526     subs            N, N, #8
   1527     blt             3f
   1528     do_load         8
   1529     do_yuv_to_rgb_stage1
   1530     subs            N, N, #8
   1531     blt             2f
   1532 1:
   1533     do_yuv_to_rgb_stage2_store_load_stage1
   1534     subs            N, N, #8
   1535     bge             1b
   1536 2:
   1537     do_yuv_to_rgb_stage2
   1538     do_store        \bpp, 8
   1539     tst             N, #7
   1540     beq             8f
   1541 3:
   1542     tst             N, #4
   1543     beq             3f
   1544     do_load         4
   1545 3:
   1546     tst             N, #2
   1547     beq             4f
   1548     do_load         2
   1549 4:
   1550     tst             N, #1
   1551     beq             5f
   1552     do_load         1
   1553 5:
   1554     do_yuv_to_rgb
   1555     tst             N, #4
   1556     beq             6f
   1557     do_store        \bpp, 4
   1558 6:
   1559     tst             N, #2
   1560     beq             7f
   1561     do_store        \bpp, 2
   1562 7:
   1563     tst             N, #1
   1564     beq             8f
   1565     do_store        \bpp, 1
   1566 8:
   1567     subs            NUM_ROWS, NUM_ROWS, #1
   1568     bgt             0b
   1569 9:
   1570     /* Restore all registers and return */
   1571     vpop            {d8-d15}
   1572     pop             {r4, r5, r6, r7, r8, r9, r10, pc}
   1573 
   1574     .unreq          OUTPUT_WIDTH
   1575     .unreq          INPUT_ROW
   1576     .unreq          OUTPUT_BUF
   1577     .unreq          NUM_ROWS
   1578     .unreq          INPUT_BUF0
   1579     .unreq          INPUT_BUF1
   1580     .unreq          INPUT_BUF2
   1581     .unreq          RGB
   1582     .unreq          Y
   1583     .unreq          U
   1584     .unreq          V
   1585     .unreq          N
   1586 
   1587 .purgem do_yuv_to_rgb
   1588 .purgem do_yuv_to_rgb_stage1
   1589 .purgem do_yuv_to_rgb_stage2
   1590 .purgem do_yuv_to_rgb_stage2_store_load_stage1
   1591 
   1592 .endm
   1593 
   1594 /*--------------------------------- id ----- bpp R  G  B */
   1595 generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, 1, 2
   1596 generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, 1, 0
   1597 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
   1598 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
   1599 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
   1600 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
   1601 generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, 0, 0
   1602 
   1603 .purgem do_load
   1604 .purgem do_store
   1605 
   1606 
   1607 /*****************************************************************************/
   1608 
   1609 /*
   1610  * jsimd_extrgb_ycc_convert_neon
   1611  * jsimd_extbgr_ycc_convert_neon
   1612  * jsimd_extrgbx_ycc_convert_neon
   1613  * jsimd_extbgrx_ycc_convert_neon
   1614  * jsimd_extxbgr_ycc_convert_neon
   1615  * jsimd_extxrgb_ycc_convert_neon
   1616  *
   1617  * Colorspace conversion RGB -> YCbCr
   1618  */
   1619 
   1620 .macro do_store size
   1621     .if \size == 8
   1622         vst1.8  {d20}, [Y]!
   1623         vst1.8  {d21}, [U]!
   1624         vst1.8  {d22}, [V]!
   1625     .elseif \size == 4
   1626         vst1.8  {d20[0]}, [Y]!
   1627         vst1.8  {d20[1]}, [Y]!
   1628         vst1.8  {d20[2]}, [Y]!
   1629         vst1.8  {d20[3]}, [Y]!
   1630         vst1.8  {d21[0]}, [U]!
   1631         vst1.8  {d21[1]}, [U]!
   1632         vst1.8  {d21[2]}, [U]!
   1633         vst1.8  {d21[3]}, [U]!
   1634         vst1.8  {d22[0]}, [V]!
   1635         vst1.8  {d22[1]}, [V]!
   1636         vst1.8  {d22[2]}, [V]!
   1637         vst1.8  {d22[3]}, [V]!
   1638     .elseif \size == 2
   1639         vst1.8  {d20[4]}, [Y]!
   1640         vst1.8  {d20[5]}, [Y]!
   1641         vst1.8  {d21[4]}, [U]!
   1642         vst1.8  {d21[5]}, [U]!
   1643         vst1.8  {d22[4]}, [V]!
   1644         vst1.8  {d22[5]}, [V]!
   1645     .elseif \size == 1
   1646         vst1.8  {d20[6]}, [Y]!
   1647         vst1.8  {d21[6]}, [U]!
   1648         vst1.8  {d22[6]}, [V]!
   1649     .else
   1650         .error unsupported macroblock size
   1651     .endif
   1652 .endm
   1653 
   1654 .macro do_load bpp, size
   1655     .if \bpp == 24
   1656         .if \size == 8
   1657             vld3.8  {d10, d11, d12}, [RGB]!
   1658             pld     [RGB, #128]
   1659         .elseif \size == 4
   1660             vld3.8  {d10[0], d11[0], d12[0]}, [RGB]!
   1661             vld3.8  {d10[1], d11[1], d12[1]}, [RGB]!
   1662             vld3.8  {d10[2], d11[2], d12[2]}, [RGB]!
   1663             vld3.8  {d10[3], d11[3], d12[3]}, [RGB]!
   1664         .elseif \size == 2
   1665             vld3.8  {d10[4], d11[4], d12[4]}, [RGB]!
   1666             vld3.8  {d10[5], d11[5], d12[5]}, [RGB]!
   1667         .elseif \size == 1
   1668             vld3.8  {d10[6], d11[6], d12[6]}, [RGB]!
   1669         .else
   1670             .error unsupported macroblock size
   1671         .endif
   1672     .elseif \bpp == 32
   1673         .if \size == 8
   1674             vld4.8  {d10, d11, d12, d13}, [RGB]!
   1675             pld     [RGB, #128]
   1676         .elseif \size == 4
   1677             vld4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
   1678             vld4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
   1679             vld4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
   1680             vld4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
   1681         .elseif \size == 2
   1682             vld4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
   1683             vld4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
   1684         .elseif \size == 1
   1685             vld4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
   1686         .else
   1687             .error unsupported macroblock size
   1688         .endif
   1689     .else
   1690         .error unsupported bpp
   1691     .endif
   1692 .endm
   1693 
   1694 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
   1695 
   1696 /*
   1697  * 2 stage pipelined RGB->YCbCr conversion
   1698  */
   1699 
   1700 .macro do_rgb_to_yuv_stage1
   1701     vmovl.u8    q2, d1\r_offs /* r = { d4, d5 } */
   1702     vmovl.u8    q3, d1\g_offs /* g = { d6, d7 } */
   1703     vmovl.u8    q4, d1\b_offs /* b = { d8, d9 } */
   1704     vmull.u16   q7, d4, d0[0]
   1705     vmlal.u16   q7, d6, d0[1]
   1706     vmlal.u16   q7, d8, d0[2]
   1707     vmull.u16   q8, d5, d0[0]
   1708     vmlal.u16   q8, d7, d0[1]
   1709     vmlal.u16   q8, d9, d0[2]
   1710     vrev64.32   q9,  q1
   1711     vrev64.32   q13, q1
   1712     vmlsl.u16   q9,  d4, d0[3]
   1713     vmlsl.u16   q9,  d6, d1[0]
   1714     vmlal.u16   q9,  d8, d1[1]
   1715     vmlsl.u16   q13, d5, d0[3]
   1716     vmlsl.u16   q13, d7, d1[0]
   1717     vmlal.u16   q13, d9, d1[1]
   1718     vrev64.32   q14, q1
   1719     vrev64.32   q15, q1
   1720     vmlal.u16   q14, d4, d1[1]
   1721     vmlsl.u16   q14, d6, d1[2]
   1722     vmlsl.u16   q14, d8, d1[3]
   1723     vmlal.u16   q15, d5, d1[1]
   1724     vmlsl.u16   q15, d7, d1[2]
   1725     vmlsl.u16   q15, d9, d1[3]
   1726 .endm
   1727 
   1728 .macro do_rgb_to_yuv_stage2
   1729     vrshrn.u32  d20, q7,  #16
   1730     vrshrn.u32  d21, q8,  #16
   1731     vshrn.u32   d22, q9,  #16
   1732     vshrn.u32   d23, q13, #16
   1733     vshrn.u32   d24, q14, #16
   1734     vshrn.u32   d25, q15, #16
   1735     vmovn.u16   d20, q10      /* d20 = y */
   1736     vmovn.u16   d21, q11      /* d21 = u */
   1737     vmovn.u16   d22, q12      /* d22 = v */
   1738 .endm
   1739 
   1740 .macro do_rgb_to_yuv
   1741     do_rgb_to_yuv_stage1
   1742     do_rgb_to_yuv_stage2
   1743 .endm
   1744 
   1745 .macro do_rgb_to_yuv_stage2_store_load_stage1
   1746       vrshrn.u32  d20, q7,  #16
   1747       vrshrn.u32  d21, q8,  #16
   1748       vshrn.u32   d22, q9,  #16
   1749     vrev64.32   q9,  q1
   1750       vshrn.u32   d23, q13, #16
   1751     vrev64.32   q13, q1
   1752       vshrn.u32   d24, q14, #16
   1753       vshrn.u32   d25, q15, #16
   1754     do_load     \bpp, 8
   1755       vmovn.u16   d20, q10      /* d20 = y */
   1756     vmovl.u8    q2, d1\r_offs   /* r = { d4, d5 } */
   1757       vmovn.u16   d21, q11      /* d21 = u */
   1758     vmovl.u8    q3, d1\g_offs   /* g = { d6, d7 } */
   1759       vmovn.u16   d22, q12      /* d22 = v */
   1760     vmovl.u8    q4, d1\b_offs   /* b = { d8, d9 } */
   1761     vmull.u16   q7, d4, d0[0]
   1762     vmlal.u16   q7, d6, d0[1]
   1763     vmlal.u16   q7, d8, d0[2]
   1764       vst1.8      {d20}, [Y]!
   1765     vmull.u16   q8, d5, d0[0]
   1766     vmlal.u16   q8, d7, d0[1]
   1767     vmlal.u16   q8, d9, d0[2]
   1768     vmlsl.u16   q9,  d4, d0[3]
   1769     vmlsl.u16   q9,  d6, d1[0]
   1770     vmlal.u16   q9,  d8, d1[1]
   1771       vst1.8      {d21}, [U]!
   1772     vmlsl.u16   q13, d5, d0[3]
   1773     vmlsl.u16   q13, d7, d1[0]
   1774     vmlal.u16   q13, d9, d1[1]
   1775     vrev64.32   q14, q1
   1776     vrev64.32   q15, q1
   1777     vmlal.u16   q14, d4, d1[1]
   1778     vmlsl.u16   q14, d6, d1[2]
   1779     vmlsl.u16   q14, d8, d1[3]
   1780       vst1.8      {d22}, [V]!
   1781     vmlal.u16   q15, d5, d1[1]
   1782     vmlsl.u16   q15, d7, d1[2]
   1783     vmlsl.u16   q15, d9, d1[3]
   1784 .endm
   1785 
   1786 .balign 16
   1787 jsimd_\colorid\()_ycc_neon_consts:
   1788     .short          19595, 38470, 7471,  11059
   1789     .short          21709, 32768, 27439, 5329
   1790     .short          32767, 128,   32767, 128
   1791     .short          32767, 128,   32767, 128
   1792 
   1793 asm_function jsimd_\colorid\()_ycc_convert_neon
   1794     OUTPUT_WIDTH    .req r0
   1795     INPUT_BUF       .req r1
   1796     OUTPUT_BUF      .req r2
   1797     OUTPUT_ROW      .req r3
   1798     NUM_ROWS        .req r4
   1799 
   1800     OUTPUT_BUF0     .req r5
   1801     OUTPUT_BUF1     .req r6
   1802     OUTPUT_BUF2     .req OUTPUT_BUF
   1803 
   1804     RGB             .req r7
   1805     Y               .req r8
   1806     U               .req r9
   1807     V               .req r10
   1808     N               .req ip
   1809 
   1810     /* Load constants to d0, d1, d2, d3 */
   1811     adr             ip, jsimd_\colorid\()_ycc_neon_consts
   1812     vld1.16         {d0, d1, d2, d3}, [ip, :128]
   1813 
   1814     /* Save ARM registers and handle input arguments */
   1815     push            {r4, r5, r6, r7, r8, r9, r10, lr}
   1816     ldr             NUM_ROWS, [sp, #(4 * 8)]
   1817     ldr             OUTPUT_BUF0, [OUTPUT_BUF]
   1818     ldr             OUTPUT_BUF1, [OUTPUT_BUF, #4]
   1819     ldr             OUTPUT_BUF2, [OUTPUT_BUF, #8]
   1820     .unreq          OUTPUT_BUF
   1821 
   1822     /* Save NEON registers */
   1823     vpush           {d8-d15}
   1824 
   1825     /* Outer loop over scanlines */
   1826     cmp             NUM_ROWS, #1
   1827     blt             9f
   1828 0:
   1829     ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
   1830     ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
   1831     mov             N, OUTPUT_WIDTH
   1832     ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
   1833     add             OUTPUT_ROW, OUTPUT_ROW, #1
   1834     ldr             RGB, [INPUT_BUF], #4
   1835 
   1836     /* Inner loop over pixels */
   1837     subs            N, N, #8
   1838     blt             3f
   1839     do_load         \bpp, 8
   1840     do_rgb_to_yuv_stage1
   1841     subs            N, N, #8
   1842     blt             2f
   1843 1:
   1844     do_rgb_to_yuv_stage2_store_load_stage1
   1845     subs            N, N, #8
   1846     bge             1b
   1847 2:
   1848     do_rgb_to_yuv_stage2
   1849     do_store        8
   1850     tst             N, #7
   1851     beq             8f
   1852 3:
   1853     tst             N, #4
   1854     beq             3f
   1855     do_load         \bpp, 4
   1856 3:
   1857     tst             N, #2
   1858     beq             4f
   1859     do_load         \bpp, 2
   1860 4:
   1861     tst             N, #1
   1862     beq             5f
   1863     do_load         \bpp, 1
   1864 5:
   1865     do_rgb_to_yuv
   1866     tst             N, #4
   1867     beq             6f
   1868     do_store        4
   1869 6:
   1870     tst             N, #2
   1871     beq             7f
   1872     do_store        2
   1873 7:
   1874     tst             N, #1
   1875     beq             8f
   1876     do_store        1
   1877 8:
   1878     subs            NUM_ROWS, NUM_ROWS, #1
   1879     bgt             0b
   1880 9:
   1881     /* Restore all registers and return */
   1882     vpop            {d8-d15}
   1883     pop             {r4, r5, r6, r7, r8, r9, r10, pc}
   1884 
   1885     .unreq          OUTPUT_WIDTH
   1886     .unreq          OUTPUT_ROW
   1887     .unreq          INPUT_BUF
   1888     .unreq          NUM_ROWS
   1889     .unreq          OUTPUT_BUF0
   1890     .unreq          OUTPUT_BUF1
   1891     .unreq          OUTPUT_BUF2
   1892     .unreq          RGB
   1893     .unreq          Y
   1894     .unreq          U
   1895     .unreq          V
   1896     .unreq          N
   1897 
   1898 .purgem do_rgb_to_yuv
   1899 .purgem do_rgb_to_yuv_stage1
   1900 .purgem do_rgb_to_yuv_stage2
   1901 .purgem do_rgb_to_yuv_stage2_store_load_stage1
   1902 
   1903 .endm
   1904 
   1905 /*--------------------------------- id ----- bpp R  G  B */
   1906 generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
   1907 generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
   1908 generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
   1909 generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
   1910 generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
   1911 generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
   1912 
   1913 .purgem do_load
   1914 .purgem do_store
   1915 
   1916 
   1917 /*****************************************************************************/
   1918 
   1919 /*
   1920  * Load data into workspace, applying unsigned->signed conversion
   1921  *
   1922  * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
   1923  *       rid of VST1.16 instructions
   1924  */
   1925 
   1926 asm_function jsimd_convsamp_neon
   1927     SAMPLE_DATA     .req r0
   1928     START_COL       .req r1
   1929     WORKSPACE       .req r2
   1930     TMP1            .req r3
   1931     TMP2            .req r4
   1932     TMP3            .req r5
   1933     TMP4            .req ip
   1934 
   1935     push            {r4, r5}
   1936     vmov.u8         d0, #128
   1937 
   1938     ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
   1939     add             TMP1, TMP1, START_COL
   1940     add             TMP2, TMP2, START_COL
   1941     add             TMP3, TMP3, START_COL
   1942     add             TMP4, TMP4, START_COL
   1943     vld1.8          {d16}, [TMP1]
   1944     vsubl.u8        q8, d16, d0
   1945     vld1.8          {d18}, [TMP2]
   1946     vsubl.u8        q9, d18, d0
   1947     vld1.8          {d20}, [TMP3]
   1948     vsubl.u8        q10, d20, d0
   1949     vld1.8          {d22}, [TMP4]
   1950     ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
   1951     vsubl.u8        q11, d22, d0
   1952     vst1.16         {d16, d17, d18, d19}, [WORKSPACE, :128]!
   1953     add             TMP1, TMP1, START_COL
   1954     add             TMP2, TMP2, START_COL
   1955     vst1.16         {d20, d21, d22, d23}, [WORKSPACE, :128]!
   1956     add             TMP3, TMP3, START_COL
   1957     add             TMP4, TMP4, START_COL
   1958     vld1.8          {d24}, [TMP1]
   1959     vsubl.u8        q12, d24, d0
   1960     vld1.8          {d26}, [TMP2]
   1961     vsubl.u8        q13, d26, d0
   1962     vld1.8          {d28}, [TMP3]
   1963     vsubl.u8        q14, d28, d0
   1964     vld1.8          {d30}, [TMP4]
   1965     vsubl.u8        q15, d30, d0
   1966     vst1.16         {d24, d25, d26, d27}, [WORKSPACE, :128]!
   1967     vst1.16         {d28, d29, d30, d31}, [WORKSPACE, :128]!
   1968     pop             {r4, r5}
   1969     bx              lr
   1970 
   1971     .unreq          SAMPLE_DATA
   1972     .unreq          START_COL
   1973     .unreq          WORKSPACE
   1974     .unreq          TMP1
   1975     .unreq          TMP2
   1976     .unreq          TMP3
   1977     .unreq          TMP4
   1978 
   1979 
   1980 /*****************************************************************************/
   1981 
   1982 /*
   1983  * jsimd_fdct_ifast_neon
   1984  *
   1985  * This function contains a fast, not so accurate integer implementation of
   1986  * the forward DCT (Discrete Cosine Transform). It uses the same calculations
   1987  * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
   1988  * function from jfdctfst.c
   1989  *
   1990  * TODO: can be combined with 'jsimd_convsamp_neon' to get
   1991  *       rid of a bunch of VLD1.16 instructions
   1992  */
   1993 
   1994 #define XFIX_0_382683433 d0[0]
   1995 #define XFIX_0_541196100 d0[1]
   1996 #define XFIX_0_707106781 d0[2]
   1997 #define XFIX_1_306562965 d0[3]
   1998 
   1999 .balign 16
   2000 jsimd_fdct_ifast_neon_consts:
   2001     .short (98 * 128)              /* XFIX_0_382683433 */
   2002     .short (139 * 128)             /* XFIX_0_541196100 */
   2003     .short (181 * 128)             /* XFIX_0_707106781 */
   2004     .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
   2005 
   2006 asm_function jsimd_fdct_ifast_neon
   2007 
   2008     DATA            .req r0
   2009     TMP             .req ip
   2010 
   2011     vpush           {d8-d15}
   2012 
   2013     /* Load constants */
   2014     adr             TMP, jsimd_fdct_ifast_neon_consts
   2015     vld1.16         {d0}, [TMP, :64]
   2016 
   2017     /* Load all DATA into NEON registers with the following allocation:
   2018      *       0 1 2 3 | 4 5 6 7
   2019      *      ---------+--------
   2020      *   0 | d16     | d17    | q8
   2021      *   1 | d18     | d19    | q9
   2022      *   2 | d20     | d21    | q10
   2023      *   3 | d22     | d23    | q11
   2024      *   4 | d24     | d25    | q12
   2025      *   5 | d26     | d27    | q13
   2026      *   6 | d28     | d29    | q14
   2027      *   7 | d30     | d31    | q15
   2028      */
   2029 
   2030     vld1.16         {d16, d17, d18, d19}, [DATA, :128]!
   2031     vld1.16         {d20, d21, d22, d23}, [DATA, :128]!
   2032     vld1.16         {d24, d25, d26, d27}, [DATA, :128]!
   2033     vld1.16         {d28, d29, d30, d31}, [DATA, :128]
   2034     sub             DATA, DATA, #(128 - 32)
   2035 
   2036     mov             TMP, #2
   2037 1:
   2038     /* Transpose */
   2039     vtrn.16         q12, q13
   2040     vtrn.16         q10, q11
   2041     vtrn.16         q8,  q9
   2042     vtrn.16         q14, q15
   2043     vtrn.32         q9,  q11
   2044     vtrn.32         q13, q15
   2045     vtrn.32         q8,  q10
   2046     vtrn.32         q12, q14
   2047     vswp            d30, d23
   2048     vswp            d24, d17
   2049     vswp            d26, d19
   2050       /* 1-D FDCT */
   2051       vadd.s16        q2,  q11, q12
   2052     vswp            d28, d21
   2053       vsub.s16        q12, q11, q12
   2054       vsub.s16        q6,  q10, q13
   2055       vadd.s16        q10, q10, q13
   2056       vsub.s16        q7,  q9,  q14
   2057       vadd.s16        q9,  q9,  q14
   2058       vsub.s16        q1,  q8,  q15
   2059       vadd.s16        q8,  q8,  q15
   2060       vsub.s16        q4,  q9,  q10
   2061       vsub.s16        q5,  q8,  q2
   2062       vadd.s16        q3,  q9,  q10
   2063       vadd.s16        q4,  q4,  q5
   2064       vadd.s16        q2,  q8,  q2
   2065       vqdmulh.s16     q4,  q4,  XFIX_0_707106781
   2066       vadd.s16        q11, q12, q6
   2067       vadd.s16        q8,  q2,  q3
   2068       vsub.s16        q12, q2,  q3
   2069       vadd.s16        q3,  q6,  q7
   2070       vadd.s16        q7,  q7,  q1
   2071       vqdmulh.s16     q3,  q3,  XFIX_0_707106781
   2072       vsub.s16        q6,  q11, q7
   2073       vadd.s16        q10, q5,  q4
   2074       vqdmulh.s16     q6,  q6,  XFIX_0_382683433
   2075       vsub.s16        q14, q5,  q4
   2076       vqdmulh.s16     q11, q11, XFIX_0_541196100
   2077       vqdmulh.s16     q5,  q7,  XFIX_1_306562965
   2078       vadd.s16        q4,  q1,  q3
   2079       vsub.s16        q3,  q1,  q3
   2080       vadd.s16        q7,  q7,  q6
   2081       vadd.s16        q11, q11, q6
   2082       vadd.s16        q7,  q7,  q5
   2083       vadd.s16        q13, q3,  q11
   2084       vsub.s16        q11, q3,  q11
   2085       vadd.s16        q9,  q4,  q7
   2086       vsub.s16        q15, q4,  q7
   2087     subs            TMP, TMP, #1
   2088     bne             1b
   2089 
   2090     /* store results */
   2091     vst1.16         {d16, d17, d18, d19}, [DATA, :128]!
   2092     vst1.16         {d20, d21, d22, d23}, [DATA, :128]!
   2093     vst1.16         {d24, d25, d26, d27}, [DATA, :128]!
   2094     vst1.16         {d28, d29, d30, d31}, [DATA, :128]
   2095 
   2096     vpop            {d8-d15}
   2097     bx              lr
   2098 
   2099     .unreq          DATA
   2100     .unreq          TMP
   2101 
   2102 
   2103 /*****************************************************************************/
   2104 
   2105 /*
   2106  * GLOBAL(void)
   2107  * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors,
   2108  *                      DCTELEM * workspace);
   2109  *
   2110  * Note: the code uses 2 stage pipelining in order to improve instructions
   2111  *       scheduling and eliminate stalls (this provides ~15% better
   2112  *       performance for this function on both ARM Cortex-A8 and
   2113  *       ARM Cortex-A9 when compared to the non-pipelined variant).
   2114  *       The instructions which belong to the second stage use different
   2115  *       indentation for better readiability.
   2116  */
   2117 asm_function jsimd_quantize_neon
   2118 
   2119     COEF_BLOCK      .req r0
   2120     DIVISORS        .req r1
   2121     WORKSPACE       .req r2
   2122 
   2123     RECIPROCAL      .req DIVISORS
   2124     CORRECTION      .req r3
   2125     SHIFT           .req ip
   2126     LOOP_COUNT      .req r4
   2127 
   2128     vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
   2129     vabs.s16        q12, q0
   2130     add             CORRECTION, DIVISORS, #(64 * 2)
   2131     add             SHIFT, DIVISORS, #(64 * 6)
   2132     vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
   2133     vabs.s16        q13, q1
   2134     vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
   2135     vadd.u16        q12, q12, q10 /* add correction */
   2136     vadd.u16        q13, q13, q11
   2137     vmull.u16       q10, d24, d16 /* multiply by reciprocal */
   2138     vmull.u16       q11, d25, d17
   2139     vmull.u16       q8,  d26, d18
   2140     vmull.u16       q9,  d27, d19
   2141     vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
   2142     vshrn.u32       d20, q10, #16
   2143     vshrn.u32       d21, q11, #16
   2144     vshrn.u32       d22, q8,  #16
   2145     vshrn.u32       d23, q9,  #16
   2146     vneg.s16        q12, q12
   2147     vneg.s16        q13, q13
   2148     vshr.s16        q2,  q0,  #15 /* extract sign */
   2149     vshr.s16        q3,  q1,  #15
   2150     vshl.u16        q14, q10, q12 /* shift */
   2151     vshl.u16        q15, q11, q13
   2152 
   2153     push            {r4, r5}
   2154     mov             LOOP_COUNT, #3
   2155 1:
   2156     vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
   2157       veor.u16        q14, q14, q2  /* restore sign */
   2158     vabs.s16        q12, q0
   2159     vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
   2160     vabs.s16        q13, q1
   2161       veor.u16        q15, q15, q3
   2162     vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
   2163     vadd.u16        q12, q12, q10 /* add correction */
   2164     vadd.u16        q13, q13, q11
   2165     vmull.u16       q10, d24, d16 /* multiply by reciprocal */
   2166     vmull.u16       q11, d25, d17
   2167     vmull.u16       q8,  d26, d18
   2168     vmull.u16       q9,  d27, d19
   2169       vsub.u16        q14, q14, q2
   2170     vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
   2171       vsub.u16        q15, q15, q3
   2172     vshrn.u32       d20, q10, #16
   2173     vshrn.u32       d21, q11, #16
   2174       vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
   2175     vshrn.u32       d22, q8,  #16
   2176     vshrn.u32       d23, q9,  #16
   2177     vneg.s16        q12, q12
   2178     vneg.s16        q13, q13
   2179     vshr.s16        q2,  q0,  #15 /* extract sign */
   2180     vshr.s16        q3,  q1,  #15
   2181     vshl.u16        q14, q10, q12 /* shift */
   2182     vshl.u16        q15, q11, q13
   2183     subs            LOOP_COUNT, LOOP_COUNT, #1
   2184     bne             1b
   2185     pop             {r4, r5}
   2186 
   2187       veor.u16        q14, q14, q2  /* restore sign */
   2188       veor.u16        q15, q15, q3
   2189       vsub.u16        q14, q14, q2
   2190       vsub.u16        q15, q15, q3
   2191       vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
   2192 
   2193     bx              lr /* return */
   2194 
   2195     .unreq          COEF_BLOCK
   2196     .unreq          DIVISORS
   2197     .unreq          WORKSPACE
   2198     .unreq          RECIPROCAL
   2199     .unreq          CORRECTION
   2200     .unreq          SHIFT
   2201     .unreq          LOOP_COUNT
   2202 
   2203 
   2204 /*****************************************************************************/
   2205 
   2206 /*
   2207  * GLOBAL(void)
   2208  * jsimd_h2v1_fancy_upsample_neon (int          max_v_samp_factor,
   2209  *                                 JDIMENSION   downsampled_width,
   2210  *                                 JSAMPARRAY   input_data,
   2211  *                                 JSAMPARRAY * output_data_ptr);
   2212  *
   2213  * Note: the use of unaligned writes is the main remaining bottleneck in
   2214  *       this code, which can be potentially solved to get up to tens
   2215  *       of percents performance improvement on Cortex-A8/Cortex-A9.
   2216  */
   2217 
   2218 /*
   2219  * Upsample 16 source pixels to 32 destination pixels. The new 16 source
   2220  * pixels are loaded to q0. The previous 16 source pixels are in q1. The
   2221  * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
   2222  * Register d28 is used for multiplication by 3. Register q15 is used
   2223  * for adding +1 bias.
   2224  */
   2225 .macro upsample16   OUTPTR, INPTR
   2226     vld1.8          {q0}, [\INPTR]!
   2227     vmovl.u8        q8,  d0
   2228     vext.8          q2,  q1,  q0, #15
   2229     vmovl.u8        q9,  d1
   2230     vaddw.u8        q10, q15, d4
   2231     vaddw.u8        q11, q15, d5
   2232     vmlal.u8        q8,  d4,  d28
   2233     vmlal.u8        q9,  d5,  d28
   2234     vmlal.u8        q10, d0,  d28
   2235     vmlal.u8        q11, d1,  d28
   2236     vmov            q1,  q0       /* backup source pixels to q1 */
   2237     vrshrn.u16      d6,  q8,  #2
   2238     vrshrn.u16      d7,  q9,  #2
   2239     vshrn.u16       d8,  q10, #2
   2240     vshrn.u16       d9,  q11, #2
   2241     vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
   2242 .endm
   2243 
   2244 /*
   2245  * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
   2246  * macro, the roles of q0 and q1 registers are reversed for even and odd
   2247  * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
   2248  * Also this unrolling allows to reorder loads and stores to compensate
   2249  * multiplication latency and reduce stalls.
   2250  */
   2251 .macro upsample32   OUTPTR, INPTR
   2252     /* even 16 pixels group */
   2253     vld1.8          {q0}, [\INPTR]!
   2254     vmovl.u8        q8,  d0
   2255     vext.8          q2,  q1,  q0, #15
   2256     vmovl.u8        q9,  d1
   2257     vaddw.u8        q10, q15, d4
   2258     vaddw.u8        q11, q15, d5
   2259     vmlal.u8        q8,  d4,  d28
   2260     vmlal.u8        q9,  d5,  d28
   2261     vmlal.u8        q10, d0,  d28
   2262     vmlal.u8        q11, d1,  d28
   2263         /* odd 16 pixels group */
   2264         vld1.8          {q1}, [\INPTR]!
   2265     vrshrn.u16      d6,  q8,  #2
   2266     vrshrn.u16      d7,  q9,  #2
   2267     vshrn.u16       d8,  q10, #2
   2268     vshrn.u16       d9,  q11, #2
   2269         vmovl.u8        q8,  d2
   2270         vext.8          q2,  q0,  q1, #15
   2271         vmovl.u8        q9,  d3
   2272         vaddw.u8        q10, q15, d4
   2273         vaddw.u8        q11, q15, d5
   2274         vmlal.u8        q8,  d4,  d28
   2275         vmlal.u8        q9,  d5,  d28
   2276         vmlal.u8        q10, d2,  d28
   2277         vmlal.u8        q11, d3,  d28
   2278     vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
   2279         vrshrn.u16      d6,  q8,  #2
   2280         vrshrn.u16      d7,  q9,  #2
   2281         vshrn.u16       d8,  q10, #2
   2282         vshrn.u16       d9,  q11, #2
   2283         vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
   2284 .endm
   2285 
   2286 /*
   2287  * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
   2288  */
   2289 .macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
   2290     /* special case for the first and last pixels */
   2291     sub             \WIDTH, \WIDTH, #1
   2292     add             \OUTPTR, \OUTPTR, #1
   2293     ldrb            \TMP1, [\INPTR, \WIDTH]
   2294     strb            \TMP1, [\OUTPTR, \WIDTH, asl #1]
   2295     ldrb            \TMP1, [\INPTR], #1
   2296     strb            \TMP1, [\OUTPTR, #-1]
   2297     vmov.8          d3[7], \TMP1
   2298 
   2299     subs            \WIDTH, \WIDTH, #32
   2300     blt             5f
   2301 0:  /* process 32 pixels per iteration */
   2302     upsample32      \OUTPTR, \INPTR
   2303     subs            \WIDTH, \WIDTH, #32
   2304     bge             0b
   2305 5:
   2306     adds            \WIDTH, \WIDTH, #16
   2307     blt             1f
   2308 0:  /* process 16 pixels if needed */
   2309     upsample16      \OUTPTR, \INPTR
   2310     subs            \WIDTH, \WIDTH, #16
   2311 1:
   2312     adds            \WIDTH, \WIDTH, #16
   2313     beq             9f
   2314 
   2315     /* load the remaining 1-15 pixels */
   2316     add             \INPTR, \INPTR, \WIDTH
   2317     tst             \WIDTH, #1
   2318     beq             2f
   2319     sub             \INPTR, \INPTR, #1
   2320     vld1.8          {d0[0]}, [\INPTR]
   2321 2:
   2322     tst             \WIDTH, #2
   2323     beq             2f
   2324     vext.8          d0, d0, d0, #6
   2325     sub             \INPTR, \INPTR, #1
   2326     vld1.8          {d0[1]}, [\INPTR]
   2327     sub             \INPTR, \INPTR, #1
   2328     vld1.8          {d0[0]}, [\INPTR]
   2329 2:
   2330     tst             \WIDTH, #4
   2331     beq             2f
   2332     vrev64.32       d0, d0
   2333     sub             \INPTR, \INPTR, #1
   2334     vld1.8          {d0[3]}, [\INPTR]
   2335     sub             \INPTR, \INPTR, #1
   2336     vld1.8          {d0[2]}, [\INPTR]
   2337     sub             \INPTR, \INPTR, #1
   2338     vld1.8          {d0[1]}, [\INPTR]
   2339     sub             \INPTR, \INPTR, #1
   2340     vld1.8          {d0[0]}, [\INPTR]
   2341 2:
   2342     tst             \WIDTH, #8
   2343     beq             2f
   2344     vmov            d1,  d0
   2345     sub             \INPTR, \INPTR, #8
   2346     vld1.8          {d0}, [\INPTR]
   2347 2:  /* upsample the remaining pixels */
   2348     vmovl.u8        q8,  d0
   2349     vext.8          q2,  q1,  q0, #15
   2350     vmovl.u8        q9,  d1
   2351     vaddw.u8        q10, q15, d4
   2352     vaddw.u8        q11, q15, d5
   2353     vmlal.u8        q8,  d4,  d28
   2354     vmlal.u8        q9,  d5,  d28
   2355     vmlal.u8        q10, d0,  d28
   2356     vmlal.u8        q11, d1,  d28
   2357     vrshrn.u16      d10, q8,  #2
   2358     vrshrn.u16      d12, q9,  #2
   2359     vshrn.u16       d11, q10, #2
   2360     vshrn.u16       d13, q11, #2
   2361     vzip.8          d10, d11
   2362     vzip.8          d12, d13
   2363     /* store the remaining pixels */
   2364     tst             \WIDTH, #8
   2365     beq             2f
   2366     vst1.8          {d10, d11}, [\OUTPTR]!
   2367     vmov            q5,  q6
   2368 2:
   2369     tst             \WIDTH, #4
   2370     beq             2f
   2371     vst1.8          {d10}, [\OUTPTR]!
   2372     vmov            d10,  d11
   2373 2:
   2374     tst             \WIDTH, #2
   2375     beq             2f
   2376     vst1.8          {d10[0]}, [\OUTPTR]!
   2377     vst1.8          {d10[1]}, [\OUTPTR]!
   2378     vst1.8          {d10[2]}, [\OUTPTR]!
   2379     vst1.8          {d10[3]}, [\OUTPTR]!
   2380     vext.8          d10, d10, d10, #4
   2381 2:
   2382     tst             \WIDTH, #1
   2383     beq             2f
   2384     vst1.8          {d10[0]}, [\OUTPTR]!
   2385     vst1.8          {d10[1]}, [\OUTPTR]!
   2386 2:
   2387 9:
   2388 .endm
   2389 
   2390 asm_function jsimd_h2v1_fancy_upsample_neon
   2391 
   2392     MAX_V_SAMP_FACTOR .req r0
   2393     DOWNSAMPLED_WIDTH .req r1
   2394     INPUT_DATA        .req r2
   2395     OUTPUT_DATA_PTR   .req r3
   2396     OUTPUT_DATA       .req OUTPUT_DATA_PTR
   2397 
   2398     OUTPTR            .req r4
   2399     INPTR             .req r5
   2400     WIDTH             .req ip
   2401     TMP               .req lr
   2402 
   2403     push            {r4, r5, r6, lr}
   2404     vpush           {d8-d15}
   2405 
   2406     ldr             OUTPUT_DATA, [OUTPUT_DATA_PTR]
   2407     cmp             MAX_V_SAMP_FACTOR, #0
   2408     ble             99f
   2409 
   2410     /* initialize constants */
   2411     vmov.u8         d28, #3
   2412     vmov.u16        q15, #1
   2413 11:
   2414     ldr             INPTR, [INPUT_DATA], #4
   2415     ldr             OUTPTR, [OUTPUT_DATA], #4
   2416     mov             WIDTH, DOWNSAMPLED_WIDTH
   2417     upsample_row    OUTPTR, INPTR, WIDTH, TMP
   2418     subs            MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
   2419     bgt             11b
   2420 
   2421 99:
   2422     vpop            {d8-d15}
   2423     pop             {r4, r5, r6, pc}
   2424 
   2425     .unreq          MAX_V_SAMP_FACTOR
   2426     .unreq          DOWNSAMPLED_WIDTH
   2427     .unreq          INPUT_DATA
   2428     .unreq          OUTPUT_DATA_PTR
   2429     .unreq          OUTPUT_DATA
   2430 
   2431     .unreq          OUTPTR
   2432     .unreq          INPTR
   2433     .unreq          WIDTH
   2434     .unreq          TMP
   2435 
   2436 
   2437 .purgem upsample16
   2438 .purgem upsample32
   2439 .purgem upsample_row
   2440