Home | History | Annotate | Download | only in simd
      1 /*
      2  * ARMv7 NEON optimizations for libjpeg-turbo
      3  *
      4  * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
      5  * All Rights Reserved.
      6  * Author: Siarhei Siamashka <siarhei.siamashka (at) nokia.com>
      7  * Copyright (C) 2014, Siarhei Siamashka.  All Rights Reserved.
      8  * Copyright (C) 2014, Linaro Limited.  All Rights Reserved.
      9  * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
     10  * Copyright (C) 2015-2016, Matthieu Darbois.  All Rights Reserved.
     11  *
     12  * This software is provided 'as-is', without any express or implied
     13  * warranty.  In no event will the authors be held liable for any damages
     14  * arising from the use of this software.
     15  *
     16  * Permission is granted to anyone to use this software for any purpose,
     17  * including commercial applications, and to alter it and redistribute it
     18  * freely, subject to the following restrictions:
     19  *
     20  * 1. The origin of this software must not be misrepresented; you must not
     21  *    claim that you wrote the original software. If you use this software
     22  *    in a product, an acknowledgment in the product documentation would be
     23  *    appreciated but is not required.
     24  * 2. Altered source versions must be plainly marked as such, and must not be
     25  *    misrepresented as being the original software.
     26  * 3. This notice may not be removed or altered from any source distribution.
     27  */
     28 
     29 #if defined(__linux__) && defined(__ELF__)
     30 .section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
     31 #endif
     32 
     33 .text
     34 .fpu neon
     35 .arch armv7a
     36 .object_arch armv4
     37 .arm
     38 .syntax unified
     39 
     40 
     41 #define RESPECT_STRICT_ALIGNMENT 1
     42 
     43 
     44 /*****************************************************************************/
     45 
     46 /* Supplementary macro for setting function attributes */
     47 .macro asm_function fname
     48 #ifdef __APPLE__
     49     .globl _\fname
     50 _\fname:
     51 #else
     52     .global \fname
     53 #ifdef __ELF__
     54     .hidden \fname
     55     .type \fname, %function
     56 #endif
     57 \fname:
     58 #endif
     59 .endm
     60 
     61 /* Transpose a block of 4x4 coefficients in four 64-bit registers */
     62 .macro transpose_4x4 x0, x1, x2, x3
     63     vtrn.16         \x0, \x1
     64     vtrn.16         \x2, \x3
     65     vtrn.32         \x0, \x2
     66     vtrn.32         \x1, \x3
     67 .endm
     68 
     69 
     70 #define CENTERJSAMPLE 128
     71 
     72 /*****************************************************************************/
     73 
     74 /*
     75  * Perform dequantization and inverse DCT on one block of coefficients.
     76  *
     77  * GLOBAL(void)
     78  * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
     79  *                        JSAMPARRAY output_buf, JDIMENSION output_col)
     80  */
     81 
     82 #define FIX_0_298631336 (2446)
     83 #define FIX_0_390180644 (3196)
     84 #define FIX_0_541196100 (4433)
     85 #define FIX_0_765366865 (6270)
     86 #define FIX_0_899976223 (7373)
     87 #define FIX_1_175875602 (9633)
     88 #define FIX_1_501321110 (12299)
     89 #define FIX_1_847759065 (15137)
     90 #define FIX_1_961570560 (16069)
     91 #define FIX_2_053119869 (16819)
     92 #define FIX_2_562915447 (20995)
     93 #define FIX_3_072711026 (25172)
     94 
     95 #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
     96 #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
     97 #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
     98 #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
     99 #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
    100 #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
    101 #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
    102 #define FIX_0_541196100_PLUS_0_765366865  (FIX_0_541196100 + FIX_0_765366865)
    103 
    104 /*
    105  * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
    106  * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
    107  */
    108 #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7)   \
    109 {                                                                             \
    110     DCTELEM row0, row1, row2, row3, row4, row5, row6, row7;                   \
    111     JLONG   q1, q2, q3, q4, q5, q6, q7;                                       \
    112     JLONG   tmp11_plus_tmp2, tmp11_minus_tmp2;                                \
    113                                                                               \
    114     /* 1-D iDCT input data */                                                 \
    115     row0 = xrow0;                                                             \
    116     row1 = xrow1;                                                             \
    117     row2 = xrow2;                                                             \
    118     row3 = xrow3;                                                             \
    119     row4 = xrow4;                                                             \
    120     row5 = xrow5;                                                             \
    121     row6 = xrow6;                                                             \
    122     row7 = xrow7;                                                             \
    123                                                                               \
    124     q5 = row7 + row3;                                                         \
    125     q4 = row5 + row1;                                                         \
    126     q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) +                    \
    127          MULTIPLY(q4, FIX_1_175875602);                                       \
    128     q7 = MULTIPLY(q5, FIX_1_175875602) +                                      \
    129          MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644);                     \
    130     q2 = MULTIPLY(row2, FIX_0_541196100) +                                    \
    131          MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065);                   \
    132     q4 = q6;                                                                  \
    133     q3 = ((JLONG) row0 - (JLONG) row4) << 13;                                 \
    134     q6 += MULTIPLY(row5, -FIX_2_562915447) +                                  \
    135           MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447);                  \
    136     /* now we can use q1 (reloadable constants have been used up) */          \
    137     q1 = q3 + q2;                                                             \
    138     q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) +                 \
    139           MULTIPLY(row1, -FIX_0_899976223);                                   \
    140     q5 = q7;                                                                  \
    141     q1 = q1 + q6;                                                             \
    142     q7 += MULTIPLY(row7, -FIX_0_899976223) +                                  \
    143           MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223);                  \
    144                                                                               \
    145     /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */        \
    146     tmp11_plus_tmp2 = q1;                                                     \
    147     row1 = 0;                                                                 \
    148                                                                               \
    149     q1 = q1 - q6;                                                             \
    150     q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) +                 \
    151           MULTIPLY(row3, -FIX_2_562915447);                                   \
    152     q1 = q1 - q6;                                                             \
    153     q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) +                   \
    154          MULTIPLY(row6, FIX_0_541196100);                                     \
    155     q3 = q3 - q2;                                                             \
    156                                                                               \
    157     /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */        \
    158     tmp11_minus_tmp2 = q1;                                                    \
    159                                                                               \
    160     q1 = ((JLONG) row0 + (JLONG) row4) << 13;                                 \
    161     q2 = q1 + q6;                                                             \
    162     q1 = q1 - q6;                                                             \
    163                                                                               \
    164     /* pick up the results */                                                 \
    165     tmp0  = q4;                                                               \
    166     tmp1  = q5;                                                               \
    167     tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2;                         \
    168     tmp3  = q7;                                                               \
    169     tmp10 = q2;                                                               \
    170     tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2;                         \
    171     tmp12 = q3;                                                               \
    172     tmp13 = q1;                                                               \
    173 }
    174 
    175 #define XFIX_0_899976223                   d0[0]
    176 #define XFIX_0_541196100                   d0[1]
    177 #define XFIX_2_562915447                   d0[2]
    178 #define XFIX_0_298631336_MINUS_0_899976223 d0[3]
    179 #define XFIX_1_501321110_MINUS_0_899976223 d1[0]
    180 #define XFIX_2_053119869_MINUS_2_562915447 d1[1]
    181 #define XFIX_0_541196100_PLUS_0_765366865  d1[2]
    182 #define XFIX_1_175875602                   d1[3]
    183 #define XFIX_1_175875602_MINUS_0_390180644 d2[0]
    184 #define XFIX_0_541196100_MINUS_1_847759065 d2[1]
    185 #define XFIX_3_072711026_MINUS_2_562915447 d2[2]
    186 #define XFIX_1_175875602_MINUS_1_961570560 d2[3]
    187 
    188 .balign 16
    189 jsimd_idct_islow_neon_consts:
    190   .short FIX_0_899976223                    /* d0[0] */
    191   .short FIX_0_541196100                    /* d0[1] */
    192   .short FIX_2_562915447                    /* d0[2] */
    193   .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
    194   .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
    195   .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
    196   .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
    197   .short FIX_1_175875602                    /* d1[3] */
    198   /* reloadable constants */
    199   .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
    200   .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
    201   .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
    202   .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
    203 
    204 asm_function jsimd_idct_islow_neon
    205 
    206     DCT_TABLE       .req r0
    207     COEF_BLOCK      .req r1
    208     OUTPUT_BUF      .req r2
    209     OUTPUT_COL      .req r3
    210     TMP1            .req r0
    211     TMP2            .req r1
    212     TMP3            .req r2
    213     TMP4            .req ip
    214 
    215     ROW0L           .req d16
    216     ROW0R           .req d17
    217     ROW1L           .req d18
    218     ROW1R           .req d19
    219     ROW2L           .req d20
    220     ROW2R           .req d21
    221     ROW3L           .req d22
    222     ROW3R           .req d23
    223     ROW4L           .req d24
    224     ROW4R           .req d25
    225     ROW5L           .req d26
    226     ROW5R           .req d27
    227     ROW6L           .req d28
    228     ROW6R           .req d29
    229     ROW7L           .req d30
    230     ROW7R           .req d31
    231 
    232     /* Load and dequantize coefficients into NEON registers
    233      * with the following allocation:
    234      *       0 1 2 3 | 4 5 6 7
    235      *      ---------+--------
    236      *   0 | d16     | d17     ( q8  )
    237      *   1 | d18     | d19     ( q9  )
    238      *   2 | d20     | d21     ( q10 )
    239      *   3 | d22     | d23     ( q11 )
    240      *   4 | d24     | d25     ( q12 )
    241      *   5 | d26     | d27     ( q13 )
    242      *   6 | d28     | d29     ( q14 )
    243      *   7 | d30     | d31     ( q15 )
    244      */
    245     adr             ip, jsimd_idct_islow_neon_consts
    246     vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
    247     vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
    248     vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
    249     vmul.s16        q8, q8, q0
    250     vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
    251     vmul.s16        q9, q9, q1
    252     vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
    253     vmul.s16        q10, q10, q2
    254     vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
    255     vmul.s16        q11, q11, q3
    256     vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
    257     vmul.s16        q12, q12, q0
    258     vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
    259     vmul.s16        q14, q14, q2
    260     vmul.s16        q13, q13, q1
    261     vld1.16         {d0, d1, d2, d3}, [ip, :128]  /* load constants */
    262     add             ip, ip, #16
    263     vmul.s16        q15, q15, q3
    264     vpush           {d8-d15}                      /* save NEON registers */
    265     /* 1-D IDCT, pass 1, left 4x8 half */
    266     vadd.s16        d4, ROW7L, ROW3L
    267     vadd.s16        d5, ROW5L, ROW1L
    268     vmull.s16       q6, d4, XFIX_1_175875602_MINUS_1_961570560
    269     vmlal.s16       q6, d5, XFIX_1_175875602
    270     vmull.s16       q7, d4, XFIX_1_175875602
    271       /* Check for the zero coefficients in the right 4x8 half */
    272       push            {r4, r5}
    273     vmlal.s16       q7, d5, XFIX_1_175875602_MINUS_0_390180644
    274     vsubl.s16       q3, ROW0L, ROW4L
    275       ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
    276     vmull.s16       q2, ROW2L, XFIX_0_541196100
    277     vmlal.s16       q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
    278       orr             r0, r4, r5
    279     vmov            q4, q6
    280     vmlsl.s16       q6, ROW5L, XFIX_2_562915447
    281       ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
    282     vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
    283     vshl.s32        q3, q3, #13
    284       orr             r0, r0, r4
    285     vmlsl.s16       q4, ROW1L, XFIX_0_899976223
    286       orr             r0, r0, r5
    287     vadd.s32        q1, q3, q2
    288       ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
    289     vmov            q5, q7
    290     vadd.s32        q1, q1, q6
    291       orr             r0, r0, r4
    292     vmlsl.s16       q7, ROW7L, XFIX_0_899976223
    293       orr             r0, r0, r5
    294     vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
    295     vrshrn.s32      ROW1L, q1, #11
    296       ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
    297     vsub.s32        q1, q1, q6
    298     vmlal.s16       q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
    299       orr             r0, r0, r4
    300     vmlsl.s16       q5, ROW3L, XFIX_2_562915447
    301       orr             r0, r0, r5
    302     vsub.s32        q1, q1, q6
    303     vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
    304       ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
    305     vmlal.s16       q6, ROW6L, XFIX_0_541196100
    306     vsub.s32        q3, q3, q2
    307       orr             r0, r0, r4
    308     vrshrn.s32      ROW6L, q1, #11
    309       orr             r0, r0, r5
    310     vadd.s32        q1, q3, q5
    311       ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
    312     vsub.s32        q3, q3, q5
    313     vaddl.s16       q5, ROW0L, ROW4L
    314       orr             r0, r0, r4
    315     vrshrn.s32      ROW2L, q1, #11
    316       orr             r0, r0, r5
    317     vrshrn.s32      ROW5L, q3, #11
    318       ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
    319     vshl.s32        q5, q5, #13
    320     vmlal.s16       q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
    321       orr             r0, r0, r4
    322     vadd.s32        q2, q5, q6
    323       orrs            r0, r0, r5
    324     vsub.s32        q1, q5, q6
    325     vadd.s32        q6, q2, q7
    326       ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
    327     vsub.s32        q2, q2, q7
    328     vadd.s32        q5, q1, q4
    329       orr             r0, r4, r5
    330     vsub.s32        q3, q1, q4
    331       pop             {r4, r5}
    332     vrshrn.s32      ROW7L, q2, #11
    333     vrshrn.s32      ROW3L, q5, #11
    334     vrshrn.s32      ROW0L, q6, #11
    335     vrshrn.s32      ROW4L, q3, #11
    336 
    337       beq             3f  /* Go to do some special handling for the sparse
    338                              right 4x8 half */
    339 
    340     /* 1-D IDCT, pass 1, right 4x8 half */
    341     vld1.s16        {d2}, [ip, :64]  /* reload constants */
    342     vadd.s16        d10, ROW7R, ROW3R
    343     vadd.s16        d8, ROW5R, ROW1R
    344       /* Transpose left 4x8 half */
    345       vtrn.16         ROW6L, ROW7L
    346     vmull.s16       q6, d10, XFIX_1_175875602_MINUS_1_961570560
    347     vmlal.s16       q6, d8, XFIX_1_175875602
    348       vtrn.16         ROW2L, ROW3L
    349     vmull.s16       q7, d10, XFIX_1_175875602
    350     vmlal.s16       q7, d8, XFIX_1_175875602_MINUS_0_390180644
    351       vtrn.16         ROW0L, ROW1L
    352     vsubl.s16       q3, ROW0R, ROW4R
    353     vmull.s16       q2, ROW2R, XFIX_0_541196100
    354     vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
    355       vtrn.16         ROW4L, ROW5L
    356     vmov            q4, q6
    357     vmlsl.s16       q6, ROW5R, XFIX_2_562915447
    358     vmlal.s16       q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
    359       vtrn.32         ROW1L, ROW3L
    360     vshl.s32        q3, q3, #13
    361     vmlsl.s16       q4, ROW1R, XFIX_0_899976223
    362       vtrn.32         ROW4L, ROW6L
    363     vadd.s32        q1, q3, q2
    364     vmov            q5, q7
    365     vadd.s32        q1, q1, q6
    366       vtrn.32         ROW0L, ROW2L
    367     vmlsl.s16       q7, ROW7R, XFIX_0_899976223
    368     vmlal.s16       q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
    369     vrshrn.s32      ROW1R, q1, #11
    370       vtrn.32         ROW5L, ROW7L
    371     vsub.s32        q1, q1, q6
    372     vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
    373     vmlsl.s16       q5, ROW3R, XFIX_2_562915447
    374     vsub.s32        q1, q1, q6
    375     vmull.s16       q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
    376     vmlal.s16       q6, ROW6R, XFIX_0_541196100
    377     vsub.s32        q3, q3, q2
    378     vrshrn.s32      ROW6R, q1, #11
    379     vadd.s32        q1, q3, q5
    380     vsub.s32        q3, q3, q5
    381     vaddl.s16       q5, ROW0R, ROW4R
    382     vrshrn.s32      ROW2R, q1, #11
    383     vrshrn.s32      ROW5R, q3, #11
    384     vshl.s32        q5, q5, #13
    385     vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
    386     vadd.s32        q2, q5, q6
    387     vsub.s32        q1, q5, q6
    388     vadd.s32        q6, q2, q7
    389     vsub.s32        q2, q2, q7
    390     vadd.s32        q5, q1, q4
    391     vsub.s32        q3, q1, q4
    392     vrshrn.s32      ROW7R, q2, #11
    393     vrshrn.s32      ROW3R, q5, #11
    394     vrshrn.s32      ROW0R, q6, #11
    395     vrshrn.s32      ROW4R, q3, #11
    396     /* Transpose right 4x8 half */
    397     vtrn.16         ROW6R, ROW7R
    398     vtrn.16         ROW2R, ROW3R
    399     vtrn.16         ROW0R, ROW1R
    400     vtrn.16         ROW4R, ROW5R
    401     vtrn.32         ROW1R, ROW3R
    402     vtrn.32         ROW4R, ROW6R
    403     vtrn.32         ROW0R, ROW2R
    404     vtrn.32         ROW5R, ROW7R
    405 
    406 1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
    407     vld1.s16        {d2}, [ip, :64]               /* reload constants */
    408     vmull.s16       q6, ROW1R, XFIX_1_175875602   /* ROW5L <-> ROW1R */
    409     vmlal.s16       q6, ROW1L, XFIX_1_175875602
    410     vmlal.s16       q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
    411     vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
    412     vmull.s16       q7, ROW3R, XFIX_1_175875602   /* ROW7L <-> ROW3R */
    413     vmlal.s16       q7, ROW3L, XFIX_1_175875602
    414     vmlal.s16       q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
    415     vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
    416     vsubl.s16       q3, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
    417     vmull.s16       q2, ROW2L, XFIX_0_541196100
    418     vmlal.s16       q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065  /* ROW6L <-> ROW2R */
    419     vmov            q4, q6
    420     vmlsl.s16       q6, ROW1R, XFIX_2_562915447   /* ROW5L <-> ROW1R */
    421     vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
    422     vshl.s32        q3, q3, #13
    423     vmlsl.s16       q4, ROW1L, XFIX_0_899976223
    424     vadd.s32        q1, q3, q2
    425     vmov            q5, q7
    426     vadd.s32        q1, q1, q6
    427     vmlsl.s16       q7, ROW3R, XFIX_0_899976223   /* ROW7L <-> ROW3R */
    428     vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
    429     vshrn.s32       ROW1L, q1, #16
    430     vsub.s32        q1, q1, q6
    431     vmlal.s16       q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447  /* ROW5L <-> ROW1R */
    432     vmlsl.s16       q5, ROW3L, XFIX_2_562915447
    433     vsub.s32        q1, q1, q6
    434     vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
    435     vmlal.s16       q6, ROW2R, XFIX_0_541196100   /* ROW6L <-> ROW2R */
    436     vsub.s32        q3, q3, q2
    437     vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
    438     vadd.s32        q1, q3, q5
    439     vsub.s32        q3, q3, q5
    440     vaddl.s16       q5, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
    441     vshrn.s32       ROW2L, q1, #16
    442     vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
    443     vshl.s32        q5, q5, #13
    444     vmlal.s16       q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223  /* ROW7L <-> ROW3R */
    445     vadd.s32        q2, q5, q6
    446     vsub.s32        q1, q5, q6
    447     vadd.s32        q6, q2, q7
    448     vsub.s32        q2, q2, q7
    449     vadd.s32        q5, q1, q4
    450     vsub.s32        q3, q1, q4
    451     vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
    452     vshrn.s32       ROW3L, q5, #16
    453     vshrn.s32       ROW0L, q6, #16
    454     vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
    455     /* 1-D IDCT, pass 2, right 4x8 half */
    456     vld1.s16        {d2}, [ip, :64]               /* reload constants */
    457     vmull.s16       q6, ROW5R, XFIX_1_175875602
    458     vmlal.s16       q6, ROW5L, XFIX_1_175875602   /* ROW5L <-> ROW1R */
    459     vmlal.s16       q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
    460     vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
    461     vmull.s16       q7, ROW7R, XFIX_1_175875602
    462     vmlal.s16       q7, ROW7L, XFIX_1_175875602   /* ROW7L <-> ROW3R */
    463     vmlal.s16       q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
    464     vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
    465     vsubl.s16       q3, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
    466     vmull.s16       q2, ROW6L, XFIX_0_541196100   /* ROW6L <-> ROW2R */
    467     vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
    468     vmov            q4, q6
    469     vmlsl.s16       q6, ROW5R, XFIX_2_562915447
    470     vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447  /* ROW7L <-> ROW3R */
    471     vshl.s32        q3, q3, #13
    472     vmlsl.s16       q4, ROW5L, XFIX_0_899976223   /* ROW5L <-> ROW1R */
    473     vadd.s32        q1, q3, q2
    474     vmov            q5, q7
    475     vadd.s32        q1, q1, q6
    476     vmlsl.s16       q7, ROW7R, XFIX_0_899976223
    477     vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223  /* ROW5L <-> ROW1R */
    478     vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
    479     vsub.s32        q1, q1, q6
    480     vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
    481     vmlsl.s16       q5, ROW7L, XFIX_2_562915447   /* ROW7L <-> ROW3R */
    482     vsub.s32        q1, q1, q6
    483     vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865  /* ROW6L <-> ROW2R */
    484     vmlal.s16       q6, ROW6R, XFIX_0_541196100
    485     vsub.s32        q3, q3, q2
    486     vshrn.s32       ROW6R, q1, #16
    487     vadd.s32        q1, q3, q5
    488     vsub.s32        q3, q3, q5
    489     vaddl.s16       q5, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
    490     vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
    491     vshrn.s32       ROW5R, q3, #16
    492     vshl.s32        q5, q5, #13
    493     vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
    494     vadd.s32        q2, q5, q6
    495     vsub.s32        q1, q5, q6
    496     vadd.s32        q6, q2, q7
    497     vsub.s32        q2, q2, q7
    498     vadd.s32        q5, q1, q4
    499     vsub.s32        q3, q1, q4
    500     vshrn.s32       ROW7R, q2, #16
    501     vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
    502     vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
    503     vshrn.s32       ROW4R, q3, #16
    504 
    505 2:  /* Descale to 8-bit and range limit */
    506     vqrshrn.s16     d16, q8, #2
    507     vqrshrn.s16     d17, q9, #2
    508     vqrshrn.s16     d18, q10, #2
    509     vqrshrn.s16     d19, q11, #2
    510     vpop            {d8-d15}                      /* restore NEON registers */
    511     vqrshrn.s16     d20, q12, #2
    512       /* Transpose the final 8-bit samples and do signed->unsigned conversion */
    513       vtrn.16         q8, q9
    514     vqrshrn.s16     d21, q13, #2
    515     vqrshrn.s16     d22, q14, #2
    516       vmov.u8         q0, #(CENTERJSAMPLE)
    517     vqrshrn.s16     d23, q15, #2
    518       vtrn.8          d16, d17
    519       vtrn.8          d18, d19
    520       vadd.u8         q8, q8, q0
    521       vadd.u8         q9, q9, q0
    522       vtrn.16         q10, q11
    523         /* Store results to the output buffer */
    524         ldmia           OUTPUT_BUF!, {TMP1, TMP2}
    525         add             TMP1, TMP1, OUTPUT_COL
    526         add             TMP2, TMP2, OUTPUT_COL
    527         vst1.8          {d16}, [TMP1]
    528       vtrn.8          d20, d21
    529         vst1.8          {d17}, [TMP2]
    530         ldmia           OUTPUT_BUF!, {TMP1, TMP2}
    531         add             TMP1, TMP1, OUTPUT_COL
    532         add             TMP2, TMP2, OUTPUT_COL
    533         vst1.8          {d18}, [TMP1]
    534       vadd.u8         q10, q10, q0
    535         vst1.8          {d19}, [TMP2]
    536         ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
    537         add             TMP1, TMP1, OUTPUT_COL
    538         add             TMP2, TMP2, OUTPUT_COL
    539         add             TMP3, TMP3, OUTPUT_COL
    540         add             TMP4, TMP4, OUTPUT_COL
    541       vtrn.8          d22, d23
    542         vst1.8          {d20}, [TMP1]
    543       vadd.u8         q11, q11, q0
    544         vst1.8          {d21}, [TMP2]
    545         vst1.8          {d22}, [TMP3]
    546         vst1.8          {d23}, [TMP4]
    547     bx              lr
    548 
    549 3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
    550 
    551     /* Transpose left 4x8 half */
    552     vtrn.16         ROW6L, ROW7L
    553     vtrn.16         ROW2L, ROW3L
    554     vtrn.16         ROW0L, ROW1L
    555     vtrn.16         ROW4L, ROW5L
    556     vshl.s16        ROW0R, ROW0R, #2  /* PASS1_BITS */
    557     vtrn.32         ROW1L, ROW3L
    558     vtrn.32         ROW4L, ROW6L
    559     vtrn.32         ROW0L, ROW2L
    560     vtrn.32         ROW5L, ROW7L
    561 
    562     cmp             r0, #0
    563     beq             4f  /* Right 4x8 half has all zeros, go to 'sparse' second
    564                            pass */
    565 
    566     /* Only row 0 is non-zero for the right 4x8 half  */
    567     vdup.s16        ROW1R, ROW0R[1]
    568     vdup.s16        ROW2R, ROW0R[2]
    569     vdup.s16        ROW3R, ROW0R[3]
    570     vdup.s16        ROW4R, ROW0R[0]
    571     vdup.s16        ROW5R, ROW0R[1]
    572     vdup.s16        ROW6R, ROW0R[2]
    573     vdup.s16        ROW7R, ROW0R[3]
    574     vdup.s16        ROW0R, ROW0R[0]
    575     b               1b  /* Go to 'normal' second pass */
    576 
    577 4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
    578     vld1.s16        {d2}, [ip, :64]               /* reload constants */
    579     vmull.s16       q6, ROW1L, XFIX_1_175875602
    580     vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
    581     vmull.s16       q7, ROW3L, XFIX_1_175875602
    582     vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
    583     vmull.s16       q2, ROW2L, XFIX_0_541196100
    584     vshll.s16       q3, ROW0L, #13
    585     vmov            q4, q6
    586     vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
    587     vmlsl.s16       q4, ROW1L, XFIX_0_899976223
    588     vadd.s32        q1, q3, q2
    589     vmov            q5, q7
    590     vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
    591     vadd.s32        q1, q1, q6
    592     vadd.s32        q6, q6, q6
    593     vmlsl.s16       q5, ROW3L, XFIX_2_562915447
    594     vshrn.s32       ROW1L, q1, #16
    595     vsub.s32        q1, q1, q6
    596     vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
    597     vsub.s32        q3, q3, q2
    598     vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
    599     vadd.s32        q1, q3, q5
    600     vsub.s32        q3, q3, q5
    601     vshll.s16       q5, ROW0L, #13
    602     vshrn.s32       ROW2L, q1, #16
    603     vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
    604     vadd.s32        q2, q5, q6
    605     vsub.s32        q1, q5, q6
    606     vadd.s32        q6, q2, q7
    607     vsub.s32        q2, q2, q7
    608     vadd.s32        q5, q1, q4
    609     vsub.s32        q3, q1, q4
    610     vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
    611     vshrn.s32       ROW3L, q5, #16
    612     vshrn.s32       ROW0L, q6, #16
    613     vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
    614     /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
    615     vld1.s16        {d2}, [ip, :64]               /* reload constants */
    616     vmull.s16       q6, ROW5L, XFIX_1_175875602
    617     vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
    618     vmull.s16       q7, ROW7L, XFIX_1_175875602
    619     vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
    620     vmull.s16       q2, ROW6L, XFIX_0_541196100
    621     vshll.s16       q3, ROW4L, #13
    622     vmov            q4, q6
    623     vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
    624     vmlsl.s16       q4, ROW5L, XFIX_0_899976223
    625     vadd.s32        q1, q3, q2
    626     vmov            q5, q7
    627     vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
    628     vadd.s32        q1, q1, q6
    629     vadd.s32        q6, q6, q6
    630     vmlsl.s16       q5, ROW7L, XFIX_2_562915447
    631     vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
    632     vsub.s32        q1, q1, q6
    633     vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
    634     vsub.s32        q3, q3, q2
    635     vshrn.s32       ROW6R, q1, #16
    636     vadd.s32        q1, q3, q5
    637     vsub.s32        q3, q3, q5
    638     vshll.s16       q5, ROW4L, #13
    639     vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
    640     vshrn.s32       ROW5R, q3, #16
    641     vadd.s32        q2, q5, q6
    642     vsub.s32        q1, q5, q6
    643     vadd.s32        q6, q2, q7
    644     vsub.s32        q2, q2, q7
    645     vadd.s32        q5, q1, q4
    646     vsub.s32        q3, q1, q4
    647     vshrn.s32       ROW7R, q2, #16
    648     vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
    649     vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
    650     vshrn.s32       ROW4R, q3, #16
    651     b               2b                            /* Go to epilogue */
    652 
    653     .unreq          DCT_TABLE
    654     .unreq          COEF_BLOCK
    655     .unreq          OUTPUT_BUF
    656     .unreq          OUTPUT_COL
    657     .unreq          TMP1
    658     .unreq          TMP2
    659     .unreq          TMP3
    660     .unreq          TMP4
    661 
    662     .unreq          ROW0L
    663     .unreq          ROW0R
    664     .unreq          ROW1L
    665     .unreq          ROW1R
    666     .unreq          ROW2L
    667     .unreq          ROW2R
    668     .unreq          ROW3L
    669     .unreq          ROW3R
    670     .unreq          ROW4L
    671     .unreq          ROW4R
    672     .unreq          ROW5L
    673     .unreq          ROW5R
    674     .unreq          ROW6L
    675     .unreq          ROW6R
    676     .unreq          ROW7L
    677     .unreq          ROW7R
    678 
    679 
    680 /*****************************************************************************/
    681 
    682 /*
    683  * jsimd_idct_ifast_neon
    684  *
    685  * This function contains a fast, not so accurate integer implementation of
    686  * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
    687  * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
    688  * function from jidctfst.c
    689  *
    690  * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
    691  * But in ARM NEON case some extra additions are required because VQDMULH
    692  * instruction can't handle the constants larger than 1. So the expressions
    693  * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
    694  * which introduces an extra addition. Overall, there are 6 extra additions
    695  * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
    696  */
    697 
    698 #define XFIX_1_082392200 d0[0]
    699 #define XFIX_1_414213562 d0[1]
    700 #define XFIX_1_847759065 d0[2]
    701 #define XFIX_2_613125930 d0[3]
    702 
    703 .balign 16
    704 jsimd_idct_ifast_neon_consts:
    705   .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
    706   .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
    707   .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
    708   .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
    709 
    710 asm_function jsimd_idct_ifast_neon
    711 
    712     DCT_TABLE       .req r0
    713     COEF_BLOCK      .req r1
    714     OUTPUT_BUF      .req r2
    715     OUTPUT_COL      .req r3
    716     TMP1            .req r0
    717     TMP2            .req r1
    718     TMP3            .req r2
    719     TMP4            .req ip
    720 
    721     /* Load and dequantize coefficients into NEON registers
    722      * with the following allocation:
    723      *       0 1 2 3 | 4 5 6 7
    724      *      ---------+--------
    725      *   0 | d16     | d17     ( q8  )
    726      *   1 | d18     | d19     ( q9  )
    727      *   2 | d20     | d21     ( q10 )
    728      *   3 | d22     | d23     ( q11 )
    729      *   4 | d24     | d25     ( q12 )
    730      *   5 | d26     | d27     ( q13 )
    731      *   6 | d28     | d29     ( q14 )
    732      *   7 | d30     | d31     ( q15 )
    733      */
    734     adr             ip, jsimd_idct_ifast_neon_consts
    735     vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
    736     vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
    737     vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
    738     vmul.s16        q8, q8, q0
    739     vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
    740     vmul.s16        q9, q9, q1
    741     vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
    742     vmul.s16        q10, q10, q2
    743     vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
    744     vmul.s16        q11, q11, q3
    745     vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
    746     vmul.s16        q12, q12, q0
    747     vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
    748     vmul.s16        q14, q14, q2
    749     vmul.s16        q13, q13, q1
    750     vld1.16         {d0}, [ip, :64]  /* load constants */
    751     vmul.s16        q15, q15, q3
    752     vpush           {d8-d13}         /* save NEON registers */
    753     /* 1-D IDCT, pass 1 */
    754     vsub.s16        q2, q10, q14
    755     vadd.s16        q14, q10, q14
    756     vsub.s16        q1, q11, q13
    757     vadd.s16        q13, q11, q13
    758     vsub.s16        q5, q9, q15
    759     vadd.s16        q15, q9, q15
    760     vqdmulh.s16     q4, q2, XFIX_1_414213562
    761     vqdmulh.s16     q6, q1, XFIX_2_613125930
    762     vadd.s16        q3, q1, q1
    763     vsub.s16        q1, q5, q1
    764     vadd.s16        q10, q2, q4
    765     vqdmulh.s16     q4, q1, XFIX_1_847759065
    766     vsub.s16        q2, q15, q13
    767     vadd.s16        q3, q3, q6
    768     vqdmulh.s16     q6, q2, XFIX_1_414213562
    769     vadd.s16        q1, q1, q4
    770     vqdmulh.s16     q4, q5, XFIX_1_082392200
    771     vsub.s16        q10, q10, q14
    772     vadd.s16        q2, q2, q6
    773     vsub.s16        q6, q8, q12
    774     vadd.s16        q12, q8, q12
    775     vadd.s16        q9, q5, q4
    776     vadd.s16        q5, q6, q10
    777     vsub.s16        q10, q6, q10
    778     vadd.s16        q6, q15, q13
    779     vadd.s16        q8, q12, q14
    780     vsub.s16        q3, q6, q3
    781     vsub.s16        q12, q12, q14
    782     vsub.s16        q3, q3, q1
    783     vsub.s16        q1, q9, q1
    784     vadd.s16        q2, q3, q2
    785     vsub.s16        q15, q8, q6
    786     vadd.s16        q1, q1, q2
    787     vadd.s16        q8, q8, q6
    788     vadd.s16        q14, q5, q3
    789     vsub.s16        q9, q5, q3
    790     vsub.s16        q13, q10, q2
    791     vadd.s16        q10, q10, q2
    792       /* Transpose */
    793       vtrn.16         q8, q9
    794     vsub.s16        q11, q12, q1
    795       vtrn.16         q14, q15
    796     vadd.s16        q12, q12, q1
    797       vtrn.16         q10, q11
    798       vtrn.16         q12, q13
    799       vtrn.32         q9, q11
    800       vtrn.32         q12, q14
    801       vtrn.32         q8, q10
    802       vtrn.32         q13, q15
    803       vswp            d28, d21
    804       vswp            d26, d19
    805     /* 1-D IDCT, pass 2 */
    806     vsub.s16        q2, q10, q14
    807       vswp            d30, d23
    808     vadd.s16        q14, q10, q14
    809       vswp            d24, d17
    810     vsub.s16        q1, q11, q13
    811     vadd.s16        q13, q11, q13
    812     vsub.s16        q5, q9, q15
    813     vadd.s16        q15, q9, q15
    814     vqdmulh.s16     q4, q2, XFIX_1_414213562
    815     vqdmulh.s16     q6, q1, XFIX_2_613125930
    816     vadd.s16        q3, q1, q1
    817     vsub.s16        q1, q5, q1
    818     vadd.s16        q10, q2, q4
    819     vqdmulh.s16     q4, q1, XFIX_1_847759065
    820     vsub.s16        q2, q15, q13
    821     vadd.s16        q3, q3, q6
    822     vqdmulh.s16     q6, q2, XFIX_1_414213562
    823     vadd.s16        q1, q1, q4
    824     vqdmulh.s16     q4, q5, XFIX_1_082392200
    825     vsub.s16        q10, q10, q14
    826     vadd.s16        q2, q2, q6
    827     vsub.s16        q6, q8, q12
    828     vadd.s16        q12, q8, q12
    829     vadd.s16        q9, q5, q4
    830     vadd.s16        q5, q6, q10
    831     vsub.s16        q10, q6, q10
    832     vadd.s16        q6, q15, q13
    833     vadd.s16        q8, q12, q14
    834     vsub.s16        q3, q6, q3
    835     vsub.s16        q12, q12, q14
    836     vsub.s16        q3, q3, q1
    837     vsub.s16        q1, q9, q1
    838     vadd.s16        q2, q3, q2
    839     vsub.s16        q15, q8, q6
    840     vadd.s16        q1, q1, q2
    841     vadd.s16        q8, q8, q6
    842     vadd.s16        q14, q5, q3
    843     vsub.s16        q9, q5, q3
    844     vsub.s16        q13, q10, q2
    845     vpop            {d8-d13}      /* restore NEON registers */
    846     vadd.s16        q10, q10, q2
    847     vsub.s16        q11, q12, q1
    848     vadd.s16        q12, q12, q1
    849     /* Descale to 8-bit and range limit */
    850     vmov.u8         q0, #0x80
    851     vqshrn.s16      d16, q8, #5
    852     vqshrn.s16      d17, q9, #5
    853     vqshrn.s16      d18, q10, #5
    854     vqshrn.s16      d19, q11, #5
    855     vqshrn.s16      d20, q12, #5
    856     vqshrn.s16      d21, q13, #5
    857     vqshrn.s16      d22, q14, #5
    858     vqshrn.s16      d23, q15, #5
    859     vadd.u8         q8, q8, q0
    860     vadd.u8         q9, q9, q0
    861     vadd.u8         q10, q10, q0
    862     vadd.u8         q11, q11, q0
    863     /* Transpose the final 8-bit samples */
    864     vtrn.16         q8, q9
    865     vtrn.16         q10, q11
    866     vtrn.32         q8, q10
    867     vtrn.32         q9, q11
    868     vtrn.8          d16, d17
    869     vtrn.8          d18, d19
    870       /* Store results to the output buffer */
    871       ldmia           OUTPUT_BUF!, {TMP1, TMP2}
    872       add             TMP1, TMP1, OUTPUT_COL
    873       add             TMP2, TMP2, OUTPUT_COL
    874       vst1.8          {d16}, [TMP1]
    875       vst1.8          {d17}, [TMP2]
    876       ldmia           OUTPUT_BUF!, {TMP1, TMP2}
    877       add             TMP1, TMP1, OUTPUT_COL
    878       add             TMP2, TMP2, OUTPUT_COL
    879       vst1.8          {d18}, [TMP1]
    880     vtrn.8          d20, d21
    881       vst1.8          {d19}, [TMP2]
    882       ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
    883       add             TMP1, TMP1, OUTPUT_COL
    884       add             TMP2, TMP2, OUTPUT_COL
    885       add             TMP3, TMP3, OUTPUT_COL
    886       add             TMP4, TMP4, OUTPUT_COL
    887       vst1.8          {d20}, [TMP1]
    888     vtrn.8          d22, d23
    889       vst1.8          {d21}, [TMP2]
    890       vst1.8          {d22}, [TMP3]
    891       vst1.8          {d23}, [TMP4]
    892     bx              lr
    893 
    894     .unreq          DCT_TABLE
    895     .unreq          COEF_BLOCK
    896     .unreq          OUTPUT_BUF
    897     .unreq          OUTPUT_COL
    898     .unreq          TMP1
    899     .unreq          TMP2
    900     .unreq          TMP3
    901     .unreq          TMP4
    902 
    903 
    904 /*****************************************************************************/
    905 
    906 /*
    907  * jsimd_idct_4x4_neon
    908  *
    909  * This function contains inverse-DCT code for getting reduced-size
    910  * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
    911  * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
    912  * function from jpeg-6b (jidctred.c).
    913  *
    914  * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
    915  *       requires much less arithmetic operations and hence should be faster.
    916  *       The primary purpose of this particular NEON optimized function is
    917  *       bit exact compatibility with jpeg-6b.
    918  *
    919  * TODO: a bit better instructions scheduling can be achieved by expanding
    920  *       idct_helper/transpose_4x4 macros and reordering instructions,
    921  *       but readability will suffer somewhat.
    922  */
    923 
    924 #define CONST_BITS  13
    925 
    926 #define FIX_0_211164243 (1730)   /* FIX(0.211164243) */
    927 #define FIX_0_509795579 (4176)   /* FIX(0.509795579) */
    928 #define FIX_0_601344887 (4926)   /* FIX(0.601344887) */
    929 #define FIX_0_720959822 (5906)   /* FIX(0.720959822) */
    930 #define FIX_0_765366865 (6270)   /* FIX(0.765366865) */
    931 #define FIX_0_850430095 (6967)   /* FIX(0.850430095) */
    932 #define FIX_0_899976223 (7373)   /* FIX(0.899976223) */
    933 #define FIX_1_061594337 (8697)   /* FIX(1.061594337) */
    934 #define FIX_1_272758580 (10426)  /* FIX(1.272758580) */
    935 #define FIX_1_451774981 (11893)  /* FIX(1.451774981) */
    936 #define FIX_1_847759065 (15137)  /* FIX(1.847759065) */
    937 #define FIX_2_172734803 (17799)  /* FIX(2.172734803) */
    938 #define FIX_2_562915447 (20995)  /* FIX(2.562915447) */
    939 #define FIX_3_624509785 (29692)  /* FIX(3.624509785) */
    940 
    941 .balign 16
    942 jsimd_idct_4x4_neon_consts:
    943   .short FIX_1_847759065      /* d0[0] */
    944   .short -FIX_0_765366865     /* d0[1] */
    945   .short -FIX_0_211164243     /* d0[2] */
    946   .short FIX_1_451774981      /* d0[3] */
    947   .short -FIX_2_172734803     /* d1[0] */
    948   .short FIX_1_061594337      /* d1[1] */
    949   .short -FIX_0_509795579     /* d1[2] */
    950   .short -FIX_0_601344887     /* d1[3] */
    951   .short FIX_0_899976223      /* d2[0] */
    952   .short FIX_2_562915447      /* d2[1] */
    953   .short 1 << (CONST_BITS+1)  /* d2[2] */
    954   .short 0                    /* d2[3] */
    955 
    956 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
    957     vmull.s16       q14, \x4, d2[2]
    958     vmlal.s16       q14, \x8, d0[0]
    959     vmlal.s16       q14, \x14, d0[1]
    960 
    961     vmull.s16       q13, \x16, d1[2]
    962     vmlal.s16       q13, \x12, d1[3]
    963     vmlal.s16       q13, \x10, d2[0]
    964     vmlal.s16       q13, \x6, d2[1]
    965 
    966     vmull.s16       q15, \x4, d2[2]
    967     vmlsl.s16       q15, \x8, d0[0]
    968     vmlsl.s16       q15, \x14, d0[1]
    969 
    970     vmull.s16       q12, \x16, d0[2]
    971     vmlal.s16       q12, \x12, d0[3]
    972     vmlal.s16       q12, \x10, d1[0]
    973     vmlal.s16       q12, \x6, d1[1]
    974 
    975     vadd.s32        q10, q14, q13
    976     vsub.s32        q14, q14, q13
    977 
    978   .if \shift > 16
    979     vrshr.s32       q10, q10, #\shift
    980     vrshr.s32       q14, q14, #\shift
    981     vmovn.s32       \y26, q10
    982     vmovn.s32       \y29, q14
    983   .else
    984     vrshrn.s32      \y26, q10, #\shift
    985     vrshrn.s32      \y29, q14, #\shift
    986   .endif
    987 
    988     vadd.s32        q10, q15, q12
    989     vsub.s32        q15, q15, q12
    990 
    991   .if \shift > 16
    992     vrshr.s32       q10, q10, #\shift
    993     vrshr.s32       q15, q15, #\shift
    994     vmovn.s32       \y27, q10
    995     vmovn.s32       \y28, q15
    996   .else
    997     vrshrn.s32      \y27, q10, #\shift
    998     vrshrn.s32      \y28, q15, #\shift
    999   .endif
   1000 .endm
   1001 
   1002 asm_function jsimd_idct_4x4_neon
   1003 
   1004     DCT_TABLE       .req r0
   1005     COEF_BLOCK      .req r1
   1006     OUTPUT_BUF      .req r2
   1007     OUTPUT_COL      .req r3
   1008     TMP1            .req r0
   1009     TMP2            .req r1
   1010     TMP3            .req r2
   1011     TMP4            .req ip
   1012 
   1013     vpush           {d8-d15}
   1014 
   1015     /* Load constants (d3 is just used for padding) */
   1016     adr             TMP4, jsimd_idct_4x4_neon_consts
   1017     vld1.16         {d0, d1, d2, d3}, [TMP4, :128]
   1018 
   1019     /* Load all COEF_BLOCK into NEON registers with the following allocation:
   1020      *       0 1 2 3 | 4 5 6 7
   1021      *      ---------+--------
   1022      *   0 | d4      | d5
   1023      *   1 | d6      | d7
   1024      *   2 | d8      | d9
   1025      *   3 | d10     | d11
   1026      *   4 | -       | -
   1027      *   5 | d12     | d13
   1028      *   6 | d14     | d15
   1029      *   7 | d16     | d17
   1030      */
   1031     vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
   1032     vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
   1033     add COEF_BLOCK, COEF_BLOCK, #16
   1034     vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
   1035     vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
   1036     /* dequantize */
   1037     vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
   1038     vmul.s16        q2, q2, q9
   1039     vld1.16         {d22, d23, d24, d25}, [DCT_TABLE, :128]!
   1040     vmul.s16        q3, q3, q10
   1041     vmul.s16        q4, q4, q11
   1042     add             DCT_TABLE, DCT_TABLE, #16
   1043     vld1.16         {d26, d27, d28, d29}, [DCT_TABLE, :128]!
   1044     vmul.s16        q5, q5, q12
   1045     vmul.s16        q6, q6, q13
   1046     vld1.16         {d30, d31}, [DCT_TABLE, :128]!
   1047     vmul.s16        q7, q7, q14
   1048     vmul.s16        q8, q8, q15
   1049 
   1050     /* Pass 1 */
   1051     idct_helper     d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
   1052     transpose_4x4   d4, d6, d8, d10
   1053     idct_helper     d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
   1054     transpose_4x4   d5, d7, d9, d11
   1055 
   1056     /* Pass 2 */
   1057     idct_helper     d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
   1058     transpose_4x4   d26, d27, d28, d29
   1059 
   1060     /* Range limit */
   1061     vmov.u16        q15, #0x80
   1062     vadd.s16        q13, q13, q15
   1063     vadd.s16        q14, q14, q15
   1064     vqmovun.s16     d26, q13
   1065     vqmovun.s16     d27, q14
   1066 
   1067     /* Store results to the output buffer */
   1068     ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
   1069     add             TMP1, TMP1, OUTPUT_COL
   1070     add             TMP2, TMP2, OUTPUT_COL
   1071     add             TMP3, TMP3, OUTPUT_COL
   1072     add             TMP4, TMP4, OUTPUT_COL
   1073 
   1074 #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
   1075     /* We can use much less instructions on little endian systems if the
   1076      * OS kernel is not configured to trap unaligned memory accesses
   1077      */
   1078     vst1.32         {d26[0]}, [TMP1]!
   1079     vst1.32         {d27[0]}, [TMP3]!
   1080     vst1.32         {d26[1]}, [TMP2]!
   1081     vst1.32         {d27[1]}, [TMP4]!
   1082 #else
   1083     vst1.8          {d26[0]}, [TMP1]!
   1084     vst1.8          {d27[0]}, [TMP3]!
   1085     vst1.8          {d26[1]}, [TMP1]!
   1086     vst1.8          {d27[1]}, [TMP3]!
   1087     vst1.8          {d26[2]}, [TMP1]!
   1088     vst1.8          {d27[2]}, [TMP3]!
   1089     vst1.8          {d26[3]}, [TMP1]!
   1090     vst1.8          {d27[3]}, [TMP3]!
   1091 
   1092     vst1.8          {d26[4]}, [TMP2]!
   1093     vst1.8          {d27[4]}, [TMP4]!
   1094     vst1.8          {d26[5]}, [TMP2]!
   1095     vst1.8          {d27[5]}, [TMP4]!
   1096     vst1.8          {d26[6]}, [TMP2]!
   1097     vst1.8          {d27[6]}, [TMP4]!
   1098     vst1.8          {d26[7]}, [TMP2]!
   1099     vst1.8          {d27[7]}, [TMP4]!
   1100 #endif
   1101 
   1102     vpop            {d8-d15}
   1103     bx              lr
   1104 
   1105     .unreq          DCT_TABLE
   1106     .unreq          COEF_BLOCK
   1107     .unreq          OUTPUT_BUF
   1108     .unreq          OUTPUT_COL
   1109     .unreq          TMP1
   1110     .unreq          TMP2
   1111     .unreq          TMP3
   1112     .unreq          TMP4
   1113 
   1114 .purgem idct_helper
   1115 
   1116 
   1117 /*****************************************************************************/
   1118 
   1119 /*
   1120  * jsimd_idct_2x2_neon
   1121  *
   1122  * This function contains inverse-DCT code for getting reduced-size
   1123  * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
   1124  * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
   1125  * function from jpeg-6b (jidctred.c).
   1126  *
   1127  * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
   1128  *       requires much less arithmetic operations and hence should be faster.
   1129  *       The primary purpose of this particular NEON optimized function is
   1130  *       bit exact compatibility with jpeg-6b.
   1131  */
   1132 
   1133 .balign 8
   1134 jsimd_idct_2x2_neon_consts:
   1135   .short -FIX_0_720959822  /* d0[0] */
   1136   .short FIX_0_850430095   /* d0[1] */
   1137   .short -FIX_1_272758580  /* d0[2] */
   1138   .short FIX_3_624509785   /* d0[3] */
   1139 
   1140 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
   1141     vshll.s16       q14, \x4, #15
   1142     vmull.s16       q13, \x6, d0[3]
   1143     vmlal.s16       q13, \x10, d0[2]
   1144     vmlal.s16       q13, \x12, d0[1]
   1145     vmlal.s16       q13, \x16, d0[0]
   1146 
   1147     vadd.s32        q10, q14, q13
   1148     vsub.s32        q14, q14, q13
   1149 
   1150   .if \shift > 16
   1151     vrshr.s32       q10, q10, #\shift
   1152     vrshr.s32       q14, q14, #\shift
   1153     vmovn.s32       \y26, q10
   1154     vmovn.s32       \y27, q14
   1155   .else
   1156     vrshrn.s32      \y26, q10, #\shift
   1157     vrshrn.s32      \y27, q14, #\shift
   1158   .endif
   1159 .endm
   1160 
   1161 asm_function jsimd_idct_2x2_neon
   1162 
   1163     DCT_TABLE       .req r0
   1164     COEF_BLOCK      .req r1
   1165     OUTPUT_BUF      .req r2
   1166     OUTPUT_COL      .req r3
   1167     TMP1            .req r0
   1168     TMP2            .req ip
   1169 
   1170     vpush           {d8-d15}
   1171 
   1172     /* Load constants */
   1173     adr             TMP2, jsimd_idct_2x2_neon_consts
   1174     vld1.16         {d0}, [TMP2, :64]
   1175 
   1176     /* Load all COEF_BLOCK into NEON registers with the following allocation:
   1177      *       0 1 2 3 | 4 5 6 7
   1178      *      ---------+--------
   1179      *   0 | d4      | d5
   1180      *   1 | d6      | d7
   1181      *   2 | -       | -
   1182      *   3 | d10     | d11
   1183      *   4 | -       | -
   1184      *   5 | d12     | d13
   1185      *   6 | -       | -
   1186      *   7 | d16     | d17
   1187      */
   1188     vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
   1189     add             COEF_BLOCK, COEF_BLOCK, #16
   1190     vld1.16         {d10, d11}, [COEF_BLOCK, :128]!
   1191     add             COEF_BLOCK, COEF_BLOCK, #16
   1192     vld1.16         {d12, d13}, [COEF_BLOCK, :128]!
   1193     add             COEF_BLOCK, COEF_BLOCK, #16
   1194     vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
   1195     /* Dequantize */
   1196     vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
   1197     vmul.s16        q2, q2, q9
   1198     vmul.s16        q3, q3, q10
   1199     add             DCT_TABLE, DCT_TABLE, #16
   1200     vld1.16         {d24, d25}, [DCT_TABLE, :128]!
   1201     vmul.s16        q5, q5, q12
   1202     add             DCT_TABLE, DCT_TABLE, #16
   1203     vld1.16         {d26, d27}, [DCT_TABLE, :128]!
   1204     vmul.s16        q6, q6, q13
   1205     add             DCT_TABLE, DCT_TABLE, #16
   1206     vld1.16         {d30, d31}, [DCT_TABLE, :128]!
   1207     vmul.s16        q8, q8, q15
   1208 
   1209     /* Pass 1 */
   1210 #if 0
   1211     idct_helper     d4, d6, d10, d12, d16, 13, d4, d6
   1212     transpose_4x4   d4, d6, d8, d10
   1213     idct_helper     d5, d7, d11, d13, d17, 13, d5, d7
   1214     transpose_4x4   d5, d7, d9, d11
   1215 #else
   1216     vmull.s16       q13, d6, d0[3]
   1217     vmlal.s16       q13, d10, d0[2]
   1218     vmlal.s16       q13, d12, d0[1]
   1219     vmlal.s16       q13, d16, d0[0]
   1220     vmull.s16       q12, d7, d0[3]
   1221     vmlal.s16       q12, d11, d0[2]
   1222     vmlal.s16       q12, d13, d0[1]
   1223     vmlal.s16       q12, d17, d0[0]
   1224     vshll.s16       q14, d4, #15
   1225     vshll.s16       q15, d5, #15
   1226     vadd.s32        q10, q14, q13
   1227     vsub.s32        q14, q14, q13
   1228     vrshrn.s32      d4, q10, #13
   1229     vrshrn.s32      d6, q14, #13
   1230     vadd.s32        q10, q15, q12
   1231     vsub.s32        q14, q15, q12
   1232     vrshrn.s32      d5, q10, #13
   1233     vrshrn.s32      d7, q14, #13
   1234     vtrn.16         q2, q3
   1235     vtrn.32         q3, q5
   1236 #endif
   1237 
   1238     /* Pass 2 */
   1239     idct_helper     d4, d6, d10, d7, d11, 20, d26, d27
   1240 
   1241     /* Range limit */
   1242     vmov.u16        q15, #0x80
   1243     vadd.s16        q13, q13, q15
   1244     vqmovun.s16     d26, q13
   1245     vqmovun.s16     d27, q13
   1246 
   1247     /* Store results to the output buffer */
   1248     ldmia           OUTPUT_BUF, {TMP1, TMP2}
   1249     add             TMP1, TMP1, OUTPUT_COL
   1250     add             TMP2, TMP2, OUTPUT_COL
   1251 
   1252     vst1.8          {d26[0]}, [TMP1]!
   1253     vst1.8          {d27[4]}, [TMP1]!
   1254     vst1.8          {d26[1]}, [TMP2]!
   1255     vst1.8          {d27[5]}, [TMP2]!
   1256 
   1257     vpop            {d8-d15}
   1258     bx              lr
   1259 
   1260     .unreq          DCT_TABLE
   1261     .unreq          COEF_BLOCK
   1262     .unreq          OUTPUT_BUF
   1263     .unreq          OUTPUT_COL
   1264     .unreq          TMP1
   1265     .unreq          TMP2
   1266 
   1267 .purgem idct_helper
   1268 
   1269 
   1270 /*****************************************************************************/
   1271 
   1272 /*
   1273  * jsimd_ycc_extrgb_convert_neon
   1274  * jsimd_ycc_extbgr_convert_neon
   1275  * jsimd_ycc_extrgbx_convert_neon
   1276  * jsimd_ycc_extbgrx_convert_neon
   1277  * jsimd_ycc_extxbgr_convert_neon
   1278  * jsimd_ycc_extxrgb_convert_neon
   1279  *
   1280  * Colorspace conversion YCbCr -> RGB
   1281  */
   1282 
   1283 
   1284 .macro do_load size
   1285   .if \size == 8
   1286     vld1.8          {d4}, [U, :64]!
   1287     vld1.8          {d5}, [V, :64]!
   1288     vld1.8          {d0}, [Y, :64]!
   1289     pld             [U, #64]
   1290     pld             [V, #64]
   1291     pld             [Y, #64]
   1292   .elseif \size == 4
   1293     vld1.8          {d4[0]}, [U]!
   1294     vld1.8          {d4[1]}, [U]!
   1295     vld1.8          {d4[2]}, [U]!
   1296     vld1.8          {d4[3]}, [U]!
   1297     vld1.8          {d5[0]}, [V]!
   1298     vld1.8          {d5[1]}, [V]!
   1299     vld1.8          {d5[2]}, [V]!
   1300     vld1.8          {d5[3]}, [V]!
   1301     vld1.8          {d0[0]}, [Y]!
   1302     vld1.8          {d0[1]}, [Y]!
   1303     vld1.8          {d0[2]}, [Y]!
   1304     vld1.8          {d0[3]}, [Y]!
   1305   .elseif \size == 2
   1306     vld1.8          {d4[4]}, [U]!
   1307     vld1.8          {d4[5]}, [U]!
   1308     vld1.8          {d5[4]}, [V]!
   1309     vld1.8          {d5[5]}, [V]!
   1310     vld1.8          {d0[4]}, [Y]!
   1311     vld1.8          {d0[5]}, [Y]!
   1312   .elseif \size == 1
   1313     vld1.8          {d4[6]}, [U]!
   1314     vld1.8          {d5[6]}, [V]!
   1315     vld1.8          {d0[6]}, [Y]!
   1316   .else
   1317     .error unsupported macroblock size
   1318   .endif
   1319 .endm
   1320 
   1321 .macro do_store bpp, size
   1322   .if \bpp == 24
   1323     .if \size == 8
   1324       vst3.8        {d10, d11, d12}, [RGB]!
   1325     .elseif \size == 4
   1326       vst3.8        {d10[0], d11[0], d12[0]}, [RGB]!
   1327       vst3.8        {d10[1], d11[1], d12[1]}, [RGB]!
   1328       vst3.8        {d10[2], d11[2], d12[2]}, [RGB]!
   1329       vst3.8        {d10[3], d11[3], d12[3]}, [RGB]!
   1330     .elseif \size == 2
   1331       vst3.8        {d10[4], d11[4], d12[4]}, [RGB]!
   1332       vst3.8        {d10[5], d11[5], d12[5]}, [RGB]!
   1333     .elseif \size == 1
   1334       vst3.8        {d10[6], d11[6], d12[6]}, [RGB]!
   1335     .else
   1336       .error unsupported macroblock size
   1337     .endif
   1338   .elseif \bpp == 32
   1339     .if \size == 8
   1340       vst4.8        {d10, d11, d12, d13}, [RGB]!
   1341     .elseif \size == 4
   1342       vst4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
   1343       vst4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
   1344       vst4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
   1345       vst4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
   1346     .elseif \size == 2
   1347       vst4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
   1348       vst4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
   1349     .elseif \size == 1
   1350       vst4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
   1351     .else
   1352       .error unsupported macroblock size
   1353     .endif
   1354   .elseif \bpp == 16
   1355     .if \size == 8
   1356       vst1.16       {q15}, [RGB]!
   1357     .elseif \size == 4
   1358       vst1.16       {d30}, [RGB]!
   1359     .elseif \size == 2
   1360       vst1.16       {d31[0]}, [RGB]!
   1361       vst1.16       {d31[1]}, [RGB]!
   1362     .elseif \size == 1
   1363       vst1.16       {d31[2]}, [RGB]!
   1364     .else
   1365       .error unsupported macroblock size
   1366     .endif
   1367   .else
   1368     .error unsupported bpp
   1369   .endif
   1370 .endm
   1371 
   1372 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
   1373 
   1374 /*
   1375  * 2-stage pipelined YCbCr->RGB conversion
   1376  */
   1377 
   1378 .macro do_yuv_to_rgb_stage1
   1379     vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
   1380     vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
   1381     vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
   1382     vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
   1383     vmull.s16       q11, d7, d1[1]  /* multiply by -11277 */
   1384     vmlal.s16       q11, d9, d1[2]  /* multiply by -23401 */
   1385     vmull.s16       q12, d8, d1[0]  /* multiply by 22971 */
   1386     vmull.s16       q13, d9, d1[0]  /* multiply by 22971 */
   1387     vmull.s16       q14, d6, d1[3]  /* multiply by 29033 */
   1388     vmull.s16       q15, d7, d1[3]  /* multiply by 29033 */
   1389 .endm
   1390 
   1391 .macro do_yuv_to_rgb_stage2
   1392     vrshrn.s32      d20, q10, #15
   1393     vrshrn.s32      d21, q11, #15
   1394     vrshrn.s32      d24, q12, #14
   1395     vrshrn.s32      d25, q13, #14
   1396     vrshrn.s32      d28, q14, #14
   1397     vrshrn.s32      d29, q15, #14
   1398     vaddw.u8        q11, q10, d0
   1399     vaddw.u8        q12, q12, d0
   1400     vaddw.u8        q14, q14, d0
   1401   .if \bpp != 16
   1402     vqmovun.s16     d1\g_offs, q11
   1403     vqmovun.s16     d1\r_offs, q12
   1404     vqmovun.s16     d1\b_offs, q14
   1405   .else  /* rgb565 */
   1406     vqshlu.s16      q13, q11, #8
   1407     vqshlu.s16      q15, q12, #8
   1408     vqshlu.s16      q14, q14, #8
   1409     vsri.u16        q15, q13, #5
   1410     vsri.u16        q15, q14, #11
   1411   .endif
   1412 .endm
   1413 
   1414 .macro do_yuv_to_rgb_stage2_store_load_stage1
   1415                                        /* "do_yuv_to_rgb_stage2" and "store" */
   1416                                        vrshrn.s32      d20, q10, #15
   1417     /* "load" and "do_yuv_to_rgb_stage1" */
   1418     pld             [U, #64]
   1419                                        vrshrn.s32      d21, q11, #15
   1420     pld             [V, #64]
   1421                                        vrshrn.s32      d24, q12, #14
   1422                                        vrshrn.s32      d25, q13, #14
   1423     vld1.8          {d4}, [U, :64]!
   1424                                        vrshrn.s32      d28, q14, #14
   1425     vld1.8          {d5}, [V, :64]!
   1426                                        vrshrn.s32      d29, q15, #14
   1427     vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
   1428     vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
   1429                                        vaddw.u8        q11, q10, d0
   1430     vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
   1431     vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
   1432                                        vaddw.u8        q12, q12, d0
   1433                                        vaddw.u8        q14, q14, d0
   1434   .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
   1435                                        vqmovun.s16     d1\g_offs, q11
   1436     pld             [Y, #64]
   1437                                        vqmovun.s16     d1\r_offs, q12
   1438     vld1.8          {d0}, [Y, :64]!
   1439                                        vqmovun.s16     d1\b_offs, q14
   1440     vmull.s16       q11, d7, d1[1]  /* multiply by -11277 */
   1441     vmlal.s16       q11, d9, d1[2]  /* multiply by -23401 */
   1442                                        do_store        \bpp, 8
   1443     vmull.s16       q12, d8, d1[0]  /* multiply by 22971 */
   1444     vmull.s16       q13, d9, d1[0]  /* multiply by 22971 */
   1445     vmull.s16       q14, d6, d1[3]  /* multiply by 29033 */
   1446     vmull.s16       q15, d7, d1[3]  /* multiply by 29033 */
   1447   .else  /**************************** rgb565 ********************************/
   1448                                        vqshlu.s16      q13, q11, #8
   1449     pld             [Y, #64]
   1450                                        vqshlu.s16      q15, q12, #8
   1451                                        vqshlu.s16      q14, q14, #8
   1452     vld1.8          {d0}, [Y, :64]!
   1453     vmull.s16       q11, d7, d1[1]
   1454     vmlal.s16       q11, d9, d1[2]
   1455                                        vsri.u16        q15, q13, #5
   1456     vmull.s16       q12, d8, d1[0]
   1457                                        vsri.u16        q15, q14, #11
   1458     vmull.s16       q13, d9, d1[0]
   1459     vmull.s16       q14, d6, d1[3]
   1460                                        do_store        \bpp, 8
   1461     vmull.s16       q15, d7, d1[3]
   1462   .endif
   1463 .endm
   1464 
   1465 .macro do_yuv_to_rgb
   1466     do_yuv_to_rgb_stage1
   1467     do_yuv_to_rgb_stage2
   1468 .endm
   1469 
   1470 /* Apple gas crashes on adrl, work around that by using adr.
   1471  * But this requires a copy of these constants for each function.
   1472  */
   1473 
   1474 .balign 16
   1475 jsimd_ycc_\colorid\()_neon_consts:
   1476   .short 0,      0,     0,      0
   1477   .short 22971, -11277, -23401, 29033
   1478   .short -128,  -128,   -128,   -128
   1479   .short -128,  -128,   -128,   -128
   1480 
   1481 asm_function jsimd_ycc_\colorid\()_convert_neon
   1482     OUTPUT_WIDTH    .req r0
   1483     INPUT_BUF       .req r1
   1484     INPUT_ROW       .req r2
   1485     OUTPUT_BUF      .req r3
   1486     NUM_ROWS        .req r4
   1487 
   1488     INPUT_BUF0      .req r5
   1489     INPUT_BUF1      .req r6
   1490     INPUT_BUF2      .req INPUT_BUF
   1491 
   1492     RGB             .req r7
   1493     Y               .req r8
   1494     U               .req r9
   1495     V               .req r10
   1496     N               .req ip
   1497 
   1498     /* Load constants to d1, d2, d3 (d0 is just used for padding) */
   1499     adr             ip, jsimd_ycc_\colorid\()_neon_consts
   1500     vld1.16         {d0, d1, d2, d3}, [ip, :128]
   1501 
   1502     /* Save ARM registers and handle input arguments */
   1503     push            {r4, r5, r6, r7, r8, r9, r10, lr}
   1504     ldr             NUM_ROWS, [sp, #(4 * 8)]
   1505     ldr             INPUT_BUF0, [INPUT_BUF]
   1506     ldr             INPUT_BUF1, [INPUT_BUF, #4]
   1507     ldr             INPUT_BUF2, [INPUT_BUF, #8]
   1508     .unreq          INPUT_BUF
   1509 
   1510     /* Save NEON registers */
   1511     vpush           {d8-d15}
   1512 
   1513     /* Initially set d10, d11, d12, d13 to 0xFF */
   1514     vmov.u8         q5, #255
   1515     vmov.u8         q6, #255
   1516 
   1517     /* Outer loop over scanlines */
   1518     cmp             NUM_ROWS, #1
   1519     blt             9f
   1520 0:
   1521     ldr             Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
   1522     ldr             U, [INPUT_BUF1, INPUT_ROW, lsl #2]
   1523     mov             N, OUTPUT_WIDTH
   1524     ldr             V, [INPUT_BUF2, INPUT_ROW, lsl #2]
   1525     add             INPUT_ROW, INPUT_ROW, #1
   1526     ldr             RGB, [OUTPUT_BUF], #4
   1527 
   1528     /* Inner loop over pixels */
   1529     subs            N, N, #8
   1530     blt             3f
   1531     do_load         8
   1532     do_yuv_to_rgb_stage1
   1533     subs            N, N, #8
   1534     blt             2f
   1535 1:
   1536     do_yuv_to_rgb_stage2_store_load_stage1
   1537     subs            N, N, #8
   1538     bge             1b
   1539 2:
   1540     do_yuv_to_rgb_stage2
   1541     do_store        \bpp, 8
   1542     tst             N, #7
   1543     beq             8f
   1544 3:
   1545     tst             N, #4
   1546     beq             3f
   1547     do_load         4
   1548 3:
   1549     tst             N, #2
   1550     beq             4f
   1551     do_load         2
   1552 4:
   1553     tst             N, #1
   1554     beq             5f
   1555     do_load         1
   1556 5:
   1557     do_yuv_to_rgb
   1558     tst             N, #4
   1559     beq             6f
   1560     do_store        \bpp, 4
   1561 6:
   1562     tst             N, #2
   1563     beq             7f
   1564     do_store        \bpp, 2
   1565 7:
   1566     tst             N, #1
   1567     beq             8f
   1568     do_store        \bpp, 1
   1569 8:
   1570     subs            NUM_ROWS, NUM_ROWS, #1
   1571     bgt             0b
   1572 9:
   1573     /* Restore all registers and return */
   1574     vpop            {d8-d15}
   1575     pop             {r4, r5, r6, r7, r8, r9, r10, pc}
   1576 
   1577     .unreq          OUTPUT_WIDTH
   1578     .unreq          INPUT_ROW
   1579     .unreq          OUTPUT_BUF
   1580     .unreq          NUM_ROWS
   1581     .unreq          INPUT_BUF0
   1582     .unreq          INPUT_BUF1
   1583     .unreq          INPUT_BUF2
   1584     .unreq          RGB
   1585     .unreq          Y
   1586     .unreq          U
   1587     .unreq          V
   1588     .unreq          N
   1589 
   1590 .purgem do_yuv_to_rgb
   1591 .purgem do_yuv_to_rgb_stage1
   1592 .purgem do_yuv_to_rgb_stage2
   1593 .purgem do_yuv_to_rgb_stage2_store_load_stage1
   1594 
   1595 .endm
   1596 
   1597 /*--------------------------------- id ----- bpp R  G  B */
   1598 generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, 1, 2
   1599 generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, 1, 0
   1600 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
   1601 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
   1602 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
   1603 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
   1604 generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, 0, 0
   1605 
   1606 .purgem do_load
   1607 .purgem do_store
   1608 
   1609 
   1610 /*****************************************************************************/
   1611 
   1612 /*
   1613  * jsimd_extrgb_ycc_convert_neon
   1614  * jsimd_extbgr_ycc_convert_neon
   1615  * jsimd_extrgbx_ycc_convert_neon
   1616  * jsimd_extbgrx_ycc_convert_neon
   1617  * jsimd_extxbgr_ycc_convert_neon
   1618  * jsimd_extxrgb_ycc_convert_neon
   1619  *
   1620  * Colorspace conversion RGB -> YCbCr
   1621  */
   1622 
   1623 .macro do_store size
   1624   .if \size == 8
   1625     vst1.8          {d20}, [Y]!
   1626     vst1.8          {d21}, [U]!
   1627     vst1.8          {d22}, [V]!
   1628   .elseif \size == 4
   1629     vst1.8          {d20[0]}, [Y]!
   1630     vst1.8          {d20[1]}, [Y]!
   1631     vst1.8          {d20[2]}, [Y]!
   1632     vst1.8          {d20[3]}, [Y]!
   1633     vst1.8          {d21[0]}, [U]!
   1634     vst1.8          {d21[1]}, [U]!
   1635     vst1.8          {d21[2]}, [U]!
   1636     vst1.8          {d21[3]}, [U]!
   1637     vst1.8          {d22[0]}, [V]!
   1638     vst1.8          {d22[1]}, [V]!
   1639     vst1.8          {d22[2]}, [V]!
   1640     vst1.8          {d22[3]}, [V]!
   1641   .elseif \size == 2
   1642     vst1.8          {d20[4]}, [Y]!
   1643     vst1.8          {d20[5]}, [Y]!
   1644     vst1.8          {d21[4]}, [U]!
   1645     vst1.8          {d21[5]}, [U]!
   1646     vst1.8          {d22[4]}, [V]!
   1647     vst1.8          {d22[5]}, [V]!
   1648   .elseif \size == 1
   1649     vst1.8          {d20[6]}, [Y]!
   1650     vst1.8          {d21[6]}, [U]!
   1651     vst1.8          {d22[6]}, [V]!
   1652   .else
   1653     .error unsupported macroblock size
   1654   .endif
   1655 .endm
   1656 
   1657 .macro do_load bpp, size
   1658   .if \bpp == 24
   1659     .if \size == 8
   1660       vld3.8        {d10, d11, d12}, [RGB]!
   1661       pld           [RGB, #128]
   1662     .elseif \size == 4
   1663       vld3.8        {d10[0], d11[0], d12[0]}, [RGB]!
   1664       vld3.8        {d10[1], d11[1], d12[1]}, [RGB]!
   1665       vld3.8        {d10[2], d11[2], d12[2]}, [RGB]!
   1666       vld3.8        {d10[3], d11[3], d12[3]}, [RGB]!
   1667     .elseif \size == 2
   1668       vld3.8        {d10[4], d11[4], d12[4]}, [RGB]!
   1669       vld3.8        {d10[5], d11[5], d12[5]}, [RGB]!
   1670     .elseif \size == 1
   1671       vld3.8        {d10[6], d11[6], d12[6]}, [RGB]!
   1672     .else
   1673       .error unsupported macroblock size
   1674     .endif
   1675   .elseif \bpp == 32
   1676     .if \size == 8
   1677       vld4.8        {d10, d11, d12, d13}, [RGB]!
   1678       pld           [RGB, #128]
   1679     .elseif \size == 4
   1680       vld4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
   1681       vld4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
   1682       vld4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
   1683       vld4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
   1684     .elseif \size == 2
   1685       vld4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
   1686       vld4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
   1687     .elseif \size == 1
   1688       vld4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
   1689     .else
   1690       .error unsupported macroblock size
   1691     .endif
   1692   .else
   1693     .error unsupported bpp
   1694   .endif
   1695 .endm
   1696 
   1697 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
   1698 
   1699 /*
   1700  * 2-stage pipelined RGB->YCbCr conversion
   1701  */
   1702 
   1703 .macro do_rgb_to_yuv_stage1
   1704     vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
   1705     vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
   1706     vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
   1707     vmull.u16       q7, d4, d0[0]
   1708     vmlal.u16       q7, d6, d0[1]
   1709     vmlal.u16       q7, d8, d0[2]
   1710     vmull.u16       q8, d5, d0[0]
   1711     vmlal.u16       q8, d7, d0[1]
   1712     vmlal.u16       q8, d9, d0[2]
   1713     vrev64.32       q9, q1
   1714     vrev64.32       q13, q1
   1715     vmlsl.u16       q9, d4, d0[3]
   1716     vmlsl.u16       q9, d6, d1[0]
   1717     vmlal.u16       q9, d8, d1[1]
   1718     vmlsl.u16       q13, d5, d0[3]
   1719     vmlsl.u16       q13, d7, d1[0]
   1720     vmlal.u16       q13, d9, d1[1]
   1721     vrev64.32       q14, q1
   1722     vrev64.32       q15, q1
   1723     vmlal.u16       q14, d4, d1[1]
   1724     vmlsl.u16       q14, d6, d1[2]
   1725     vmlsl.u16       q14, d8, d1[3]
   1726     vmlal.u16       q15, d5, d1[1]
   1727     vmlsl.u16       q15, d7, d1[2]
   1728     vmlsl.u16       q15, d9, d1[3]
   1729 .endm
   1730 
   1731 .macro do_rgb_to_yuv_stage2
   1732     vrshrn.u32      d20, q7, #16
   1733     vrshrn.u32      d21, q8, #16
   1734     vshrn.u32       d22, q9, #16
   1735     vshrn.u32       d23, q13, #16
   1736     vshrn.u32       d24, q14, #16
   1737     vshrn.u32       d25, q15, #16
   1738     vmovn.u16       d20, q10       /* d20 = y */
   1739     vmovn.u16       d21, q11       /* d21 = u */
   1740     vmovn.u16       d22, q12       /* d22 = v */
   1741 .endm
   1742 
   1743 .macro do_rgb_to_yuv
   1744     do_rgb_to_yuv_stage1
   1745     do_rgb_to_yuv_stage2
   1746 .endm
   1747 
   1748 .macro do_rgb_to_yuv_stage2_store_load_stage1
   1749       vrshrn.u32      d20, q7, #16
   1750       vrshrn.u32      d21, q8, #16
   1751       vshrn.u32       d22, q9, #16
   1752     vrev64.32       q9, q1
   1753       vshrn.u32       d23, q13, #16
   1754     vrev64.32       q13, q1
   1755       vshrn.u32       d24, q14, #16
   1756       vshrn.u32       d25, q15, #16
   1757     do_load         \bpp, 8
   1758       vmovn.u16       d20, q10     /* d20 = y */
   1759     vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
   1760       vmovn.u16       d21, q11     /* d21 = u */
   1761     vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
   1762       vmovn.u16       d22, q12     /* d22 = v */
   1763     vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
   1764     vmull.u16       q7, d4, d0[0]
   1765     vmlal.u16       q7, d6, d0[1]
   1766     vmlal.u16       q7, d8, d0[2]
   1767       vst1.8          {d20}, [Y]!
   1768     vmull.u16       q8, d5, d0[0]
   1769     vmlal.u16       q8, d7, d0[1]
   1770     vmlal.u16       q8, d9, d0[2]
   1771     vmlsl.u16       q9, d4, d0[3]
   1772     vmlsl.u16       q9, d6, d1[0]
   1773     vmlal.u16       q9, d8, d1[1]
   1774       vst1.8          {d21}, [U]!
   1775     vmlsl.u16       q13, d5, d0[3]
   1776     vmlsl.u16       q13, d7, d1[0]
   1777     vmlal.u16       q13, d9, d1[1]
   1778     vrev64.32       q14, q1
   1779     vrev64.32       q15, q1
   1780     vmlal.u16       q14, d4, d1[1]
   1781     vmlsl.u16       q14, d6, d1[2]
   1782     vmlsl.u16       q14, d8, d1[3]
   1783       vst1.8          {d22}, [V]!
   1784     vmlal.u16       q15, d5, d1[1]
   1785     vmlsl.u16       q15, d7, d1[2]
   1786     vmlsl.u16       q15, d9, d1[3]
   1787 .endm
   1788 
   1789 .balign 16
   1790 jsimd_\colorid\()_ycc_neon_consts:
   1791   .short 19595, 38470, 7471,  11059
   1792   .short 21709, 32768, 27439, 5329
   1793   .short 32767, 128,   32767, 128
   1794   .short 32767, 128,   32767, 128
   1795 
   1796 asm_function jsimd_\colorid\()_ycc_convert_neon
   1797     OUTPUT_WIDTH    .req r0
   1798     INPUT_BUF       .req r1
   1799     OUTPUT_BUF      .req r2
   1800     OUTPUT_ROW      .req r3
   1801     NUM_ROWS        .req r4
   1802 
   1803     OUTPUT_BUF0     .req r5
   1804     OUTPUT_BUF1     .req r6
   1805     OUTPUT_BUF2     .req OUTPUT_BUF
   1806 
   1807     RGB             .req r7
   1808     Y               .req r8
   1809     U               .req r9
   1810     V               .req r10
   1811     N               .req ip
   1812 
   1813     /* Load constants to d0, d1, d2, d3 */
   1814     adr             ip, jsimd_\colorid\()_ycc_neon_consts
   1815     vld1.16         {d0, d1, d2, d3}, [ip, :128]
   1816 
   1817     /* Save ARM registers and handle input arguments */
   1818     push            {r4, r5, r6, r7, r8, r9, r10, lr}
   1819     ldr             NUM_ROWS, [sp, #(4 * 8)]
   1820     ldr             OUTPUT_BUF0, [OUTPUT_BUF]
   1821     ldr             OUTPUT_BUF1, [OUTPUT_BUF, #4]
   1822     ldr             OUTPUT_BUF2, [OUTPUT_BUF, #8]
   1823     .unreq          OUTPUT_BUF
   1824 
   1825     /* Save NEON registers */
   1826     vpush           {d8-d15}
   1827 
   1828     /* Outer loop over scanlines */
   1829     cmp             NUM_ROWS, #1
   1830     blt             9f
   1831 0:
   1832     ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
   1833     ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
   1834     mov             N, OUTPUT_WIDTH
   1835     ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
   1836     add             OUTPUT_ROW, OUTPUT_ROW, #1
   1837     ldr             RGB, [INPUT_BUF], #4
   1838 
   1839     /* Inner loop over pixels */
   1840     subs            N, N, #8
   1841     blt             3f
   1842     do_load         \bpp, 8
   1843     do_rgb_to_yuv_stage1
   1844     subs            N, N, #8
   1845     blt             2f
   1846 1:
   1847     do_rgb_to_yuv_stage2_store_load_stage1
   1848     subs            N, N, #8
   1849     bge             1b
   1850 2:
   1851     do_rgb_to_yuv_stage2
   1852     do_store        8
   1853     tst             N, #7
   1854     beq             8f
   1855 3:
   1856     tst             N, #4
   1857     beq             3f
   1858     do_load         \bpp, 4
   1859 3:
   1860     tst             N, #2
   1861     beq             4f
   1862     do_load         \bpp, 2
   1863 4:
   1864     tst             N, #1
   1865     beq             5f
   1866     do_load         \bpp, 1
   1867 5:
   1868     do_rgb_to_yuv
   1869     tst             N, #4
   1870     beq             6f
   1871     do_store        4
   1872 6:
   1873     tst             N, #2
   1874     beq             7f
   1875     do_store        2
   1876 7:
   1877     tst             N, #1
   1878     beq             8f
   1879     do_store        1
   1880 8:
   1881     subs            NUM_ROWS, NUM_ROWS, #1
   1882     bgt             0b
   1883 9:
   1884     /* Restore all registers and return */
   1885     vpop            {d8-d15}
   1886     pop             {r4, r5, r6, r7, r8, r9, r10, pc}
   1887 
   1888     .unreq          OUTPUT_WIDTH
   1889     .unreq          OUTPUT_ROW
   1890     .unreq          INPUT_BUF
   1891     .unreq          NUM_ROWS
   1892     .unreq          OUTPUT_BUF0
   1893     .unreq          OUTPUT_BUF1
   1894     .unreq          OUTPUT_BUF2
   1895     .unreq          RGB
   1896     .unreq          Y
   1897     .unreq          U
   1898     .unreq          V
   1899     .unreq          N
   1900 
   1901 .purgem do_rgb_to_yuv
   1902 .purgem do_rgb_to_yuv_stage1
   1903 .purgem do_rgb_to_yuv_stage2
   1904 .purgem do_rgb_to_yuv_stage2_store_load_stage1
   1905 
   1906 .endm
   1907 
   1908 /*--------------------------------- id ----- bpp R  G  B */
   1909 generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
   1910 generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
   1911 generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
   1912 generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
   1913 generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
   1914 generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
   1915 
   1916 .purgem do_load
   1917 .purgem do_store
   1918 
   1919 
   1920 /*****************************************************************************/
   1921 
   1922 /*
   1923  * Load data into workspace, applying unsigned->signed conversion
   1924  *
   1925  * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
   1926  *       rid of VST1.16 instructions
   1927  */
   1928 
   1929 asm_function jsimd_convsamp_neon
   1930     SAMPLE_DATA     .req r0
   1931     START_COL       .req r1
   1932     WORKSPACE       .req r2
   1933     TMP1            .req r3
   1934     TMP2            .req r4
   1935     TMP3            .req r5
   1936     TMP4            .req ip
   1937 
   1938     push            {r4, r5}
   1939     vmov.u8         d0, #128
   1940 
   1941     ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
   1942     add             TMP1, TMP1, START_COL
   1943     add             TMP2, TMP2, START_COL
   1944     add             TMP3, TMP3, START_COL
   1945     add             TMP4, TMP4, START_COL
   1946     vld1.8          {d16}, [TMP1]
   1947     vsubl.u8        q8, d16, d0
   1948     vld1.8          {d18}, [TMP2]
   1949     vsubl.u8        q9, d18, d0
   1950     vld1.8          {d20}, [TMP3]
   1951     vsubl.u8        q10, d20, d0
   1952     vld1.8          {d22}, [TMP4]
   1953     ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
   1954     vsubl.u8        q11, d22, d0
   1955     vst1.16         {d16, d17, d18, d19}, [WORKSPACE, :128]!
   1956     add             TMP1, TMP1, START_COL
   1957     add             TMP2, TMP2, START_COL
   1958     vst1.16         {d20, d21, d22, d23}, [WORKSPACE, :128]!
   1959     add             TMP3, TMP3, START_COL
   1960     add             TMP4, TMP4, START_COL
   1961     vld1.8          {d24}, [TMP1]
   1962     vsubl.u8        q12, d24, d0
   1963     vld1.8          {d26}, [TMP2]
   1964     vsubl.u8        q13, d26, d0
   1965     vld1.8          {d28}, [TMP3]
   1966     vsubl.u8        q14, d28, d0
   1967     vld1.8          {d30}, [TMP4]
   1968     vsubl.u8        q15, d30, d0
   1969     vst1.16         {d24, d25, d26, d27}, [WORKSPACE, :128]!
   1970     vst1.16         {d28, d29, d30, d31}, [WORKSPACE, :128]!
   1971     pop             {r4, r5}
   1972     bx              lr
   1973 
   1974     .unreq          SAMPLE_DATA
   1975     .unreq          START_COL
   1976     .unreq          WORKSPACE
   1977     .unreq          TMP1
   1978     .unreq          TMP2
   1979     .unreq          TMP3
   1980     .unreq          TMP4
   1981 
   1982 
   1983 /*****************************************************************************/
   1984 
   1985 /*
   1986  * jsimd_fdct_ifast_neon
   1987  *
   1988  * This function contains a fast, not so accurate integer implementation of
   1989  * the forward DCT (Discrete Cosine Transform). It uses the same calculations
   1990  * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
   1991  * function from jfdctfst.c
   1992  *
   1993  * TODO: can be combined with 'jsimd_convsamp_neon' to get
   1994  *       rid of a bunch of VLD1.16 instructions
   1995  */
   1996 
   1997 #define XFIX_0_382683433 d0[0]
   1998 #define XFIX_0_541196100 d0[1]
   1999 #define XFIX_0_707106781 d0[2]
   2000 #define XFIX_1_306562965 d0[3]
   2001 
   2002 .balign 16
   2003 jsimd_fdct_ifast_neon_consts:
   2004   .short (98 * 128)               /* XFIX_0_382683433 */
   2005   .short (139 * 128)              /* XFIX_0_541196100 */
   2006   .short (181 * 128)              /* XFIX_0_707106781 */
   2007   .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
   2008 
   2009 asm_function jsimd_fdct_ifast_neon
   2010 
   2011     DATA            .req r0
   2012     TMP             .req ip
   2013 
   2014     vpush           {d8-d15}
   2015 
   2016     /* Load constants */
   2017     adr             TMP, jsimd_fdct_ifast_neon_consts
   2018     vld1.16         {d0}, [TMP, :64]
   2019 
   2020     /* Load all DATA into NEON registers with the following allocation:
   2021      *       0 1 2 3 | 4 5 6 7
   2022      *      ---------+--------
   2023      *   0 | d16     | d17    | q8
   2024      *   1 | d18     | d19    | q9
   2025      *   2 | d20     | d21    | q10
   2026      *   3 | d22     | d23    | q11
   2027      *   4 | d24     | d25    | q12
   2028      *   5 | d26     | d27    | q13
   2029      *   6 | d28     | d29    | q14
   2030      *   7 | d30     | d31    | q15
   2031      */
   2032 
   2033     vld1.16         {d16, d17, d18, d19}, [DATA, :128]!
   2034     vld1.16         {d20, d21, d22, d23}, [DATA, :128]!
   2035     vld1.16         {d24, d25, d26, d27}, [DATA, :128]!
   2036     vld1.16         {d28, d29, d30, d31}, [DATA, :128]
   2037     sub             DATA, DATA, #(128 - 32)
   2038 
   2039     mov             TMP, #2
   2040 1:
   2041     /* Transpose */
   2042     vtrn.16         q12, q13
   2043     vtrn.16         q10, q11
   2044     vtrn.16         q8, q9
   2045     vtrn.16         q14, q15
   2046     vtrn.32         q9, q11
   2047     vtrn.32         q13, q15
   2048     vtrn.32         q8, q10
   2049     vtrn.32         q12, q14
   2050     vswp            d30, d23
   2051     vswp            d24, d17
   2052     vswp            d26, d19
   2053       /* 1-D FDCT */
   2054       vadd.s16        q2, q11, q12
   2055     vswp            d28, d21
   2056       vsub.s16        q12, q11, q12
   2057       vsub.s16        q6, q10, q13
   2058       vadd.s16        q10, q10, q13
   2059       vsub.s16        q7, q9, q14
   2060       vadd.s16        q9, q9, q14
   2061       vsub.s16        q1, q8, q15
   2062       vadd.s16        q8, q8, q15
   2063       vsub.s16        q4, q9, q10
   2064       vsub.s16        q5, q8, q2
   2065       vadd.s16        q3, q9, q10
   2066       vadd.s16        q4, q4, q5
   2067       vadd.s16        q2, q8, q2
   2068       vqdmulh.s16     q4, q4, XFIX_0_707106781
   2069       vadd.s16        q11, q12, q6
   2070       vadd.s16        q8, q2, q3
   2071       vsub.s16        q12, q2, q3
   2072       vadd.s16        q3, q6, q7
   2073       vadd.s16        q7, q7, q1
   2074       vqdmulh.s16     q3, q3, XFIX_0_707106781
   2075       vsub.s16        q6, q11, q7
   2076       vadd.s16        q10, q5, q4
   2077       vqdmulh.s16     q6, q6, XFIX_0_382683433
   2078       vsub.s16        q14, q5, q4
   2079       vqdmulh.s16     q11, q11, XFIX_0_541196100
   2080       vqdmulh.s16     q5, q7, XFIX_1_306562965
   2081       vadd.s16        q4, q1, q3
   2082       vsub.s16        q3, q1, q3
   2083       vadd.s16        q7, q7, q6
   2084       vadd.s16        q11, q11, q6
   2085       vadd.s16        q7, q7, q5
   2086       vadd.s16        q13, q3, q11
   2087       vsub.s16        q11, q3, q11
   2088       vadd.s16        q9, q4, q7
   2089       vsub.s16        q15, q4, q7
   2090     subs            TMP, TMP, #1
   2091     bne             1b
   2092 
   2093     /* store results */
   2094     vst1.16         {d16, d17, d18, d19}, [DATA, :128]!
   2095     vst1.16         {d20, d21, d22, d23}, [DATA, :128]!
   2096     vst1.16         {d24, d25, d26, d27}, [DATA, :128]!
   2097     vst1.16         {d28, d29, d30, d31}, [DATA, :128]
   2098 
   2099     vpop            {d8-d15}
   2100     bx              lr
   2101 
   2102     .unreq          DATA
   2103     .unreq          TMP
   2104 
   2105 
   2106 /*****************************************************************************/
   2107 
   2108 /*
   2109  * GLOBAL(void)
   2110  * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
   2111  *                      DCTELEM *workspace);
   2112  *
   2113  * Note: the code uses 2 stage pipelining in order to improve instructions
   2114  *       scheduling and eliminate stalls (this provides ~15% better
   2115  *       performance for this function on both ARM Cortex-A8 and
   2116  *       ARM Cortex-A9 when compared to the non-pipelined variant).
   2117  *       The instructions which belong to the second stage use different
   2118  *       indentation for better readiability.
   2119  */
   2120 asm_function jsimd_quantize_neon
   2121 
   2122     COEF_BLOCK      .req r0
   2123     DIVISORS        .req r1
   2124     WORKSPACE       .req r2
   2125 
   2126     RECIPROCAL      .req DIVISORS
   2127     CORRECTION      .req r3
   2128     SHIFT           .req ip
   2129     LOOP_COUNT      .req r4
   2130 
   2131     vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
   2132     vabs.s16        q12, q0
   2133     add             CORRECTION, DIVISORS, #(64 * 2)
   2134     add             SHIFT, DIVISORS, #(64 * 6)
   2135     vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
   2136     vabs.s16        q13, q1
   2137     vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
   2138     vadd.u16        q12, q12, q10  /* add correction */
   2139     vadd.u16        q13, q13, q11
   2140     vmull.u16       q10, d24, d16  /* multiply by reciprocal */
   2141     vmull.u16       q11, d25, d17
   2142     vmull.u16       q8, d26, d18
   2143     vmull.u16       q9, d27, d19
   2144     vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
   2145     vshrn.u32       d20, q10, #16
   2146     vshrn.u32       d21, q11, #16
   2147     vshrn.u32       d22, q8, #16
   2148     vshrn.u32       d23, q9, #16
   2149     vneg.s16        q12, q12
   2150     vneg.s16        q13, q13
   2151     vshr.s16        q2, q0, #15    /* extract sign */
   2152     vshr.s16        q3, q1, #15
   2153     vshl.u16        q14, q10, q12  /* shift */
   2154     vshl.u16        q15, q11, q13
   2155 
   2156     push            {r4, r5}
   2157     mov             LOOP_COUNT, #3
   2158 1:
   2159     vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
   2160       veor.u16        q14, q14, q2  /* restore sign */
   2161     vabs.s16        q12, q0
   2162     vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
   2163     vabs.s16        q13, q1
   2164       veor.u16        q15, q15, q3
   2165     vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
   2166     vadd.u16        q12, q12, q10  /* add correction */
   2167     vadd.u16        q13, q13, q11
   2168     vmull.u16       q10, d24, d16  /* multiply by reciprocal */
   2169     vmull.u16       q11, d25, d17
   2170     vmull.u16       q8, d26, d18
   2171     vmull.u16       q9, d27, d19
   2172       vsub.u16        q14, q14, q2
   2173     vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
   2174       vsub.u16        q15, q15, q3
   2175     vshrn.u32       d20, q10, #16
   2176     vshrn.u32       d21, q11, #16
   2177       vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
   2178     vshrn.u32       d22, q8, #16
   2179     vshrn.u32       d23, q9, #16
   2180     vneg.s16        q12, q12
   2181     vneg.s16        q13, q13
   2182     vshr.s16        q2, q0, #15    /* extract sign */
   2183     vshr.s16        q3, q1, #15
   2184     vshl.u16        q14, q10, q12  /* shift */
   2185     vshl.u16        q15, q11, q13
   2186     subs            LOOP_COUNT, LOOP_COUNT, #1
   2187     bne             1b
   2188     pop             {r4, r5}
   2189 
   2190       veor.u16        q14, q14, q2  /* restore sign */
   2191       veor.u16        q15, q15, q3
   2192       vsub.u16        q14, q14, q2
   2193       vsub.u16        q15, q15, q3
   2194       vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
   2195 
   2196     bx              lr  /* return */
   2197 
   2198     .unreq          COEF_BLOCK
   2199     .unreq          DIVISORS
   2200     .unreq          WORKSPACE
   2201     .unreq          RECIPROCAL
   2202     .unreq          CORRECTION
   2203     .unreq          SHIFT
   2204     .unreq          LOOP_COUNT
   2205 
   2206 
   2207 /*****************************************************************************/
   2208 
   2209 /*
   2210  * GLOBAL(void)
   2211  * jsimd_h2v1_fancy_upsample_neon (int max_v_samp_factor,
   2212  *                                 JDIMENSION downsampled_width,
   2213  *                                 JSAMPARRAY input_data,
   2214  *                                 JSAMPARRAY *output_data_ptr);
   2215  *
   2216  * Note: the use of unaligned writes is the main remaining bottleneck in
   2217  *       this code, which can be potentially solved to get up to tens
   2218  *       of percents performance improvement on Cortex-A8/Cortex-A9.
   2219  */
   2220 
   2221 /*
   2222  * Upsample 16 source pixels to 32 destination pixels. The new 16 source
   2223  * pixels are loaded to q0. The previous 16 source pixels are in q1. The
   2224  * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
   2225  * Register d28 is used for multiplication by 3. Register q15 is used
   2226  * for adding +1 bias.
   2227  */
   2228 .macro upsample16 OUTPTR, INPTR
   2229     vld1.8          {q0}, [\INPTR]!
   2230     vmovl.u8        q8, d0
   2231     vext.8          q2, q1, q0, #15
   2232     vmovl.u8        q9, d1
   2233     vaddw.u8        q10, q15, d4
   2234     vaddw.u8        q11, q15, d5
   2235     vmlal.u8        q8, d4, d28
   2236     vmlal.u8        q9, d5, d28
   2237     vmlal.u8        q10, d0, d28
   2238     vmlal.u8        q11, d1, d28
   2239     vmov            q1, q0        /* backup source pixels to q1 */
   2240     vrshrn.u16      d6, q8, #2
   2241     vrshrn.u16      d7, q9, #2
   2242     vshrn.u16       d8, q10, #2
   2243     vshrn.u16       d9, q11, #2
   2244     vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
   2245 .endm
   2246 
   2247 /*
   2248  * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
   2249  * macro, the roles of q0 and q1 registers are reversed for even and odd
   2250  * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
   2251  * Also this unrolling allows to reorder loads and stores to compensate
   2252  * multiplication latency and reduce stalls.
   2253  */
   2254 .macro upsample32 OUTPTR, INPTR
   2255     /* even 16 pixels group */
   2256     vld1.8          {q0}, [\INPTR]!
   2257     vmovl.u8        q8, d0
   2258     vext.8          q2, q1, q0, #15
   2259     vmovl.u8        q9, d1
   2260     vaddw.u8        q10, q15, d4
   2261     vaddw.u8        q11, q15, d5
   2262     vmlal.u8        q8, d4, d28
   2263     vmlal.u8        q9, d5, d28
   2264     vmlal.u8        q10, d0, d28
   2265     vmlal.u8        q11, d1, d28
   2266       /* odd 16 pixels group */
   2267       vld1.8          {q1}, [\INPTR]!
   2268     vrshrn.u16      d6, q8, #2
   2269     vrshrn.u16      d7, q9, #2
   2270     vshrn.u16       d8, q10, #2
   2271     vshrn.u16       d9, q11, #2
   2272       vmovl.u8        q8, d2
   2273       vext.8          q2, q0, q1, #15
   2274       vmovl.u8        q9, d3
   2275       vaddw.u8        q10, q15, d4
   2276       vaddw.u8        q11, q15, d5
   2277       vmlal.u8        q8, d4, d28
   2278       vmlal.u8        q9, d5, d28
   2279       vmlal.u8        q10, d2, d28
   2280       vmlal.u8        q11, d3, d28
   2281     vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
   2282       vrshrn.u16      d6, q8, #2
   2283       vrshrn.u16      d7, q9, #2
   2284       vshrn.u16       d8, q10, #2
   2285       vshrn.u16       d9, q11, #2
   2286       vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
   2287 .endm
   2288 
   2289 /*
   2290  * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
   2291  */
   2292 .macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
   2293     /* special case for the first and last pixels */
   2294     sub             \WIDTH, \WIDTH, #1
   2295     add             \OUTPTR, \OUTPTR, #1
   2296     ldrb            \TMP1, [\INPTR, \WIDTH]
   2297     strb            \TMP1, [\OUTPTR, \WIDTH, asl #1]
   2298     ldrb            \TMP1, [\INPTR], #1
   2299     strb            \TMP1, [\OUTPTR, #-1]
   2300     vmov.8          d3[7], \TMP1
   2301 
   2302     subs            \WIDTH, \WIDTH, #32
   2303     blt             5f
   2304 0:  /* process 32 pixels per iteration */
   2305     upsample32      \OUTPTR, \INPTR
   2306     subs            \WIDTH, \WIDTH, #32
   2307     bge             0b
   2308 5:
   2309     adds            \WIDTH, \WIDTH, #16
   2310     blt             1f
   2311 0:  /* process 16 pixels if needed */
   2312     upsample16      \OUTPTR, \INPTR
   2313     subs            \WIDTH, \WIDTH, #16
   2314 1:
   2315     adds            \WIDTH, \WIDTH, #16
   2316     beq             9f
   2317 
   2318     /* load the remaining 1-15 pixels */
   2319     add             \INPTR, \INPTR, \WIDTH
   2320     tst             \WIDTH, #1
   2321     beq             2f
   2322     sub             \INPTR, \INPTR, #1
   2323     vld1.8          {d0[0]}, [\INPTR]
   2324 2:
   2325     tst             \WIDTH, #2
   2326     beq             2f
   2327     vext.8          d0, d0, d0, #6
   2328     sub             \INPTR, \INPTR, #1
   2329     vld1.8          {d0[1]}, [\INPTR]
   2330     sub             \INPTR, \INPTR, #1
   2331     vld1.8          {d0[0]}, [\INPTR]
   2332 2:
   2333     tst             \WIDTH, #4
   2334     beq             2f
   2335     vrev64.32       d0, d0
   2336     sub             \INPTR, \INPTR, #1
   2337     vld1.8          {d0[3]}, [\INPTR]
   2338     sub             \INPTR, \INPTR, #1
   2339     vld1.8          {d0[2]}, [\INPTR]
   2340     sub             \INPTR, \INPTR, #1
   2341     vld1.8          {d0[1]}, [\INPTR]
   2342     sub             \INPTR, \INPTR, #1
   2343     vld1.8          {d0[0]}, [\INPTR]
   2344 2:
   2345     tst             \WIDTH, #8
   2346     beq             2f
   2347     vmov            d1, d0
   2348     sub             \INPTR, \INPTR, #8
   2349     vld1.8          {d0}, [\INPTR]
   2350 2:  /* upsample the remaining pixels */
   2351     vmovl.u8        q8, d0
   2352     vext.8          q2, q1, q0, #15
   2353     vmovl.u8        q9, d1
   2354     vaddw.u8        q10, q15, d4
   2355     vaddw.u8        q11, q15, d5
   2356     vmlal.u8        q8, d4, d28
   2357     vmlal.u8        q9, d5, d28
   2358     vmlal.u8        q10, d0, d28
   2359     vmlal.u8        q11, d1, d28
   2360     vrshrn.u16      d10, q8, #2
   2361     vrshrn.u16      d12, q9, #2
   2362     vshrn.u16       d11, q10, #2
   2363     vshrn.u16       d13, q11, #2
   2364     vzip.8          d10, d11
   2365     vzip.8          d12, d13
   2366     /* store the remaining pixels */
   2367     tst             \WIDTH, #8
   2368     beq             2f
   2369     vst1.8          {d10, d11}, [\OUTPTR]!
   2370     vmov            q5, q6
   2371 2:
   2372     tst             \WIDTH, #4
   2373     beq             2f
   2374     vst1.8          {d10}, [\OUTPTR]!
   2375     vmov            d10, d11
   2376 2:
   2377     tst             \WIDTH, #2
   2378     beq             2f
   2379     vst1.8          {d10[0]}, [\OUTPTR]!
   2380     vst1.8          {d10[1]}, [\OUTPTR]!
   2381     vst1.8          {d10[2]}, [\OUTPTR]!
   2382     vst1.8          {d10[3]}, [\OUTPTR]!
   2383     vext.8          d10, d10, d10, #4
   2384 2:
   2385     tst             \WIDTH, #1
   2386     beq             2f
   2387     vst1.8          {d10[0]}, [\OUTPTR]!
   2388     vst1.8          {d10[1]}, [\OUTPTR]!
   2389 2:
   2390 9:
   2391 .endm
   2392 
   2393 asm_function jsimd_h2v1_fancy_upsample_neon
   2394 
   2395     MAX_V_SAMP_FACTOR .req r0
   2396     DOWNSAMPLED_WIDTH .req r1
   2397     INPUT_DATA        .req r2
   2398     OUTPUT_DATA_PTR   .req r3
   2399     OUTPUT_DATA       .req OUTPUT_DATA_PTR
   2400 
   2401     OUTPTR            .req r4
   2402     INPTR             .req r5
   2403     WIDTH             .req ip
   2404     TMP               .req lr
   2405 
   2406     push            {r4, r5, r6, lr}
   2407     vpush           {d8-d15}
   2408 
   2409     ldr             OUTPUT_DATA, [OUTPUT_DATA_PTR]
   2410     cmp             MAX_V_SAMP_FACTOR, #0
   2411     ble             99f
   2412 
   2413     /* initialize constants */
   2414     vmov.u8         d28, #3
   2415     vmov.u16        q15, #1
   2416 11:
   2417     ldr             INPTR, [INPUT_DATA], #4
   2418     ldr             OUTPTR, [OUTPUT_DATA], #4
   2419     mov             WIDTH, DOWNSAMPLED_WIDTH
   2420     upsample_row    OUTPTR, INPTR, WIDTH, TMP
   2421     subs            MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
   2422     bgt             11b
   2423 
   2424 99:
   2425     vpop            {d8-d15}
   2426     pop             {r4, r5, r6, pc}
   2427 
   2428     .unreq          MAX_V_SAMP_FACTOR
   2429     .unreq          DOWNSAMPLED_WIDTH
   2430     .unreq          INPUT_DATA
   2431     .unreq          OUTPUT_DATA_PTR
   2432     .unreq          OUTPUT_DATA
   2433 
   2434     .unreq          OUTPTR
   2435     .unreq          INPTR
   2436     .unreq          WIDTH
   2437     .unreq          TMP
   2438 
   2439 .purgem upsample16
   2440 .purgem upsample32
   2441 .purgem upsample_row
   2442 
   2443 
   2444 /*****************************************************************************/
   2445 
   2446 /*
   2447  * GLOBAL(JOCTET*)
   2448  * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
   2449  *                              JCOEFPTR block, int last_dc_val,
   2450  *                              c_derived_tbl *dctbl, c_derived_tbl *actbl)
   2451  *
   2452  */
   2453 
   2454 .macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
   2455     sub             \PUT_BITS, \PUT_BITS, #0x8
   2456     lsr             \TMP, \PUT_BUFFER, \PUT_BITS
   2457     uxtb            \TMP, \TMP
   2458     strb            \TMP, [\BUFFER, #1]!
   2459     cmp             \TMP, #0xff
   2460     /*it eq*/
   2461     strbeq          \ZERO, [\BUFFER, #1]!
   2462 .endm
   2463 
   2464 .macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE
   2465     /*lsl             \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
   2466     add             \PUT_BITS, \SIZE
   2467     /*orr             \PUT_BUFFER, \PUT_BUFFER, \CODE*/
   2468     orr             \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
   2469 .endm
   2470 
   2471 .macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
   2472   cmp               \PUT_BITS, #0x10
   2473   blt               15f
   2474     eor               \ZERO, \ZERO, \ZERO
   2475     emit_byte         \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
   2476     emit_byte         \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
   2477 15:
   2478 .endm
   2479 
   2480 .balign 16
   2481 jsimd_huff_encode_one_block_neon_consts:
   2482   .byte 0x01
   2483   .byte 0x02
   2484   .byte 0x04
   2485   .byte 0x08
   2486   .byte 0x10
   2487   .byte 0x20
   2488   .byte 0x40
   2489   .byte 0x80
   2490 
   2491 asm_function jsimd_huff_encode_one_block_neon
   2492     push            {r4, r5, r6, r7, r8, r9, r10, r11, lr}
   2493     add             r7, sp, #0x1c
   2494     sub             r4, sp, #0x40
   2495     bfc             r4, #0, #5
   2496     mov             sp, r4           /* align sp on 32 bytes */
   2497     vst1.64         {d8, d9, d10, d11}, [r4, :128]!
   2498     vst1.64         {d12, d13, d14, d15}, [r4, :128]
   2499     sub             sp, #0x140       /* reserve 320 bytes */
   2500     str             r0, [sp, #0x18]  /* working state > sp + Ox18 */
   2501     add             r4, sp, #0x20    /* r4 = t1 */
   2502     ldr             lr, [r7, #0x8]   /* lr = dctbl */
   2503     sub             r10, r1, #0x1    /* r10=buffer-- */
   2504     ldrsh           r1, [r2]
   2505     mov             r9, #0x10
   2506     mov             r8, #0x1
   2507     adr             r5, jsimd_huff_encode_one_block_neon_consts
   2508     /* prepare data */
   2509     vld1.8          {d26}, [r5, :64]
   2510     veor            q8, q8, q8
   2511     veor            q9, q9, q9
   2512     vdup.16         q14, r9
   2513     vdup.16         q15, r8
   2514     veor            q10, q10, q10
   2515     veor            q11, q11, q11
   2516     sub             r1, r1, r3
   2517     add             r9, r2, #0x22
   2518     add             r8, r2, #0x18
   2519     add             r3, r2, #0x36
   2520     vmov.16         d0[0], r1
   2521     vld1.16         {d2[0]}, [r9, :16]
   2522     vld1.16         {d4[0]}, [r8, :16]
   2523     vld1.16         {d6[0]}, [r3, :16]
   2524     add             r1, r2, #0x2
   2525     add             r9, r2, #0x30
   2526     add             r8, r2, #0x26
   2527     add             r3, r2, #0x28
   2528     vld1.16         {d0[1]}, [r1, :16]
   2529     vld1.16         {d2[1]}, [r9, :16]
   2530     vld1.16         {d4[1]}, [r8, :16]
   2531     vld1.16         {d6[1]}, [r3, :16]
   2532     add             r1, r2, #0x10
   2533     add             r9, r2, #0x40
   2534     add             r8, r2, #0x34
   2535     add             r3, r2, #0x1a
   2536     vld1.16         {d0[2]}, [r1, :16]
   2537     vld1.16         {d2[2]}, [r9, :16]
   2538     vld1.16         {d4[2]}, [r8, :16]
   2539     vld1.16         {d6[2]}, [r3, :16]
   2540     add             r1, r2, #0x20
   2541     add             r9, r2, #0x32
   2542     add             r8, r2, #0x42
   2543     add             r3, r2, #0xc
   2544     vld1.16         {d0[3]}, [r1, :16]
   2545     vld1.16         {d2[3]}, [r9, :16]
   2546     vld1.16         {d4[3]}, [r8, :16]
   2547     vld1.16         {d6[3]}, [r3, :16]
   2548     add             r1, r2, #0x12
   2549     add             r9, r2, #0x24
   2550     add             r8, r2, #0x50
   2551     add             r3, r2, #0xe
   2552     vld1.16         {d1[0]}, [r1, :16]
   2553     vld1.16         {d3[0]}, [r9, :16]
   2554     vld1.16         {d5[0]}, [r8, :16]
   2555     vld1.16         {d7[0]}, [r3, :16]
   2556     add             r1, r2, #0x4
   2557     add             r9, r2, #0x16
   2558     add             r8, r2, #0x60
   2559     add             r3, r2, #0x1c
   2560     vld1.16         {d1[1]}, [r1, :16]
   2561     vld1.16         {d3[1]}, [r9, :16]
   2562     vld1.16         {d5[1]}, [r8, :16]
   2563     vld1.16         {d7[1]}, [r3, :16]
   2564     add             r1, r2, #0x6
   2565     add             r9, r2, #0x8
   2566     add             r8, r2, #0x52
   2567     add             r3, r2, #0x2a
   2568     vld1.16         {d1[2]}, [r1, :16]
   2569     vld1.16         {d3[2]}, [r9, :16]
   2570     vld1.16         {d5[2]}, [r8, :16]
   2571     vld1.16         {d7[2]}, [r3, :16]
   2572     add             r1, r2, #0x14
   2573     add             r9, r2, #0xa
   2574     add             r8, r2, #0x44
   2575     add             r3, r2, #0x38
   2576     vld1.16         {d1[3]}, [r1, :16]
   2577     vld1.16         {d3[3]}, [r9, :16]
   2578     vld1.16         {d5[3]}, [r8, :16]
   2579     vld1.16         {d7[3]}, [r3, :16]
   2580     vcgt.s16        q8, q8, q0
   2581     vcgt.s16        q9, q9, q1
   2582     vcgt.s16        q10, q10, q2
   2583     vcgt.s16        q11, q11, q3
   2584     vabs.s16        q0, q0
   2585     vabs.s16        q1, q1
   2586     vabs.s16        q2, q2
   2587     vabs.s16        q3, q3
   2588     veor            q8, q8, q0
   2589     veor            q9, q9, q1
   2590     veor            q10, q10, q2
   2591     veor            q11, q11, q3
   2592     add             r9, r4, #0x20
   2593     add             r8, r4, #0x80
   2594     add             r3, r4, #0xa0
   2595     vclz.i16        q0, q0
   2596     vclz.i16        q1, q1
   2597     vclz.i16        q2, q2
   2598     vclz.i16        q3, q3
   2599     vsub.i16        q0, q14, q0
   2600     vsub.i16        q1, q14, q1
   2601     vsub.i16        q2, q14, q2
   2602     vsub.i16        q3, q14, q3
   2603     vst1.16         {d0, d1, d2, d3}, [r4, :256]
   2604     vst1.16         {d4, d5, d6, d7}, [r9, :256]
   2605     vshl.s16        q0, q15, q0
   2606     vshl.s16        q1, q15, q1
   2607     vshl.s16        q2, q15, q2
   2608     vshl.s16        q3, q15, q3
   2609     vsub.i16        q0, q0, q15
   2610     vsub.i16        q1, q1, q15
   2611     vsub.i16        q2, q2, q15
   2612     vsub.i16        q3, q3, q15
   2613     vand            q8, q8, q0
   2614     vand            q9, q9, q1
   2615     vand            q10, q10, q2
   2616     vand            q11, q11, q3
   2617     vst1.16         {d16, d17, d18, d19}, [r8, :256]
   2618     vst1.16         {d20, d21, d22, d23}, [r3, :256]
   2619     add             r1, r2, #0x46
   2620     add             r9, r2, #0x3a
   2621     add             r8, r2, #0x74
   2622     add             r3, r2, #0x6a
   2623     vld1.16         {d8[0]}, [r1, :16]
   2624     vld1.16         {d10[0]}, [r9, :16]
   2625     vld1.16         {d12[0]}, [r8, :16]
   2626     vld1.16         {d14[0]}, [r3, :16]
   2627     veor            q8, q8, q8
   2628     veor            q9, q9, q9
   2629     veor            q10, q10, q10
   2630     veor            q11, q11, q11
   2631     add             r1, r2, #0x54
   2632     add             r9, r2, #0x2c
   2633     add             r8, r2, #0x76
   2634     add             r3, r2, #0x78
   2635     vld1.16         {d8[1]}, [r1, :16]
   2636     vld1.16         {d10[1]}, [r9, :16]
   2637     vld1.16         {d12[1]}, [r8, :16]
   2638     vld1.16         {d14[1]}, [r3, :16]
   2639     add             r1, r2, #0x62
   2640     add             r9, r2, #0x1e
   2641     add             r8, r2, #0x68
   2642     add             r3, r2, #0x7a
   2643     vld1.16         {d8[2]}, [r1, :16]
   2644     vld1.16         {d10[2]}, [r9, :16]
   2645     vld1.16         {d12[2]}, [r8, :16]
   2646     vld1.16         {d14[2]}, [r3, :16]
   2647     add             r1, r2, #0x70
   2648     add             r9, r2, #0x2e
   2649     add             r8, r2, #0x5a
   2650     add             r3, r2, #0x6c
   2651     vld1.16         {d8[3]}, [r1, :16]
   2652     vld1.16         {d10[3]}, [r9, :16]
   2653     vld1.16         {d12[3]}, [r8, :16]
   2654     vld1.16         {d14[3]}, [r3, :16]
   2655     add             r1, r2, #0x72
   2656     add             r9, r2, #0x3c
   2657     add             r8, r2, #0x4c
   2658     add             r3, r2, #0x5e
   2659     vld1.16         {d9[0]}, [r1, :16]
   2660     vld1.16         {d11[0]}, [r9, :16]
   2661     vld1.16         {d13[0]}, [r8, :16]
   2662     vld1.16         {d15[0]}, [r3, :16]
   2663     add             r1, r2, #0x64
   2664     add             r9, r2, #0x4a
   2665     add             r8, r2, #0x3e
   2666     add             r3, r2, #0x6e
   2667     vld1.16         {d9[1]}, [r1, :16]
   2668     vld1.16         {d11[1]}, [r9, :16]
   2669     vld1.16         {d13[1]}, [r8, :16]
   2670     vld1.16         {d15[1]}, [r3, :16]
   2671     add             r1, r2, #0x56
   2672     add             r9, r2, #0x58
   2673     add             r8, r2, #0x4e
   2674     add             r3, r2, #0x7c
   2675     vld1.16         {d9[2]}, [r1, :16]
   2676     vld1.16         {d11[2]}, [r9, :16]
   2677     vld1.16         {d13[2]}, [r8, :16]
   2678     vld1.16         {d15[2]}, [r3, :16]
   2679     add             r1, r2, #0x48
   2680     add             r9, r2, #0x66
   2681     add             r8, r2, #0x5c
   2682     add             r3, r2, #0x7e
   2683     vld1.16         {d9[3]}, [r1, :16]
   2684     vld1.16         {d11[3]}, [r9, :16]
   2685     vld1.16         {d13[3]}, [r8, :16]
   2686     vld1.16         {d15[3]}, [r3, :16]
   2687     vcgt.s16        q8, q8, q4
   2688     vcgt.s16        q9, q9, q5
   2689     vcgt.s16        q10, q10, q6
   2690     vcgt.s16        q11, q11, q7
   2691     vabs.s16        q4, q4
   2692     vabs.s16        q5, q5
   2693     vabs.s16        q6, q6
   2694     vabs.s16        q7, q7
   2695     veor            q8, q8, q4
   2696     veor            q9, q9, q5
   2697     veor            q10, q10, q6
   2698     veor            q11, q11, q7
   2699     add             r1, r4, #0x40
   2700     add             r9, r4, #0x60
   2701     add             r8, r4, #0xc0
   2702     add             r3, r4, #0xe0
   2703     vclz.i16        q4, q4
   2704     vclz.i16        q5, q5
   2705     vclz.i16        q6, q6
   2706     vclz.i16        q7, q7
   2707     vsub.i16        q4, q14, q4
   2708     vsub.i16        q5, q14, q5
   2709     vsub.i16        q6, q14, q6
   2710     vsub.i16        q7, q14, q7
   2711     vst1.16         {d8, d9, d10, d11}, [r1, :256]
   2712     vst1.16         {d12, d13, d14, d15}, [r9, :256]
   2713     vshl.s16        q4, q15, q4
   2714     vshl.s16        q5, q15, q5
   2715     vshl.s16        q6, q15, q6
   2716     vshl.s16        q7, q15, q7
   2717     vsub.i16        q4, q4, q15
   2718     vsub.i16        q5, q5, q15
   2719     vsub.i16        q6, q6, q15
   2720     vsub.i16        q7, q7, q15
   2721     vand            q8, q8, q4
   2722     vand            q9, q9, q5
   2723     vand            q10, q10, q6
   2724     vand            q11, q11, q7
   2725     vst1.16         {d16, d17, d18, d19}, [r8, :256]
   2726     vst1.16         {d20, d21, d22, d23}, [r3, :256]
   2727     ldr             r12, [r7, #0xc]       /* r12 = actbl */
   2728     add             r1, lr, #0x400        /* r1 = dctbl->ehufsi */
   2729     mov             r9, r12               /* r9 = actbl */
   2730     add             r6, r4, #0x80         /* r6 = t2 */
   2731     ldr             r11, [r0, #0x8]       /* r11 = put_buffer */
   2732     ldr             r4, [r0, #0xc]        /* r4  = put_bits */
   2733     ldrh            r2, [r6, #-128]       /* r2  = nbits */
   2734     ldrh            r3, [r6]              /* r3  = temp2 & (((JLONG) 1)<<nbits) - 1; */
   2735     ldr             r0, [lr, r2, lsl #2]
   2736     ldrb            r5, [r1, r2]
   2737     put_bits        r11, r4, r0, r5
   2738     checkbuf15      r10, r11, r4, r5, r0
   2739     put_bits        r11, r4, r3, r2
   2740     checkbuf15      r10, r11, r4, r5, r0
   2741     mov             lr, r6                /* lr = t2 */
   2742     add             r5, r9, #0x400        /* r5 = actbl->ehufsi */
   2743     ldrsb           r6, [r5, #0xf0]       /* r6 = actbl->ehufsi[0xf0] */
   2744     veor            q8, q8, q8
   2745     vceq.i16        q0, q0, q8
   2746     vceq.i16        q1, q1, q8
   2747     vceq.i16        q2, q2, q8
   2748     vceq.i16        q3, q3, q8
   2749     vceq.i16        q4, q4, q8
   2750     vceq.i16        q5, q5, q8
   2751     vceq.i16        q6, q6, q8
   2752     vceq.i16        q7, q7, q8
   2753     vmovn.i16       d0, q0
   2754     vmovn.i16       d2, q1
   2755     vmovn.i16       d4, q2
   2756     vmovn.i16       d6, q3
   2757     vmovn.i16       d8, q4
   2758     vmovn.i16       d10, q5
   2759     vmovn.i16       d12, q6
   2760     vmovn.i16       d14, q7
   2761     vand            d0, d0, d26
   2762     vand            d2, d2, d26
   2763     vand            d4, d4, d26
   2764     vand            d6, d6, d26
   2765     vand            d8, d8, d26
   2766     vand            d10, d10, d26
   2767     vand            d12, d12, d26
   2768     vand            d14, d14, d26
   2769     vpadd.i8        d0, d0, d2
   2770     vpadd.i8        d4, d4, d6
   2771     vpadd.i8        d8, d8, d10
   2772     vpadd.i8        d12, d12, d14
   2773     vpadd.i8        d0, d0, d4
   2774     vpadd.i8        d8, d8, d12
   2775     vpadd.i8        d0, d0, d8
   2776     vmov.32         r1, d0[1]
   2777     vmov.32         r8, d0[0]
   2778     mvn             r1, r1
   2779     mvn             r8, r8
   2780     lsrs            r1, r1, #0x1
   2781     rrx             r8, r8            /* shift in last r1 bit while shifting out DC bit */
   2782     rbit            r1, r1            /* r1 = index1 */
   2783     rbit            r8, r8            /* r8 = index0 */
   2784     ldr             r0, [r9, #0x3c0]  /* r0 = actbl->ehufco[0xf0] */
   2785     str             r1, [sp, #0x14]   /* index1 > sp + 0x14 */
   2786     cmp             r8, #0x0
   2787     beq             6f
   2788 1:
   2789     clz             r2, r8
   2790     add             lr, lr, r2, lsl #1
   2791     lsl             r8, r8, r2
   2792     ldrh            r1, [lr, #-126]
   2793 2:
   2794     cmp             r2, #0x10
   2795     blt             3f
   2796     sub             r2, r2, #0x10
   2797     put_bits        r11, r4, r0, r6
   2798     cmp             r4, #0x10
   2799     blt             2b
   2800     eor             r3, r3, r3
   2801     emit_byte       r10, r11, r4, r3, r12
   2802     emit_byte       r10, r11, r4, r3, r12
   2803     b               2b
   2804 3:
   2805     add             r2, r1, r2, lsl #4
   2806     ldrh            r3, [lr, #2]!
   2807     ldr             r12, [r9, r2, lsl #2]
   2808     ldrb            r2, [r5, r2]
   2809     put_bits        r11, r4, r12, r2
   2810     checkbuf15      r10, r11, r4, r2, r12
   2811     put_bits        r11, r4, r3, r1
   2812     checkbuf15      r10, r11, r4, r2, r12
   2813     lsls            r8, r8, #0x1
   2814     bne             1b
   2815 6:
   2816     add             r12, sp, #0x20   /* r12 = t1 */
   2817     ldr             r8, [sp, #0x14]  /* r8 = index1 */
   2818     adds            r12, #0xc0       /* r12 = t2 + (DCTSIZE2/2) */
   2819     cmp             r8, #0x0
   2820     beq             6f
   2821     clz             r2, r8
   2822     sub             r12, r12, lr
   2823     lsl             r8, r8, r2
   2824     add             r2, r2, r12, lsr #1
   2825     add             lr, lr, r2, lsl #1
   2826     b               7f
   2827 1:
   2828     clz             r2, r8
   2829     add             lr, lr, r2, lsl #1
   2830     lsl             r8, r8, r2
   2831 7:
   2832     ldrh            r1, [lr, #-126]
   2833 2:
   2834     cmp             r2, #0x10
   2835     blt             3f
   2836     sub             r2, r2, #0x10
   2837     put_bits        r11, r4, r0, r6
   2838     cmp             r4, #0x10
   2839     blt             2b
   2840     eor             r3, r3, r3
   2841     emit_byte       r10, r11, r4, r3, r12
   2842     emit_byte       r10, r11, r4, r3, r12
   2843     b               2b
   2844 3:
   2845     add             r2, r1, r2, lsl #4
   2846     ldrh            r3, [lr, #2]!
   2847     ldr             r12, [r9, r2, lsl #2]
   2848     ldrb            r2, [r5, r2]
   2849     put_bits        r11, r4, r12, r2
   2850     checkbuf15      r10, r11, r4, r2, r12
   2851     put_bits        r11, r4, r3, r1
   2852     checkbuf15      r10, r11, r4, r2, r12
   2853     lsls            r8, r8, #0x1
   2854     bne             1b
   2855 6:
   2856     add             r0, sp, #0x20
   2857     add             r0, #0xfe
   2858     cmp             lr, r0
   2859     bhs             1f
   2860     ldr             r1, [r9]
   2861     ldrb            r0, [r5]
   2862     put_bits        r11, r4, r1, r0
   2863     checkbuf15      r10, r11, r4, r0, r1
   2864 1:
   2865     ldr             r12, [sp, #0x18]
   2866     str             r11, [r12, #0x8]
   2867     str             r4, [r12, #0xc]
   2868     add             r0, r10, #0x1
   2869     add             r4, sp, #0x140
   2870     vld1.64         {d8, d9, d10, d11}, [r4, :128]!
   2871     vld1.64         {d12, d13, d14, d15}, [r4, :128]
   2872     sub             r4, r7, #0x1c
   2873     mov             sp, r4
   2874     pop             {r4, r5, r6, r7, r8, r9, r10, r11, pc}
   2875 
   2876 .purgem emit_byte
   2877 .purgem put_bits
   2878 .purgem checkbuf15
   2879