Home | History | Annotate | Download | only in simd
      1 /*
      2  * ARMv7 NEON optimizations for libjpeg-turbo
      3  *
      4  * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
      5  * All rights reserved.
      6  * Author: Siarhei Siamashka <siarhei.siamashka (at) nokia.com>
      7  *
      8  * This software is provided 'as-is', without any express or implied
      9  * warranty.  In no event will the authors be held liable for any damages
     10  * arising from the use of this software.
     11  *
     12  * Permission is granted to anyone to use this software for any purpose,
     13  * including commercial applications, and to alter it and redistribute it
     14  * freely, subject to the following restrictions:
     15  *
     16  * 1. The origin of this software must not be misrepresented; you must not
     17  *    claim that you wrote the original software. If you use this software
     18  *    in a product, an acknowledgment in the product documentation would be
     19  *    appreciated but is not required.
     20  * 2. Altered source versions must be plainly marked as such, and must not be
     21  *    misrepresented as being the original software.
     22  * 3. This notice may not be removed or altered from any source distribution.
     23  */
     24 
     25 #if defined(__linux__) && defined(__ELF__)
     26 .section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
     27 #endif
     28 
     29 .text
     30 .fpu neon
     31 .arch armv7a
     32 .object_arch armv4
     33 .arm
     34 
     35 
     36 #define RESPECT_STRICT_ALIGNMENT 1
     37 
     38 
     39 /*****************************************************************************/
     40 
     41 /* Supplementary macro for setting function attributes */
     42 .macro asm_function fname
     43 #ifdef __APPLE__
     44     .globl _\fname
     45 _\fname:
     46 #else
     47     .global \fname
     48 #ifdef __ELF__
     49     .hidden \fname
     50     .type \fname, %function
     51 #endif
     52 \fname:
     53 #endif
     54 .endm
     55 
     56 /* Transpose a block of 4x4 coefficients in four 64-bit registers */
     57 .macro transpose_4x4 x0, x1, x2, x3
     58     vtrn.16 \x0, \x1
     59     vtrn.16 \x2, \x3
     60     vtrn.32 \x0, \x2
     61     vtrn.32 \x1, \x3
     62 .endm
     63 
     64 
     65 #define CENTERJSAMPLE 128
     66 
     67 /*****************************************************************************/
     68 
     69 /*
     70  * Perform dequantization and inverse DCT on one block of coefficients.
     71  *
     72  * GLOBAL(void)
     73  * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
     74  *                        JSAMPARRAY output_buf, JDIMENSION output_col)
     75  */
     76 
     77 #define FIX_0_298631336  (2446)
     78 #define FIX_0_390180644  (3196)
     79 #define FIX_0_541196100  (4433)
     80 #define FIX_0_765366865  (6270)
     81 #define FIX_0_899976223  (7373)
     82 #define FIX_1_175875602  (9633)
     83 #define FIX_1_501321110  (12299)
     84 #define FIX_1_847759065  (15137)
     85 #define FIX_1_961570560  (16069)
     86 #define FIX_2_053119869  (16819)
     87 #define FIX_2_562915447  (20995)
     88 #define FIX_3_072711026  (25172)
     89 
     90 #define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
     91 #define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
     92 #define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
     93 #define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
     94 #define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
     95 #define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
     96 #define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
     97 #define FIX_0_541196100_PLUS_0_765366865  (FIX_0_541196100 + FIX_0_765366865)
     98 
     99 /*
    100  * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
    101  * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
    102  */
    103 #define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7)   \
    104 {                                                                             \
    105     DCTELEM row0, row1, row2, row3, row4, row5, row6, row7;                   \
    106     INT32   q1, q2, q3, q4, q5, q6, q7;                                       \
    107     INT32   tmp11_plus_tmp2, tmp11_minus_tmp2;                                \
    108                                                                               \
    109     /* 1-D iDCT input data */                                                 \
    110     row0 = xrow0;                                                             \
    111     row1 = xrow1;                                                             \
    112     row2 = xrow2;                                                             \
    113     row3 = xrow3;                                                             \
    114     row4 = xrow4;                                                             \
    115     row5 = xrow5;                                                             \
    116     row6 = xrow6;                                                             \
    117     row7 = xrow7;                                                             \
    118                                                                               \
    119     q5 = row7 + row3;                                                         \
    120     q4 = row5 + row1;                                                         \
    121     q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) +                    \
    122          MULTIPLY(q4, FIX_1_175875602);                                       \
    123     q7 = MULTIPLY(q5, FIX_1_175875602) +                                      \
    124          MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644);                     \
    125     q2 = MULTIPLY(row2, FIX_0_541196100) +                                    \
    126          MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065);                   \
    127     q4 = q6;                                                                  \
    128     q3 = ((INT32) row0 - (INT32) row4) << 13;                                 \
    129     q6 += MULTIPLY(row5, -FIX_2_562915447) +                                  \
    130           MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447);                  \
    131     /* now we can use q1 (reloadable constants have been used up) */          \
    132     q1 = q3 + q2;                                                             \
    133     q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) +                 \
    134           MULTIPLY(row1, -FIX_0_899976223);                                   \
    135     q5 = q7;                                                                  \
    136     q1 = q1 + q6;                                                             \
    137     q7 += MULTIPLY(row7, -FIX_0_899976223) +                                  \
    138           MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223);                  \
    139                                                                               \
    140     /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */        \
    141     tmp11_plus_tmp2 = q1;                                                     \
    142     row1 = 0;                                                                 \
    143                                                                               \
    144     q1 = q1 - q6;                                                             \
    145     q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) +                 \
    146           MULTIPLY(row3, -FIX_2_562915447);                                   \
    147     q1 = q1 - q6;                                                             \
    148     q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) +                   \
    149          MULTIPLY(row6, FIX_0_541196100);                                     \
    150     q3 = q3 - q2;                                                             \
    151                                                                               \
    152     /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */        \
    153     tmp11_minus_tmp2 = q1;                                                    \
    154                                                                               \
    155     q1 = ((INT32) row0 + (INT32) row4) << 13;                                 \
    156     q2 = q1 + q6;                                                             \
    157     q1 = q1 - q6;                                                             \
    158                                                                               \
    159     /* pick up the results */                                                 \
    160     tmp0  = q4;                                                               \
    161     tmp1  = q5;                                                               \
    162     tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2;                         \
    163     tmp3  = q7;                                                               \
    164     tmp10 = q2;                                                               \
    165     tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2;                         \
    166     tmp12 = q3;                                                               \
    167     tmp13 = q1;                                                               \
    168 }
    169 
    170 #define XFIX_0_899976223                    d0[0]
    171 #define XFIX_0_541196100                    d0[1]
    172 #define XFIX_2_562915447                    d0[2]
    173 #define XFIX_0_298631336_MINUS_0_899976223  d0[3]
    174 #define XFIX_1_501321110_MINUS_0_899976223  d1[0]
    175 #define XFIX_2_053119869_MINUS_2_562915447  d1[1]
    176 #define XFIX_0_541196100_PLUS_0_765366865   d1[2]
    177 #define XFIX_1_175875602                    d1[3]
    178 #define XFIX_1_175875602_MINUS_0_390180644  d2[0]
    179 #define XFIX_0_541196100_MINUS_1_847759065  d2[1]
    180 #define XFIX_3_072711026_MINUS_2_562915447  d2[2]
    181 #define XFIX_1_175875602_MINUS_1_961570560  d2[3]
    182 
    183 .balign 16
    184 jsimd_idct_islow_neon_consts:
    185     .short FIX_0_899976223                    /* d0[0] */
    186     .short FIX_0_541196100                    /* d0[1] */
    187     .short FIX_2_562915447                    /* d0[2] */
    188     .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
    189     .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
    190     .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
    191     .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
    192     .short FIX_1_175875602                    /* d1[3] */
    193     /* reloadable constants */
    194     .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
    195     .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
    196     .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
    197     .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
    198 
    199 asm_function jsimd_idct_islow_neon
    200 
    201     DCT_TABLE       .req r0
    202     COEF_BLOCK      .req r1
    203     OUTPUT_BUF      .req r2
    204     OUTPUT_COL      .req r3
    205     TMP1            .req r0
    206     TMP2            .req r1
    207     TMP3            .req r2
    208     TMP4            .req ip
    209 
    210     ROW0L           .req d16
    211     ROW0R           .req d17
    212     ROW1L           .req d18
    213     ROW1R           .req d19
    214     ROW2L           .req d20
    215     ROW2R           .req d21
    216     ROW3L           .req d22
    217     ROW3R           .req d23
    218     ROW4L           .req d24
    219     ROW4R           .req d25
    220     ROW5L           .req d26
    221     ROW5R           .req d27
    222     ROW6L           .req d28
    223     ROW6R           .req d29
    224     ROW7L           .req d30
    225     ROW7R           .req d31
    226 
    227     /* Load and dequantize coefficients into NEON registers
    228      * with the following allocation:
    229      *       0 1 2 3 | 4 5 6 7
    230      *      ---------+--------
    231      *   0 | d16     | d17     ( q8  )
    232      *   1 | d18     | d19     ( q9  )
    233      *   2 | d20     | d21     ( q10 )
    234      *   3 | d22     | d23     ( q11 )
    235      *   4 | d24     | d25     ( q12 )
    236      *   5 | d26     | d27     ( q13 )
    237      *   6 | d28     | d29     ( q14 )
    238      *   7 | d30     | d31     ( q15 )
    239      */
    240     adr             ip, jsimd_idct_islow_neon_consts
    241     vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
    242     vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
    243     vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
    244     vmul.s16        q8, q8, q0
    245     vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
    246     vmul.s16        q9, q9, q1
    247     vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
    248     vmul.s16        q10, q10, q2
    249     vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
    250     vmul.s16        q11, q11, q3
    251     vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
    252     vmul.s16        q12, q12, q0
    253     vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
    254     vmul.s16        q14, q14, q2
    255     vmul.s16        q13, q13, q1
    256     vld1.16         {d0, d1, d2, d3}, [ip, :128] /* load constants */
    257     add             ip, ip, #16
    258     vmul.s16        q15, q15, q3
    259     vpush           {d8-d15} /* save NEON registers */
    260     /* 1-D IDCT, pass 1, left 4x8 half */
    261     vadd.s16        d4,    ROW7L, ROW3L
    262     vadd.s16        d5,    ROW5L, ROW1L
    263     vmull.s16       q6,    d4,    XFIX_1_175875602_MINUS_1_961570560
    264     vmlal.s16       q6,    d5,    XFIX_1_175875602
    265     vmull.s16       q7,    d4,    XFIX_1_175875602
    266       /* Check for the zero coefficients in the right 4x8 half */
    267       push            {r4, r5}
    268     vmlal.s16       q7,    d5,    XFIX_1_175875602_MINUS_0_390180644
    269     vsubl.s16       q3,    ROW0L, ROW4L
    270       ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
    271     vmull.s16       q2,    ROW2L, XFIX_0_541196100
    272     vmlal.s16       q2,    ROW6L, XFIX_0_541196100_MINUS_1_847759065
    273       orr             r0,    r4,    r5
    274     vmov            q4,    q6
    275     vmlsl.s16       q6,    ROW5L, XFIX_2_562915447
    276       ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
    277     vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
    278     vshl.s32        q3,    q3,    #13
    279       orr             r0,    r0,    r4
    280     vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
    281       orr             r0,    r0,    r5
    282     vadd.s32        q1,    q3,    q2
    283       ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
    284     vmov            q5,    q7
    285     vadd.s32        q1,    q1,    q6
    286       orr             r0,    r0,    r4
    287     vmlsl.s16       q7,    ROW7L, XFIX_0_899976223
    288       orr             r0,    r0,    r5
    289     vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
    290     vrshrn.s32      ROW1L, q1,    #11
    291       ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
    292     vsub.s32        q1,    q1,    q6
    293     vmlal.s16       q5,    ROW5L, XFIX_2_053119869_MINUS_2_562915447
    294       orr             r0,    r0,    r4
    295     vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
    296       orr             r0,    r0,    r5
    297     vsub.s32        q1,    q1,    q6
    298     vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
    299       ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
    300     vmlal.s16       q6,    ROW6L, XFIX_0_541196100
    301     vsub.s32        q3,    q3,    q2
    302       orr             r0,    r0,    r4
    303     vrshrn.s32      ROW6L, q1,    #11
    304       orr             r0,    r0,    r5
    305     vadd.s32        q1,    q3,    q5
    306       ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
    307     vsub.s32        q3,    q3,    q5
    308     vaddl.s16       q5,    ROW0L, ROW4L
    309       orr             r0,    r0,    r4
    310     vrshrn.s32      ROW2L, q1,    #11
    311       orr             r0,    r0,    r5
    312     vrshrn.s32      ROW5L, q3,    #11
    313       ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
    314     vshl.s32        q5,    q5,    #13
    315     vmlal.s16       q4,    ROW7L, XFIX_0_298631336_MINUS_0_899976223
    316       orr             r0,    r0,    r4
    317     vadd.s32        q2,    q5,    q6
    318       orrs            r0,    r0,    r5
    319     vsub.s32        q1,    q5,    q6
    320     vadd.s32        q6,    q2,    q7
    321       ldrd            r4,    [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
    322     vsub.s32        q2,    q2,    q7
    323     vadd.s32        q5,    q1,    q4
    324       orr             r0,    r4,    r5
    325     vsub.s32        q3,    q1,    q4
    326       pop             {r4, r5}
    327     vrshrn.s32      ROW7L, q2,    #11
    328     vrshrn.s32      ROW3L, q5,    #11
    329     vrshrn.s32      ROW0L, q6,    #11
    330     vrshrn.s32      ROW4L, q3,    #11
    331 
    332       beq             3f /* Go to do some special handling for the sparse right 4x8 half */
    333 
    334     /* 1-D IDCT, pass 1, right 4x8 half */
    335     vld1.s16        {d2},  [ip, :64]    /* reload constants */
    336     vadd.s16        d10,   ROW7R, ROW3R
    337     vadd.s16        d8,    ROW5R, ROW1R
    338       /* Transpose left 4x8 half */
    339       vtrn.16         ROW6L, ROW7L
    340     vmull.s16       q6,    d10,   XFIX_1_175875602_MINUS_1_961570560
    341     vmlal.s16       q6,    d8,    XFIX_1_175875602
    342       vtrn.16         ROW2L, ROW3L
    343     vmull.s16       q7,    d10,   XFIX_1_175875602
    344     vmlal.s16       q7,    d8,    XFIX_1_175875602_MINUS_0_390180644
    345       vtrn.16         ROW0L, ROW1L
    346     vsubl.s16       q3,    ROW0R, ROW4R
    347     vmull.s16       q2,    ROW2R, XFIX_0_541196100
    348     vmlal.s16       q2,    ROW6R, XFIX_0_541196100_MINUS_1_847759065
    349       vtrn.16         ROW4L, ROW5L
    350     vmov            q4,    q6
    351     vmlsl.s16       q6,    ROW5R, XFIX_2_562915447
    352     vmlal.s16       q6,    ROW3R, XFIX_3_072711026_MINUS_2_562915447
    353       vtrn.32         ROW1L, ROW3L
    354     vshl.s32        q3,    q3,    #13
    355     vmlsl.s16       q4,    ROW1R, XFIX_0_899976223
    356       vtrn.32         ROW4L, ROW6L
    357     vadd.s32        q1,    q3,    q2
    358     vmov            q5,    q7
    359     vadd.s32        q1,    q1,    q6
    360       vtrn.32         ROW0L, ROW2L
    361     vmlsl.s16       q7,    ROW7R, XFIX_0_899976223
    362     vmlal.s16       q7,    ROW1R, XFIX_1_501321110_MINUS_0_899976223
    363     vrshrn.s32      ROW1R, q1,    #11
    364       vtrn.32         ROW5L, ROW7L
    365     vsub.s32        q1,    q1,    q6
    366     vmlal.s16       q5,    ROW5R, XFIX_2_053119869_MINUS_2_562915447
    367     vmlsl.s16       q5,    ROW3R, XFIX_2_562915447
    368     vsub.s32        q1,    q1,    q6
    369     vmull.s16       q6,    ROW2R, XFIX_0_541196100_PLUS_0_765366865
    370     vmlal.s16       q6,    ROW6R, XFIX_0_541196100
    371     vsub.s32        q3,    q3,    q2
    372     vrshrn.s32      ROW6R, q1,    #11
    373     vadd.s32        q1,    q3,    q5
    374     vsub.s32        q3,    q3,    q5
    375     vaddl.s16       q5,    ROW0R, ROW4R
    376     vrshrn.s32      ROW2R, q1,    #11
    377     vrshrn.s32      ROW5R, q3,    #11
    378     vshl.s32        q5,    q5,    #13
    379     vmlal.s16       q4,    ROW7R, XFIX_0_298631336_MINUS_0_899976223
    380     vadd.s32        q2,    q5,    q6
    381     vsub.s32        q1,    q5,    q6
    382     vadd.s32        q6,    q2,    q7
    383     vsub.s32        q2,    q2,    q7
    384     vadd.s32        q5,    q1,    q4
    385     vsub.s32        q3,    q1,    q4
    386     vrshrn.s32      ROW7R, q2,    #11
    387     vrshrn.s32      ROW3R, q5,    #11
    388     vrshrn.s32      ROW0R, q6,    #11
    389     vrshrn.s32      ROW4R, q3,    #11
    390     /* Transpose right 4x8 half */
    391     vtrn.16         ROW6R, ROW7R
    392     vtrn.16         ROW2R, ROW3R
    393     vtrn.16         ROW0R, ROW1R
    394     vtrn.16         ROW4R, ROW5R
    395     vtrn.32         ROW1R, ROW3R
    396     vtrn.32         ROW4R, ROW6R
    397     vtrn.32         ROW0R, ROW2R
    398     vtrn.32         ROW5R, ROW7R
    399 
    400 1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
    401     vld1.s16        {d2},  [ip, :64]    /* reload constants */
    402     vmull.s16       q6,    ROW1R, XFIX_1_175875602 /* ROW5L <-> ROW1R */
    403     vmlal.s16       q6,    ROW1L, XFIX_1_175875602
    404     vmlal.s16       q6,    ROW3R, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
    405     vmlal.s16       q6,    ROW3L, XFIX_1_175875602_MINUS_1_961570560
    406     vmull.s16       q7,    ROW3R, XFIX_1_175875602 /* ROW7L <-> ROW3R */
    407     vmlal.s16       q7,    ROW3L, XFIX_1_175875602
    408     vmlal.s16       q7,    ROW1R, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
    409     vmlal.s16       q7,    ROW1L, XFIX_1_175875602_MINUS_0_390180644
    410     vsubl.s16       q3,    ROW0L, ROW0R /* ROW4L <-> ROW0R */
    411     vmull.s16       q2,    ROW2L, XFIX_0_541196100
    412     vmlal.s16       q2,    ROW2R, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L <-> ROW2R */
    413     vmov            q4,    q6
    414     vmlsl.s16       q6,    ROW1R, XFIX_2_562915447 /* ROW5L <-> ROW1R */
    415     vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
    416     vshl.s32        q3,    q3,    #13
    417     vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
    418     vadd.s32        q1,    q3,    q2
    419     vmov            q5,    q7
    420     vadd.s32        q1,    q1,    q6
    421     vmlsl.s16       q7,    ROW3R, XFIX_0_899976223 /* ROW7L <-> ROW3R */
    422     vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
    423     vshrn.s32       ROW1L, q1,    #16
    424     vsub.s32        q1,    q1,    q6
    425     vmlal.s16       q5,    ROW1R, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L <-> ROW1R */
    426     vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
    427     vsub.s32        q1,    q1,    q6
    428     vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
    429     vmlal.s16       q6,    ROW2R, XFIX_0_541196100 /* ROW6L <-> ROW2R */
    430     vsub.s32        q3,    q3,    q2
    431     vshrn.s32       ROW2R, q1,    #16 /* ROW6L <-> ROW2R */
    432     vadd.s32        q1,    q3,    q5
    433     vsub.s32        q3,    q3,    q5
    434     vaddl.s16       q5,    ROW0L, ROW0R /* ROW4L <-> ROW0R */
    435     vshrn.s32       ROW2L, q1,    #16
    436     vshrn.s32       ROW1R, q3,    #16 /* ROW5L <-> ROW1R */
    437     vshl.s32        q5,    q5,    #13
    438     vmlal.s16       q4,    ROW3R, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L <-> ROW3R */
    439     vadd.s32        q2,    q5,    q6
    440     vsub.s32        q1,    q5,    q6
    441     vadd.s32        q6,    q2,    q7
    442     vsub.s32        q2,    q2,    q7
    443     vadd.s32        q5,    q1,    q4
    444     vsub.s32        q3,    q1,    q4
    445     vshrn.s32       ROW3R, q2,    #16 /* ROW7L <-> ROW3R */
    446     vshrn.s32       ROW3L, q5,    #16
    447     vshrn.s32       ROW0L, q6,    #16
    448     vshrn.s32       ROW0R, q3,    #16 /* ROW4L <-> ROW0R */
    449     /* 1-D IDCT, pass 2, right 4x8 half */
    450     vld1.s16        {d2},  [ip, :64]    /* reload constants */
    451     vmull.s16       q6,    ROW5R, XFIX_1_175875602
    452     vmlal.s16       q6,    ROW5L, XFIX_1_175875602 /* ROW5L <-> ROW1R */
    453     vmlal.s16       q6,    ROW7R, XFIX_1_175875602_MINUS_1_961570560
    454     vmlal.s16       q6,    ROW7L, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L <-> ROW3R */
    455     vmull.s16       q7,    ROW7R, XFIX_1_175875602
    456     vmlal.s16       q7,    ROW7L, XFIX_1_175875602 /* ROW7L <-> ROW3R */
    457     vmlal.s16       q7,    ROW5R, XFIX_1_175875602_MINUS_0_390180644
    458     vmlal.s16       q7,    ROW5L, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L <-> ROW1R */
    459     vsubl.s16       q3,    ROW4L, ROW4R /* ROW4L <-> ROW0R */
    460     vmull.s16       q2,    ROW6L, XFIX_0_541196100 /* ROW6L <-> ROW2R */
    461     vmlal.s16       q2,    ROW6R, XFIX_0_541196100_MINUS_1_847759065
    462     vmov            q4,    q6
    463     vmlsl.s16       q6,    ROW5R, XFIX_2_562915447
    464     vmlal.s16       q6,    ROW7L, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L <-> ROW3R */
    465     vshl.s32        q3,    q3,    #13
    466     vmlsl.s16       q4,    ROW5L, XFIX_0_899976223 /* ROW5L <-> ROW1R */
    467     vadd.s32        q1,    q3,    q2
    468     vmov            q5,    q7
    469     vadd.s32        q1,    q1,    q6
    470     vmlsl.s16       q7,    ROW7R, XFIX_0_899976223
    471     vmlal.s16       q7,    ROW5L, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L <-> ROW1R */
    472     vshrn.s32       ROW5L, q1,    #16 /* ROW5L <-> ROW1R */
    473     vsub.s32        q1,    q1,    q6
    474     vmlal.s16       q5,    ROW5R, XFIX_2_053119869_MINUS_2_562915447
    475     vmlsl.s16       q5,    ROW7L, XFIX_2_562915447 /* ROW7L <-> ROW3R */
    476     vsub.s32        q1,    q1,    q6
    477     vmull.s16       q6,    ROW6L, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L <-> ROW2R */
    478     vmlal.s16       q6,    ROW6R, XFIX_0_541196100
    479     vsub.s32        q3,    q3,    q2
    480     vshrn.s32       ROW6R, q1,    #16
    481     vadd.s32        q1,    q3,    q5
    482     vsub.s32        q3,    q3,    q5
    483     vaddl.s16       q5,    ROW4L, ROW4R /* ROW4L <-> ROW0R */
    484     vshrn.s32       ROW6L, q1,    #16 /* ROW6L <-> ROW2R */
    485     vshrn.s32       ROW5R, q3,    #16
    486     vshl.s32        q5,    q5,    #13
    487     vmlal.s16       q4,    ROW7R, XFIX_0_298631336_MINUS_0_899976223
    488     vadd.s32        q2,    q5,    q6
    489     vsub.s32        q1,    q5,    q6
    490     vadd.s32        q6,    q2,    q7
    491     vsub.s32        q2,    q2,    q7
    492     vadd.s32        q5,    q1,    q4
    493     vsub.s32        q3,    q1,    q4
    494     vshrn.s32       ROW7R, q2,    #16
    495     vshrn.s32       ROW7L, q5,    #16 /* ROW7L <-> ROW3R */
    496     vshrn.s32       ROW4L, q6,    #16 /* ROW4L <-> ROW0R */
    497     vshrn.s32       ROW4R, q3,    #16
    498 
    499 2:  /* Descale to 8-bit and range limit */
    500     vqrshrn.s16     d16,   q8,    #2
    501     vqrshrn.s16     d17,   q9,    #2
    502     vqrshrn.s16     d18,   q10,   #2
    503     vqrshrn.s16     d19,   q11,   #2
    504     vpop            {d8-d15} /* restore NEON registers */
    505     vqrshrn.s16     d20,   q12,   #2
    506       /* Transpose the final 8-bit samples and do signed->unsigned conversion */
    507       vtrn.16         q8,    q9
    508     vqrshrn.s16     d21,   q13,   #2
    509     vqrshrn.s16     d22,   q14,   #2
    510       vmov.u8         q0,    #(CENTERJSAMPLE)
    511     vqrshrn.s16     d23,   q15,   #2
    512       vtrn.8          d16,   d17
    513       vtrn.8          d18,   d19
    514       vadd.u8         q8,    q8,    q0
    515       vadd.u8         q9,    q9,    q0
    516       vtrn.16         q10,   q11
    517         /* Store results to the output buffer */
    518         ldmia           OUTPUT_BUF!, {TMP1, TMP2}
    519         add             TMP1, TMP1, OUTPUT_COL
    520         add             TMP2, TMP2, OUTPUT_COL
    521         vst1.8          {d16}, [TMP1]
    522       vtrn.8          d20, d21
    523         vst1.8          {d17}, [TMP2]
    524         ldmia           OUTPUT_BUF!, {TMP1, TMP2}
    525         add             TMP1, TMP1, OUTPUT_COL
    526         add             TMP2, TMP2, OUTPUT_COL
    527         vst1.8          {d18}, [TMP1]
    528       vadd.u8         q10,   q10,   q0
    529         vst1.8          {d19}, [TMP2]
    530         ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
    531         add             TMP1, TMP1, OUTPUT_COL
    532         add             TMP2, TMP2, OUTPUT_COL
    533         add             TMP3, TMP3, OUTPUT_COL
    534         add             TMP4, TMP4, OUTPUT_COL
    535       vtrn.8          d22, d23
    536         vst1.8          {d20}, [TMP1]
    537       vadd.u8         q11,   q11,   q0
    538         vst1.8          {d21}, [TMP2]
    539         vst1.8          {d22}, [TMP3]
    540         vst1.8          {d23}, [TMP4]
    541     bx              lr
    542 
    543 3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
    544 
    545     /* Transpose left 4x8 half */
    546     vtrn.16         ROW6L, ROW7L
    547     vtrn.16         ROW2L, ROW3L
    548     vtrn.16         ROW0L, ROW1L
    549     vtrn.16         ROW4L, ROW5L
    550     vshl.s16        ROW0R, ROW0R, #2 /* PASS1_BITS */
    551     vtrn.32         ROW1L, ROW3L
    552     vtrn.32         ROW4L, ROW6L
    553     vtrn.32         ROW0L, ROW2L
    554     vtrn.32         ROW5L, ROW7L
    555 
    556     cmp             r0, #0
    557     beq             4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
    558 
    559     /* Only row 0 is non-zero for the right 4x8 half  */
    560     vdup.s16        ROW1R, ROW0R[1]
    561     vdup.s16        ROW2R, ROW0R[2]
    562     vdup.s16        ROW3R, ROW0R[3]
    563     vdup.s16        ROW4R, ROW0R[0]
    564     vdup.s16        ROW5R, ROW0R[1]
    565     vdup.s16        ROW6R, ROW0R[2]
    566     vdup.s16        ROW7R, ROW0R[3]
    567     vdup.s16        ROW0R, ROW0R[0]
    568     b               1b /* Go to 'normal' second pass */
    569 
    570 4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
    571     vld1.s16        {d2},  [ip, :64]    /* reload constants */
    572     vmull.s16       q6,    ROW1L, XFIX_1_175875602
    573     vmlal.s16       q6,    ROW3L, XFIX_1_175875602_MINUS_1_961570560
    574     vmull.s16       q7,    ROW3L, XFIX_1_175875602
    575     vmlal.s16       q7,    ROW1L, XFIX_1_175875602_MINUS_0_390180644
    576     vmull.s16       q2,    ROW2L, XFIX_0_541196100
    577     vshll.s16       q3,    ROW0L, #13
    578     vmov            q4,    q6
    579     vmlal.s16       q6,    ROW3L, XFIX_3_072711026_MINUS_2_562915447
    580     vmlsl.s16       q4,    ROW1L, XFIX_0_899976223
    581     vadd.s32        q1,    q3,    q2
    582     vmov            q5,    q7
    583     vmlal.s16       q7,    ROW1L, XFIX_1_501321110_MINUS_0_899976223
    584     vadd.s32        q1,    q1,    q6
    585     vadd.s32        q6,    q6,    q6
    586     vmlsl.s16       q5,    ROW3L, XFIX_2_562915447
    587     vshrn.s32       ROW1L, q1,    #16
    588     vsub.s32        q1,    q1,    q6
    589     vmull.s16       q6,    ROW2L, XFIX_0_541196100_PLUS_0_765366865
    590     vsub.s32        q3,    q3,    q2
    591     vshrn.s32       ROW2R, q1,    #16 /* ROW6L <-> ROW2R */
    592     vadd.s32        q1,    q3,    q5
    593     vsub.s32        q3,    q3,    q5
    594     vshll.s16       q5,    ROW0L, #13
    595     vshrn.s32       ROW2L, q1,    #16
    596     vshrn.s32       ROW1R, q3,    #16 /* ROW5L <-> ROW1R */
    597     vadd.s32        q2,    q5,    q6
    598     vsub.s32        q1,    q5,    q6
    599     vadd.s32        q6,    q2,    q7
    600     vsub.s32        q2,    q2,    q7
    601     vadd.s32        q5,    q1,    q4
    602     vsub.s32        q3,    q1,    q4
    603     vshrn.s32       ROW3R, q2,    #16 /* ROW7L <-> ROW3R */
    604     vshrn.s32       ROW3L, q5,    #16
    605     vshrn.s32       ROW0L, q6,    #16
    606     vshrn.s32       ROW0R, q3,    #16 /* ROW4L <-> ROW0R */
    607     /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
    608     vld1.s16        {d2},  [ip, :64]    /* reload constants */
    609     vmull.s16       q6,    ROW5L, XFIX_1_175875602
    610     vmlal.s16       q6,    ROW7L, XFIX_1_175875602_MINUS_1_961570560
    611     vmull.s16       q7,    ROW7L, XFIX_1_175875602
    612     vmlal.s16       q7,    ROW5L, XFIX_1_175875602_MINUS_0_390180644
    613     vmull.s16       q2,    ROW6L, XFIX_0_541196100
    614     vshll.s16       q3,    ROW4L, #13
    615     vmov            q4,    q6
    616     vmlal.s16       q6,    ROW7L, XFIX_3_072711026_MINUS_2_562915447
    617     vmlsl.s16       q4,    ROW5L, XFIX_0_899976223
    618     vadd.s32        q1,    q3,    q2
    619     vmov            q5,    q7
    620     vmlal.s16       q7,    ROW5L, XFIX_1_501321110_MINUS_0_899976223
    621     vadd.s32        q1,    q1,    q6
    622     vadd.s32        q6,    q6,    q6
    623     vmlsl.s16       q5,    ROW7L, XFIX_2_562915447
    624     vshrn.s32       ROW5L, q1,    #16 /* ROW5L <-> ROW1R */
    625     vsub.s32        q1,    q1,    q6
    626     vmull.s16       q6,    ROW6L, XFIX_0_541196100_PLUS_0_765366865
    627     vsub.s32        q3,    q3,    q2
    628     vshrn.s32       ROW6R, q1,    #16
    629     vadd.s32        q1,    q3,    q5
    630     vsub.s32        q3,    q3,    q5
    631     vshll.s16       q5,    ROW4L, #13
    632     vshrn.s32       ROW6L, q1,    #16 /* ROW6L <-> ROW2R */
    633     vshrn.s32       ROW5R, q3,    #16
    634     vadd.s32        q2,    q5,    q6
    635     vsub.s32        q1,    q5,    q6
    636     vadd.s32        q6,    q2,    q7
    637     vsub.s32        q2,    q2,    q7
    638     vadd.s32        q5,    q1,    q4
    639     vsub.s32        q3,    q1,    q4
    640     vshrn.s32       ROW7R, q2,    #16
    641     vshrn.s32       ROW7L, q5,    #16 /* ROW7L <-> ROW3R */
    642     vshrn.s32       ROW4L, q6,    #16 /* ROW4L <-> ROW0R */
    643     vshrn.s32       ROW4R, q3,    #16
    644     b               2b /* Go to epilogue */
    645 
    646     .unreq          DCT_TABLE
    647     .unreq          COEF_BLOCK
    648     .unreq          OUTPUT_BUF
    649     .unreq          OUTPUT_COL
    650     .unreq          TMP1
    651     .unreq          TMP2
    652     .unreq          TMP3
    653     .unreq          TMP4
    654 
    655     .unreq          ROW0L
    656     .unreq          ROW0R
    657     .unreq          ROW1L
    658     .unreq          ROW1R
    659     .unreq          ROW2L
    660     .unreq          ROW2R
    661     .unreq          ROW3L
    662     .unreq          ROW3R
    663     .unreq          ROW4L
    664     .unreq          ROW4R
    665     .unreq          ROW5L
    666     .unreq          ROW5R
    667     .unreq          ROW6L
    668     .unreq          ROW6R
    669     .unreq          ROW7L
    670     .unreq          ROW7R
    671 
    672 
    673 /*****************************************************************************/
    674 
    675 /*
    676  * jsimd_idct_ifast_neon
    677  *
    678  * This function contains a fast, not so accurate integer implementation of
    679  * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
    680  * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
    681  * function from jidctfst.c
    682  *
    683  * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
    684  * But in ARM NEON case some extra additions are required because VQDMULH
    685  * instruction can't handle the constants larger than 1. So the expressions
    686  * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
    687  * which introduces an extra addition. Overall, there are 6 extra additions
    688  * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
    689  */
    690 
    691 #define XFIX_1_082392200 d0[0]
    692 #define XFIX_1_414213562 d0[1]
    693 #define XFIX_1_847759065 d0[2]
    694 #define XFIX_2_613125930 d0[3]
    695 
    696 .balign 16
    697 jsimd_idct_ifast_neon_consts:
    698     .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
    699     .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
    700     .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
    701     .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
    702 
    703 asm_function jsimd_idct_ifast_neon
    704 
    705     DCT_TABLE       .req r0
    706     COEF_BLOCK      .req r1
    707     OUTPUT_BUF      .req r2
    708     OUTPUT_COL      .req r3
    709     TMP1            .req r0
    710     TMP2            .req r1
    711     TMP3            .req r2
    712     TMP4            .req ip
    713 
    714     /* Load and dequantize coefficients into NEON registers
    715      * with the following allocation:
    716      *       0 1 2 3 | 4 5 6 7
    717      *      ---------+--------
    718      *   0 | d16     | d17     ( q8  )
    719      *   1 | d18     | d19     ( q9  )
    720      *   2 | d20     | d21     ( q10 )
    721      *   3 | d22     | d23     ( q11 )
    722      *   4 | d24     | d25     ( q12 )
    723      *   5 | d26     | d27     ( q13 )
    724      *   6 | d28     | d29     ( q14 )
    725      *   7 | d30     | d31     ( q15 )
    726      */
    727     adr             ip, jsimd_idct_ifast_neon_consts
    728     vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
    729     vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
    730     vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
    731     vmul.s16        q8,  q8,  q0
    732     vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
    733     vmul.s16        q9,  q9,  q1
    734     vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
    735     vmul.s16        q10, q10, q2
    736     vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
    737     vmul.s16        q11, q11, q3
    738     vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
    739     vmul.s16        q12, q12, q0
    740     vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
    741     vmul.s16        q14, q14, q2
    742     vmul.s16        q13, q13, q1
    743     vld1.16         {d0}, [ip, :64] /* load constants */
    744     vmul.s16        q15, q15, q3
    745     vpush           {d8-d13}        /* save NEON registers */
    746     /* 1-D IDCT, pass 1 */
    747     vsub.s16        q2,  q10, q14
    748     vadd.s16        q14, q10, q14
    749     vsub.s16        q1,  q11, q13
    750     vadd.s16        q13, q11, q13
    751     vsub.s16        q5,  q9,  q15
    752     vadd.s16        q15, q9,  q15
    753     vqdmulh.s16     q4,  q2,  XFIX_1_414213562
    754     vqdmulh.s16     q6,  q1,  XFIX_2_613125930
    755     vadd.s16        q3,  q1,  q1
    756     vsub.s16        q1,  q5,  q1
    757     vadd.s16        q10, q2,  q4
    758     vqdmulh.s16     q4,  q1,  XFIX_1_847759065
    759     vsub.s16        q2,  q15, q13
    760     vadd.s16        q3,  q3,  q6
    761     vqdmulh.s16     q6,  q2,  XFIX_1_414213562
    762     vadd.s16        q1,  q1,  q4
    763     vqdmulh.s16     q4,  q5,  XFIX_1_082392200
    764     vsub.s16        q10, q10, q14
    765     vadd.s16        q2,  q2,  q6
    766     vsub.s16        q6,  q8,  q12
    767     vadd.s16        q12, q8,  q12
    768     vadd.s16        q9,  q5,  q4
    769     vadd.s16        q5,  q6,  q10
    770     vsub.s16        q10, q6,  q10
    771     vadd.s16        q6,  q15, q13
    772     vadd.s16        q8,  q12, q14
    773     vsub.s16        q3,  q6,  q3
    774     vsub.s16        q12, q12, q14
    775     vsub.s16        q3,  q3,  q1
    776     vsub.s16        q1,  q9,  q1
    777     vadd.s16        q2,  q3,  q2
    778     vsub.s16        q15, q8,  q6
    779     vadd.s16        q1,  q1,  q2
    780     vadd.s16        q8,  q8,  q6
    781     vadd.s16        q14, q5,  q3
    782     vsub.s16        q9,  q5,  q3
    783     vsub.s16        q13, q10, q2
    784     vadd.s16        q10, q10, q2
    785       /* Transpose */
    786       vtrn.16         q8,  q9
    787     vsub.s16        q11, q12, q1
    788       vtrn.16         q14, q15
    789     vadd.s16        q12, q12, q1
    790       vtrn.16         q10, q11
    791       vtrn.16         q12, q13
    792       vtrn.32         q9,  q11
    793       vtrn.32         q12, q14
    794       vtrn.32         q8,  q10
    795       vtrn.32         q13, q15
    796       vswp            d28, d21
    797       vswp            d26, d19
    798     /* 1-D IDCT, pass 2 */
    799     vsub.s16        q2,  q10, q14
    800       vswp            d30, d23
    801     vadd.s16        q14, q10, q14
    802       vswp            d24, d17
    803     vsub.s16        q1,  q11, q13
    804     vadd.s16        q13, q11, q13
    805     vsub.s16        q5,  q9,  q15
    806     vadd.s16        q15, q9,  q15
    807     vqdmulh.s16     q4,  q2,  XFIX_1_414213562
    808     vqdmulh.s16     q6,  q1,  XFIX_2_613125930
    809     vadd.s16        q3,  q1,  q1
    810     vsub.s16        q1,  q5,  q1
    811     vadd.s16        q10, q2,  q4
    812     vqdmulh.s16     q4,  q1,  XFIX_1_847759065
    813     vsub.s16        q2,  q15, q13
    814     vadd.s16        q3,  q3,  q6
    815     vqdmulh.s16     q6,  q2,  XFIX_1_414213562
    816     vadd.s16        q1,  q1,  q4
    817     vqdmulh.s16     q4,  q5,  XFIX_1_082392200
    818     vsub.s16        q10, q10, q14
    819     vadd.s16        q2,  q2,  q6
    820     vsub.s16        q6,  q8,  q12
    821     vadd.s16        q12, q8,  q12
    822     vadd.s16        q9,  q5,  q4
    823     vadd.s16        q5,  q6,  q10
    824     vsub.s16        q10, q6,  q10
    825     vadd.s16        q6,  q15, q13
    826     vadd.s16        q8,  q12, q14
    827     vsub.s16        q3,  q6,  q3
    828     vsub.s16        q12, q12, q14
    829     vsub.s16        q3,  q3,  q1
    830     vsub.s16        q1,  q9,  q1
    831     vadd.s16        q2,  q3,  q2
    832     vsub.s16        q15, q8,  q6
    833     vadd.s16        q1,  q1,  q2
    834     vadd.s16        q8,  q8,  q6
    835     vadd.s16        q14, q5,  q3
    836     vsub.s16        q9,  q5,  q3
    837     vsub.s16        q13, q10, q2
    838     vpop            {d8-d13}        /* restore NEON registers */
    839     vadd.s16        q10, q10, q2
    840     vsub.s16        q11, q12, q1
    841     vadd.s16        q12, q12, q1
    842     /* Descale to 8-bit and range limit */
    843     vmov.u8         q0,  #0x80
    844     vqshrn.s16      d16, q8,  #5
    845     vqshrn.s16      d17, q9,  #5
    846     vqshrn.s16      d18, q10, #5
    847     vqshrn.s16      d19, q11, #5
    848     vqshrn.s16      d20, q12, #5
    849     vqshrn.s16      d21, q13, #5
    850     vqshrn.s16      d22, q14, #5
    851     vqshrn.s16      d23, q15, #5
    852     vadd.u8         q8,  q8,  q0
    853     vadd.u8         q9,  q9,  q0
    854     vadd.u8         q10, q10, q0
    855     vadd.u8         q11, q11, q0
    856     /* Transpose the final 8-bit samples */
    857     vtrn.16         q8,  q9
    858     vtrn.16         q10, q11
    859     vtrn.32         q8,  q10
    860     vtrn.32         q9,  q11
    861     vtrn.8          d16, d17
    862     vtrn.8          d18, d19
    863       /* Store results to the output buffer */
    864       ldmia           OUTPUT_BUF!, {TMP1, TMP2}
    865       add             TMP1, TMP1, OUTPUT_COL
    866       add             TMP2, TMP2, OUTPUT_COL
    867       vst1.8          {d16}, [TMP1]
    868       vst1.8          {d17}, [TMP2]
    869       ldmia           OUTPUT_BUF!, {TMP1, TMP2}
    870       add             TMP1, TMP1, OUTPUT_COL
    871       add             TMP2, TMP2, OUTPUT_COL
    872       vst1.8          {d18}, [TMP1]
    873     vtrn.8          d20, d21
    874       vst1.8          {d19}, [TMP2]
    875       ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
    876       add             TMP1, TMP1, OUTPUT_COL
    877       add             TMP2, TMP2, OUTPUT_COL
    878       add             TMP3, TMP3, OUTPUT_COL
    879       add             TMP4, TMP4, OUTPUT_COL
    880       vst1.8          {d20}, [TMP1]
    881     vtrn.8          d22, d23
    882       vst1.8          {d21}, [TMP2]
    883       vst1.8          {d22}, [TMP3]
    884       vst1.8          {d23}, [TMP4]
    885     bx              lr
    886 
    887     .unreq          DCT_TABLE
    888     .unreq          COEF_BLOCK
    889     .unreq          OUTPUT_BUF
    890     .unreq          OUTPUT_COL
    891     .unreq          TMP1
    892     .unreq          TMP2
    893     .unreq          TMP3
    894     .unreq          TMP4
    895 
    896 
    897 /*****************************************************************************/
    898 
    899 /*
    900  * jsimd_idct_4x4_neon
    901  *
    902  * This function contains inverse-DCT code for getting reduced-size
    903  * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
    904  * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
    905  * function from jpeg-6b (jidctred.c).
    906  *
    907  * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
    908  *       requires much less arithmetic operations and hence should be faster.
    909  *       The primary purpose of this particular NEON optimized function is
    910  *       bit exact compatibility with jpeg-6b.
    911  *
    912  * TODO: a bit better instructions scheduling can be achieved by expanding
    913  *       idct_helper/transpose_4x4 macros and reordering instructions,
    914  *       but readability will suffer somewhat.
    915  */
    916 
    917 #define CONST_BITS  13
    918 
    919 #define FIX_0_211164243  (1730)  /* FIX(0.211164243) */
    920 #define FIX_0_509795579  (4176)  /* FIX(0.509795579) */
    921 #define FIX_0_601344887  (4926)  /* FIX(0.601344887) */
    922 #define FIX_0_720959822  (5906)  /* FIX(0.720959822) */
    923 #define FIX_0_765366865  (6270)  /* FIX(0.765366865) */
    924 #define FIX_0_850430095  (6967)  /* FIX(0.850430095) */
    925 #define FIX_0_899976223  (7373)  /* FIX(0.899976223) */
    926 #define FIX_1_061594337  (8697)  /* FIX(1.061594337) */
    927 #define FIX_1_272758580  (10426) /* FIX(1.272758580) */
    928 #define FIX_1_451774981  (11893) /* FIX(1.451774981) */
    929 #define FIX_1_847759065  (15137) /* FIX(1.847759065) */
    930 #define FIX_2_172734803  (17799) /* FIX(2.172734803) */
    931 #define FIX_2_562915447  (20995) /* FIX(2.562915447) */
    932 #define FIX_3_624509785  (29692) /* FIX(3.624509785) */
    933 
    934 .balign 16
    935 jsimd_idct_4x4_neon_consts:
    936     .short     FIX_1_847759065     /* d0[0] */
    937     .short     -FIX_0_765366865    /* d0[1] */
    938     .short     -FIX_0_211164243    /* d0[2] */
    939     .short     FIX_1_451774981     /* d0[3] */
    940     .short     -FIX_2_172734803    /* d1[0] */
    941     .short     FIX_1_061594337     /* d1[1] */
    942     .short     -FIX_0_509795579    /* d1[2] */
    943     .short     -FIX_0_601344887    /* d1[3] */
    944     .short     FIX_0_899976223     /* d2[0] */
    945     .short     FIX_2_562915447     /* d2[1] */
    946     .short     1 << (CONST_BITS+1) /* d2[2] */
    947     .short     0                   /* d2[3] */
    948 
    949 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
    950     vmull.s16       q14, \x4,  d2[2]
    951     vmlal.s16       q14, \x8,  d0[0]
    952     vmlal.s16       q14, \x14, d0[1]
    953 
    954     vmull.s16       q13, \x16, d1[2]
    955     vmlal.s16       q13, \x12, d1[3]
    956     vmlal.s16       q13, \x10, d2[0]
    957     vmlal.s16       q13, \x6,  d2[1]
    958 
    959     vmull.s16       q15, \x4,  d2[2]
    960     vmlsl.s16       q15, \x8,  d0[0]
    961     vmlsl.s16       q15, \x14, d0[1]
    962 
    963     vmull.s16       q12, \x16, d0[2]
    964     vmlal.s16       q12, \x12, d0[3]
    965     vmlal.s16       q12, \x10, d1[0]
    966     vmlal.s16       q12, \x6,  d1[1]
    967 
    968     vadd.s32        q10, q14, q13
    969     vsub.s32        q14, q14, q13
    970 
    971 .if \shift > 16
    972     vrshr.s32       q10,  q10, #\shift
    973     vrshr.s32       q14,  q14, #\shift
    974     vmovn.s32       \y26, q10
    975     vmovn.s32       \y29, q14
    976 .else
    977     vrshrn.s32      \y26, q10, #\shift
    978     vrshrn.s32      \y29, q14, #\shift
    979 .endif
    980 
    981     vadd.s32        q10, q15, q12
    982     vsub.s32        q15, q15, q12
    983 
    984 .if \shift > 16
    985     vrshr.s32       q10,  q10, #\shift
    986     vrshr.s32       q15,  q15, #\shift
    987     vmovn.s32       \y27, q10
    988     vmovn.s32       \y28, q15
    989 .else
    990     vrshrn.s32      \y27, q10, #\shift
    991     vrshrn.s32      \y28, q15, #\shift
    992 .endif
    993 
    994 .endm
    995 
    996 asm_function jsimd_idct_4x4_neon
    997 
    998     DCT_TABLE       .req r0
    999     COEF_BLOCK      .req r1
   1000     OUTPUT_BUF      .req r2
   1001     OUTPUT_COL      .req r3
   1002     TMP1            .req r0
   1003     TMP2            .req r1
   1004     TMP3            .req r2
   1005     TMP4            .req ip
   1006 
   1007     vpush           {d8-d15}
   1008 
   1009     /* Load constants (d3 is just used for padding) */
   1010     adr             TMP4, jsimd_idct_4x4_neon_consts
   1011     vld1.16         {d0, d1, d2, d3}, [TMP4, :128]
   1012 
   1013     /* Load all COEF_BLOCK into NEON registers with the following allocation:
   1014      *       0 1 2 3 | 4 5 6 7
   1015      *      ---------+--------
   1016      *   0 | d4      | d5
   1017      *   1 | d6      | d7
   1018      *   2 | d8      | d9
   1019      *   3 | d10     | d11
   1020      *   4 | -       | -
   1021      *   5 | d12     | d13
   1022      *   6 | d14     | d15
   1023      *   7 | d16     | d17
   1024      */
   1025     vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
   1026     vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
   1027     add COEF_BLOCK, COEF_BLOCK, #16
   1028     vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
   1029     vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
   1030     /* dequantize */
   1031     vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
   1032     vmul.s16        q2, q2, q9
   1033     vld1.16         {d22, d23, d24, d25}, [DCT_TABLE, :128]!
   1034     vmul.s16        q3, q3, q10
   1035     vmul.s16        q4, q4, q11
   1036     add             DCT_TABLE, DCT_TABLE, #16
   1037     vld1.16         {d26, d27, d28, d29}, [DCT_TABLE, :128]!
   1038     vmul.s16        q5, q5, q12
   1039     vmul.s16        q6, q6, q13
   1040     vld1.16         {d30, d31}, [DCT_TABLE, :128]!
   1041     vmul.s16        q7, q7, q14
   1042     vmul.s16        q8, q8, q15
   1043 
   1044     /* Pass 1 */
   1045     idct_helper     d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
   1046     transpose_4x4   d4, d6, d8, d10
   1047     idct_helper     d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
   1048     transpose_4x4   d5, d7, d9, d11
   1049 
   1050     /* Pass 2 */
   1051     idct_helper     d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
   1052     transpose_4x4   d26, d27, d28, d29
   1053 
   1054     /* Range limit */
   1055     vmov.u16        q15, #0x80
   1056     vadd.s16        q13, q13, q15
   1057     vadd.s16        q14, q14, q15
   1058     vqmovun.s16     d26, q13
   1059     vqmovun.s16     d27, q14
   1060 
   1061     /* Store results to the output buffer */
   1062     ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
   1063     add             TMP1, TMP1, OUTPUT_COL
   1064     add             TMP2, TMP2, OUTPUT_COL
   1065     add             TMP3, TMP3, OUTPUT_COL
   1066     add             TMP4, TMP4, OUTPUT_COL
   1067 
   1068 #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
   1069     /* We can use much less instructions on little endian systems if the
   1070      * OS kernel is not configured to trap unaligned memory accesses
   1071      */
   1072     vst1.32         {d26[0]}, [TMP1]!
   1073     vst1.32         {d27[0]}, [TMP3]!
   1074     vst1.32         {d26[1]}, [TMP2]!
   1075     vst1.32         {d27[1]}, [TMP4]!
   1076 #else
   1077     vst1.8          {d26[0]}, [TMP1]!
   1078     vst1.8          {d27[0]}, [TMP3]!
   1079     vst1.8          {d26[1]}, [TMP1]!
   1080     vst1.8          {d27[1]}, [TMP3]!
   1081     vst1.8          {d26[2]}, [TMP1]!
   1082     vst1.8          {d27[2]}, [TMP3]!
   1083     vst1.8          {d26[3]}, [TMP1]!
   1084     vst1.8          {d27[3]}, [TMP3]!
   1085 
   1086     vst1.8          {d26[4]}, [TMP2]!
   1087     vst1.8          {d27[4]}, [TMP4]!
   1088     vst1.8          {d26[5]}, [TMP2]!
   1089     vst1.8          {d27[5]}, [TMP4]!
   1090     vst1.8          {d26[6]}, [TMP2]!
   1091     vst1.8          {d27[6]}, [TMP4]!
   1092     vst1.8          {d26[7]}, [TMP2]!
   1093     vst1.8          {d27[7]}, [TMP4]!
   1094 #endif
   1095 
   1096     vpop            {d8-d15}
   1097     bx              lr
   1098 
   1099     .unreq          DCT_TABLE
   1100     .unreq          COEF_BLOCK
   1101     .unreq          OUTPUT_BUF
   1102     .unreq          OUTPUT_COL
   1103     .unreq          TMP1
   1104     .unreq          TMP2
   1105     .unreq          TMP3
   1106     .unreq          TMP4
   1107 
   1108 .purgem idct_helper
   1109 
   1110 
   1111 /*****************************************************************************/
   1112 
   1113 /*
   1114  * jsimd_idct_2x2_neon
   1115  *
   1116  * This function contains inverse-DCT code for getting reduced-size
   1117  * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
   1118  * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
   1119  * function from jpeg-6b (jidctred.c).
   1120  *
   1121  * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
   1122  *       requires much less arithmetic operations and hence should be faster.
   1123  *       The primary purpose of this particular NEON optimized function is
   1124  *       bit exact compatibility with jpeg-6b.
   1125  */
   1126 
   1127 .balign 8
   1128 jsimd_idct_2x2_neon_consts:
   1129     .short     -FIX_0_720959822    /* d0[0] */
   1130     .short     FIX_0_850430095     /* d0[1] */
   1131     .short     -FIX_1_272758580    /* d0[2] */
   1132     .short     FIX_3_624509785     /* d0[3] */
   1133 
   1134 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
   1135     vshll.s16  q14,  \x4,  #15
   1136     vmull.s16  q13,  \x6,  d0[3]
   1137     vmlal.s16  q13,  \x10, d0[2]
   1138     vmlal.s16  q13,  \x12, d0[1]
   1139     vmlal.s16  q13,  \x16, d0[0]
   1140 
   1141     vadd.s32   q10,  q14,  q13
   1142     vsub.s32   q14,  q14,  q13
   1143 
   1144 .if \shift > 16
   1145     vrshr.s32  q10,  q10,  #\shift
   1146     vrshr.s32  q14,  q14,  #\shift
   1147     vmovn.s32  \y26, q10
   1148     vmovn.s32  \y27, q14
   1149 .else
   1150     vrshrn.s32 \y26, q10,  #\shift
   1151     vrshrn.s32 \y27, q14,  #\shift
   1152 .endif
   1153 
   1154 .endm
   1155 
   1156 asm_function jsimd_idct_2x2_neon
   1157 
   1158     DCT_TABLE       .req r0
   1159     COEF_BLOCK      .req r1
   1160     OUTPUT_BUF      .req r2
   1161     OUTPUT_COL      .req r3
   1162     TMP1            .req r0
   1163     TMP2            .req ip
   1164 
   1165     vpush           {d8-d15}
   1166 
   1167     /* Load constants */
   1168     adr             TMP2, jsimd_idct_2x2_neon_consts
   1169     vld1.16         {d0}, [TMP2, :64]
   1170 
   1171     /* Load all COEF_BLOCK into NEON registers with the following allocation:
   1172      *       0 1 2 3 | 4 5 6 7
   1173      *      ---------+--------
   1174      *   0 | d4      | d5
   1175      *   1 | d6      | d7
   1176      *   2 | -       | -
   1177      *   3 | d10     | d11
   1178      *   4 | -       | -
   1179      *   5 | d12     | d13
   1180      *   6 | -       | -
   1181      *   7 | d16     | d17
   1182      */
   1183     vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
   1184     add             COEF_BLOCK, COEF_BLOCK, #16
   1185     vld1.16         {d10, d11}, [COEF_BLOCK, :128]!
   1186     add             COEF_BLOCK, COEF_BLOCK, #16
   1187     vld1.16         {d12, d13}, [COEF_BLOCK, :128]!
   1188     add             COEF_BLOCK, COEF_BLOCK, #16
   1189     vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
   1190     /* Dequantize */
   1191     vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
   1192     vmul.s16        q2, q2, q9
   1193     vmul.s16        q3, q3, q10
   1194     add             DCT_TABLE, DCT_TABLE, #16
   1195     vld1.16         {d24, d25}, [DCT_TABLE, :128]!
   1196     vmul.s16        q5, q5, q12
   1197     add             DCT_TABLE, DCT_TABLE, #16
   1198     vld1.16         {d26, d27}, [DCT_TABLE, :128]!
   1199     vmul.s16        q6, q6, q13
   1200     add             DCT_TABLE, DCT_TABLE, #16
   1201     vld1.16         {d30, d31}, [DCT_TABLE, :128]!
   1202     vmul.s16        q8, q8, q15
   1203 
   1204     /* Pass 1 */
   1205 #if 0
   1206     idct_helper     d4, d6, d10, d12, d16, 13, d4, d6
   1207     transpose_4x4   d4, d6, d8,  d10
   1208     idct_helper     d5, d7, d11, d13, d17, 13, d5, d7
   1209     transpose_4x4   d5, d7, d9,  d11
   1210 #else
   1211     vmull.s16       q13, d6,  d0[3]
   1212     vmlal.s16       q13, d10, d0[2]
   1213     vmlal.s16       q13, d12, d0[1]
   1214     vmlal.s16       q13, d16, d0[0]
   1215     vmull.s16       q12, d7,  d0[3]
   1216     vmlal.s16       q12, d11, d0[2]
   1217     vmlal.s16       q12, d13, d0[1]
   1218     vmlal.s16       q12, d17, d0[0]
   1219     vshll.s16       q14, d4,  #15
   1220     vshll.s16       q15, d5,  #15
   1221     vadd.s32        q10, q14, q13
   1222     vsub.s32        q14, q14, q13
   1223     vrshrn.s32      d4,  q10, #13
   1224     vrshrn.s32      d6,  q14, #13
   1225     vadd.s32        q10, q15, q12
   1226     vsub.s32        q14, q15, q12
   1227     vrshrn.s32      d5,  q10, #13
   1228     vrshrn.s32      d7,  q14, #13
   1229     vtrn.16         q2,  q3
   1230     vtrn.32         q3,  q5
   1231 #endif
   1232 
   1233     /* Pass 2 */
   1234     idct_helper     d4, d6, d10, d7, d11, 20, d26, d27
   1235 
   1236     /* Range limit */
   1237     vmov.u16        q15, #0x80
   1238     vadd.s16        q13, q13, q15
   1239     vqmovun.s16     d26, q13
   1240     vqmovun.s16     d27, q13
   1241 
   1242     /* Store results to the output buffer */
   1243     ldmia           OUTPUT_BUF, {TMP1, TMP2}
   1244     add             TMP1, TMP1, OUTPUT_COL
   1245     add             TMP2, TMP2, OUTPUT_COL
   1246 
   1247     vst1.8          {d26[0]}, [TMP1]!
   1248     vst1.8          {d27[4]}, [TMP1]!
   1249     vst1.8          {d26[1]}, [TMP2]!
   1250     vst1.8          {d27[5]}, [TMP2]!
   1251 
   1252     vpop            {d8-d15}
   1253     bx              lr
   1254 
   1255     .unreq          DCT_TABLE
   1256     .unreq          COEF_BLOCK
   1257     .unreq          OUTPUT_BUF
   1258     .unreq          OUTPUT_COL
   1259     .unreq          TMP1
   1260     .unreq          TMP2
   1261 
   1262 .purgem idct_helper
   1263 
   1264 
   1265 /*****************************************************************************/
   1266 
   1267 /*
   1268  * jsimd_ycc_extrgb_convert_neon
   1269  * jsimd_ycc_extbgr_convert_neon
   1270  * jsimd_ycc_extrgbx_convert_neon
   1271  * jsimd_ycc_extbgrx_convert_neon
   1272  * jsimd_ycc_extxbgr_convert_neon
   1273  * jsimd_ycc_extxrgb_convert_neon
   1274  *
   1275  * Colorspace conversion YCbCr -> RGB
   1276  */
   1277 
   1278 
   1279 .macro do_load size
   1280     .if \size == 8
   1281         vld1.8  {d4}, [U, :64]!
   1282         vld1.8  {d5}, [V, :64]!
   1283         vld1.8  {d0}, [Y, :64]!
   1284         pld     [U, #64]
   1285         pld     [V, #64]
   1286         pld     [Y, #64]
   1287     .elseif \size == 4
   1288         vld1.8  {d4[0]}, [U]!
   1289         vld1.8  {d4[1]}, [U]!
   1290         vld1.8  {d4[2]}, [U]!
   1291         vld1.8  {d4[3]}, [U]!
   1292         vld1.8  {d5[0]}, [V]!
   1293         vld1.8  {d5[1]}, [V]!
   1294         vld1.8  {d5[2]}, [V]!
   1295         vld1.8  {d5[3]}, [V]!
   1296         vld1.8  {d0[0]}, [Y]!
   1297         vld1.8  {d0[1]}, [Y]!
   1298         vld1.8  {d0[2]}, [Y]!
   1299         vld1.8  {d0[3]}, [Y]!
   1300     .elseif \size == 2
   1301         vld1.8  {d4[4]}, [U]!
   1302         vld1.8  {d4[5]}, [U]!
   1303         vld1.8  {d5[4]}, [V]!
   1304         vld1.8  {d5[5]}, [V]!
   1305         vld1.8  {d0[4]}, [Y]!
   1306         vld1.8  {d0[5]}, [Y]!
   1307     .elseif \size == 1
   1308         vld1.8  {d4[6]}, [U]!
   1309         vld1.8  {d5[6]}, [V]!
   1310         vld1.8  {d0[6]}, [Y]!
   1311     .else
   1312         .error unsupported macroblock size
   1313     .endif
   1314 .endm
   1315 
   1316 .macro do_store bpp, size
   1317     .if \bpp == 24
   1318         .if \size == 8
   1319             vst3.8  {d10, d11, d12}, [RGB]!
   1320         .elseif \size == 4
   1321             vst3.8  {d10[0], d11[0], d12[0]}, [RGB]!
   1322             vst3.8  {d10[1], d11[1], d12[1]}, [RGB]!
   1323             vst3.8  {d10[2], d11[2], d12[2]}, [RGB]!
   1324             vst3.8  {d10[3], d11[3], d12[3]}, [RGB]!
   1325         .elseif \size == 2
   1326             vst3.8  {d10[4], d11[4], d12[4]}, [RGB]!
   1327             vst3.8  {d10[5], d11[5], d12[5]}, [RGB]!
   1328         .elseif \size == 1
   1329             vst3.8  {d10[6], d11[6], d12[6]}, [RGB]!
   1330         .else
   1331             .error unsupported macroblock size
   1332         .endif
   1333     .elseif \bpp == 32
   1334         .if \size == 8
   1335             vst4.8  {d10, d11, d12, d13}, [RGB]!
   1336         .elseif \size == 4
   1337             vst4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
   1338             vst4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
   1339             vst4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
   1340             vst4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
   1341         .elseif \size == 2
   1342             vst4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
   1343             vst4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
   1344         .elseif \size == 1
   1345             vst4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
   1346         .else
   1347             .error unsupported macroblock size
   1348         .endif
   1349     .else
   1350         .error unsupported bpp
   1351     .endif
   1352 .endm
   1353 
   1354 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
   1355 
   1356 /*
   1357  * 2 stage pipelined YCbCr->RGB conversion
   1358  */
   1359 
   1360 .macro do_yuv_to_rgb_stage1
   1361     vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
   1362     vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
   1363     vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
   1364     vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */
   1365     vmull.s16       q11, d7, d1[1] /* multiply by -11277 */
   1366     vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */
   1367     vmull.s16       q12, d8, d1[0] /* multiply by 22971 */
   1368     vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
   1369     vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
   1370     vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
   1371 .endm
   1372 
   1373 .macro do_yuv_to_rgb_stage2
   1374     vrshrn.s32      d20, q10, #15
   1375     vrshrn.s32      d21, q11, #15
   1376     vrshrn.s32      d24, q12, #14
   1377     vrshrn.s32      d25, q13, #14
   1378     vrshrn.s32      d28, q14, #14
   1379     vrshrn.s32      d29, q15, #14
   1380     vaddw.u8        q10, q10, d0
   1381     vaddw.u8        q12, q12, d0
   1382     vaddw.u8        q14, q14, d0
   1383     vqmovun.s16     d1\g_offs, q10
   1384     vqmovun.s16     d1\r_offs, q12
   1385     vqmovun.s16     d1\b_offs, q14
   1386 .endm
   1387 
   1388 .macro do_yuv_to_rgb_stage2_store_load_stage1
   1389     vld1.8          {d4}, [U, :64]!
   1390       vrshrn.s32      d20, q10, #15
   1391       vrshrn.s32      d21, q11, #15
   1392       vrshrn.s32      d24, q12, #14
   1393       vrshrn.s32      d25, q13, #14
   1394       vrshrn.s32      d28, q14, #14
   1395     vld1.8          {d5}, [V, :64]!
   1396       vrshrn.s32      d29, q15, #14
   1397       vaddw.u8        q10, q10, d0
   1398       vaddw.u8        q12, q12, d0
   1399       vaddw.u8        q14, q14, d0
   1400       vqmovun.s16     d1\g_offs, q10
   1401     vld1.8          {d0}, [Y, :64]!
   1402       vqmovun.s16     d1\r_offs, q12
   1403     pld             [U, #64]
   1404     pld             [V, #64]
   1405     pld             [Y, #64]
   1406       vqmovun.s16     d1\b_offs, q14
   1407     vaddw.u8        q3, q1, d4     /* q3 = u - 128 */
   1408     vaddw.u8        q4, q1, d5     /* q2 = v - 128 */
   1409       do_store        \bpp, 8
   1410     vmull.s16       q10, d6, d1[1] /* multiply by -11277 */
   1411     vmlal.s16       q10, d8, d1[2] /* multiply by -23401 */
   1412     vmull.s16       q11, d7, d1[1] /* multiply by -11277 */
   1413     vmlal.s16       q11, d9, d1[2] /* multiply by -23401 */
   1414     vmull.s16       q12, d8, d1[0] /* multiply by 22971 */
   1415     vmull.s16       q13, d9, d1[0] /* multiply by 22971 */
   1416     vmull.s16       q14, d6, d1[3] /* multiply by 29033 */
   1417     vmull.s16       q15, d7, d1[3] /* multiply by 29033 */
   1418 .endm
   1419 
   1420 .macro do_yuv_to_rgb
   1421     do_yuv_to_rgb_stage1
   1422     do_yuv_to_rgb_stage2
   1423 .endm
   1424 
   1425 /* Apple gas crashes on adrl, work around that by using adr.
   1426  * But this requires a copy of these constants for each function.
   1427  */
   1428 
   1429 .balign 16
   1430 jsimd_ycc_\colorid\()_neon_consts:
   1431     .short          0,      0,     0,      0
   1432     .short          22971, -11277, -23401, 29033
   1433     .short          -128,  -128,   -128,   -128
   1434     .short          -128,  -128,   -128,   -128
   1435 
   1436 asm_function jsimd_ycc_\colorid\()_convert_neon
   1437     OUTPUT_WIDTH    .req r0
   1438     INPUT_BUF       .req r1
   1439     INPUT_ROW       .req r2
   1440     OUTPUT_BUF      .req r3
   1441     NUM_ROWS        .req r4
   1442 
   1443     INPUT_BUF0      .req r5
   1444     INPUT_BUF1      .req r6
   1445     INPUT_BUF2      .req INPUT_BUF
   1446 
   1447     RGB             .req r7
   1448     Y               .req r8
   1449     U               .req r9
   1450     V               .req r10
   1451     N               .req ip
   1452 
   1453     /* Load constants to d1, d2, d3 (d0 is just used for padding) */
   1454     adr             ip, jsimd_ycc_\colorid\()_neon_consts
   1455     vld1.16         {d0, d1, d2, d3}, [ip, :128]
   1456 
   1457     /* Save ARM registers and handle input arguments */
   1458     push            {r4, r5, r6, r7, r8, r9, r10, lr}
   1459     ldr             NUM_ROWS, [sp, #(4 * 8)]
   1460     ldr             INPUT_BUF0, [INPUT_BUF]
   1461     ldr             INPUT_BUF1, [INPUT_BUF, #4]
   1462     ldr             INPUT_BUF2, [INPUT_BUF, #8]
   1463     .unreq          INPUT_BUF
   1464 
   1465     /* Save NEON registers */
   1466     vpush           {d8-d15}
   1467 
   1468     /* Initially set d10, d11, d12, d13 to 0xFF */
   1469     vmov.u8         q5, #255
   1470     vmov.u8         q6, #255
   1471 
   1472     /* Outer loop over scanlines */
   1473     cmp             NUM_ROWS, #1
   1474     blt             9f
   1475 0:
   1476     ldr             Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
   1477     ldr             U, [INPUT_BUF1, INPUT_ROW, lsl #2]
   1478     mov             N, OUTPUT_WIDTH
   1479     ldr             V, [INPUT_BUF2, INPUT_ROW, lsl #2]
   1480     add             INPUT_ROW, INPUT_ROW, #1
   1481     ldr             RGB, [OUTPUT_BUF], #4
   1482 
   1483     /* Inner loop over pixels */
   1484     subs            N, N, #8
   1485     blt             3f
   1486     do_load         8
   1487     do_yuv_to_rgb_stage1
   1488     subs            N, N, #8
   1489     blt             2f
   1490 1:
   1491     do_yuv_to_rgb_stage2_store_load_stage1
   1492     subs            N, N, #8
   1493     bge             1b
   1494 2:
   1495     do_yuv_to_rgb_stage2
   1496     do_store        \bpp, 8
   1497     tst             N, #7
   1498     beq             8f
   1499 3:
   1500     tst             N, #4
   1501     beq             3f
   1502     do_load         4
   1503 3:
   1504     tst             N, #2
   1505     beq             4f
   1506     do_load         2
   1507 4:
   1508     tst             N, #1
   1509     beq             5f
   1510     do_load         1
   1511 5:
   1512     do_yuv_to_rgb
   1513     tst             N, #4
   1514     beq             6f
   1515     do_store        \bpp, 4
   1516 6:
   1517     tst             N, #2
   1518     beq             7f
   1519     do_store        \bpp, 2
   1520 7:
   1521     tst             N, #1
   1522     beq             8f
   1523     do_store        \bpp, 1
   1524 8:
   1525     subs            NUM_ROWS, NUM_ROWS, #1
   1526     bgt             0b
   1527 9:
   1528     /* Restore all registers and return */
   1529     vpop            {d8-d15}
   1530     pop             {r4, r5, r6, r7, r8, r9, r10, pc}
   1531 
   1532     .unreq          OUTPUT_WIDTH
   1533     .unreq          INPUT_ROW
   1534     .unreq          OUTPUT_BUF
   1535     .unreq          NUM_ROWS
   1536     .unreq          INPUT_BUF0
   1537     .unreq          INPUT_BUF1
   1538     .unreq          INPUT_BUF2
   1539     .unreq          RGB
   1540     .unreq          Y
   1541     .unreq          U
   1542     .unreq          V
   1543     .unreq          N
   1544 
   1545 .purgem do_yuv_to_rgb
   1546 .purgem do_yuv_to_rgb_stage1
   1547 .purgem do_yuv_to_rgb_stage2
   1548 .purgem do_yuv_to_rgb_stage2_store_load_stage1
   1549 
   1550 .endm
   1551 
   1552 /*--------------------------------- id ----- bpp R  G  B */
   1553 generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, 1, 2
   1554 generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, 1, 0
   1555 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
   1556 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
   1557 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
   1558 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
   1559 
   1560 .purgem do_load
   1561 .purgem do_store
   1562 
   1563 
   1564 /*****************************************************************************/
   1565 
   1566 /*
   1567  * jsimd_extrgb_ycc_convert_neon
   1568  * jsimd_extbgr_ycc_convert_neon
   1569  * jsimd_extrgbx_ycc_convert_neon
   1570  * jsimd_extbgrx_ycc_convert_neon
   1571  * jsimd_extxbgr_ycc_convert_neon
   1572  * jsimd_extxrgb_ycc_convert_neon
   1573  *
   1574  * Colorspace conversion RGB -> YCbCr
   1575  */
   1576 
   1577 .macro do_store size
   1578     .if \size == 8
   1579         vst1.8  {d20}, [Y]!
   1580         vst1.8  {d21}, [U]!
   1581         vst1.8  {d22}, [V]!
   1582     .elseif \size == 4
   1583         vst1.8  {d20[0]}, [Y]!
   1584         vst1.8  {d20[1]}, [Y]!
   1585         vst1.8  {d20[2]}, [Y]!
   1586         vst1.8  {d20[3]}, [Y]!
   1587         vst1.8  {d21[0]}, [U]!
   1588         vst1.8  {d21[1]}, [U]!
   1589         vst1.8  {d21[2]}, [U]!
   1590         vst1.8  {d21[3]}, [U]!
   1591         vst1.8  {d22[0]}, [V]!
   1592         vst1.8  {d22[1]}, [V]!
   1593         vst1.8  {d22[2]}, [V]!
   1594         vst1.8  {d22[3]}, [V]!
   1595     .elseif \size == 2
   1596         vst1.8  {d20[4]}, [Y]!
   1597         vst1.8  {d20[5]}, [Y]!
   1598         vst1.8  {d21[4]}, [U]!
   1599         vst1.8  {d21[5]}, [U]!
   1600         vst1.8  {d22[4]}, [V]!
   1601         vst1.8  {d22[5]}, [V]!
   1602     .elseif \size == 1
   1603         vst1.8  {d20[6]}, [Y]!
   1604         vst1.8  {d21[6]}, [U]!
   1605         vst1.8  {d22[6]}, [V]!
   1606     .else
   1607         .error unsupported macroblock size
   1608     .endif
   1609 .endm
   1610 
   1611 .macro do_load bpp, size
   1612     .if \bpp == 24
   1613         .if \size == 8
   1614             vld3.8  {d10, d11, d12}, [RGB]!
   1615             pld     [RGB, #128]
   1616         .elseif \size == 4
   1617             vld3.8  {d10[0], d11[0], d12[0]}, [RGB]!
   1618             vld3.8  {d10[1], d11[1], d12[1]}, [RGB]!
   1619             vld3.8  {d10[2], d11[2], d12[2]}, [RGB]!
   1620             vld3.8  {d10[3], d11[3], d12[3]}, [RGB]!
   1621         .elseif \size == 2
   1622             vld3.8  {d10[4], d11[4], d12[4]}, [RGB]!
   1623             vld3.8  {d10[5], d11[5], d12[5]}, [RGB]!
   1624         .elseif \size == 1
   1625             vld3.8  {d10[6], d11[6], d12[6]}, [RGB]!
   1626         .else
   1627             .error unsupported macroblock size
   1628         .endif
   1629     .elseif \bpp == 32
   1630         .if \size == 8
   1631             vld4.8  {d10, d11, d12, d13}, [RGB]!
   1632             pld     [RGB, #128]
   1633         .elseif \size == 4
   1634             vld4.8  {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
   1635             vld4.8  {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
   1636             vld4.8  {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
   1637             vld4.8  {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
   1638         .elseif \size == 2
   1639             vld4.8  {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
   1640             vld4.8  {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
   1641         .elseif \size == 1
   1642             vld4.8  {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
   1643         .else
   1644             .error unsupported macroblock size
   1645         .endif
   1646     .else
   1647         .error unsupported bpp
   1648     .endif
   1649 .endm
   1650 
   1651 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
   1652 
   1653 /*
   1654  * 2 stage pipelined RGB->YCbCr conversion
   1655  */
   1656 
   1657 .macro do_rgb_to_yuv_stage1
   1658     vmovl.u8    q2, d1\r_offs /* r = { d4, d5 } */
   1659     vmovl.u8    q3, d1\g_offs /* g = { d6, d7 } */
   1660     vmovl.u8    q4, d1\b_offs /* b = { d8, d9 } */
   1661     vmull.u16   q7, d4, d0[0]
   1662     vmlal.u16   q7, d6, d0[1]
   1663     vmlal.u16   q7, d8, d0[2]
   1664     vmull.u16   q8, d5, d0[0]
   1665     vmlal.u16   q8, d7, d0[1]
   1666     vmlal.u16   q8, d9, d0[2]
   1667     vrev64.32   q9,  q1
   1668     vrev64.32   q13, q1
   1669     vmlsl.u16   q9,  d4, d0[3]
   1670     vmlsl.u16   q9,  d6, d1[0]
   1671     vmlal.u16   q9,  d8, d1[1]
   1672     vmlsl.u16   q13, d5, d0[3]
   1673     vmlsl.u16   q13, d7, d1[0]
   1674     vmlal.u16   q13, d9, d1[1]
   1675     vrev64.32   q14, q1
   1676     vrev64.32   q15, q1
   1677     vmlal.u16   q14, d4, d1[1]
   1678     vmlsl.u16   q14, d6, d1[2]
   1679     vmlsl.u16   q14, d8, d1[3]
   1680     vmlal.u16   q15, d5, d1[1]
   1681     vmlsl.u16   q15, d7, d1[2]
   1682     vmlsl.u16   q15, d9, d1[3]
   1683 .endm
   1684 
   1685 .macro do_rgb_to_yuv_stage2
   1686     vrshrn.u32  d20, q7,  #16
   1687     vrshrn.u32  d21, q8,  #16
   1688     vshrn.u32   d22, q9,  #16
   1689     vshrn.u32   d23, q13, #16
   1690     vshrn.u32   d24, q14, #16
   1691     vshrn.u32   d25, q15, #16
   1692     vmovn.u16   d20, q10      /* d20 = y */
   1693     vmovn.u16   d21, q11      /* d21 = u */
   1694     vmovn.u16   d22, q12      /* d22 = v */
   1695 .endm
   1696 
   1697 .macro do_rgb_to_yuv
   1698     do_rgb_to_yuv_stage1
   1699     do_rgb_to_yuv_stage2
   1700 .endm
   1701 
   1702 .macro do_rgb_to_yuv_stage2_store_load_stage1
   1703       vrshrn.u32  d20, q7,  #16
   1704       vrshrn.u32  d21, q8,  #16
   1705       vshrn.u32   d22, q9,  #16
   1706     vrev64.32   q9,  q1
   1707       vshrn.u32   d23, q13, #16
   1708     vrev64.32   q13, q1
   1709       vshrn.u32   d24, q14, #16
   1710       vshrn.u32   d25, q15, #16
   1711     do_load     \bpp, 8
   1712       vmovn.u16   d20, q10      /* d20 = y */
   1713     vmovl.u8    q2, d1\r_offs   /* r = { d4, d5 } */
   1714       vmovn.u16   d21, q11      /* d21 = u */
   1715     vmovl.u8    q3, d1\g_offs   /* g = { d6, d7 } */
   1716       vmovn.u16   d22, q12      /* d22 = v */
   1717     vmovl.u8    q4, d1\b_offs   /* b = { d8, d9 } */
   1718     vmull.u16   q7, d4, d0[0]
   1719     vmlal.u16   q7, d6, d0[1]
   1720     vmlal.u16   q7, d8, d0[2]
   1721       vst1.8      {d20}, [Y]!
   1722     vmull.u16   q8, d5, d0[0]
   1723     vmlal.u16   q8, d7, d0[1]
   1724     vmlal.u16   q8, d9, d0[2]
   1725     vmlsl.u16   q9,  d4, d0[3]
   1726     vmlsl.u16   q9,  d6, d1[0]
   1727     vmlal.u16   q9,  d8, d1[1]
   1728       vst1.8      {d21}, [U]!
   1729     vmlsl.u16   q13, d5, d0[3]
   1730     vmlsl.u16   q13, d7, d1[0]
   1731     vmlal.u16   q13, d9, d1[1]
   1732     vrev64.32   q14, q1
   1733     vrev64.32   q15, q1
   1734     vmlal.u16   q14, d4, d1[1]
   1735     vmlsl.u16   q14, d6, d1[2]
   1736     vmlsl.u16   q14, d8, d1[3]
   1737       vst1.8      {d22}, [V]!
   1738     vmlal.u16   q15, d5, d1[1]
   1739     vmlsl.u16   q15, d7, d1[2]
   1740     vmlsl.u16   q15, d9, d1[3]
   1741 .endm
   1742 
   1743 .balign 16
   1744 jsimd_\colorid\()_ycc_neon_consts:
   1745     .short          19595, 38470, 7471,  11059
   1746     .short          21709, 32768, 27439, 5329
   1747     .short          32767, 128,   32767, 128
   1748     .short          32767, 128,   32767, 128
   1749 
   1750 asm_function jsimd_\colorid\()_ycc_convert_neon
   1751     OUTPUT_WIDTH    .req r0
   1752     INPUT_BUF       .req r1
   1753     OUTPUT_BUF      .req r2
   1754     OUTPUT_ROW      .req r3
   1755     NUM_ROWS        .req r4
   1756 
   1757     OUTPUT_BUF0     .req r5
   1758     OUTPUT_BUF1     .req r6
   1759     OUTPUT_BUF2     .req OUTPUT_BUF
   1760 
   1761     RGB             .req r7
   1762     Y               .req r8
   1763     U               .req r9
   1764     V               .req r10
   1765     N               .req ip
   1766 
   1767     /* Load constants to d0, d1, d2, d3 */
   1768     adr             ip, jsimd_\colorid\()_ycc_neon_consts
   1769     vld1.16         {d0, d1, d2, d3}, [ip, :128]
   1770 
   1771     /* Save ARM registers and handle input arguments */
   1772     push            {r4, r5, r6, r7, r8, r9, r10, lr}
   1773     ldr             NUM_ROWS, [sp, #(4 * 8)]
   1774     ldr             OUTPUT_BUF0, [OUTPUT_BUF]
   1775     ldr             OUTPUT_BUF1, [OUTPUT_BUF, #4]
   1776     ldr             OUTPUT_BUF2, [OUTPUT_BUF, #8]
   1777     .unreq          OUTPUT_BUF
   1778 
   1779     /* Save NEON registers */
   1780     vpush           {d8-d15}
   1781 
   1782     /* Outer loop over scanlines */
   1783     cmp             NUM_ROWS, #1
   1784     blt             9f
   1785 0:
   1786     ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
   1787     ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
   1788     mov             N, OUTPUT_WIDTH
   1789     ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
   1790     add             OUTPUT_ROW, OUTPUT_ROW, #1
   1791     ldr             RGB, [INPUT_BUF], #4
   1792 
   1793     /* Inner loop over pixels */
   1794     subs            N, N, #8
   1795     blt             3f
   1796     do_load         \bpp, 8
   1797     do_rgb_to_yuv_stage1
   1798     subs            N, N, #8
   1799     blt             2f
   1800 1:
   1801     do_rgb_to_yuv_stage2_store_load_stage1
   1802     subs            N, N, #8
   1803     bge             1b
   1804 2:
   1805     do_rgb_to_yuv_stage2
   1806     do_store        8
   1807     tst             N, #7
   1808     beq             8f
   1809 3:
   1810     tst             N, #4
   1811     beq             3f
   1812     do_load         \bpp, 4
   1813 3:
   1814     tst             N, #2
   1815     beq             4f
   1816     do_load         \bpp, 2
   1817 4:
   1818     tst             N, #1
   1819     beq             5f
   1820     do_load         \bpp, 1
   1821 5:
   1822     do_rgb_to_yuv
   1823     tst             N, #4
   1824     beq             6f
   1825     do_store        4
   1826 6:
   1827     tst             N, #2
   1828     beq             7f
   1829     do_store        2
   1830 7:
   1831     tst             N, #1
   1832     beq             8f
   1833     do_store        1
   1834 8:
   1835     subs            NUM_ROWS, NUM_ROWS, #1
   1836     bgt             0b
   1837 9:
   1838     /* Restore all registers and return */
   1839     vpop            {d8-d15}
   1840     pop             {r4, r5, r6, r7, r8, r9, r10, pc}
   1841 
   1842     .unreq          OUTPUT_WIDTH
   1843     .unreq          OUTPUT_ROW
   1844     .unreq          INPUT_BUF
   1845     .unreq          NUM_ROWS
   1846     .unreq          OUTPUT_BUF0
   1847     .unreq          OUTPUT_BUF1
   1848     .unreq          OUTPUT_BUF2
   1849     .unreq          RGB
   1850     .unreq          Y
   1851     .unreq          U
   1852     .unreq          V
   1853     .unreq          N
   1854 
   1855 .purgem do_rgb_to_yuv
   1856 .purgem do_rgb_to_yuv_stage1
   1857 .purgem do_rgb_to_yuv_stage2
   1858 .purgem do_rgb_to_yuv_stage2_store_load_stage1
   1859 
   1860 .endm
   1861 
   1862 /*--------------------------------- id ----- bpp R  G  B */
   1863 generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
   1864 generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
   1865 generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
   1866 generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
   1867 generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
   1868 generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
   1869 
   1870 .purgem do_load
   1871 .purgem do_store
   1872 
   1873 
   1874 /*****************************************************************************/
   1875 
   1876 /*
   1877  * Load data into workspace, applying unsigned->signed conversion
   1878  *
   1879  * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
   1880  *       rid of VST1.16 instructions
   1881  */
   1882 
   1883 asm_function jsimd_convsamp_neon
   1884     SAMPLE_DATA     .req r0
   1885     START_COL       .req r1
   1886     WORKSPACE       .req r2
   1887     TMP1            .req r3
   1888     TMP2            .req r4
   1889     TMP3            .req r5
   1890     TMP4            .req ip
   1891 
   1892     push            {r4, r5}
   1893     vmov.u8         d0, #128
   1894 
   1895     ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
   1896     add             TMP1, TMP1, START_COL
   1897     add             TMP2, TMP2, START_COL
   1898     add             TMP3, TMP3, START_COL
   1899     add             TMP4, TMP4, START_COL
   1900     vld1.8          {d16}, [TMP1]
   1901     vsubl.u8        q8, d16, d0
   1902     vld1.8          {d18}, [TMP2]
   1903     vsubl.u8        q9, d18, d0
   1904     vld1.8          {d20}, [TMP3]
   1905     vsubl.u8        q10, d20, d0
   1906     vld1.8          {d22}, [TMP4]
   1907     ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
   1908     vsubl.u8        q11, d22, d0
   1909     vst1.16         {d16, d17, d18, d19}, [WORKSPACE, :128]!
   1910     add             TMP1, TMP1, START_COL
   1911     add             TMP2, TMP2, START_COL
   1912     vst1.16         {d20, d21, d22, d23}, [WORKSPACE, :128]!
   1913     add             TMP3, TMP3, START_COL
   1914     add             TMP4, TMP4, START_COL
   1915     vld1.8          {d24}, [TMP1]
   1916     vsubl.u8        q12, d24, d0
   1917     vld1.8          {d26}, [TMP2]
   1918     vsubl.u8        q13, d26, d0
   1919     vld1.8          {d28}, [TMP3]
   1920     vsubl.u8        q14, d28, d0
   1921     vld1.8          {d30}, [TMP4]
   1922     vsubl.u8        q15, d30, d0
   1923     vst1.16         {d24, d25, d26, d27}, [WORKSPACE, :128]!
   1924     vst1.16         {d28, d29, d30, d31}, [WORKSPACE, :128]!
   1925     pop             {r4, r5}
   1926     bx              lr
   1927 
   1928     .unreq          SAMPLE_DATA
   1929     .unreq          START_COL
   1930     .unreq          WORKSPACE
   1931     .unreq          TMP1
   1932     .unreq          TMP2
   1933     .unreq          TMP3
   1934     .unreq          TMP4
   1935 
   1936 
   1937 /*****************************************************************************/
   1938 
   1939 /*
   1940  * jsimd_fdct_ifast_neon
   1941  *
   1942  * This function contains a fast, not so accurate integer implementation of
   1943  * the forward DCT (Discrete Cosine Transform). It uses the same calculations
   1944  * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
   1945  * function from jfdctfst.c
   1946  *
   1947  * TODO: can be combined with 'jsimd_convsamp_neon' to get
   1948  *       rid of a bunch of VLD1.16 instructions
   1949  */
   1950 
   1951 #define XFIX_0_382683433 d0[0]
   1952 #define XFIX_0_541196100 d0[1]
   1953 #define XFIX_0_707106781 d0[2]
   1954 #define XFIX_1_306562965 d0[3]
   1955 
   1956 .balign 16
   1957 jsimd_fdct_ifast_neon_consts:
   1958     .short (98 * 128)              /* XFIX_0_382683433 */
   1959     .short (139 * 128)             /* XFIX_0_541196100 */
   1960     .short (181 * 128)             /* XFIX_0_707106781 */
   1961     .short (334 * 128 - 256 * 128) /* XFIX_1_306562965 */
   1962 
   1963 asm_function jsimd_fdct_ifast_neon
   1964 
   1965     DATA            .req r0
   1966     TMP             .req ip
   1967 
   1968     vpush           {d8-d15}
   1969 
   1970     /* Load constants */
   1971     adr             TMP, jsimd_fdct_ifast_neon_consts
   1972     vld1.16         {d0}, [TMP, :64]
   1973 
   1974     /* Load all DATA into NEON registers with the following allocation:
   1975      *       0 1 2 3 | 4 5 6 7
   1976      *      ---------+--------
   1977      *   0 | d16     | d17    | q8
   1978      *   1 | d18     | d19    | q9
   1979      *   2 | d20     | d21    | q10
   1980      *   3 | d22     | d23    | q11
   1981      *   4 | d24     | d25    | q12
   1982      *   5 | d26     | d27    | q13
   1983      *   6 | d28     | d29    | q14
   1984      *   7 | d30     | d31    | q15
   1985      */
   1986 
   1987     vld1.16         {d16, d17, d18, d19}, [DATA, :128]!
   1988     vld1.16         {d20, d21, d22, d23}, [DATA, :128]!
   1989     vld1.16         {d24, d25, d26, d27}, [DATA, :128]!
   1990     vld1.16         {d28, d29, d30, d31}, [DATA, :128]
   1991     sub             DATA, DATA, #(128 - 32)
   1992 
   1993     mov             TMP, #2
   1994 1:
   1995     /* Transpose */
   1996     vtrn.16         q12, q13
   1997     vtrn.16         q10, q11
   1998     vtrn.16         q8,  q9
   1999     vtrn.16         q14, q15
   2000     vtrn.32         q9,  q11
   2001     vtrn.32         q13, q15
   2002     vtrn.32         q8,  q10
   2003     vtrn.32         q12, q14
   2004     vswp            d30, d23
   2005     vswp            d24, d17
   2006     vswp            d26, d19
   2007       /* 1-D FDCT */
   2008       vadd.s16        q2,  q11, q12
   2009     vswp            d28, d21
   2010       vsub.s16        q12, q11, q12
   2011       vsub.s16        q6,  q10, q13
   2012       vadd.s16        q10, q10, q13
   2013       vsub.s16        q7,  q9,  q14
   2014       vadd.s16        q9,  q9,  q14
   2015       vsub.s16        q1,  q8,  q15
   2016       vadd.s16        q8,  q8,  q15
   2017       vsub.s16        q4,  q9,  q10
   2018       vsub.s16        q5,  q8,  q2
   2019       vadd.s16        q3,  q9,  q10
   2020       vadd.s16        q4,  q4,  q5
   2021       vadd.s16        q2,  q8,  q2
   2022       vqdmulh.s16     q4,  q4,  XFIX_0_707106781
   2023       vadd.s16        q11, q12, q6
   2024       vadd.s16        q8,  q2,  q3
   2025       vsub.s16        q12, q2,  q3
   2026       vadd.s16        q3,  q6,  q7
   2027       vadd.s16        q7,  q7,  q1
   2028       vqdmulh.s16     q3,  q3,  XFIX_0_707106781
   2029       vsub.s16        q6,  q11, q7
   2030       vadd.s16        q10, q5,  q4
   2031       vqdmulh.s16     q6,  q6,  XFIX_0_382683433
   2032       vsub.s16        q14, q5,  q4
   2033       vqdmulh.s16     q11, q11, XFIX_0_541196100
   2034       vqdmulh.s16     q5,  q7,  XFIX_1_306562965
   2035       vadd.s16        q4,  q1,  q3
   2036       vsub.s16        q3,  q1,  q3
   2037       vadd.s16        q7,  q7,  q6
   2038       vadd.s16        q11, q11, q6
   2039       vadd.s16        q7,  q7,  q5
   2040       vadd.s16        q13, q3,  q11
   2041       vsub.s16        q11, q3,  q11
   2042       vadd.s16        q9,  q4,  q7
   2043       vsub.s16        q15, q4,  q7
   2044     subs            TMP, TMP, #1
   2045     bne             1b
   2046 
   2047     /* store results */
   2048     vst1.16         {d16, d17, d18, d19}, [DATA, :128]!
   2049     vst1.16         {d20, d21, d22, d23}, [DATA, :128]!
   2050     vst1.16         {d24, d25, d26, d27}, [DATA, :128]!
   2051     vst1.16         {d28, d29, d30, d31}, [DATA, :128]
   2052 
   2053     vpop            {d8-d15}
   2054     bx              lr
   2055 
   2056     .unreq          DATA
   2057     .unreq          TMP
   2058 
   2059 
   2060 /*****************************************************************************/
   2061 
   2062 /*
   2063  * GLOBAL(void)
   2064  * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM * divisors,
   2065  *                      DCTELEM * workspace);
   2066  *
   2067  * Note: the code uses 2 stage pipelining in order to improve instructions
   2068  *       scheduling and eliminate stalls (this provides ~15% better
   2069  *       performance for this function on both ARM Cortex-A8 and
   2070  *       ARM Cortex-A9 when compared to the non-pipelined variant).
   2071  *       The instructions which belong to the second stage use different
   2072  *       indentation for better readiability.
   2073  */
   2074 asm_function jsimd_quantize_neon
   2075 
   2076     COEF_BLOCK      .req r0
   2077     DIVISORS        .req r1
   2078     WORKSPACE       .req r2
   2079 
   2080     RECIPROCAL      .req DIVISORS
   2081     CORRECTION      .req r3
   2082     SHIFT           .req ip
   2083     LOOP_COUNT      .req r4
   2084 
   2085     vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
   2086     vabs.s16        q12, q0
   2087     add             CORRECTION, DIVISORS, #(64 * 2)
   2088     add             SHIFT, DIVISORS, #(64 * 6)
   2089     vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
   2090     vabs.s16        q13, q1
   2091     vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
   2092     vadd.u16        q12, q12, q10 /* add correction */
   2093     vadd.u16        q13, q13, q11
   2094     vmull.u16       q10, d24, d16 /* multiply by reciprocal */
   2095     vmull.u16       q11, d25, d17
   2096     vmull.u16       q8,  d26, d18
   2097     vmull.u16       q9,  d27, d19
   2098     vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
   2099     vshrn.u32       d20, q10, #16
   2100     vshrn.u32       d21, q11, #16
   2101     vshrn.u32       d22, q8,  #16
   2102     vshrn.u32       d23, q9,  #16
   2103     vneg.s16        q12, q12
   2104     vneg.s16        q13, q13
   2105     vshr.s16        q2,  q0,  #15 /* extract sign */
   2106     vshr.s16        q3,  q1,  #15
   2107     vshl.u16        q14, q10, q12 /* shift */
   2108     vshl.u16        q15, q11, q13
   2109 
   2110     push            {r4, r5}
   2111     mov             LOOP_COUNT, #3
   2112 1:
   2113     vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
   2114       veor.u16        q14, q14, q2  /* restore sign */
   2115     vabs.s16        q12, q0
   2116     vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
   2117     vabs.s16        q13, q1
   2118       veor.u16        q15, q15, q3
   2119     vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
   2120     vadd.u16        q12, q12, q10 /* add correction */
   2121     vadd.u16        q13, q13, q11
   2122     vmull.u16       q10, d24, d16 /* multiply by reciprocal */
   2123     vmull.u16       q11, d25, d17
   2124     vmull.u16       q8,  d26, d18
   2125     vmull.u16       q9,  d27, d19
   2126       vsub.u16        q14, q14, q2
   2127     vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
   2128       vsub.u16        q15, q15, q3
   2129     vshrn.u32       d20, q10, #16
   2130     vshrn.u32       d21, q11, #16
   2131       vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
   2132     vshrn.u32       d22, q8,  #16
   2133     vshrn.u32       d23, q9,  #16
   2134     vneg.s16        q12, q12
   2135     vneg.s16        q13, q13
   2136     vshr.s16        q2,  q0,  #15 /* extract sign */
   2137     vshr.s16        q3,  q1,  #15
   2138     vshl.u16        q14, q10, q12 /* shift */
   2139     vshl.u16        q15, q11, q13
   2140     subs            LOOP_COUNT, LOOP_COUNT, #1
   2141     bne             1b
   2142     pop             {r4, r5}
   2143 
   2144       veor.u16        q14, q14, q2  /* restore sign */
   2145       veor.u16        q15, q15, q3
   2146       vsub.u16        q14, q14, q2
   2147       vsub.u16        q15, q15, q3
   2148       vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
   2149 
   2150     bx              lr /* return */
   2151 
   2152     .unreq          COEF_BLOCK
   2153     .unreq          DIVISORS
   2154     .unreq          WORKSPACE
   2155     .unreq          RECIPROCAL
   2156     .unreq          CORRECTION
   2157     .unreq          SHIFT
   2158     .unreq          LOOP_COUNT
   2159 
   2160 
   2161 /*****************************************************************************/
   2162 
   2163 /*
   2164  * GLOBAL(void)
   2165  * jsimd_h2v1_fancy_upsample_neon (int          max_v_samp_factor,
   2166  *                                 JDIMENSION   downsampled_width,
   2167  *                                 JSAMPARRAY   input_data,
   2168  *                                 JSAMPARRAY * output_data_ptr);
   2169  *
   2170  * Note: the use of unaligned writes is the main remaining bottleneck in
   2171  *       this code, which can be potentially solved to get up to tens
   2172  *       of percents performance improvement on Cortex-A8/Cortex-A9.
   2173  */
   2174 
   2175 /*
   2176  * Upsample 16 source pixels to 32 destination pixels. The new 16 source
   2177  * pixels are loaded to q0. The previous 16 source pixels are in q1. The
   2178  * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
   2179  * Register d28 is used for multiplication by 3. Register q15 is used
   2180  * for adding +1 bias.
   2181  */
   2182 .macro upsample16   OUTPTR, INPTR
   2183     vld1.8          {q0}, [\INPTR]!
   2184     vmovl.u8        q8,  d0
   2185     vext.8          q2,  q1,  q0, #15
   2186     vmovl.u8        q9,  d1
   2187     vaddw.u8        q10, q15, d4
   2188     vaddw.u8        q11, q15, d5
   2189     vmlal.u8        q8,  d4,  d28
   2190     vmlal.u8        q9,  d5,  d28
   2191     vmlal.u8        q10, d0,  d28
   2192     vmlal.u8        q11, d1,  d28
   2193     vmov            q1,  q0       /* backup source pixels to q1 */
   2194     vrshrn.u16      d6,  q8,  #2
   2195     vrshrn.u16      d7,  q9,  #2
   2196     vshrn.u16       d8,  q10, #2
   2197     vshrn.u16       d9,  q11, #2
   2198     vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
   2199 .endm
   2200 
   2201 /*
   2202  * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
   2203  * macro, the roles of q0 and q1 registers are reversed for even and odd
   2204  * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
   2205  * Also this unrolling allows to reorder loads and stores to compensate
   2206  * multiplication latency and reduce stalls.
   2207  */
   2208 .macro upsample32   OUTPTR, INPTR
   2209     /* even 16 pixels group */
   2210     vld1.8          {q0}, [\INPTR]!
   2211     vmovl.u8        q8,  d0
   2212     vext.8          q2,  q1,  q0, #15
   2213     vmovl.u8        q9,  d1
   2214     vaddw.u8        q10, q15, d4
   2215     vaddw.u8        q11, q15, d5
   2216     vmlal.u8        q8,  d4,  d28
   2217     vmlal.u8        q9,  d5,  d28
   2218     vmlal.u8        q10, d0,  d28
   2219     vmlal.u8        q11, d1,  d28
   2220         /* odd 16 pixels group */
   2221         vld1.8          {q1}, [\INPTR]!
   2222     vrshrn.u16      d6,  q8,  #2
   2223     vrshrn.u16      d7,  q9,  #2
   2224     vshrn.u16       d8,  q10, #2
   2225     vshrn.u16       d9,  q11, #2
   2226         vmovl.u8        q8,  d2
   2227         vext.8          q2,  q0,  q1, #15
   2228         vmovl.u8        q9,  d3
   2229         vaddw.u8        q10, q15, d4
   2230         vaddw.u8        q11, q15, d5
   2231         vmlal.u8        q8,  d4,  d28
   2232         vmlal.u8        q9,  d5,  d28
   2233         vmlal.u8        q10, d2,  d28
   2234         vmlal.u8        q11, d3,  d28
   2235     vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
   2236         vrshrn.u16      d6,  q8,  #2
   2237         vrshrn.u16      d7,  q9,  #2
   2238         vshrn.u16       d8,  q10, #2
   2239         vshrn.u16       d9,  q11, #2
   2240         vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
   2241 .endm
   2242 
   2243 /*
   2244  * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
   2245  */
   2246 .macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
   2247     /* special case for the first and last pixels */
   2248     sub             \WIDTH, \WIDTH, #1
   2249     add             \OUTPTR, \OUTPTR, #1
   2250     ldrb            \TMP1, [\INPTR, \WIDTH]
   2251     strb            \TMP1, [\OUTPTR, \WIDTH, asl #1]
   2252     ldrb            \TMP1, [\INPTR], #1
   2253     strb            \TMP1, [\OUTPTR, #-1]
   2254     vmov.8          d3[7], \TMP1
   2255 
   2256     subs            \WIDTH, \WIDTH, #32
   2257     blt             5f
   2258 0:  /* process 32 pixels per iteration */
   2259     upsample32      \OUTPTR, \INPTR
   2260     subs            \WIDTH, \WIDTH, #32
   2261     bge             0b
   2262 5:
   2263     adds            \WIDTH, \WIDTH, #16
   2264     blt             1f
   2265 0:  /* process 16 pixels if needed */
   2266     upsample16      \OUTPTR, \INPTR
   2267     subs            \WIDTH, \WIDTH, #16
   2268 1:
   2269     adds            \WIDTH, \WIDTH, #16
   2270     beq             9f
   2271 
   2272     /* load the remaining 1-15 pixels */
   2273     add             \INPTR, \INPTR, \WIDTH
   2274     tst             \WIDTH, #1
   2275     beq             2f
   2276     sub             \INPTR, \INPTR, #1
   2277     vld1.8          {d0[0]}, [\INPTR]
   2278 2:
   2279     tst             \WIDTH, #2
   2280     beq             2f
   2281     vext.8          d0, d0, d0, #6
   2282     sub             \INPTR, \INPTR, #1
   2283     vld1.8          {d0[1]}, [\INPTR]
   2284     sub             \INPTR, \INPTR, #1
   2285     vld1.8          {d0[0]}, [\INPTR]
   2286 2:
   2287     tst             \WIDTH, #4
   2288     beq             2f
   2289     vrev64.32       d0, d0
   2290     sub             \INPTR, \INPTR, #1
   2291     vld1.8          {d0[3]}, [\INPTR]
   2292     sub             \INPTR, \INPTR, #1
   2293     vld1.8          {d0[2]}, [\INPTR]
   2294     sub             \INPTR, \INPTR, #1
   2295     vld1.8          {d0[1]}, [\INPTR]
   2296     sub             \INPTR, \INPTR, #1
   2297     vld1.8          {d0[0]}, [\INPTR]
   2298 2:
   2299     tst             \WIDTH, #8
   2300     beq             2f
   2301     vmov            d1,  d0
   2302     sub             \INPTR, \INPTR, #8
   2303     vld1.8          {d0}, [\INPTR]
   2304 2:  /* upsample the remaining pixels */
   2305     vmovl.u8        q8,  d0
   2306     vext.8          q2,  q1,  q0, #15
   2307     vmovl.u8        q9,  d1
   2308     vaddw.u8        q10, q15, d4
   2309     vaddw.u8        q11, q15, d5
   2310     vmlal.u8        q8,  d4,  d28
   2311     vmlal.u8        q9,  d5,  d28
   2312     vmlal.u8        q10, d0,  d28
   2313     vmlal.u8        q11, d1,  d28
   2314     vrshrn.u16      d10, q8,  #2
   2315     vrshrn.u16      d12, q9,  #2
   2316     vshrn.u16       d11, q10, #2
   2317     vshrn.u16       d13, q11, #2
   2318     vzip.8          d10, d11
   2319     vzip.8          d12, d13
   2320     /* store the remaining pixels */
   2321     tst             \WIDTH, #8
   2322     beq             2f
   2323     vst1.8          {d10, d11}, [\OUTPTR]!
   2324     vmov            q5,  q6
   2325 2:
   2326     tst             \WIDTH, #4
   2327     beq             2f
   2328     vst1.8          {d10}, [\OUTPTR]!
   2329     vmov            d10,  d11
   2330 2:
   2331     tst             \WIDTH, #2
   2332     beq             2f
   2333     vst1.8          {d10[0]}, [\OUTPTR]!
   2334     vst1.8          {d10[1]}, [\OUTPTR]!
   2335     vst1.8          {d10[2]}, [\OUTPTR]!
   2336     vst1.8          {d10[3]}, [\OUTPTR]!
   2337     vext.8          d10, d10, d10, #4
   2338 2:
   2339     tst             \WIDTH, #1
   2340     beq             2f
   2341     vst1.8          {d10[0]}, [\OUTPTR]!
   2342     vst1.8          {d10[1]}, [\OUTPTR]!
   2343 2:
   2344 9:
   2345 .endm
   2346 
   2347 asm_function jsimd_h2v1_fancy_upsample_neon
   2348 
   2349     MAX_V_SAMP_FACTOR .req r0
   2350     DOWNSAMPLED_WIDTH .req r1
   2351     INPUT_DATA        .req r2
   2352     OUTPUT_DATA_PTR   .req r3
   2353     OUTPUT_DATA       .req OUTPUT_DATA_PTR
   2354 
   2355     OUTPTR            .req r4
   2356     INPTR             .req r5
   2357     WIDTH             .req ip
   2358     TMP               .req lr
   2359 
   2360     push            {r4, r5, r6, lr}
   2361     vpush           {d8-d15}
   2362 
   2363     ldr             OUTPUT_DATA, [OUTPUT_DATA_PTR]
   2364     cmp             MAX_V_SAMP_FACTOR, #0
   2365     ble             99f
   2366 
   2367     /* initialize constants */
   2368     vmov.u8         d28, #3
   2369     vmov.u16        q15, #1
   2370 11:
   2371     ldr             INPTR, [INPUT_DATA], #4
   2372     ldr             OUTPTR, [OUTPUT_DATA], #4
   2373     mov             WIDTH, DOWNSAMPLED_WIDTH
   2374     upsample_row    OUTPTR, INPTR, WIDTH, TMP
   2375     subs            MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
   2376     bgt             11b
   2377 
   2378 99:
   2379     vpop            {d8-d15}
   2380     pop             {r4, r5, r6, pc}
   2381 
   2382     .unreq          MAX_V_SAMP_FACTOR
   2383     .unreq          DOWNSAMPLED_WIDTH
   2384     .unreq          INPUT_DATA
   2385     .unreq          OUTPUT_DATA_PTR
   2386     .unreq          OUTPUT_DATA
   2387 
   2388     .unreq          OUTPTR
   2389     .unreq          INPTR
   2390     .unreq          WIDTH
   2391     .unreq          TMP
   2392 
   2393 
   2394 .purgem upsample16
   2395 .purgem upsample32
   2396 .purgem upsample_row
   2397