Home | History | Annotate | Download | only in simd
      1 /*
      2  * ARMv8 NEON optimizations for libjpeg-turbo
      3  *
      4  * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
      5  * All Rights Reserved.
      6  * Author: Siarhei Siamashka <siarhei.siamashka (at) nokia.com>
      7  * Copyright (C) 2013-2014, Linaro Limited.  All Rights Reserved.
      8  * Author: Ragesh Radhakrishnan <ragesh.r (at) linaro.org>
      9  * Copyright (C) 2014-2016, D. R. Commander.  All Rights Reserved.
     10  * Copyright (C) 2015-2016, Matthieu Darbois.  All Rights Reserved.
     11  * Copyright (C) 2016, Siarhei Siamashka.  All Rights Reserved.
     12  *
     13  * This software is provided 'as-is', without any express or implied
     14  * warranty.  In no event will the authors be held liable for any damages
     15  * arising from the use of this software.
     16  *
     17  * Permission is granted to anyone to use this software for any purpose,
     18  * including commercial applications, and to alter it and redistribute it
     19  * freely, subject to the following restrictions:
     20  *
     21  * 1. The origin of this software must not be misrepresented; you must not
     22  *    claim that you wrote the original software. If you use this software
     23  *    in a product, an acknowledgment in the product documentation would be
     24  *    appreciated but is not required.
     25  * 2. Altered source versions must be plainly marked as such, and must not be
     26  *    misrepresented as being the original software.
     27  * 3. This notice may not be removed or altered from any source distribution.
     28  */
     29 
     30 #if defined(__linux__) && defined(__ELF__)
     31 .section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
     32 #endif
     33 
     34 .text
     35 
     36 
     37 #define RESPECT_STRICT_ALIGNMENT 1
     38 
     39 
     40 /*****************************************************************************/
     41 
     42 /* Supplementary macro for setting function attributes */
     43 .macro asm_function fname
     44 #ifdef __APPLE__
     45     .globl _\fname
     46 _\fname:
     47 #else
     48     .global \fname
     49 #ifdef __ELF__
     50     .hidden \fname
     51     .type \fname, %function
     52 #endif
     53 \fname:
     54 #endif
     55 .endm
     56 
     57 /* Transpose elements of single 128 bit registers */
     58 .macro transpose_single x0, x1, xi, xilen, literal
     59     ins             \xi\xilen[0], \x0\xilen[0]
     60     ins             \x1\xilen[0], \x0\xilen[1]
     61     trn1            \x0\literal, \x0\literal, \x1\literal
     62     trn2            \x1\literal, \xi\literal, \x1\literal
     63 .endm
     64 
     65 /* Transpose elements of 2 differnet registers */
     66 .macro transpose x0, x1, xi, xilen, literal
     67     mov             \xi\xilen, \x0\xilen
     68     trn1            \x0\literal, \x0\literal, \x1\literal
     69     trn2            \x1\literal, \xi\literal, \x1\literal
     70 .endm
     71 
     72 /* Transpose a block of 4x4 coefficients in four 64-bit registers */
     73 .macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
     74     mov             \xi\xilen, \x0\xilen
     75     trn1            \x0\x0len, \x0\x0len, \x2\x2len
     76     trn2            \x2\x2len, \xi\x0len, \x2\x2len
     77     mov             \xi\xilen, \x1\xilen
     78     trn1            \x1\x1len, \x1\x1len, \x3\x3len
     79     trn2            \x3\x3len, \xi\x1len, \x3\x3len
     80 .endm
     81 
     82 .macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
     83     mov             \xi\xilen, \x0\xilen
     84     trn1            \x0\x0len, \x0\x0len, \x1\x1len
     85     trn2            \x1\x2len, \xi\x0len, \x1\x2len
     86     mov             \xi\xilen, \x2\xilen
     87     trn1            \x2\x2len, \x2\x2len, \x3\x3len
     88     trn2            \x3\x2len, \xi\x1len, \x3\x3len
     89 .endm
     90 
     91 .macro transpose_4x4 x0, x1, x2, x3, x5
     92     transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
     93     transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
     94 .endm
     95 
     96 .macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
     97     trn1            \t0\().8h, \l0\().8h, \l1\().8h
     98     trn1            \t1\().8h, \l2\().8h, \l3\().8h
     99     trn1            \t2\().8h, \l4\().8h, \l5\().8h
    100     trn1            \t3\().8h, \l6\().8h, \l7\().8h
    101     trn2            \l1\().8h, \l0\().8h, \l1\().8h
    102     trn2            \l3\().8h, \l2\().8h, \l3\().8h
    103     trn2            \l5\().8h, \l4\().8h, \l5\().8h
    104     trn2            \l7\().8h, \l6\().8h, \l7\().8h
    105 
    106     trn1            \l4\().4s, \t2\().4s, \t3\().4s
    107     trn2            \t3\().4s, \t2\().4s, \t3\().4s
    108     trn1            \t2\().4s, \t0\().4s, \t1\().4s
    109     trn2            \l2\().4s, \t0\().4s, \t1\().4s
    110     trn1            \t0\().4s, \l1\().4s, \l3\().4s
    111     trn2            \l3\().4s, \l1\().4s, \l3\().4s
    112     trn2            \t1\().4s, \l5\().4s, \l7\().4s
    113     trn1            \l5\().4s, \l5\().4s, \l7\().4s
    114 
    115     trn2            \l6\().2d, \l2\().2d, \t3\().2d
    116     trn1            \l0\().2d, \t2\().2d, \l4\().2d
    117     trn1            \l1\().2d, \t0\().2d, \l5\().2d
    118     trn2            \l7\().2d, \l3\().2d, \t1\().2d
    119     trn1            \l2\().2d, \l2\().2d, \t3\().2d
    120     trn2            \l4\().2d, \t2\().2d, \l4\().2d
    121     trn1            \l3\().2d, \l3\().2d, \t1\().2d
    122     trn2            \l5\().2d, \t0\().2d, \l5\().2d
    123 .endm
    124 
    125 
    126 #define CENTERJSAMPLE 128
    127 
    128 /*****************************************************************************/
    129 
    130 /*
    131  * Perform dequantization and inverse DCT on one block of coefficients.
    132  *
    133  * GLOBAL(void)
    134  * jsimd_idct_islow_neon (void *dct_table, JCOEFPTR coef_block,
    135  *                        JSAMPARRAY output_buf, JDIMENSION output_col)
    136  */
    137 
    138 #define CONST_BITS 13
    139 #define PASS1_BITS 2
    140 
    141 #define F_0_298  2446  /* FIX(0.298631336) */
    142 #define F_0_390  3196  /* FIX(0.390180644) */
    143 #define F_0_541  4433  /* FIX(0.541196100) */
    144 #define F_0_765  6270  /* FIX(0.765366865) */
    145 #define F_0_899  7373  /* FIX(0.899976223) */
    146 #define F_1_175  9633  /* FIX(1.175875602) */
    147 #define F_1_501 12299  /* FIX(1.501321110) */
    148 #define F_1_847 15137  /* FIX(1.847759065) */
    149 #define F_1_961 16069  /* FIX(1.961570560) */
    150 #define F_2_053 16819  /* FIX(2.053119869) */
    151 #define F_2_562 20995  /* FIX(2.562915447) */
    152 #define F_3_072 25172  /* FIX(3.072711026) */
    153 
    154 .balign 16
    155 Ljsimd_idct_islow_neon_consts:
    156   .short F_0_298
    157   .short -F_0_390
    158   .short F_0_541
    159   .short F_0_765
    160   .short - F_0_899
    161   .short F_1_175
    162   .short F_1_501
    163   .short - F_1_847
    164   .short - F_1_961
    165   .short F_2_053
    166   .short - F_2_562
    167   .short F_3_072
    168   .short 0          /* padding */
    169   .short 0
    170   .short 0
    171   .short 0
    172 
    173 #undef F_0_298
    174 #undef F_0_390
    175 #undef F_0_541
    176 #undef F_0_765
    177 #undef F_0_899
    178 #undef F_1_175
    179 #undef F_1_501
    180 #undef F_1_847
    181 #undef F_1_961
    182 #undef F_2_053
    183 #undef F_2_562
    184 #undef F_3_072
    185 
    186 #define XFIX_P_0_298 v0.h[0]
    187 #define XFIX_N_0_390 v0.h[1]
    188 #define XFIX_P_0_541 v0.h[2]
    189 #define XFIX_P_0_765 v0.h[3]
    190 #define XFIX_N_0_899 v0.h[4]
    191 #define XFIX_P_1_175 v0.h[5]
    192 #define XFIX_P_1_501 v0.h[6]
    193 #define XFIX_N_1_847 v0.h[7]
    194 #define XFIX_N_1_961 v1.h[0]
    195 #define XFIX_P_2_053 v1.h[1]
    196 #define XFIX_N_2_562 v1.h[2]
    197 #define XFIX_P_3_072 v1.h[3]
    198 
    199 asm_function jsimd_idct_islow_neon
    200     DCT_TABLE       .req x0
    201     COEF_BLOCK      .req x1
    202     OUTPUT_BUF      .req x2
    203     OUTPUT_COL      .req x3
    204     TMP1            .req x0
    205     TMP2            .req x1
    206     TMP3            .req x9
    207     TMP4            .req x10
    208     TMP5            .req x11
    209     TMP6            .req x12
    210     TMP7            .req x13
    211     TMP8            .req x14
    212 
    213     /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
    214        guarantee that the upper (unused) 32 bits of x3 are valid.  This
    215        instruction ensures that those bits are set to zero. */
    216     uxtw x3, w3
    217 
    218     sub             sp, sp, #64
    219     adr             x15, Ljsimd_idct_islow_neon_consts
    220     mov             x10, sp
    221     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
    222     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
    223     ld1             {v0.8h, v1.8h}, [x15]
    224     ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
    225     ld1             {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
    226     ld1             {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
    227     ld1             {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
    228 
    229     cmeq            v16.8h, v3.8h, #0
    230     cmeq            v26.8h, v4.8h, #0
    231     cmeq            v27.8h, v5.8h, #0
    232     cmeq            v28.8h, v6.8h, #0
    233     cmeq            v29.8h, v7.8h, #0
    234     cmeq            v30.8h, v8.8h, #0
    235     cmeq            v31.8h, v9.8h, #0
    236 
    237     and             v10.16b, v16.16b, v26.16b
    238     and             v11.16b, v27.16b, v28.16b
    239     and             v12.16b, v29.16b, v30.16b
    240     and             v13.16b, v31.16b, v10.16b
    241     and             v14.16b, v11.16b, v12.16b
    242     mul             v2.8h, v2.8h, v18.8h
    243     and             v15.16b, v13.16b, v14.16b
    244     shl             v10.8h, v2.8h, #(PASS1_BITS)
    245     sqxtn           v16.8b, v15.8h
    246     mov             TMP1, v16.d[0]
    247     mvn             TMP2, TMP1
    248 
    249     cbnz            TMP2, 2f
    250     /* case all AC coeffs are zeros */
    251     dup             v2.2d, v10.d[0]
    252     dup             v6.2d, v10.d[1]
    253     mov             v3.16b, v2.16b
    254     mov             v7.16b, v6.16b
    255     mov             v4.16b, v2.16b
    256     mov             v8.16b, v6.16b
    257     mov             v5.16b, v2.16b
    258     mov             v9.16b, v6.16b
    259 1:
    260     /* for this transpose, we should organise data like this:
    261      * 00, 01, 02, 03, 40, 41, 42, 43
    262      * 10, 11, 12, 13, 50, 51, 52, 53
    263      * 20, 21, 22, 23, 60, 61, 62, 63
    264      * 30, 31, 32, 33, 70, 71, 72, 73
    265      * 04, 05, 06, 07, 44, 45, 46, 47
    266      * 14, 15, 16, 17, 54, 55, 56, 57
    267      * 24, 25, 26, 27, 64, 65, 66, 67
    268      * 34, 35, 36, 37, 74, 75, 76, 77
    269      */
    270     trn1            v28.8h, v2.8h, v3.8h
    271     trn1            v29.8h, v4.8h, v5.8h
    272     trn1            v30.8h, v6.8h, v7.8h
    273     trn1            v31.8h, v8.8h, v9.8h
    274     trn2            v16.8h, v2.8h, v3.8h
    275     trn2            v17.8h, v4.8h, v5.8h
    276     trn2            v18.8h, v6.8h, v7.8h
    277     trn2            v19.8h, v8.8h, v9.8h
    278     trn1            v2.4s, v28.4s, v29.4s
    279     trn1            v6.4s, v30.4s, v31.4s
    280     trn1            v3.4s, v16.4s, v17.4s
    281     trn1            v7.4s, v18.4s, v19.4s
    282     trn2            v4.4s, v28.4s, v29.4s
    283     trn2            v8.4s, v30.4s, v31.4s
    284     trn2            v5.4s, v16.4s, v17.4s
    285     trn2            v9.4s, v18.4s, v19.4s
    286     /* Even part: reverse the even part of the forward DCT. */
    287     add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
    288     add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    289     smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
    290     sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    291     smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
    292     sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
    293     mov             v21.16b, v19.16b               /* tmp3 = z1 */
    294     mov             v20.16b, v18.16b               /* tmp3 = z1 */
    295     smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
    296     smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
    297     sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
    298     smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
    299     smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
    300     sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
    301     sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
    302     add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
    303     sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
    304     add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
    305     sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
    306     add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
    307     sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
    308     add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
    309     sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
    310 
    311     /* Odd part per figure 8; the matrix is unitary and hence its
    312      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
    313      */
    314 
    315     add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    316     add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    317     add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    318     add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    319     add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
    320 
    321     smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
    322     smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
    323     smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
    324     smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
    325     smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
    326     smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
    327     smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
    328     smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
    329     smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
    330 
    331     smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
    332     smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
    333     smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
    334     smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
    335     smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
    336     smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
    337     smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
    338     smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
    339     smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
    340 
    341     add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
    342     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
    343     add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
    344     add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
    345 
    346     add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
    347     add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
    348     add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
    349     add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
    350     add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
    351     add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
    352     add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
    353     add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
    354 
    355     add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
    356     add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
    357     add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
    358     add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
    359     add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
    360     add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
    361     add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
    362     add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
    363 
    364     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
    365 
    366     add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
    367     add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
    368     sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
    369     sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
    370     add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
    371     add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
    372     sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
    373     sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
    374     add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
    375     add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
    376     sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
    377     sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
    378     add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
    379     add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
    380     sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
    381     sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
    382 
    383     shrn            v2.4h, v18.4s, #16  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
    384     shrn            v9.4h, v20.4s, #16  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
    385     shrn            v3.4h, v22.4s, #16  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
    386     shrn            v8.4h, v24.4s, #16  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
    387     shrn            v4.4h, v26.4s, #16  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
    388     shrn            v7.4h, v28.4s, #16  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
    389     shrn            v5.4h, v14.4s, #16  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
    390     shrn            v6.4h, v16.4s, #16  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
    391     shrn2           v2.8h, v19.4s, #16  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
    392     shrn2           v9.8h, v21.4s, #16  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
    393     shrn2           v3.8h, v23.4s, #16  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
    394     shrn2           v8.8h, v25.4s, #16  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
    395     shrn2           v4.8h, v27.4s, #16  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
    396     shrn2           v7.8h, v29.4s, #16  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
    397     shrn2           v5.8h, v15.4s, #16  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
    398     shrn2           v6.8h, v17.4s, #16  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
    399     movi            v0.16b, #(CENTERJSAMPLE)
    400     /* Prepare pointers (dual-issue with NEON instructions) */
    401       ldp             TMP1, TMP2, [OUTPUT_BUF], 16
    402     sqrshrn         v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)
    403       ldp             TMP3, TMP4, [OUTPUT_BUF], 16
    404     sqrshrn         v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)
    405       add             TMP1, TMP1, OUTPUT_COL
    406     sqrshrn         v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)
    407       add             TMP2, TMP2, OUTPUT_COL
    408     sqrshrn         v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)
    409       add             TMP3, TMP3, OUTPUT_COL
    410     sqrshrn2        v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)
    411       add             TMP4, TMP4, OUTPUT_COL
    412     sqrshrn2        v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)
    413       ldp             TMP5, TMP6, [OUTPUT_BUF], 16
    414     sqrshrn2        v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)
    415       ldp             TMP7, TMP8, [OUTPUT_BUF], 16
    416     sqrshrn2        v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)
    417       add             TMP5, TMP5, OUTPUT_COL
    418     add             v16.16b, v28.16b, v0.16b
    419       add             TMP6, TMP6, OUTPUT_COL
    420     add             v18.16b, v29.16b, v0.16b
    421       add             TMP7, TMP7, OUTPUT_COL
    422     add             v20.16b, v30.16b, v0.16b
    423       add             TMP8, TMP8, OUTPUT_COL
    424     add             v22.16b, v31.16b, v0.16b
    425 
    426     /* Transpose the final 8-bit samples */
    427     trn1            v28.16b, v16.16b, v18.16b
    428     trn1            v30.16b, v20.16b, v22.16b
    429     trn2            v29.16b, v16.16b, v18.16b
    430     trn2            v31.16b, v20.16b, v22.16b
    431 
    432     trn1            v16.8h, v28.8h, v30.8h
    433     trn2            v18.8h, v28.8h, v30.8h
    434     trn1            v20.8h, v29.8h, v31.8h
    435     trn2            v22.8h, v29.8h, v31.8h
    436 
    437     uzp1            v28.4s, v16.4s, v18.4s
    438     uzp2            v30.4s, v16.4s, v18.4s
    439     uzp1            v29.4s, v20.4s, v22.4s
    440     uzp2            v31.4s, v20.4s, v22.4s
    441 
    442     /* Store results to the output buffer */
    443     st1             {v28.d}[0], [TMP1]
    444     st1             {v29.d}[0], [TMP2]
    445     st1             {v28.d}[1], [TMP3]
    446     st1             {v29.d}[1], [TMP4]
    447     st1             {v30.d}[0], [TMP5]
    448     st1             {v31.d}[0], [TMP6]
    449     st1             {v30.d}[1], [TMP7]
    450     st1             {v31.d}[1], [TMP8]
    451     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
    452     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
    453     blr             x30
    454 
    455 .balign 16
    456 2:
    457     mul             v3.8h, v3.8h, v19.8h
    458     mul             v4.8h, v4.8h, v20.8h
    459     mul             v5.8h, v5.8h, v21.8h
    460     add             TMP4, xzr, TMP2, LSL #32
    461     mul             v6.8h, v6.8h, v22.8h
    462     mul             v7.8h, v7.8h, v23.8h
    463     adds            TMP3, xzr, TMP2, LSR #32
    464     mul             v8.8h, v8.8h, v24.8h
    465     mul             v9.8h, v9.8h, v25.8h
    466     b.ne            3f
    467     /* Right AC coef is zero */
    468     dup             v15.2d, v10.d[1]
    469     /* Even part: reverse the even part of the forward DCT. */
    470     add             v18.4h, v4.4h, v8.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
    471     add             v22.4h, v2.4h, v6.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    472     sub             v26.4h, v2.4h, v6.4h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    473     smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
    474     sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
    475     mov             v20.16b, v18.16b               /* tmp3 = z1 */
    476     sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
    477     smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
    478     smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
    479     add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
    480     sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
    481     add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
    482     sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
    483 
    484     /* Odd part per figure 8; the matrix is unitary and hence its
    485      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
    486      */
    487 
    488     add             v22.4h, v9.4h, v5.4h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    489     add             v24.4h, v7.4h, v3.4h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    490     add             v18.4h, v9.4h, v3.4h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    491     add             v20.4h, v7.4h, v5.4h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    492     add             v26.4h, v22.4h, v24.4h  /* z5 = z3 + z4 */
    493 
    494     smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
    495     smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
    496     smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
    497     smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
    498     smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
    499     smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
    500     smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
    501     smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
    502     smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
    503 
    504     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
    505     add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
    506 
    507     add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
    508     add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
    509     add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
    510     add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
    511 
    512     add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
    513     add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
    514     add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
    515     add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
    516 
    517     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
    518 
    519     add             v18.4s, v2.4s, v16.4s  /* tmp10 + tmp3 */
    520     sub             v20.4s, v2.4s, v16.4s  /* tmp10 - tmp3 */
    521     add             v22.4s, v8.4s, v14.4s  /* tmp11 + tmp2 */
    522     sub             v24.4s, v8.4s, v14.4s  /* tmp11 - tmp2 */
    523     add             v26.4s, v4.4s, v12.4s  /* tmp12 + tmp1 */
    524     sub             v28.4s, v4.4s, v12.4s  /* tmp12 - tmp1 */
    525     add             v14.4s, v6.4s, v10.4s  /* tmp13 + tmp0 */
    526     sub             v16.4s, v6.4s, v10.4s  /* tmp13 - tmp0 */
    527 
    528     rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
    529     rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
    530     rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
    531     rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
    532     rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
    533     rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
    534     rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
    535     rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
    536     mov             v6.16b, v15.16b
    537     mov             v7.16b, v15.16b
    538     mov             v8.16b, v15.16b
    539     mov             v9.16b, v15.16b
    540     b               1b
    541 
    542 .balign 16
    543 3:
    544     cbnz            TMP4, 4f
    545     /* Left AC coef is zero */
    546     dup             v14.2d, v10.d[0]
    547     /* Even part: reverse the even part of the forward DCT. */
    548     add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
    549     add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    550     smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
    551     sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    552     sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
    553     mov             v21.16b, v19.16b               /* tmp3 = z1 */
    554     smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
    555     sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
    556     smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
    557     add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
    558     sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
    559     add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
    560     sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
    561 
    562     /* Odd part per figure 8; the matrix is unitary and hence its
    563      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
    564      */
    565 
    566     add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    567     add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    568     add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    569     add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    570     add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
    571 
    572     smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
    573     smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
    574     smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
    575     smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
    576     smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
    577     smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
    578     smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
    579     smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
    580     smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
    581 
    582     add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
    583     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
    584     add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
    585     add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
    586 
    587     add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
    588     add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
    589     add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
    590     add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
    591 
    592     add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
    593     add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
    594     add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
    595     add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
    596 
    597     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
    598 
    599     add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
    600     sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
    601     add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
    602     sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
    603     add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
    604     sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
    605     add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
    606     sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
    607 
    608     mov             v2.16b, v14.16b
    609     mov             v3.16b, v14.16b
    610     mov             v4.16b, v14.16b
    611     mov             v5.16b, v14.16b
    612     rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
    613     rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
    614     rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
    615     rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
    616     rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
    617     rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
    618     rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
    619     rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
    620     b               1b
    621 
    622 .balign 16
    623 4:
    624     /* "No" AC coef is zero */
    625     /* Even part: reverse the even part of the forward DCT. */
    626     add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
    627     add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    628     smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
    629     sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    630     smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
    631     sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
    632     mov             v21.16b, v19.16b               /* tmp3 = z1 */
    633     mov             v20.16b, v18.16b               /* tmp3 = z1 */
    634     smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
    635     smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, - FIX_1_847759065); */
    636     sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
    637     smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
    638     smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
    639     sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
    640     sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
    641     add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
    642     sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
    643     add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
    644     sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
    645     add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
    646     sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
    647     add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
    648     sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
    649 
    650     /* Odd part per figure 8; the matrix is unitary and hence its
    651      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
    652      */
    653 
    654     add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    655     add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    656     add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    657     add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    658     add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
    659 
    660     smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
    661     smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
    662     smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
    663     smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
    664     smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
    665     smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
    666     smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
    667     smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
    668     smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
    669 
    670     smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
    671     smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
    672     smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
    673     smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
    674     smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
    675     smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560) */
    676     smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644) */
    677     smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, - FIX_0_899976223) */
    678     smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, - FIX_2_562915447) */
    679 
    680     add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
    681     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
    682     add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
    683     add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
    684 
    685     add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
    686     add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
    687     add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
    688     add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
    689     add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
    690     add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
    691     add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
    692     add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
    693 
    694     add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
    695     add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
    696     add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
    697     add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
    698     add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
    699     add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
    700     add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
    701     add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
    702 
    703     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
    704 
    705     add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
    706     add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
    707     sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
    708     sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
    709     add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
    710     add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
    711     sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
    712     sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
    713     add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
    714     add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
    715     sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
    716     sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
    717     add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
    718     add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
    719     sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
    720     sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
    721 
    722     rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
    723     rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
    724     rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
    725     rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
    726     rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
    727     rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
    728     rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
    729     rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
    730     rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
    731     rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
    732     rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
    733     rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
    734     rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
    735     rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
    736     rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
    737     rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
    738     b               1b
    739 
    740     .unreq          DCT_TABLE
    741     .unreq          COEF_BLOCK
    742     .unreq          OUTPUT_BUF
    743     .unreq          OUTPUT_COL
    744     .unreq          TMP1
    745     .unreq          TMP2
    746     .unreq          TMP3
    747     .unreq          TMP4
    748     .unreq          TMP5
    749     .unreq          TMP6
    750     .unreq          TMP7
    751     .unreq          TMP8
    752 
    753 #undef CENTERJSAMPLE
    754 #undef CONST_BITS
    755 #undef PASS1_BITS
    756 #undef XFIX_P_0_298
    757 #undef XFIX_N_0_390
    758 #undef XFIX_P_0_541
    759 #undef XFIX_P_0_765
    760 #undef XFIX_N_0_899
    761 #undef XFIX_P_1_175
    762 #undef XFIX_P_1_501
    763 #undef XFIX_N_1_847
    764 #undef XFIX_N_1_961
    765 #undef XFIX_P_2_053
    766 #undef XFIX_N_2_562
    767 #undef XFIX_P_3_072
    768 
    769 
    770 /*****************************************************************************/
    771 
    772 /*
    773  * jsimd_idct_ifast_neon
    774  *
    775  * This function contains a fast, not so accurate integer implementation of
    776  * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
    777  * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
    778  * function from jidctfst.c
    779  *
    780  * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
    781  * But in ARM NEON case some extra additions are required because VQDMULH
    782  * instruction can't handle the constants larger than 1. So the expressions
    783  * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
    784  * which introduces an extra addition. Overall, there are 6 extra additions
    785  * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
    786  */
    787 
    788 #define XFIX_1_082392200 v0.h[0]
    789 #define XFIX_1_414213562 v0.h[1]
    790 #define XFIX_1_847759065 v0.h[2]
    791 #define XFIX_2_613125930 v0.h[3]
    792 
    793 .balign 16
    794 Ljsimd_idct_ifast_neon_consts:
    795   .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
    796   .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
    797   .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
    798   .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
    799 
    800 asm_function jsimd_idct_ifast_neon
    801 
    802     DCT_TABLE       .req x0
    803     COEF_BLOCK      .req x1
    804     OUTPUT_BUF      .req x2
    805     OUTPUT_COL      .req x3
    806     TMP1            .req x0
    807     TMP2            .req x1
    808     TMP3            .req x9
    809     TMP4            .req x10
    810     TMP5            .req x11
    811     TMP6            .req x12
    812     TMP7            .req x13
    813     TMP8            .req x14
    814 
    815     /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
    816        guarantee that the upper (unused) 32 bits of x3 are valid.  This
    817        instruction ensures that those bits are set to zero. */
    818     uxtw x3, w3
    819 
    820     /* Load and dequantize coefficients into NEON registers
    821      * with the following allocation:
    822      *       0 1 2 3 | 4 5 6 7
    823      *      ---------+--------
    824      *   0 | d16     | d17     ( v16.8h )
    825      *   1 | d18     | d19     ( v17.8h )
    826      *   2 | d20     | d21     ( v18.8h )
    827      *   3 | d22     | d23     ( v19.8h )
    828      *   4 | d24     | d25     ( v20.8h )
    829      *   5 | d26     | d27     ( v21.8h )
    830      *   6 | d28     | d29     ( v22.8h )
    831      *   7 | d30     | d31     ( v23.8h )
    832      */
    833     /* Save NEON registers used in fast IDCT */
    834     adr             TMP5, Ljsimd_idct_ifast_neon_consts
    835     ld1             {v16.8h, v17.8h}, [COEF_BLOCK], 32
    836     ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
    837     ld1             {v18.8h, v19.8h}, [COEF_BLOCK], 32
    838     mul             v16.8h, v16.8h, v0.8h
    839     ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
    840     mul             v17.8h, v17.8h, v1.8h
    841     ld1             {v20.8h, v21.8h}, [COEF_BLOCK], 32
    842     mul             v18.8h, v18.8h, v2.8h
    843     ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
    844     mul             v19.8h, v19.8h, v3.8h
    845     ld1             {v22.8h, v23.8h}, [COEF_BLOCK], 32
    846     mul             v20.8h, v20.8h, v0.8h
    847     ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
    848     mul             v22.8h, v22.8h, v2.8h
    849     mul             v21.8h, v21.8h, v1.8h
    850     ld1             {v0.4h}, [TMP5]        /* load constants */
    851     mul             v23.8h, v23.8h, v3.8h
    852 
    853     /* 1-D IDCT, pass 1 */
    854     sub             v2.8h, v18.8h, v22.8h
    855     add             v22.8h, v18.8h, v22.8h
    856     sub             v1.8h, v19.8h, v21.8h
    857     add             v21.8h, v19.8h, v21.8h
    858     sub             v5.8h, v17.8h, v23.8h
    859     add             v23.8h, v17.8h, v23.8h
    860     sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
    861     sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
    862     add             v3.8h, v1.8h, v1.8h
    863     sub             v1.8h, v5.8h, v1.8h
    864     add             v18.8h, v2.8h, v4.8h
    865     sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
    866     sub             v2.8h, v23.8h, v21.8h
    867     add             v3.8h, v3.8h, v6.8h
    868     sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
    869     add             v1.8h, v1.8h, v4.8h
    870     sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
    871     sub             v18.8h, v18.8h, v22.8h
    872     add             v2.8h, v2.8h, v6.8h
    873     sub             v6.8h, v16.8h, v20.8h
    874     add             v20.8h, v16.8h, v20.8h
    875     add             v17.8h, v5.8h, v4.8h
    876     add             v5.8h, v6.8h, v18.8h
    877     sub             v18.8h, v6.8h, v18.8h
    878     add             v6.8h, v23.8h, v21.8h
    879     add             v16.8h, v20.8h, v22.8h
    880     sub             v3.8h, v6.8h, v3.8h
    881     sub             v20.8h, v20.8h, v22.8h
    882     sub             v3.8h, v3.8h, v1.8h
    883     sub             v1.8h, v17.8h, v1.8h
    884     add             v2.8h, v3.8h, v2.8h
    885     sub             v23.8h, v16.8h, v6.8h
    886     add             v1.8h, v1.8h, v2.8h
    887     add             v16.8h, v16.8h, v6.8h
    888     add             v22.8h, v5.8h, v3.8h
    889     sub             v17.8h, v5.8h, v3.8h
    890     sub             v21.8h, v18.8h, v2.8h
    891     add             v18.8h, v18.8h, v2.8h
    892     sub             v19.8h, v20.8h, v1.8h
    893     add             v20.8h, v20.8h, v1.8h
    894     transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31
    895     /* 1-D IDCT, pass 2 */
    896     sub             v2.8h, v18.8h, v22.8h
    897     add             v22.8h, v18.8h, v22.8h
    898     sub             v1.8h, v19.8h, v21.8h
    899     add             v21.8h, v19.8h, v21.8h
    900     sub             v5.8h, v17.8h, v23.8h
    901     add             v23.8h, v17.8h, v23.8h
    902     sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
    903     sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
    904     add             v3.8h, v1.8h, v1.8h
    905     sub             v1.8h, v5.8h, v1.8h
    906     add             v18.8h, v2.8h, v4.8h
    907     sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
    908     sub             v2.8h, v23.8h, v21.8h
    909     add             v3.8h, v3.8h, v6.8h
    910     sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
    911     add             v1.8h, v1.8h, v4.8h
    912     sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
    913     sub             v18.8h, v18.8h, v22.8h
    914     add             v2.8h, v2.8h, v6.8h
    915     sub             v6.8h, v16.8h, v20.8h
    916     add             v20.8h, v16.8h, v20.8h
    917     add             v17.8h, v5.8h, v4.8h
    918     add             v5.8h, v6.8h, v18.8h
    919     sub             v18.8h, v6.8h, v18.8h
    920     add             v6.8h, v23.8h, v21.8h
    921     add             v16.8h, v20.8h, v22.8h
    922     sub             v3.8h, v6.8h, v3.8h
    923     sub             v20.8h, v20.8h, v22.8h
    924     sub             v3.8h, v3.8h, v1.8h
    925     sub             v1.8h, v17.8h, v1.8h
    926     add             v2.8h, v3.8h, v2.8h
    927     sub             v23.8h, v16.8h, v6.8h
    928     add             v1.8h, v1.8h, v2.8h
    929     add             v16.8h, v16.8h, v6.8h
    930     add             v22.8h, v5.8h, v3.8h
    931     sub             v17.8h, v5.8h, v3.8h
    932     sub             v21.8h, v18.8h, v2.8h
    933     add             v18.8h, v18.8h, v2.8h
    934     sub             v19.8h, v20.8h, v1.8h
    935     add             v20.8h, v20.8h, v1.8h
    936     /* Descale to 8-bit and range limit */
    937     movi            v0.16b, #0x80
    938       /* Prepare pointers (dual-issue with NEON instructions) */
    939       ldp             TMP1, TMP2, [OUTPUT_BUF], 16
    940     sqshrn          v28.8b, v16.8h, #5
    941       ldp             TMP3, TMP4, [OUTPUT_BUF], 16
    942     sqshrn          v29.8b, v17.8h, #5
    943       add             TMP1, TMP1, OUTPUT_COL
    944     sqshrn          v30.8b, v18.8h, #5
    945       add             TMP2, TMP2, OUTPUT_COL
    946     sqshrn          v31.8b, v19.8h, #5
    947       add             TMP3, TMP3, OUTPUT_COL
    948     sqshrn2         v28.16b, v20.8h, #5
    949       add             TMP4, TMP4, OUTPUT_COL
    950     sqshrn2         v29.16b, v21.8h, #5
    951       ldp             TMP5, TMP6, [OUTPUT_BUF], 16
    952     sqshrn2         v30.16b, v22.8h, #5
    953       ldp             TMP7, TMP8, [OUTPUT_BUF], 16
    954     sqshrn2         v31.16b, v23.8h, #5
    955       add             TMP5, TMP5, OUTPUT_COL
    956     add             v16.16b, v28.16b, v0.16b
    957       add             TMP6, TMP6, OUTPUT_COL
    958     add             v18.16b, v29.16b, v0.16b
    959       add             TMP7, TMP7, OUTPUT_COL
    960     add             v20.16b, v30.16b, v0.16b
    961       add             TMP8, TMP8, OUTPUT_COL
    962     add             v22.16b, v31.16b, v0.16b
    963 
    964     /* Transpose the final 8-bit samples */
    965     trn1            v28.16b, v16.16b, v18.16b
    966     trn1            v30.16b, v20.16b, v22.16b
    967     trn2            v29.16b, v16.16b, v18.16b
    968     trn2            v31.16b, v20.16b, v22.16b
    969 
    970     trn1            v16.8h, v28.8h, v30.8h
    971     trn2            v18.8h, v28.8h, v30.8h
    972     trn1            v20.8h, v29.8h, v31.8h
    973     trn2            v22.8h, v29.8h, v31.8h
    974 
    975     uzp1            v28.4s, v16.4s, v18.4s
    976     uzp2            v30.4s, v16.4s, v18.4s
    977     uzp1            v29.4s, v20.4s, v22.4s
    978     uzp2            v31.4s, v20.4s, v22.4s
    979 
    980     /* Store results to the output buffer */
    981     st1             {v28.d}[0], [TMP1]
    982     st1             {v29.d}[0], [TMP2]
    983     st1             {v28.d}[1], [TMP3]
    984     st1             {v29.d}[1], [TMP4]
    985     st1             {v30.d}[0], [TMP5]
    986     st1             {v31.d}[0], [TMP6]
    987     st1             {v30.d}[1], [TMP7]
    988     st1             {v31.d}[1], [TMP8]
    989     blr             x30
    990 
    991     .unreq          DCT_TABLE
    992     .unreq          COEF_BLOCK
    993     .unreq          OUTPUT_BUF
    994     .unreq          OUTPUT_COL
    995     .unreq          TMP1
    996     .unreq          TMP2
    997     .unreq          TMP3
    998     .unreq          TMP4
    999     .unreq          TMP5
   1000     .unreq          TMP6
   1001     .unreq          TMP7
   1002     .unreq          TMP8
   1003 
   1004 
   1005 /*****************************************************************************/
   1006 
   1007 /*
   1008  * jsimd_idct_4x4_neon
   1009  *
   1010  * This function contains inverse-DCT code for getting reduced-size
   1011  * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
   1012  * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
   1013  * function from jpeg-6b (jidctred.c).
   1014  *
   1015  * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
   1016  *       requires much less arithmetic operations and hence should be faster.
   1017  *       The primary purpose of this particular NEON optimized function is
   1018  *       bit exact compatibility with jpeg-6b.
   1019  *
   1020  * TODO: a bit better instructions scheduling can be achieved by expanding
   1021  *       idct_helper/transpose_4x4 macros and reordering instructions,
   1022  *       but readability will suffer somewhat.
   1023  */
   1024 
   1025 #define CONST_BITS  13
   1026 
   1027 #define FIX_0_211164243 (1730)   /* FIX(0.211164243) */
   1028 #define FIX_0_509795579 (4176)   /* FIX(0.509795579) */
   1029 #define FIX_0_601344887 (4926)   /* FIX(0.601344887) */
   1030 #define FIX_0_720959822 (5906)   /* FIX(0.720959822) */
   1031 #define FIX_0_765366865 (6270)   /* FIX(0.765366865) */
   1032 #define FIX_0_850430095 (6967)   /* FIX(0.850430095) */
   1033 #define FIX_0_899976223 (7373)   /* FIX(0.899976223) */
   1034 #define FIX_1_061594337 (8697)   /* FIX(1.061594337) */
   1035 #define FIX_1_272758580 (10426)  /* FIX(1.272758580) */
   1036 #define FIX_1_451774981 (11893)  /* FIX(1.451774981) */
   1037 #define FIX_1_847759065 (15137)  /* FIX(1.847759065) */
   1038 #define FIX_2_172734803 (17799)  /* FIX(2.172734803) */
   1039 #define FIX_2_562915447 (20995)  /* FIX(2.562915447) */
   1040 #define FIX_3_624509785 (29692)  /* FIX(3.624509785) */
   1041 
   1042 .balign 16
   1043 Ljsimd_idct_4x4_neon_consts:
   1044   .short FIX_1_847759065      /* v0.h[0] */
   1045   .short -FIX_0_765366865     /* v0.h[1] */
   1046   .short -FIX_0_211164243     /* v0.h[2] */
   1047   .short FIX_1_451774981      /* v0.h[3] */
   1048   .short -FIX_2_172734803     /* d1[0] */
   1049   .short FIX_1_061594337      /* d1[1] */
   1050   .short -FIX_0_509795579     /* d1[2] */
   1051   .short -FIX_0_601344887     /* d1[3] */
   1052   .short FIX_0_899976223      /* v2.h[0] */
   1053   .short FIX_2_562915447      /* v2.h[1] */
   1054   .short 1 << (CONST_BITS+1)  /* v2.h[2] */
   1055   .short 0                    /* v2.h[3] */
   1056 
   1057 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
   1058     smull           v28.4s, \x4, v2.h[2]
   1059     smlal           v28.4s, \x8, v0.h[0]
   1060     smlal           v28.4s, \x14, v0.h[1]
   1061 
   1062     smull           v26.4s, \x16, v1.h[2]
   1063     smlal           v26.4s, \x12, v1.h[3]
   1064     smlal           v26.4s, \x10, v2.h[0]
   1065     smlal           v26.4s, \x6, v2.h[1]
   1066 
   1067     smull           v30.4s, \x4, v2.h[2]
   1068     smlsl           v30.4s, \x8, v0.h[0]
   1069     smlsl           v30.4s, \x14, v0.h[1]
   1070 
   1071     smull           v24.4s, \x16, v0.h[2]
   1072     smlal           v24.4s, \x12, v0.h[3]
   1073     smlal           v24.4s, \x10, v1.h[0]
   1074     smlal           v24.4s, \x6, v1.h[1]
   1075 
   1076     add             v20.4s, v28.4s, v26.4s
   1077     sub             v28.4s, v28.4s, v26.4s
   1078 
   1079   .if \shift > 16
   1080     srshr           v20.4s, v20.4s, #\shift
   1081     srshr           v28.4s, v28.4s, #\shift
   1082     xtn             \y26, v20.4s
   1083     xtn             \y29, v28.4s
   1084   .else
   1085     rshrn           \y26, v20.4s, #\shift
   1086     rshrn           \y29, v28.4s, #\shift
   1087   .endif
   1088 
   1089     add             v20.4s, v30.4s, v24.4s
   1090     sub             v30.4s, v30.4s, v24.4s
   1091 
   1092   .if \shift > 16
   1093     srshr           v20.4s, v20.4s, #\shift
   1094     srshr           v30.4s, v30.4s, #\shift
   1095     xtn             \y27, v20.4s
   1096     xtn             \y28, v30.4s
   1097   .else
   1098     rshrn           \y27, v20.4s, #\shift
   1099     rshrn           \y28, v30.4s, #\shift
   1100   .endif
   1101 .endm
   1102 
   1103 asm_function jsimd_idct_4x4_neon
   1104 
   1105     DCT_TABLE       .req x0
   1106     COEF_BLOCK      .req x1
   1107     OUTPUT_BUF      .req x2
   1108     OUTPUT_COL      .req x3
   1109     TMP1            .req x0
   1110     TMP2            .req x1
   1111     TMP3            .req x2
   1112     TMP4            .req x15
   1113 
   1114     /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
   1115        guarantee that the upper (unused) 32 bits of x3 are valid.  This
   1116        instruction ensures that those bits are set to zero. */
   1117     uxtw x3, w3
   1118 
   1119     /* Save all used NEON registers */
   1120     sub             sp, sp, 64
   1121     mov             x9, sp
   1122     /* Load constants (v3.4h is just used for padding) */
   1123     adr             TMP4, Ljsimd_idct_4x4_neon_consts
   1124     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
   1125     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
   1126     ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
   1127 
   1128     /* Load all COEF_BLOCK into NEON registers with the following allocation:
   1129      *       0 1 2 3 | 4 5 6 7
   1130      *      ---------+--------
   1131      *   0 | v4.4h   | v5.4h
   1132      *   1 | v6.4h   | v7.4h
   1133      *   2 | v8.4h   | v9.4h
   1134      *   3 | v10.4h  | v11.4h
   1135      *   4 | -       | -
   1136      *   5 | v12.4h  | v13.4h
   1137      *   6 | v14.4h  | v15.4h
   1138      *   7 | v16.4h  | v17.4h
   1139      */
   1140     ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
   1141     ld1             {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
   1142     add             COEF_BLOCK, COEF_BLOCK, #16
   1143     ld1             {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
   1144     ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
   1145     /* dequantize */
   1146     ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
   1147     mul             v4.4h, v4.4h, v18.4h
   1148     mul             v5.4h, v5.4h, v19.4h
   1149     ins             v4.d[1], v5.d[0]              /* 128 bit q4 */
   1150     ld1             {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
   1151     mul             v6.4h, v6.4h, v20.4h
   1152     mul             v7.4h, v7.4h, v21.4h
   1153     ins             v6.d[1], v7.d[0]              /* 128 bit q6 */
   1154     mul             v8.4h, v8.4h, v22.4h
   1155     mul             v9.4h, v9.4h, v23.4h
   1156     ins             v8.d[1], v9.d[0]              /* 128 bit q8 */
   1157     add             DCT_TABLE, DCT_TABLE, #16
   1158     ld1             {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
   1159     mul             v10.4h, v10.4h, v24.4h
   1160     mul             v11.4h, v11.4h, v25.4h
   1161     ins             v10.d[1], v11.d[0]            /* 128 bit q10 */
   1162     mul             v12.4h, v12.4h, v26.4h
   1163     mul             v13.4h, v13.4h, v27.4h
   1164     ins             v12.d[1], v13.d[0]            /* 128 bit q12 */
   1165     ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
   1166     mul             v14.4h, v14.4h, v28.4h
   1167     mul             v15.4h, v15.4h, v29.4h
   1168     ins             v14.d[1], v15.d[0]            /* 128 bit q14 */
   1169     mul             v16.4h, v16.4h, v30.4h
   1170     mul             v17.4h, v17.4h, v31.4h
   1171     ins             v16.d[1], v17.d[0]            /* 128 bit q16 */
   1172 
   1173     /* Pass 1 */
   1174     idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
   1175                     v4.4h, v6.4h, v8.4h, v10.4h
   1176     transpose_4x4   v4, v6, v8, v10, v3
   1177     ins             v10.d[1], v11.d[0]
   1178     idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
   1179                     v5.4h, v7.4h, v9.4h, v11.4h
   1180     transpose_4x4   v5, v7, v9, v11, v3
   1181     ins             v10.d[1], v11.d[0]
   1182 
   1183     /* Pass 2 */
   1184     idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
   1185                     v26.4h, v27.4h, v28.4h, v29.4h
   1186     transpose_4x4   v26, v27, v28, v29, v3
   1187 
   1188     /* Range limit */
   1189     movi            v30.8h, #0x80
   1190     ins             v26.d[1], v27.d[0]
   1191     ins             v28.d[1], v29.d[0]
   1192     add             v26.8h, v26.8h, v30.8h
   1193     add             v28.8h, v28.8h, v30.8h
   1194     sqxtun          v26.8b, v26.8h
   1195     sqxtun          v27.8b, v28.8h
   1196 
   1197     /* Store results to the output buffer */
   1198     ldp             TMP1, TMP2, [OUTPUT_BUF], 16
   1199     ldp             TMP3, TMP4, [OUTPUT_BUF]
   1200     add             TMP1, TMP1, OUTPUT_COL
   1201     add             TMP2, TMP2, OUTPUT_COL
   1202     add             TMP3, TMP3, OUTPUT_COL
   1203     add             TMP4, TMP4, OUTPUT_COL
   1204 
   1205 #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
   1206     /* We can use much less instructions on little endian systems if the
   1207      * OS kernel is not configured to trap unaligned memory accesses
   1208      */
   1209     st1             {v26.s}[0], [TMP1], 4
   1210     st1             {v27.s}[0], [TMP3], 4
   1211     st1             {v26.s}[1], [TMP2], 4
   1212     st1             {v27.s}[1], [TMP4], 4
   1213 #else
   1214     st1             {v26.b}[0], [TMP1], 1
   1215     st1             {v27.b}[0], [TMP3], 1
   1216     st1             {v26.b}[1], [TMP1], 1
   1217     st1             {v27.b}[1], [TMP3], 1
   1218     st1             {v26.b}[2], [TMP1], 1
   1219     st1             {v27.b}[2], [TMP3], 1
   1220     st1             {v26.b}[3], [TMP1], 1
   1221     st1             {v27.b}[3], [TMP3], 1
   1222 
   1223     st1             {v26.b}[4], [TMP2], 1
   1224     st1             {v27.b}[4], [TMP4], 1
   1225     st1             {v26.b}[5], [TMP2], 1
   1226     st1             {v27.b}[5], [TMP4], 1
   1227     st1             {v26.b}[6], [TMP2], 1
   1228     st1             {v27.b}[6], [TMP4], 1
   1229     st1             {v26.b}[7], [TMP2], 1
   1230     st1             {v27.b}[7], [TMP4], 1
   1231 #endif
   1232 
   1233     /* vpop            {v8.4h - v15.4h}    ;not available */
   1234     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
   1235     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
   1236     blr             x30
   1237 
   1238     .unreq          DCT_TABLE
   1239     .unreq          COEF_BLOCK
   1240     .unreq          OUTPUT_BUF
   1241     .unreq          OUTPUT_COL
   1242     .unreq          TMP1
   1243     .unreq          TMP2
   1244     .unreq          TMP3
   1245     .unreq          TMP4
   1246 
   1247 .purgem idct_helper
   1248 
   1249 
   1250 /*****************************************************************************/
   1251 
   1252 /*
   1253  * jsimd_idct_2x2_neon
   1254  *
   1255  * This function contains inverse-DCT code for getting reduced-size
   1256  * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
   1257  * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
   1258  * function from jpeg-6b (jidctred.c).
   1259  *
   1260  * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
   1261  *       requires much less arithmetic operations and hence should be faster.
   1262  *       The primary purpose of this particular NEON optimized function is
   1263  *       bit exact compatibility with jpeg-6b.
   1264  */
   1265 
   1266 .balign 8
   1267 Ljsimd_idct_2x2_neon_consts:
   1268   .short -FIX_0_720959822  /* v14[0] */
   1269   .short FIX_0_850430095   /* v14[1] */
   1270   .short -FIX_1_272758580  /* v14[2] */
   1271   .short FIX_3_624509785   /* v14[3] */
   1272 
   1273 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
   1274     sshll           v15.4s, \x4, #15
   1275     smull           v26.4s, \x6, v14.h[3]
   1276     smlal           v26.4s, \x10, v14.h[2]
   1277     smlal           v26.4s, \x12, v14.h[1]
   1278     smlal           v26.4s, \x16, v14.h[0]
   1279 
   1280     add             v20.4s, v15.4s, v26.4s
   1281     sub             v15.4s, v15.4s, v26.4s
   1282 
   1283   .if \shift > 16
   1284     srshr           v20.4s, v20.4s, #\shift
   1285     srshr           v15.4s, v15.4s, #\shift
   1286     xtn             \y26, v20.4s
   1287     xtn             \y27, v15.4s
   1288   .else
   1289     rshrn           \y26, v20.4s, #\shift
   1290     rshrn           \y27, v15.4s, #\shift
   1291   .endif
   1292 .endm
   1293 
   1294 asm_function jsimd_idct_2x2_neon
   1295 
   1296     DCT_TABLE       .req x0
   1297     COEF_BLOCK      .req x1
   1298     OUTPUT_BUF      .req x2
   1299     OUTPUT_COL      .req x3
   1300     TMP1            .req x0
   1301     TMP2            .req x15
   1302 
   1303     /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
   1304        guarantee that the upper (unused) 32 bits of x3 are valid.  This
   1305        instruction ensures that those bits are set to zero. */
   1306     uxtw x3, w3
   1307 
   1308     /* vpush           {v8.4h - v15.4h}            ; not available */
   1309     sub             sp, sp, 64
   1310     mov             x9, sp
   1311 
   1312     /* Load constants */
   1313     adr             TMP2, Ljsimd_idct_2x2_neon_consts
   1314     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
   1315     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
   1316     ld1             {v14.4h}, [TMP2]
   1317 
   1318     /* Load all COEF_BLOCK into NEON registers with the following allocation:
   1319      *       0 1 2 3 | 4 5 6 7
   1320      *      ---------+--------
   1321      *   0 | v4.4h   | v5.4h
   1322      *   1 | v6.4h   | v7.4h
   1323      *   2 | -       | -
   1324      *   3 | v10.4h  | v11.4h
   1325      *   4 | -       | -
   1326      *   5 | v12.4h  | v13.4h
   1327      *   6 | -       | -
   1328      *   7 | v16.4h  | v17.4h
   1329      */
   1330     ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
   1331     add             COEF_BLOCK, COEF_BLOCK, #16
   1332     ld1             {v10.4h, v11.4h}, [COEF_BLOCK], 16
   1333     add             COEF_BLOCK, COEF_BLOCK, #16
   1334     ld1             {v12.4h, v13.4h}, [COEF_BLOCK], 16
   1335     add             COEF_BLOCK, COEF_BLOCK, #16
   1336     ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
   1337     /* Dequantize */
   1338     ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
   1339     mul             v4.4h, v4.4h, v18.4h
   1340     mul             v5.4h, v5.4h, v19.4h
   1341     ins             v4.d[1], v5.d[0]
   1342     mul             v6.4h, v6.4h, v20.4h
   1343     mul             v7.4h, v7.4h, v21.4h
   1344     ins             v6.d[1], v7.d[0]
   1345     add             DCT_TABLE, DCT_TABLE, #16
   1346     ld1             {v24.4h, v25.4h}, [DCT_TABLE], 16
   1347     mul             v10.4h, v10.4h, v24.4h
   1348     mul             v11.4h, v11.4h, v25.4h
   1349     ins             v10.d[1], v11.d[0]
   1350     add             DCT_TABLE, DCT_TABLE, #16
   1351     ld1             {v26.4h, v27.4h}, [DCT_TABLE], 16
   1352     mul             v12.4h, v12.4h, v26.4h
   1353     mul             v13.4h, v13.4h, v27.4h
   1354     ins             v12.d[1], v13.d[0]
   1355     add             DCT_TABLE, DCT_TABLE, #16
   1356     ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
   1357     mul             v16.4h, v16.4h, v30.4h
   1358     mul             v17.4h, v17.4h, v31.4h
   1359     ins             v16.d[1], v17.d[0]
   1360 
   1361     /* Pass 1 */
   1362 #if 0
   1363     idct_helper     v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
   1364     transpose_4x4   v4.4h, v6.4h, v8.4h, v10.4h
   1365     idct_helper     v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
   1366     transpose_4x4   v5.4h, v7.4h, v9.4h, v11.4h
   1367 #else
   1368     smull           v26.4s, v6.4h, v14.h[3]
   1369     smlal           v26.4s, v10.4h, v14.h[2]
   1370     smlal           v26.4s, v12.4h, v14.h[1]
   1371     smlal           v26.4s, v16.4h, v14.h[0]
   1372     smull           v24.4s, v7.4h, v14.h[3]
   1373     smlal           v24.4s, v11.4h, v14.h[2]
   1374     smlal           v24.4s, v13.4h, v14.h[1]
   1375     smlal           v24.4s, v17.4h, v14.h[0]
   1376     sshll           v15.4s, v4.4h, #15
   1377     sshll           v30.4s, v5.4h, #15
   1378     add             v20.4s, v15.4s, v26.4s
   1379     sub             v15.4s, v15.4s, v26.4s
   1380     rshrn           v4.4h, v20.4s, #13
   1381     rshrn           v6.4h, v15.4s, #13
   1382     add             v20.4s, v30.4s, v24.4s
   1383     sub             v15.4s, v30.4s, v24.4s
   1384     rshrn           v5.4h, v20.4s, #13
   1385     rshrn           v7.4h, v15.4s, #13
   1386     ins             v4.d[1], v5.d[0]
   1387     ins             v6.d[1], v7.d[0]
   1388     transpose       v4, v6, v3, .16b, .8h
   1389     transpose       v6, v10, v3, .16b, .4s
   1390     ins             v11.d[0], v10.d[1]
   1391     ins             v7.d[0], v6.d[1]
   1392 #endif
   1393 
   1394     /* Pass 2 */
   1395     idct_helper     v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
   1396 
   1397     /* Range limit */
   1398     movi            v30.8h, #0x80
   1399     ins             v26.d[1], v27.d[0]
   1400     add             v26.8h, v26.8h, v30.8h
   1401     sqxtun          v30.8b, v26.8h
   1402     ins             v26.d[0], v30.d[0]
   1403     sqxtun          v27.8b, v26.8h
   1404 
   1405     /* Store results to the output buffer */
   1406     ldp             TMP1, TMP2, [OUTPUT_BUF]
   1407     add             TMP1, TMP1, OUTPUT_COL
   1408     add             TMP2, TMP2, OUTPUT_COL
   1409 
   1410     st1             {v26.b}[0], [TMP1], 1
   1411     st1             {v27.b}[4], [TMP1], 1
   1412     st1             {v26.b}[1], [TMP2], 1
   1413     st1             {v27.b}[5], [TMP2], 1
   1414 
   1415     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
   1416     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
   1417     blr             x30
   1418 
   1419     .unreq          DCT_TABLE
   1420     .unreq          COEF_BLOCK
   1421     .unreq          OUTPUT_BUF
   1422     .unreq          OUTPUT_COL
   1423     .unreq          TMP1
   1424     .unreq          TMP2
   1425 
   1426 .purgem idct_helper
   1427 
   1428 
   1429 /*****************************************************************************/
   1430 
   1431 /*
   1432  * jsimd_ycc_extrgb_convert_neon
   1433  * jsimd_ycc_extbgr_convert_neon
   1434  * jsimd_ycc_extrgbx_convert_neon
   1435  * jsimd_ycc_extbgrx_convert_neon
   1436  * jsimd_ycc_extxbgr_convert_neon
   1437  * jsimd_ycc_extxrgb_convert_neon
   1438  *
   1439  * Colorspace conversion YCbCr -> RGB
   1440  */
   1441 
   1442 .macro do_load size
   1443   .if \size == 8
   1444     ld1             {v4.8b}, [U], 8
   1445     ld1             {v5.8b}, [V], 8
   1446     ld1             {v0.8b}, [Y], 8
   1447     prfm            pldl1keep, [U, #64]
   1448     prfm            pldl1keep, [V, #64]
   1449     prfm            pldl1keep, [Y, #64]
   1450   .elseif \size == 4
   1451     ld1             {v4.b}[0], [U], 1
   1452     ld1             {v4.b}[1], [U], 1
   1453     ld1             {v4.b}[2], [U], 1
   1454     ld1             {v4.b}[3], [U], 1
   1455     ld1             {v5.b}[0], [V], 1
   1456     ld1             {v5.b}[1], [V], 1
   1457     ld1             {v5.b}[2], [V], 1
   1458     ld1             {v5.b}[3], [V], 1
   1459     ld1             {v0.b}[0], [Y], 1
   1460     ld1             {v0.b}[1], [Y], 1
   1461     ld1             {v0.b}[2], [Y], 1
   1462     ld1             {v0.b}[3], [Y], 1
   1463   .elseif \size == 2
   1464     ld1             {v4.b}[4], [U], 1
   1465     ld1             {v4.b}[5], [U], 1
   1466     ld1             {v5.b}[4], [V], 1
   1467     ld1             {v5.b}[5], [V], 1
   1468     ld1             {v0.b}[4], [Y], 1
   1469     ld1             {v0.b}[5], [Y], 1
   1470   .elseif \size == 1
   1471     ld1             {v4.b}[6], [U], 1
   1472     ld1             {v5.b}[6], [V], 1
   1473     ld1             {v0.b}[6], [Y], 1
   1474   .else
   1475     .error unsupported macroblock size
   1476   .endif
   1477 .endm
   1478 
   1479 .macro do_store bpp, size, fast_st3
   1480   .if \bpp == 24
   1481     .if \size == 8
   1482       .if \fast_st3 == 1
   1483         st3         {v10.8b, v11.8b, v12.8b}, [RGB], 24
   1484       .else
   1485         st1         {v10.b}[0], [RGB], #1
   1486         st1         {v11.b}[0], [RGB], #1
   1487         st1         {v12.b}[0], [RGB], #1
   1488 
   1489         st1         {v10.b}[1], [RGB], #1
   1490         st1         {v11.b}[1], [RGB], #1
   1491         st1         {v12.b}[1], [RGB], #1
   1492 
   1493         st1         {v10.b}[2], [RGB], #1
   1494         st1         {v11.b}[2], [RGB], #1
   1495         st1         {v12.b}[2], [RGB], #1
   1496 
   1497         st1         {v10.b}[3], [RGB], #1
   1498         st1         {v11.b}[3], [RGB], #1
   1499         st1         {v12.b}[3], [RGB], #1
   1500 
   1501         st1         {v10.b}[4], [RGB], #1
   1502         st1         {v11.b}[4], [RGB], #1
   1503         st1         {v12.b}[4], [RGB], #1
   1504 
   1505         st1         {v10.b}[5], [RGB], #1
   1506         st1         {v11.b}[5], [RGB], #1
   1507         st1         {v12.b}[5], [RGB], #1
   1508 
   1509         st1         {v10.b}[6], [RGB], #1
   1510         st1         {v11.b}[6], [RGB], #1
   1511         st1         {v12.b}[6], [RGB], #1
   1512 
   1513         st1         {v10.b}[7], [RGB], #1
   1514         st1         {v11.b}[7], [RGB], #1
   1515         st1         {v12.b}[7], [RGB], #1
   1516       .endif
   1517     .elseif \size == 4
   1518       st3           {v10.b, v11.b, v12.b}[0], [RGB], 3
   1519       st3           {v10.b, v11.b, v12.b}[1], [RGB], 3
   1520       st3           {v10.b, v11.b, v12.b}[2], [RGB], 3
   1521       st3           {v10.b, v11.b, v12.b}[3], [RGB], 3
   1522     .elseif \size == 2
   1523       st3           {v10.b, v11.b, v12.b}[4], [RGB], 3
   1524       st3           {v10.b, v11.b, v12.b}[5], [RGB], 3
   1525     .elseif \size == 1
   1526       st3           {v10.b, v11.b, v12.b}[6], [RGB], 3
   1527     .else
   1528      .error unsupported macroblock size
   1529     .endif
   1530   .elseif \bpp == 32
   1531     .if \size == 8
   1532       st4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
   1533     .elseif \size == 4
   1534       st4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
   1535       st4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
   1536       st4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
   1537       st4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
   1538     .elseif \size == 2
   1539       st4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
   1540       st4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
   1541     .elseif \size == 1
   1542       st4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
   1543     .else
   1544       .error unsupported macroblock size
   1545     .endif
   1546   .elseif \bpp==16
   1547     .if \size == 8
   1548       st1           {v25.8h}, [RGB], 16
   1549     .elseif \size == 4
   1550       st1           {v25.4h}, [RGB], 8
   1551     .elseif \size == 2
   1552       st1           {v25.h}[4], [RGB], 2
   1553       st1           {v25.h}[5], [RGB], 2
   1554     .elseif \size == 1
   1555       st1           {v25.h}[6], [RGB], 2
   1556     .else
   1557       .error unsupported macroblock size
   1558     .endif
   1559   .else
   1560     .error unsupported bpp
   1561   .endif
   1562 .endm
   1563 
   1564 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
   1565                                            g_offs, gsize, b_offs, bsize, \
   1566                                            defsize, fast_st3
   1567 
   1568 /*
   1569  * 2-stage pipelined YCbCr->RGB conversion
   1570  */
   1571 
   1572 .macro do_yuv_to_rgb_stage1
   1573     uaddw           v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
   1574     uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
   1575     smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
   1576     smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
   1577     smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
   1578     smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
   1579     smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
   1580     smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
   1581     smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
   1582     smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
   1583 .endm
   1584 
   1585 .macro do_yuv_to_rgb_stage2
   1586     rshrn           v20.4h, v20.4s, #15
   1587     rshrn2          v20.8h, v22.4s, #15
   1588     rshrn           v24.4h, v24.4s, #14
   1589     rshrn2          v24.8h, v26.4s, #14
   1590     rshrn           v28.4h, v28.4s, #14
   1591     rshrn2          v28.8h, v30.4s, #14
   1592     uaddw           v20.8h, v20.8h, v0.8b
   1593     uaddw           v24.8h, v24.8h, v0.8b
   1594     uaddw           v28.8h, v28.8h, v0.8b
   1595   .if \bpp != 16
   1596     sqxtun          v1\g_offs\defsize, v20.8h
   1597     sqxtun          v1\r_offs\defsize, v24.8h
   1598     sqxtun          v1\b_offs\defsize, v28.8h
   1599   .else
   1600     sqshlu          v21.8h, v20.8h, #8
   1601     sqshlu          v25.8h, v24.8h, #8
   1602     sqshlu          v29.8h, v28.8h, #8
   1603     sri             v25.8h, v21.8h, #5
   1604     sri             v25.8h, v29.8h, #11
   1605   .endif
   1606 .endm
   1607 
   1608 .macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
   1609     rshrn           v20.4h, v20.4s, #15
   1610     rshrn           v24.4h, v24.4s, #14
   1611     rshrn           v28.4h, v28.4s, #14
   1612     ld1             {v4.8b}, [U], 8
   1613     rshrn2          v20.8h, v22.4s, #15
   1614     rshrn2          v24.8h, v26.4s, #14
   1615     rshrn2          v28.8h, v30.4s, #14
   1616     ld1             {v5.8b}, [V], 8
   1617     uaddw           v20.8h, v20.8h, v0.8b
   1618     uaddw           v24.8h, v24.8h, v0.8b
   1619     uaddw           v28.8h, v28.8h, v0.8b
   1620   .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
   1621     sqxtun          v1\g_offs\defsize, v20.8h
   1622     ld1             {v0.8b}, [Y], 8
   1623     sqxtun          v1\r_offs\defsize, v24.8h
   1624     prfm            pldl1keep, [U, #64]
   1625     prfm            pldl1keep, [V, #64]
   1626     prfm            pldl1keep, [Y, #64]
   1627     sqxtun          v1\b_offs\defsize, v28.8h
   1628     uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
   1629     uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
   1630     smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
   1631     smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
   1632     smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
   1633     smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
   1634     smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
   1635     smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
   1636   .else  /**************************** rgb565 ********************************/
   1637     sqshlu          v21.8h, v20.8h, #8
   1638     sqshlu          v25.8h, v24.8h, #8
   1639     sqshlu          v29.8h, v28.8h, #8
   1640     uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
   1641     uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
   1642     ld1             {v0.8b}, [Y], 8
   1643     smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
   1644     smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
   1645     smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
   1646     smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
   1647     sri             v25.8h, v21.8h, #5
   1648     smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
   1649     smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
   1650     prfm            pldl1keep, [U, #64]
   1651     prfm            pldl1keep, [V, #64]
   1652     prfm            pldl1keep, [Y, #64]
   1653     sri             v25.8h, v29.8h, #11
   1654   .endif
   1655     do_store        \bpp, 8, \fast_st3
   1656     smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
   1657     smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
   1658 .endm
   1659 
   1660 .macro do_yuv_to_rgb
   1661     do_yuv_to_rgb_stage1
   1662     do_yuv_to_rgb_stage2
   1663 .endm
   1664 
   1665 /* Apple gas crashes on adrl, work around that by using adr.
   1666  * But this requires a copy of these constants for each function.
   1667  */
   1668 
   1669 .balign 16
   1670 .if \fast_st3 == 1
   1671 Ljsimd_ycc_\colorid\()_neon_consts:
   1672 .else
   1673 Ljsimd_ycc_\colorid\()_neon_slowst3_consts:
   1674 .endif
   1675   .short 0,      0,     0,      0
   1676   .short 22971, -11277, -23401, 29033
   1677   .short -128,  -128,   -128,   -128
   1678   .short -128,  -128,   -128,   -128
   1679 
   1680 .if \fast_st3 == 1
   1681 asm_function jsimd_ycc_\colorid\()_convert_neon
   1682 .else
   1683 asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
   1684 .endif
   1685     OUTPUT_WIDTH    .req w0
   1686     INPUT_BUF       .req x1
   1687     INPUT_ROW       .req w2
   1688     OUTPUT_BUF      .req x3
   1689     NUM_ROWS        .req w4
   1690 
   1691     INPUT_BUF0      .req x5
   1692     INPUT_BUF1      .req x6
   1693     INPUT_BUF2      .req x1
   1694 
   1695     RGB             .req x7
   1696     Y               .req x9
   1697     U               .req x10
   1698     V               .req x11
   1699     N               .req w15
   1700 
   1701     sub             sp, sp, 64
   1702     mov             x9, sp
   1703 
   1704     /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
   1705     .if \fast_st3 == 1
   1706       adr           x15, Ljsimd_ycc_\colorid\()_neon_consts
   1707     .else
   1708       adr           x15, Ljsimd_ycc_\colorid\()_neon_slowst3_consts
   1709     .endif
   1710 
   1711     /* Save NEON registers */
   1712     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
   1713     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
   1714     ld1             {v0.4h, v1.4h}, [x15], 16
   1715     ld1             {v2.8h}, [x15]
   1716 
   1717     ldr             INPUT_BUF0, [INPUT_BUF]
   1718     ldr             INPUT_BUF1, [INPUT_BUF, #8]
   1719     ldr             INPUT_BUF2, [INPUT_BUF, #16]
   1720     .unreq          INPUT_BUF
   1721 
   1722     /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
   1723     movi            v10.16b, #255
   1724     movi            v13.16b, #255
   1725 
   1726     /* Outer loop over scanlines */
   1727     cmp             NUM_ROWS, #1
   1728     b.lt            9f
   1729 0:
   1730     ldr             Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
   1731     ldr             U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
   1732     mov             N, OUTPUT_WIDTH
   1733     ldr             V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
   1734     add             INPUT_ROW, INPUT_ROW, #1
   1735     ldr             RGB, [OUTPUT_BUF], #8
   1736 
   1737     /* Inner loop over pixels */
   1738     subs            N, N, #8
   1739     b.lt            3f
   1740     do_load         8
   1741     do_yuv_to_rgb_stage1
   1742     subs            N, N, #8
   1743     b.lt            2f
   1744 1:
   1745     do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
   1746     subs            N, N, #8
   1747     b.ge            1b
   1748 2:
   1749     do_yuv_to_rgb_stage2
   1750     do_store        \bpp, 8, \fast_st3
   1751     tst             N, #7
   1752     b.eq            8f
   1753 3:
   1754     tst             N, #4
   1755     b.eq            3f
   1756     do_load         4
   1757 3:
   1758     tst             N, #2
   1759     b.eq            4f
   1760     do_load         2
   1761 4:
   1762     tst             N, #1
   1763     b.eq            5f
   1764     do_load         1
   1765 5:
   1766     do_yuv_to_rgb
   1767     tst             N, #4
   1768     b.eq            6f
   1769     do_store        \bpp, 4, \fast_st3
   1770 6:
   1771     tst             N, #2
   1772     b.eq            7f
   1773     do_store        \bpp, 2, \fast_st3
   1774 7:
   1775     tst             N, #1
   1776     b.eq            8f
   1777     do_store        \bpp, 1, \fast_st3
   1778 8:
   1779     subs            NUM_ROWS, NUM_ROWS, #1
   1780     b.gt            0b
   1781 9:
   1782     /* Restore all registers and return */
   1783     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
   1784     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
   1785     br              x30
   1786     .unreq          OUTPUT_WIDTH
   1787     .unreq          INPUT_ROW
   1788     .unreq          OUTPUT_BUF
   1789     .unreq          NUM_ROWS
   1790     .unreq          INPUT_BUF0
   1791     .unreq          INPUT_BUF1
   1792     .unreq          INPUT_BUF2
   1793     .unreq          RGB
   1794     .unreq          Y
   1795     .unreq          U
   1796     .unreq          V
   1797     .unreq          N
   1798 
   1799 .purgem do_yuv_to_rgb
   1800 .purgem do_yuv_to_rgb_stage1
   1801 .purgem do_yuv_to_rgb_stage2
   1802 .purgem do_yuv_to_rgb_stage2_store_load_stage1
   1803 
   1804 .endm
   1805 
   1806 /*--------------------------------- id ----- bpp R  rsize G  gsize B  bsize defsize fast_st3*/
   1807 generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
   1808 generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
   1809 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
   1810 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
   1811 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,  2, .4h,  1, .4h,  .8b,    1
   1812 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,  2, .4h,  3, .4h,  .8b,    1
   1813 generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,  0, .4h,  0, .4h,  .8b,    1
   1814 
   1815 generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    0
   1816 generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    0
   1817 
   1818 .purgem do_load
   1819 .purgem do_store
   1820 
   1821 
   1822 /*****************************************************************************/
   1823 
   1824 /*
   1825  * jsimd_extrgb_ycc_convert_neon
   1826  * jsimd_extbgr_ycc_convert_neon
   1827  * jsimd_extrgbx_ycc_convert_neon
   1828  * jsimd_extbgrx_ycc_convert_neon
   1829  * jsimd_extxbgr_ycc_convert_neon
   1830  * jsimd_extxrgb_ycc_convert_neon
   1831  *
   1832  * Colorspace conversion RGB -> YCbCr
   1833  */
   1834 
   1835 .macro do_store size
   1836   .if \size == 8
   1837     st1             {v20.8b}, [Y], #8
   1838     st1             {v21.8b}, [U], #8
   1839     st1             {v22.8b}, [V], #8
   1840   .elseif \size == 4
   1841     st1             {v20.b}[0], [Y], #1
   1842     st1             {v20.b}[1], [Y], #1
   1843     st1             {v20.b}[2], [Y], #1
   1844     st1             {v20.b}[3], [Y], #1
   1845     st1             {v21.b}[0], [U], #1
   1846     st1             {v21.b}[1], [U], #1
   1847     st1             {v21.b}[2], [U], #1
   1848     st1             {v21.b}[3], [U], #1
   1849     st1             {v22.b}[0], [V], #1
   1850     st1             {v22.b}[1], [V], #1
   1851     st1             {v22.b}[2], [V], #1
   1852     st1             {v22.b}[3], [V], #1
   1853   .elseif \size == 2
   1854     st1             {v20.b}[4], [Y], #1
   1855     st1             {v20.b}[5], [Y], #1
   1856     st1             {v21.b}[4], [U], #1
   1857     st1             {v21.b}[5], [U], #1
   1858     st1             {v22.b}[4], [V], #1
   1859     st1             {v22.b}[5], [V], #1
   1860   .elseif \size == 1
   1861     st1             {v20.b}[6], [Y], #1
   1862     st1             {v21.b}[6], [U], #1
   1863     st1             {v22.b}[6], [V], #1
   1864   .else
   1865     .error unsupported macroblock size
   1866   .endif
   1867 .endm
   1868 
   1869 .macro do_load bpp, size, fast_ld3
   1870   .if \bpp == 24
   1871     .if \size == 8
   1872       .if \fast_ld3 == 1
   1873         ld3         {v10.8b, v11.8b, v12.8b}, [RGB], #24
   1874       .else
   1875         ld1         {v10.b}[0], [RGB], #1
   1876         ld1         {v11.b}[0], [RGB], #1
   1877         ld1         {v12.b}[0], [RGB], #1
   1878 
   1879         ld1         {v10.b}[1], [RGB], #1
   1880         ld1         {v11.b}[1], [RGB], #1
   1881         ld1         {v12.b}[1], [RGB], #1
   1882 
   1883         ld1         {v10.b}[2], [RGB], #1
   1884         ld1         {v11.b}[2], [RGB], #1
   1885         ld1         {v12.b}[2], [RGB], #1
   1886 
   1887         ld1         {v10.b}[3], [RGB], #1
   1888         ld1         {v11.b}[3], [RGB], #1
   1889         ld1         {v12.b}[3], [RGB], #1
   1890 
   1891         ld1         {v10.b}[4], [RGB], #1
   1892         ld1         {v11.b}[4], [RGB], #1
   1893         ld1         {v12.b}[4], [RGB], #1
   1894 
   1895         ld1         {v10.b}[5], [RGB], #1
   1896         ld1         {v11.b}[5], [RGB], #1
   1897         ld1         {v12.b}[5], [RGB], #1
   1898 
   1899         ld1         {v10.b}[6], [RGB], #1
   1900         ld1         {v11.b}[6], [RGB], #1
   1901         ld1         {v12.b}[6], [RGB], #1
   1902 
   1903         ld1         {v10.b}[7], [RGB], #1
   1904         ld1         {v11.b}[7], [RGB], #1
   1905         ld1         {v12.b}[7], [RGB], #1
   1906       .endif
   1907       prfm          pldl1keep, [RGB, #128]
   1908     .elseif \size == 4
   1909       ld3           {v10.b, v11.b, v12.b}[0], [RGB], #3
   1910       ld3           {v10.b, v11.b, v12.b}[1], [RGB], #3
   1911       ld3           {v10.b, v11.b, v12.b}[2], [RGB], #3
   1912       ld3           {v10.b, v11.b, v12.b}[3], [RGB], #3
   1913     .elseif \size == 2
   1914       ld3           {v10.b, v11.b, v12.b}[4], [RGB], #3
   1915       ld3           {v10.b, v11.b, v12.b}[5], [RGB], #3
   1916     .elseif \size == 1
   1917       ld3           {v10.b, v11.b, v12.b}[6], [RGB], #3
   1918     .else
   1919       .error unsupported macroblock size
   1920     .endif
   1921   .elseif \bpp == 32
   1922     .if \size == 8
   1923       ld4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
   1924       prfm          pldl1keep, [RGB, #128]
   1925     .elseif \size == 4
   1926       ld4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
   1927       ld4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
   1928       ld4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
   1929       ld4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
   1930     .elseif \size == 2
   1931       ld4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
   1932       ld4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
   1933     .elseif \size == 1
   1934       ld4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
   1935     .else
   1936       .error unsupported macroblock size
   1937     .endif
   1938   .else
   1939     .error unsupported bpp
   1940   .endif
   1941 .endm
   1942 
   1943 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
   1944                                            b_offs, fast_ld3
   1945 
   1946 /*
   1947  * 2-stage pipelined RGB->YCbCr conversion
   1948  */
   1949 
   1950 .macro do_rgb_to_yuv_stage1
   1951     ushll           v4.8h, v1\r_offs\().8b, #0  /* r = v4 */
   1952     ushll           v6.8h, v1\g_offs\().8b, #0  /* g = v6 */
   1953     ushll           v8.8h, v1\b_offs\().8b, #0  /* b = v8 */
   1954     rev64           v18.4s, v1.4s
   1955     rev64           v26.4s, v1.4s
   1956     rev64           v28.4s, v1.4s
   1957     rev64           v30.4s, v1.4s
   1958     umull           v14.4s, v4.4h, v0.h[0]
   1959     umull2          v16.4s, v4.8h, v0.h[0]
   1960     umlsl           v18.4s, v4.4h, v0.h[3]
   1961     umlsl2          v26.4s, v4.8h, v0.h[3]
   1962     umlal           v28.4s, v4.4h, v0.h[5]
   1963     umlal2          v30.4s, v4.8h, v0.h[5]
   1964     umlal           v14.4s, v6.4h, v0.h[1]
   1965     umlal2          v16.4s, v6.8h, v0.h[1]
   1966     umlsl           v18.4s, v6.4h, v0.h[4]
   1967     umlsl2          v26.4s, v6.8h, v0.h[4]
   1968     umlsl           v28.4s, v6.4h, v0.h[6]
   1969     umlsl2          v30.4s, v6.8h, v0.h[6]
   1970     umlal           v14.4s, v8.4h, v0.h[2]
   1971     umlal2          v16.4s, v8.8h, v0.h[2]
   1972     umlal           v18.4s, v8.4h, v0.h[5]
   1973     umlal2          v26.4s, v8.8h, v0.h[5]
   1974     umlsl           v28.4s, v8.4h, v0.h[7]
   1975     umlsl2          v30.4s, v8.8h, v0.h[7]
   1976 .endm
   1977 
   1978 .macro do_rgb_to_yuv_stage2
   1979     rshrn           v20.4h, v14.4s, #16
   1980     shrn            v22.4h, v18.4s, #16
   1981     shrn            v24.4h, v28.4s, #16
   1982     rshrn2          v20.8h, v16.4s, #16
   1983     shrn2           v22.8h, v26.4s, #16
   1984     shrn2           v24.8h, v30.4s, #16
   1985     xtn             v20.8b, v20.8h       /* v20 = y */
   1986     xtn             v21.8b, v22.8h       /* v21 = u */
   1987     xtn             v22.8b, v24.8h       /* v22 = v */
   1988 .endm
   1989 
   1990 .macro do_rgb_to_yuv
   1991     do_rgb_to_yuv_stage1
   1992     do_rgb_to_yuv_stage2
   1993 .endm
   1994 
   1995 /* TODO: expand macros and interleave instructions if some in-order
   1996  *       ARM64 processor actually can dual-issue LOAD/STORE with ALU */
   1997 .macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
   1998     do_rgb_to_yuv_stage2
   1999     do_load         \bpp, 8, \fast_ld3
   2000     st1             {v20.8b}, [Y], #8
   2001     st1             {v21.8b}, [U], #8
   2002     st1             {v22.8b}, [V], #8
   2003     do_rgb_to_yuv_stage1
   2004 .endm
   2005 
   2006 .balign 16
   2007 .if \fast_ld3 == 1
   2008 Ljsimd_\colorid\()_ycc_neon_consts:
   2009 .else
   2010 Ljsimd_\colorid\()_ycc_neon_slowld3_consts:
   2011 .endif
   2012   .short 19595, 38470, 7471, 11059
   2013   .short 21709, 32768, 27439, 5329
   2014   .short 32767, 128, 32767, 128
   2015   .short 32767, 128, 32767, 128
   2016 
   2017 .if \fast_ld3 == 1
   2018 asm_function jsimd_\colorid\()_ycc_convert_neon
   2019 .else
   2020 asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
   2021 .endif
   2022     OUTPUT_WIDTH    .req w0
   2023     INPUT_BUF       .req x1
   2024     OUTPUT_BUF      .req x2
   2025     OUTPUT_ROW      .req w3
   2026     NUM_ROWS        .req w4
   2027 
   2028     OUTPUT_BUF0     .req x5
   2029     OUTPUT_BUF1     .req x6
   2030     OUTPUT_BUF2     .req x2  /* OUTPUT_BUF */
   2031 
   2032     RGB             .req x7
   2033     Y               .req x9
   2034     U               .req x10
   2035     V               .req x11
   2036     N               .req w12
   2037 
   2038     /* Load constants to d0, d1, d2, d3 */
   2039     .if \fast_ld3 == 1
   2040       adr           x13, Ljsimd_\colorid\()_ycc_neon_consts
   2041     .else
   2042       adr           x13, Ljsimd_\colorid\()_ycc_neon_slowld3_consts
   2043     .endif
   2044     ld1             {v0.8h, v1.8h}, [x13]
   2045 
   2046     ldr             OUTPUT_BUF0, [OUTPUT_BUF]
   2047     ldr             OUTPUT_BUF1, [OUTPUT_BUF, #8]
   2048     ldr             OUTPUT_BUF2, [OUTPUT_BUF, #16]
   2049     .unreq          OUTPUT_BUF
   2050 
   2051     /* Save NEON registers */
   2052     sub             sp, sp, #64
   2053     mov             x9, sp
   2054     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
   2055     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
   2056 
   2057     /* Outer loop over scanlines */
   2058     cmp             NUM_ROWS, #1
   2059     b.lt            9f
   2060 0:
   2061     ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
   2062     ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
   2063     mov             N, OUTPUT_WIDTH
   2064     ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
   2065     add             OUTPUT_ROW, OUTPUT_ROW, #1
   2066     ldr             RGB, [INPUT_BUF], #8
   2067 
   2068     /* Inner loop over pixels */
   2069     subs            N, N, #8
   2070     b.lt            3f
   2071     do_load         \bpp, 8, \fast_ld3
   2072     do_rgb_to_yuv_stage1
   2073     subs            N, N, #8
   2074     b.lt            2f
   2075 1:
   2076     do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
   2077     subs            N, N, #8
   2078     b.ge            1b
   2079 2:
   2080     do_rgb_to_yuv_stage2
   2081     do_store        8
   2082     tst             N, #7
   2083     b.eq            8f
   2084 3:
   2085     tbz             N, #2, 3f
   2086     do_load         \bpp, 4, \fast_ld3
   2087 3:
   2088     tbz             N, #1, 4f
   2089     do_load         \bpp, 2, \fast_ld3
   2090 4:
   2091     tbz             N, #0, 5f
   2092     do_load         \bpp, 1, \fast_ld3
   2093 5:
   2094     do_rgb_to_yuv
   2095     tbz             N, #2, 6f
   2096     do_store        4
   2097 6:
   2098     tbz             N, #1, 7f
   2099     do_store        2
   2100 7:
   2101     tbz             N, #0, 8f
   2102     do_store        1
   2103 8:
   2104     subs            NUM_ROWS, NUM_ROWS, #1
   2105     b.gt            0b
   2106 9:
   2107     /* Restore all registers and return */
   2108     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
   2109     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
   2110     br              x30
   2111 
   2112     .unreq          OUTPUT_WIDTH
   2113     .unreq          OUTPUT_ROW
   2114     .unreq          INPUT_BUF
   2115     .unreq          NUM_ROWS
   2116     .unreq          OUTPUT_BUF0
   2117     .unreq          OUTPUT_BUF1
   2118     .unreq          OUTPUT_BUF2
   2119     .unreq          RGB
   2120     .unreq          Y
   2121     .unreq          U
   2122     .unreq          V
   2123     .unreq          N
   2124 
   2125 .purgem do_rgb_to_yuv
   2126 .purgem do_rgb_to_yuv_stage1
   2127 .purgem do_rgb_to_yuv_stage2
   2128 .purgem do_rgb_to_yuv_stage2_store_load_stage1
   2129 
   2130 .endm
   2131 
   2132 /*--------------------------------- id ----- bpp R  G  B  Fast LD3 */
   2133 generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 1
   2134 generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 1
   2135 generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1
   2136 generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1
   2137 generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1
   2138 generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1
   2139 
   2140 generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 0
   2141 generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 0
   2142 
   2143 .purgem do_load
   2144 .purgem do_store
   2145 
   2146 
   2147 /*****************************************************************************/
   2148 
   2149 /*
   2150  * Load data into workspace, applying unsigned->signed conversion
   2151  *
   2152  * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
   2153  *       rid of VST1.16 instructions
   2154  */
   2155 
   2156 asm_function jsimd_convsamp_neon
   2157     SAMPLE_DATA     .req x0
   2158     START_COL       .req x1
   2159     WORKSPACE       .req x2
   2160     TMP1            .req x9
   2161     TMP2            .req x10
   2162     TMP3            .req x11
   2163     TMP4            .req x12
   2164     TMP5            .req x13
   2165     TMP6            .req x14
   2166     TMP7            .req x15
   2167     TMP8            .req x4
   2168     TMPDUP          .req w3
   2169 
   2170     /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
   2171        guarantee that the upper (unused) 32 bits of x1 are valid.  This
   2172        instruction ensures that those bits are set to zero. */
   2173     uxtw x1, w1
   2174 
   2175     mov             TMPDUP, #128
   2176     ldp             TMP1, TMP2, [SAMPLE_DATA], 16
   2177     ldp             TMP3, TMP4, [SAMPLE_DATA], 16
   2178     dup             v0.8b, TMPDUP
   2179     add             TMP1, TMP1, START_COL
   2180     add             TMP2, TMP2, START_COL
   2181     ldp             TMP5, TMP6, [SAMPLE_DATA], 16
   2182     add             TMP3, TMP3, START_COL
   2183     add             TMP4, TMP4, START_COL
   2184     ldp             TMP7, TMP8, [SAMPLE_DATA], 16
   2185     add             TMP5, TMP5, START_COL
   2186     add             TMP6, TMP6, START_COL
   2187     ld1             {v16.8b}, [TMP1]
   2188     add             TMP7, TMP7, START_COL
   2189     add             TMP8, TMP8, START_COL
   2190     ld1             {v17.8b}, [TMP2]
   2191     usubl           v16.8h, v16.8b, v0.8b
   2192     ld1             {v18.8b}, [TMP3]
   2193     usubl           v17.8h, v17.8b, v0.8b
   2194     ld1             {v19.8b}, [TMP4]
   2195     usubl           v18.8h, v18.8b, v0.8b
   2196     ld1             {v20.8b}, [TMP5]
   2197     usubl           v19.8h, v19.8b, v0.8b
   2198     ld1             {v21.8b}, [TMP6]
   2199     st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
   2200     usubl           v20.8h, v20.8b, v0.8b
   2201     ld1             {v22.8b}, [TMP7]
   2202     usubl           v21.8h, v21.8b, v0.8b
   2203     ld1             {v23.8b}, [TMP8]
   2204     usubl           v22.8h, v22.8b, v0.8b
   2205     usubl           v23.8h, v23.8b, v0.8b
   2206     st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
   2207 
   2208     br              x30
   2209 
   2210     .unreq          SAMPLE_DATA
   2211     .unreq          START_COL
   2212     .unreq          WORKSPACE
   2213     .unreq          TMP1
   2214     .unreq          TMP2
   2215     .unreq          TMP3
   2216     .unreq          TMP4
   2217     .unreq          TMP5
   2218     .unreq          TMP6
   2219     .unreq          TMP7
   2220     .unreq          TMP8
   2221     .unreq          TMPDUP
   2222 
   2223 /*****************************************************************************/
   2224 
   2225 /*
   2226  * jsimd_fdct_islow_neon
   2227  *
   2228  * This file contains a slow-but-accurate integer implementation of the
   2229  * forward DCT (Discrete Cosine Transform). The following code is based
   2230  * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
   2231  * more details.
   2232  *
   2233  * TODO: can be combined with 'jsimd_convsamp_neon' to get
   2234  *       rid of a bunch of VLD1.16 instructions
   2235  */
   2236 
   2237 #define CONST_BITS 13
   2238 #define PASS1_BITS 2
   2239 
   2240 #define DESCALE_P1 (CONST_BITS-PASS1_BITS)
   2241 #define DESCALE_P2 (CONST_BITS+PASS1_BITS)
   2242 
   2243 #define F_0_298  2446  /* FIX(0.298631336) */
   2244 #define F_0_390  3196  /* FIX(0.390180644) */
   2245 #define F_0_541  4433  /* FIX(0.541196100) */
   2246 #define F_0_765  6270  /* FIX(0.765366865) */
   2247 #define F_0_899  7373  /* FIX(0.899976223) */
   2248 #define F_1_175  9633  /* FIX(1.175875602) */
   2249 #define F_1_501 12299  /* FIX(1.501321110) */
   2250 #define F_1_847 15137  /* FIX(1.847759065) */
   2251 #define F_1_961 16069  /* FIX(1.961570560) */
   2252 #define F_2_053 16819  /* FIX(2.053119869) */
   2253 #define F_2_562 20995  /* FIX(2.562915447) */
   2254 #define F_3_072 25172  /* FIX(3.072711026) */
   2255 
   2256 .balign 16
   2257 Ljsimd_fdct_islow_neon_consts:
   2258   .short F_0_298
   2259   .short -F_0_390
   2260   .short F_0_541
   2261   .short F_0_765
   2262   .short - F_0_899
   2263   .short F_1_175
   2264   .short F_1_501
   2265   .short - F_1_847
   2266   .short - F_1_961
   2267   .short F_2_053
   2268   .short - F_2_562
   2269   .short F_3_072
   2270   .short 0          /* padding */
   2271   .short 0
   2272   .short 0
   2273   .short 0
   2274 
   2275 #undef F_0_298
   2276 #undef F_0_390
   2277 #undef F_0_541
   2278 #undef F_0_765
   2279 #undef F_0_899
   2280 #undef F_1_175
   2281 #undef F_1_501
   2282 #undef F_1_847
   2283 #undef F_1_961
   2284 #undef F_2_053
   2285 #undef F_2_562
   2286 #undef F_3_072
   2287 #define XFIX_P_0_298 v0.h[0]
   2288 #define XFIX_N_0_390 v0.h[1]
   2289 #define XFIX_P_0_541 v0.h[2]
   2290 #define XFIX_P_0_765 v0.h[3]
   2291 #define XFIX_N_0_899 v0.h[4]
   2292 #define XFIX_P_1_175 v0.h[5]
   2293 #define XFIX_P_1_501 v0.h[6]
   2294 #define XFIX_N_1_847 v0.h[7]
   2295 #define XFIX_N_1_961 v1.h[0]
   2296 #define XFIX_P_2_053 v1.h[1]
   2297 #define XFIX_N_2_562 v1.h[2]
   2298 #define XFIX_P_3_072 v1.h[3]
   2299 
   2300 asm_function jsimd_fdct_islow_neon
   2301 
   2302     DATA            .req x0
   2303     TMP             .req x9
   2304 
   2305     /* Load constants */
   2306     adr             TMP, Ljsimd_fdct_islow_neon_consts
   2307     ld1             {v0.8h, v1.8h}, [TMP]
   2308 
   2309     /* Save NEON registers */
   2310     sub             sp, sp, #64
   2311     mov             x10, sp
   2312     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
   2313     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
   2314 
   2315     /* Load all DATA into NEON registers with the following allocation:
   2316      *       0 1 2 3 | 4 5 6 7
   2317      *      ---------+--------
   2318      *   0 | d16     | d17    | v16.8h
   2319      *   1 | d18     | d19    | v17.8h
   2320      *   2 | d20     | d21    | v18.8h
   2321      *   3 | d22     | d23    | v19.8h
   2322      *   4 | d24     | d25    | v20.8h
   2323      *   5 | d26     | d27    | v21.8h
   2324      *   6 | d28     | d29    | v22.8h
   2325      *   7 | d30     | d31    | v23.8h
   2326      */
   2327 
   2328     ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
   2329     ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
   2330     sub             DATA, DATA, #64
   2331 
   2332     /* Transpose */
   2333     transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
   2334     /* 1-D FDCT */
   2335     add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
   2336     sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
   2337     add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
   2338     sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
   2339     add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
   2340     sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
   2341     add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
   2342     sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
   2343 
   2344     /* even part */
   2345 
   2346     add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
   2347     sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
   2348     add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
   2349     sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
   2350 
   2351     add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
   2352     sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
   2353 
   2354     add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
   2355 
   2356     shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM) LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
   2357     shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM) LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
   2358 
   2359     smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
   2360     smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
   2361     mov             v22.16b, v18.16b
   2362     mov             v25.16b, v24.16b
   2363 
   2364     smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
   2365     smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
   2366     smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
   2367     smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
   2368 
   2369     rshrn           v18.4h, v18.4s, #DESCALE_P1
   2370     rshrn           v22.4h, v22.4s, #DESCALE_P1
   2371     rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
   2372     rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
   2373 
   2374     /* Odd part */
   2375 
   2376     add             v8.8h, v28.8h, v31.8h        /* z1 = tmp4 + tmp7; */
   2377     add             v9.8h, v29.8h, v30.8h        /* z2 = tmp5 + tmp6; */
   2378     add             v10.8h, v28.8h, v30.8h       /* z3 = tmp4 + tmp6; */
   2379     add             v11.8h, v29.8h, v31.8h       /* z4 = tmp5 + tmp7; */
   2380     smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
   2381     smull2          v5.4s, v10.8h, XFIX_P_1_175
   2382     smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
   2383     smlal2          v5.4s, v11.8h, XFIX_P_1_175
   2384 
   2385     smull2          v24.4s, v28.8h, XFIX_P_0_298
   2386     smull2          v25.4s, v29.8h, XFIX_P_2_053
   2387     smull2          v26.4s, v30.8h, XFIX_P_3_072
   2388     smull2          v27.4s, v31.8h, XFIX_P_1_501
   2389     smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
   2390     smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
   2391     smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
   2392     smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
   2393 
   2394     smull2          v12.4s, v8.8h, XFIX_N_0_899
   2395     smull2          v13.4s, v9.8h, XFIX_N_2_562
   2396     smull2          v14.4s, v10.8h, XFIX_N_1_961
   2397     smull2          v15.4s, v11.8h, XFIX_N_0_390
   2398     smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
   2399     smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
   2400     smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
   2401     smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
   2402 
   2403     add             v10.4s, v10.4s, v4.4s  /* z3 += z5 */
   2404     add             v14.4s, v14.4s, v5.4s
   2405     add             v11.4s, v11.4s, v4.4s  /* z4 += z5 */
   2406     add             v15.4s, v15.4s, v5.4s
   2407 
   2408     add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
   2409     add             v24.4s, v24.4s, v12.4s
   2410     add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
   2411     add             v25.4s, v25.4s, v13.4s
   2412     add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
   2413     add             v26.4s, v26.4s, v14.4s
   2414     add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
   2415     add             v27.4s, v27.4s, v15.4s
   2416 
   2417     add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
   2418     add             v24.4s, v24.4s, v14.4s
   2419     add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
   2420     add             v25.4s, v25.4s, v15.4s
   2421     add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
   2422     add             v26.4s, v26.4s, v13.4s
   2423     add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
   2424     add             v27.4s, v27.4s, v12.4s
   2425 
   2426     rshrn           v23.4h, v28.4s, #DESCALE_P1
   2427     rshrn           v21.4h, v29.4s, #DESCALE_P1
   2428     rshrn           v19.4h, v30.4s, #DESCALE_P1
   2429     rshrn           v17.4h, v31.4s, #DESCALE_P1
   2430     rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
   2431     rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
   2432     rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
   2433     rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
   2434 
   2435     /* Transpose */
   2436     transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
   2437 
   2438     /* 1-D FDCT */
   2439     add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
   2440     sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
   2441     add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
   2442     sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
   2443     add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
   2444     sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
   2445     add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
   2446     sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
   2447 
   2448     /* even part */
   2449     add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
   2450     sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
   2451     add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
   2452     sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
   2453 
   2454     add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
   2455     sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
   2456 
   2457     add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
   2458 
   2459     srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS); */
   2460     srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS); */
   2461 
   2462     smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
   2463     smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
   2464     mov             v22.16b, v18.16b
   2465     mov             v25.16b, v24.16b
   2466 
   2467     smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
   2468     smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
   2469     smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
   2470     smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
   2471 
   2472     rshrn           v18.4h, v18.4s, #DESCALE_P2
   2473     rshrn           v22.4h, v22.4s, #DESCALE_P2
   2474     rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
   2475     rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
   2476 
   2477     /* Odd part */
   2478     add             v8.8h, v28.8h, v31.8h   /* z1 = tmp4 + tmp7; */
   2479     add             v9.8h, v29.8h, v30.8h   /* z2 = tmp5 + tmp6; */
   2480     add             v10.8h, v28.8h, v30.8h  /* z3 = tmp4 + tmp6; */
   2481     add             v11.8h, v29.8h, v31.8h  /* z4 = tmp5 + tmp7; */
   2482 
   2483     smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
   2484     smull2          v5.4s, v10.8h, XFIX_P_1_175
   2485     smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
   2486     smlal2          v5.4s, v11.8h, XFIX_P_1_175
   2487 
   2488     smull2          v24.4s, v28.8h, XFIX_P_0_298
   2489     smull2          v25.4s, v29.8h, XFIX_P_2_053
   2490     smull2          v26.4s, v30.8h, XFIX_P_3_072
   2491     smull2          v27.4s, v31.8h, XFIX_P_1_501
   2492     smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
   2493     smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
   2494     smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
   2495     smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
   2496 
   2497     smull2          v12.4s, v8.8h, XFIX_N_0_899
   2498     smull2          v13.4s, v9.8h, XFIX_N_2_562
   2499     smull2          v14.4s, v10.8h, XFIX_N_1_961
   2500     smull2          v15.4s, v11.8h, XFIX_N_0_390
   2501     smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, - FIX_0_899976223); */
   2502     smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, - FIX_2_562915447); */
   2503     smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, - FIX_1_961570560); */
   2504     smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, - FIX_0_390180644); */
   2505 
   2506     add             v10.4s, v10.4s, v4.4s
   2507     add             v14.4s, v14.4s, v5.4s
   2508     add             v11.4s, v11.4s, v4.4s
   2509     add             v15.4s, v15.4s, v5.4s
   2510 
   2511     add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
   2512     add             v24.4s, v24.4s, v12.4s
   2513     add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
   2514     add             v25.4s, v25.4s, v13.4s
   2515     add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
   2516     add             v26.4s, v26.4s, v14.4s
   2517     add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
   2518     add             v27.4s, v27.4s, v15.4s
   2519 
   2520     add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
   2521     add             v24.4s, v24.4s, v14.4s
   2522     add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
   2523     add             v25.4s, v25.4s, v15.4s
   2524     add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
   2525     add             v26.4s, v26.4s, v13.4s
   2526     add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
   2527     add             v27.4s, v27.4s, v12.4s
   2528 
   2529     rshrn           v23.4h, v28.4s, #DESCALE_P2
   2530     rshrn           v21.4h, v29.4s, #DESCALE_P2
   2531     rshrn           v19.4h, v30.4s, #DESCALE_P2
   2532     rshrn           v17.4h, v31.4s, #DESCALE_P2
   2533     rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
   2534     rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
   2535     rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
   2536     rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
   2537 
   2538     /* store results */
   2539     st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
   2540     st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
   2541 
   2542     /* Restore NEON registers */
   2543     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
   2544     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
   2545 
   2546     br              x30
   2547 
   2548     .unreq          DATA
   2549     .unreq          TMP
   2550 
   2551 #undef XFIX_P_0_298
   2552 #undef XFIX_N_0_390
   2553 #undef XFIX_P_0_541
   2554 #undef XFIX_P_0_765
   2555 #undef XFIX_N_0_899
   2556 #undef XFIX_P_1_175
   2557 #undef XFIX_P_1_501
   2558 #undef XFIX_N_1_847
   2559 #undef XFIX_N_1_961
   2560 #undef XFIX_P_2_053
   2561 #undef XFIX_N_2_562
   2562 #undef XFIX_P_3_072
   2563 
   2564 
   2565 /*****************************************************************************/
   2566 
   2567 /*
   2568  * jsimd_fdct_ifast_neon
   2569  *
   2570  * This function contains a fast, not so accurate integer implementation of
   2571  * the forward DCT (Discrete Cosine Transform). It uses the same calculations
   2572  * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
   2573  * function from jfdctfst.c
   2574  *
   2575  * TODO: can be combined with 'jsimd_convsamp_neon' to get
   2576  *       rid of a bunch of VLD1.16 instructions
   2577  */
   2578 
   2579 #undef XFIX_0_541196100
   2580 #define XFIX_0_382683433 v0.h[0]
   2581 #define XFIX_0_541196100 v0.h[1]
   2582 #define XFIX_0_707106781 v0.h[2]
   2583 #define XFIX_1_306562965 v0.h[3]
   2584 
   2585 .balign 16
   2586 Ljsimd_fdct_ifast_neon_consts:
   2587   .short (98 * 128)               /* XFIX_0_382683433 */
   2588   .short (139 * 128)              /* XFIX_0_541196100 */
   2589   .short (181 * 128)              /* XFIX_0_707106781 */
   2590   .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
   2591 
   2592 asm_function jsimd_fdct_ifast_neon
   2593 
   2594     DATA            .req x0
   2595     TMP             .req x9
   2596 
   2597     /* Load constants */
   2598     adr             TMP, Ljsimd_fdct_ifast_neon_consts
   2599     ld1             {v0.4h}, [TMP]
   2600 
   2601     /* Load all DATA into NEON registers with the following allocation:
   2602      *       0 1 2 3 | 4 5 6 7
   2603      *      ---------+--------
   2604      *   0 | d16     | d17    | v0.8h
   2605      *   1 | d18     | d19    | q9
   2606      *   2 | d20     | d21    | q10
   2607      *   3 | d22     | d23    | q11
   2608      *   4 | d24     | d25    | q12
   2609      *   5 | d26     | d27    | q13
   2610      *   6 | d28     | d29    | q14
   2611      *   7 | d30     | d31    | q15
   2612      */
   2613 
   2614     ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
   2615     ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
   2616     mov             TMP, #2
   2617     sub             DATA, DATA, #64
   2618 1:
   2619     /* Transpose */
   2620     transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
   2621     subs            TMP, TMP, #1
   2622     /* 1-D FDCT */
   2623     add             v4.8h, v19.8h, v20.8h
   2624     sub             v20.8h, v19.8h, v20.8h
   2625     sub             v28.8h, v18.8h, v21.8h
   2626     add             v18.8h, v18.8h, v21.8h
   2627     sub             v29.8h, v17.8h, v22.8h
   2628     add             v17.8h, v17.8h, v22.8h
   2629     sub             v21.8h, v16.8h, v23.8h
   2630     add             v16.8h, v16.8h, v23.8h
   2631     sub             v6.8h, v17.8h, v18.8h
   2632     sub             v7.8h, v16.8h, v4.8h
   2633     add             v5.8h, v17.8h, v18.8h
   2634     add             v6.8h, v6.8h, v7.8h
   2635     add             v4.8h, v16.8h, v4.8h
   2636     sqdmulh         v6.8h, v6.8h, XFIX_0_707106781
   2637     add             v19.8h, v20.8h, v28.8h
   2638     add             v16.8h, v4.8h, v5.8h
   2639     sub             v20.8h, v4.8h, v5.8h
   2640     add             v5.8h, v28.8h, v29.8h
   2641     add             v29.8h, v29.8h, v21.8h
   2642     sqdmulh         v5.8h, v5.8h, XFIX_0_707106781
   2643     sub             v28.8h, v19.8h, v29.8h
   2644     add             v18.8h, v7.8h, v6.8h
   2645     sqdmulh         v28.8h, v28.8h, XFIX_0_382683433
   2646     sub             v22.8h, v7.8h, v6.8h
   2647     sqdmulh         v19.8h, v19.8h, XFIX_0_541196100
   2648     sqdmulh         v7.8h, v29.8h, XFIX_1_306562965
   2649     add             v6.8h, v21.8h, v5.8h
   2650     sub             v5.8h, v21.8h, v5.8h
   2651     add             v29.8h, v29.8h, v28.8h
   2652     add             v19.8h, v19.8h, v28.8h
   2653     add             v29.8h, v29.8h, v7.8h
   2654     add             v21.8h, v5.8h, v19.8h
   2655     sub             v19.8h, v5.8h, v19.8h
   2656     add             v17.8h, v6.8h, v29.8h
   2657     sub             v23.8h, v6.8h, v29.8h
   2658 
   2659     b.ne            1b
   2660 
   2661     /* store results */
   2662     st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
   2663     st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
   2664 
   2665     br              x30
   2666 
   2667     .unreq          DATA
   2668     .unreq          TMP
   2669 #undef XFIX_0_382683433
   2670 #undef XFIX_0_541196100
   2671 #undef XFIX_0_707106781
   2672 #undef XFIX_1_306562965
   2673 
   2674 
   2675 /*****************************************************************************/
   2676 
   2677 /*
   2678  * GLOBAL(void)
   2679  * jsimd_quantize_neon (JCOEFPTR coef_block, DCTELEM *divisors,
   2680  *                      DCTELEM *workspace);
   2681  *
   2682  */
   2683 asm_function jsimd_quantize_neon
   2684 
   2685     COEF_BLOCK      .req x0
   2686     DIVISORS        .req x1
   2687     WORKSPACE       .req x2
   2688 
   2689     RECIPROCAL      .req DIVISORS
   2690     CORRECTION      .req x9
   2691     SHIFT           .req x10
   2692     LOOP_COUNT      .req x11
   2693 
   2694     mov             LOOP_COUNT, #2
   2695     add             CORRECTION, DIVISORS, #(64 * 2)
   2696     add             SHIFT, DIVISORS, #(64 * 6)
   2697 1:
   2698     subs            LOOP_COUNT, LOOP_COUNT, #1
   2699     ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
   2700     ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
   2701     abs             v20.8h, v0.8h
   2702     abs             v21.8h, v1.8h
   2703     abs             v22.8h, v2.8h
   2704     abs             v23.8h, v3.8h
   2705     ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
   2706     add             v20.8h, v20.8h, v4.8h  /* add correction */
   2707     add             v21.8h, v21.8h, v5.8h
   2708     add             v22.8h, v22.8h, v6.8h
   2709     add             v23.8h, v23.8h, v7.8h
   2710     umull           v4.4s, v20.4h, v28.4h  /* multiply by reciprocal */
   2711     umull2          v16.4s, v20.8h, v28.8h
   2712     umull           v5.4s, v21.4h, v29.4h
   2713     umull2          v17.4s, v21.8h, v29.8h
   2714     umull           v6.4s, v22.4h, v30.4h  /* multiply by reciprocal */
   2715     umull2          v18.4s, v22.8h, v30.8h
   2716     umull           v7.4s, v23.4h, v31.4h
   2717     umull2          v19.4s, v23.8h, v31.8h
   2718     ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
   2719     shrn            v4.4h, v4.4s, #16
   2720     shrn            v5.4h, v5.4s, #16
   2721     shrn            v6.4h, v6.4s, #16
   2722     shrn            v7.4h, v7.4s, #16
   2723     shrn2           v4.8h, v16.4s, #16
   2724     shrn2           v5.8h, v17.4s, #16
   2725     shrn2           v6.8h, v18.4s, #16
   2726     shrn2           v7.8h, v19.4s, #16
   2727     neg             v24.8h, v24.8h
   2728     neg             v25.8h, v25.8h
   2729     neg             v26.8h, v26.8h
   2730     neg             v27.8h, v27.8h
   2731     sshr            v0.8h, v0.8h, #15  /* extract sign */
   2732     sshr            v1.8h, v1.8h, #15
   2733     sshr            v2.8h, v2.8h, #15
   2734     sshr            v3.8h, v3.8h, #15
   2735     ushl            v4.8h, v4.8h, v24.8h  /* shift */
   2736     ushl            v5.8h, v5.8h, v25.8h
   2737     ushl            v6.8h, v6.8h, v26.8h
   2738     ushl            v7.8h, v7.8h, v27.8h
   2739 
   2740     eor             v4.16b, v4.16b, v0.16b  /* restore sign */
   2741     eor             v5.16b, v5.16b, v1.16b
   2742     eor             v6.16b, v6.16b, v2.16b
   2743     eor             v7.16b, v7.16b, v3.16b
   2744     sub             v4.8h, v4.8h, v0.8h
   2745     sub             v5.8h, v5.8h, v1.8h
   2746     sub             v6.8h, v6.8h, v2.8h
   2747     sub             v7.8h, v7.8h, v3.8h
   2748     st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
   2749 
   2750     b.ne            1b
   2751 
   2752     br              x30  /* return */
   2753 
   2754     .unreq          COEF_BLOCK
   2755     .unreq          DIVISORS
   2756     .unreq          WORKSPACE
   2757     .unreq          RECIPROCAL
   2758     .unreq          CORRECTION
   2759     .unreq          SHIFT
   2760     .unreq          LOOP_COUNT
   2761 
   2762 
   2763 /*****************************************************************************/
   2764 
   2765 /*
   2766  * Downsample pixel values of a single component.
   2767  * This version handles the common case of 2:1 horizontal and 1:1 vertical,
   2768  * without smoothing.
   2769  *
   2770  * GLOBAL(void)
   2771  * jsimd_h2v1_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
   2772  *                             JDIMENSION v_samp_factor,
   2773  *                             JDIMENSION width_blocks, JSAMPARRAY input_data,
   2774  *                             JSAMPARRAY output_data);
   2775  */
   2776 
   2777 .balign 16
   2778 Ljsimd_h2_downsample_neon_consts:
   2779   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
   2780         0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F  /* diff 0 */
   2781   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
   2782         0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E  /* diff 1 */
   2783   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
   2784         0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D  /* diff 2 */
   2785   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
   2786         0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C  /* diff 3 */
   2787   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
   2788         0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B  /* diff 4 */
   2789   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
   2790         0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A  /* diff 5 */
   2791   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
   2792         0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09  /* diff 6 */
   2793   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
   2794         0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08  /* diff 7 */
   2795   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
   2796         0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8 */
   2797   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
   2798         0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9 */
   2799   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
   2800         0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10 */
   2801   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
   2802         0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11 */
   2803   .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
   2804         0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12 */
   2805   .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
   2806         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13 */
   2807   .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
   2808         0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14 */
   2809   .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
   2810         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15 */
   2811 
   2812 asm_function jsimd_h2v1_downsample_neon
   2813     IMAGE_WIDTH     .req x0
   2814     MAX_V_SAMP      .req x1
   2815     V_SAMP          .req x2
   2816     BLOCK_WIDTH     .req x3
   2817     INPUT_DATA      .req x4
   2818     OUTPUT_DATA     .req x5
   2819     OUTPTR          .req x9
   2820     INPTR           .req x10
   2821     TMP1            .req x11
   2822     TMP2            .req x12
   2823     TMP3            .req x13
   2824     TMPDUP          .req w15
   2825 
   2826     mov             TMPDUP, #0x10000
   2827     lsl             TMP2, BLOCK_WIDTH, #4
   2828     sub             TMP2, TMP2, IMAGE_WIDTH
   2829     adr             TMP3, Ljsimd_h2_downsample_neon_consts
   2830     add             TMP3, TMP3, TMP2, lsl #4
   2831     dup             v16.4s, TMPDUP
   2832     ld1             {v18.16b}, [TMP3]
   2833 
   2834 1:  /* row loop */
   2835     ldr             INPTR, [INPUT_DATA], #8
   2836     ldr             OUTPTR, [OUTPUT_DATA], #8
   2837     subs            TMP1, BLOCK_WIDTH, #1
   2838     b.eq            3f
   2839 2:  /* columns */
   2840     ld1             {v0.16b}, [INPTR], #16
   2841     mov             v4.16b, v16.16b
   2842     subs            TMP1, TMP1, #1
   2843     uadalp          v4.8h, v0.16b
   2844     shrn            v6.8b, v4.8h, #1
   2845     st1             {v6.8b}, [OUTPTR], #8
   2846     b.ne            2b
   2847 3:  /* last columns */
   2848     ld1             {v0.16b}, [INPTR]
   2849     mov             v4.16b, v16.16b
   2850     subs            V_SAMP, V_SAMP, #1
   2851     /* expand right */
   2852     tbl             v2.16b, {v0.16b}, v18.16b
   2853     uadalp          v4.8h, v2.16b
   2854     shrn            v6.8b, v4.8h, #1
   2855     st1             {v6.8b}, [OUTPTR], #8
   2856     b.ne            1b
   2857 
   2858     br              x30
   2859 
   2860     .unreq          IMAGE_WIDTH
   2861     .unreq          MAX_V_SAMP
   2862     .unreq          V_SAMP
   2863     .unreq          BLOCK_WIDTH
   2864     .unreq          INPUT_DATA
   2865     .unreq          OUTPUT_DATA
   2866     .unreq          OUTPTR
   2867     .unreq          INPTR
   2868     .unreq          TMP1
   2869     .unreq          TMP2
   2870     .unreq          TMP3
   2871     .unreq          TMPDUP
   2872 
   2873 
   2874 /*****************************************************************************/
   2875 
   2876 /*
   2877  * Downsample pixel values of a single component.
   2878  * This version handles the common case of 2:1 horizontal and 2:1 vertical,
   2879  * without smoothing.
   2880  *
   2881  * GLOBAL(void)
   2882  * jsimd_h2v2_downsample_neon (JDIMENSION image_width, int max_v_samp_factor,
   2883  *                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
   2884  *                             JSAMPARRAY input_data, JSAMPARRAY output_data);
   2885  */
   2886 
   2887 .balign 16
   2888 asm_function jsimd_h2v2_downsample_neon
   2889     IMAGE_WIDTH     .req x0
   2890     MAX_V_SAMP      .req x1
   2891     V_SAMP          .req x2
   2892     BLOCK_WIDTH     .req x3
   2893     INPUT_DATA      .req x4
   2894     OUTPUT_DATA     .req x5
   2895     OUTPTR          .req x9
   2896     INPTR0          .req x10
   2897     INPTR1          .req x14
   2898     TMP1            .req x11
   2899     TMP2            .req x12
   2900     TMP3            .req x13
   2901     TMPDUP          .req w15
   2902 
   2903     mov             TMPDUP, #1
   2904     lsl             TMP2, BLOCK_WIDTH, #4
   2905     lsl             TMPDUP, TMPDUP, #17
   2906     sub             TMP2, TMP2, IMAGE_WIDTH
   2907     adr             TMP3, Ljsimd_h2_downsample_neon_consts
   2908     orr             TMPDUP, TMPDUP, #1
   2909     add             TMP3, TMP3, TMP2, lsl #4
   2910     dup             v16.4s, TMPDUP
   2911     ld1             {v18.16b}, [TMP3]
   2912 
   2913 1:  /* row loop */
   2914     ldr             INPTR0, [INPUT_DATA], #8
   2915     ldr             OUTPTR, [OUTPUT_DATA], #8
   2916     ldr             INPTR1, [INPUT_DATA], #8
   2917     subs            TMP1, BLOCK_WIDTH, #1
   2918     b.eq            3f
   2919 2:  /* columns */
   2920     ld1             {v0.16b}, [INPTR0], #16
   2921     ld1             {v1.16b}, [INPTR1], #16
   2922     mov             v4.16b, v16.16b
   2923     subs            TMP1, TMP1, #1
   2924     uadalp          v4.8h, v0.16b
   2925     uadalp          v4.8h, v1.16b
   2926     shrn            v6.8b, v4.8h, #2
   2927     st1             {v6.8b}, [OUTPTR], #8
   2928     b.ne            2b
   2929 3:  /* last columns */
   2930     ld1             {v0.16b}, [INPTR0], #16
   2931     ld1             {v1.16b}, [INPTR1], #16
   2932     mov             v4.16b, v16.16b
   2933     subs            V_SAMP, V_SAMP, #1
   2934     /* expand right */
   2935     tbl             v2.16b, {v0.16b}, v18.16b
   2936     tbl             v3.16b, {v1.16b}, v18.16b
   2937     uadalp          v4.8h, v2.16b
   2938     uadalp          v4.8h, v3.16b
   2939     shrn            v6.8b, v4.8h, #2
   2940     st1             {v6.8b}, [OUTPTR], #8
   2941     b.ne            1b
   2942 
   2943     br              x30
   2944 
   2945     .unreq          IMAGE_WIDTH
   2946     .unreq          MAX_V_SAMP
   2947     .unreq          V_SAMP
   2948     .unreq          BLOCK_WIDTH
   2949     .unreq          INPUT_DATA
   2950     .unreq          OUTPUT_DATA
   2951     .unreq          OUTPTR
   2952     .unreq          INPTR0
   2953     .unreq          INPTR1
   2954     .unreq          TMP1
   2955     .unreq          TMP2
   2956     .unreq          TMP3
   2957     .unreq          TMPDUP
   2958 
   2959 
   2960 /*****************************************************************************/
   2961 
   2962 /*
   2963  * GLOBAL(JOCTET*)
   2964  * jsimd_huff_encode_one_block (working_state *state, JOCTET *buffer,
   2965  *                              JCOEFPTR block, int last_dc_val,
   2966  *                              c_derived_tbl *dctbl, c_derived_tbl *actbl)
   2967  *
   2968  */
   2969 
   2970     BUFFER          .req x1
   2971     PUT_BUFFER      .req x6
   2972     PUT_BITS        .req x7
   2973     PUT_BITSw       .req w7
   2974 
   2975 .macro emit_byte
   2976     sub             PUT_BITS, PUT_BITS, #0x8
   2977     lsr             x19, PUT_BUFFER, PUT_BITS
   2978     uxtb            w19, w19
   2979     strb            w19, [BUFFER, #1]!
   2980     cmp             w19, #0xff
   2981     b.ne            14f
   2982     strb            wzr, [BUFFER, #1]!
   2983 14:
   2984 .endm
   2985 .macro put_bits CODE, SIZE
   2986     lsl             PUT_BUFFER, PUT_BUFFER, \SIZE
   2987     add             PUT_BITS, PUT_BITS, \SIZE
   2988     orr             PUT_BUFFER, PUT_BUFFER, \CODE
   2989 .endm
   2990 .macro checkbuf31
   2991     cmp             PUT_BITS, #0x20
   2992     b.lt            31f
   2993     emit_byte
   2994     emit_byte
   2995     emit_byte
   2996     emit_byte
   2997 31:
   2998 .endm
   2999 .macro checkbuf47
   3000     cmp             PUT_BITS, #0x30
   3001     b.lt            47f
   3002     emit_byte
   3003     emit_byte
   3004     emit_byte
   3005     emit_byte
   3006     emit_byte
   3007     emit_byte
   3008 47:
   3009 .endm
   3010 
   3011 .macro generate_jsimd_huff_encode_one_block fast_tbl
   3012 
   3013 .balign 16
   3014 .if \fast_tbl == 1
   3015 Ljsimd_huff_encode_one_block_neon_consts:
   3016 .else
   3017 Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
   3018 .endif
   3019     .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
   3020           0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
   3021 .if \fast_tbl == 1
   3022     .byte    0,   1,   2,   3,  16,  17,  32,  33, \
   3023             18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
   3024     .byte   34,  35,  48,  49, 255, 255,  50,  51, \
   3025             36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
   3026     .byte    8,   9,  22,  23,  36,  37,  50,  51, \
   3027            255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
   3028     .byte   54,  55,  40,  41,  26,  27,  12,  13, \
   3029             14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
   3030     .byte    6,   7,  20,  21,  34,  35,  48,  49, \
   3031             50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
   3032     .byte   42,  43,  28,  29,  14,  15,  30,  31, \
   3033             44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
   3034     .byte  255, 255, 255, 255,  56,  57,  42,  43, \
   3035             28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
   3036     .byte   26,  27,  40,  41,  42,  43,  28,  29, \
   3037             14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
   3038     .byte  255, 255, 255, 255,   0,   1, 255, 255, \
   3039            255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
   3040     .byte  255, 255, 255, 255, 255, 255, 255, 255, \
   3041              0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
   3042     .byte  255, 255, 255, 255, 255, 255, 255, 255, \
   3043            255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
   3044     .byte    4,   5,   6,   7, 255, 255, 255, 255, \
   3045            255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
   3046 .endif
   3047 
   3048 .if \fast_tbl == 1
   3049 asm_function jsimd_huff_encode_one_block_neon
   3050 .else
   3051 asm_function jsimd_huff_encode_one_block_neon_slowtbl
   3052 .endif
   3053     sub             sp, sp, 272
   3054     sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
   3055     /* Save ARM registers */
   3056     stp             x19, x20, [sp]
   3057 .if \fast_tbl == 1
   3058     adr             x15, Ljsimd_huff_encode_one_block_neon_consts
   3059 .else
   3060     adr             x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
   3061 .endif
   3062     ldr             PUT_BUFFER, [x0, #0x10]
   3063     ldr             PUT_BITSw, [x0, #0x18]
   3064     ldrsh           w12, [x2]               /* load DC coeff in w12 */
   3065     /* prepare data */
   3066 .if \fast_tbl == 1
   3067     ld1             {v23.16b}, [x15], #16
   3068     ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
   3069     ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
   3070     ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
   3071     ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
   3072     ld1             {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
   3073     sub             w12, w12, w3      /* last_dc_val, not used afterwards */
   3074     /* ZigZag 8x8 */
   3075     tbl             v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
   3076     tbl             v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
   3077     tbl             v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
   3078     tbl             v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
   3079     tbl             v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
   3080     tbl             v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
   3081     tbl             v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
   3082     tbl             v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
   3083     ins             v0.h[0], w12
   3084     tbx             v1.16b, {v28.16b}, v16.16b
   3085     tbx             v2.16b, {v29.16b, v30.16b}, v17.16b
   3086     tbx             v5.16b, {v29.16b, v30.16b}, v18.16b
   3087     tbx             v6.16b, {v31.16b}, v19.16b
   3088 .else
   3089       add             x13, x2, #0x22
   3090       sub             w12, w12, w3    /* last_dc_val, not used afterwards */
   3091     ld1             {v23.16b}, [x15]
   3092       add             x14, x2, #0x18
   3093       add             x3, x2, #0x36
   3094     ins             v0.h[0], w12
   3095       add             x9, x2, #0x2
   3096     ld1             {v1.h}[0], [x13]
   3097       add             x15, x2, #0x30
   3098     ld1             {v2.h}[0], [x14]
   3099       add             x19, x2, #0x26
   3100     ld1             {v3.h}[0], [x3]
   3101       add             x20, x2, #0x28
   3102     ld1             {v0.h}[1], [x9]
   3103       add             x12, x2, #0x10
   3104     ld1             {v1.h}[1], [x15]
   3105       add             x13, x2, #0x40
   3106     ld1             {v2.h}[1], [x19]
   3107       add             x14, x2, #0x34
   3108     ld1             {v3.h}[1], [x20]
   3109       add             x3, x2, #0x1a
   3110     ld1             {v0.h}[2], [x12]
   3111       add             x9, x2, #0x20
   3112     ld1             {v1.h}[2], [x13]
   3113       add             x15, x2, #0x32
   3114     ld1             {v2.h}[2], [x14]
   3115       add             x19, x2, #0x42
   3116     ld1             {v3.h}[2], [x3]
   3117       add             x20, x2, #0xc
   3118     ld1             {v0.h}[3], [x9]
   3119       add             x12, x2, #0x12
   3120     ld1             {v1.h}[3], [x15]
   3121       add             x13, x2, #0x24
   3122     ld1             {v2.h}[3], [x19]
   3123       add             x14, x2, #0x50
   3124     ld1             {v3.h}[3], [x20]
   3125       add             x3, x2, #0xe
   3126     ld1             {v0.h}[4], [x12]
   3127       add             x9, x2, #0x4
   3128     ld1             {v1.h}[4], [x13]
   3129       add             x15, x2, #0x16
   3130     ld1             {v2.h}[4], [x14]
   3131       add             x19, x2, #0x60
   3132     ld1             {v3.h}[4], [x3]
   3133       add             x20, x2, #0x1c
   3134     ld1             {v0.h}[5], [x9]
   3135       add             x12, x2, #0x6
   3136     ld1             {v1.h}[5], [x15]
   3137       add             x13, x2, #0x8
   3138     ld1             {v2.h}[5], [x19]
   3139       add             x14, x2, #0x52
   3140     ld1             {v3.h}[5], [x20]
   3141       add             x3, x2, #0x2a
   3142     ld1             {v0.h}[6], [x12]
   3143       add             x9, x2, #0x14
   3144     ld1             {v1.h}[6], [x13]
   3145       add             x15, x2, #0xa
   3146     ld1             {v2.h}[6], [x14]
   3147       add             x19, x2, #0x44
   3148     ld1             {v3.h}[6], [x3]
   3149       add             x20, x2, #0x38
   3150     ld1             {v0.h}[7], [x9]
   3151       add             x12, x2, #0x46
   3152     ld1             {v1.h}[7], [x15]
   3153       add             x13, x2, #0x3a
   3154     ld1             {v2.h}[7], [x19]
   3155       add             x14, x2, #0x74
   3156     ld1             {v3.h}[7], [x20]
   3157       add             x3, x2, #0x6a
   3158     ld1             {v4.h}[0], [x12]
   3159       add             x9, x2, #0x54
   3160     ld1             {v5.h}[0], [x13]
   3161       add             x15, x2, #0x2c
   3162     ld1             {v6.h}[0], [x14]
   3163       add             x19, x2, #0x76
   3164     ld1             {v7.h}[0], [x3]
   3165       add             x20, x2, #0x78
   3166     ld1             {v4.h}[1], [x9]
   3167       add             x12, x2, #0x62
   3168     ld1             {v5.h}[1], [x15]
   3169       add             x13, x2, #0x1e
   3170     ld1             {v6.h}[1], [x19]
   3171       add             x14, x2, #0x68
   3172     ld1             {v7.h}[1], [x20]
   3173       add             x3, x2, #0x7a
   3174     ld1             {v4.h}[2], [x12]
   3175       add             x9, x2, #0x70
   3176     ld1             {v5.h}[2], [x13]
   3177       add             x15, x2, #0x2e
   3178     ld1             {v6.h}[2], [x14]
   3179       add             x19, x2, #0x5a
   3180     ld1             {v7.h}[2], [x3]
   3181       add             x20, x2, #0x6c
   3182     ld1             {v4.h}[3], [x9]
   3183       add             x12, x2, #0x72
   3184     ld1             {v5.h}[3], [x15]
   3185       add             x13, x2, #0x3c
   3186     ld1             {v6.h}[3], [x19]
   3187       add             x14, x2, #0x4c
   3188     ld1             {v7.h}[3], [x20]
   3189       add             x3, x2, #0x5e
   3190     ld1             {v4.h}[4], [x12]
   3191       add             x9, x2, #0x64
   3192     ld1             {v5.h}[4], [x13]
   3193       add             x15, x2, #0x4a
   3194     ld1             {v6.h}[4], [x14]
   3195       add             x19, x2, #0x3e
   3196     ld1             {v7.h}[4], [x3]
   3197       add             x20, x2, #0x6e
   3198     ld1             {v4.h}[5], [x9]
   3199       add             x12, x2, #0x56
   3200     ld1             {v5.h}[5], [x15]
   3201       add             x13, x2, #0x58
   3202     ld1             {v6.h}[5], [x19]
   3203       add             x14, x2, #0x4e
   3204     ld1             {v7.h}[5], [x20]
   3205       add             x3, x2, #0x7c
   3206     ld1             {v4.h}[6], [x12]
   3207       add             x9, x2, #0x48
   3208     ld1             {v5.h}[6], [x13]
   3209       add             x15, x2, #0x66
   3210     ld1             {v6.h}[6], [x14]
   3211       add             x19, x2, #0x5c
   3212     ld1             {v7.h}[6], [x3]
   3213       add             x20, x2, #0x7e
   3214     ld1             {v4.h}[7], [x9]
   3215     ld1             {v5.h}[7], [x15]
   3216     ld1             {v6.h}[7], [x19]
   3217     ld1             {v7.h}[7], [x20]
   3218 .endif
   3219     cmlt            v24.8h, v0.8h, #0
   3220     cmlt            v25.8h, v1.8h, #0
   3221     cmlt            v26.8h, v2.8h, #0
   3222     cmlt            v27.8h, v3.8h, #0
   3223     cmlt            v28.8h, v4.8h, #0
   3224     cmlt            v29.8h, v5.8h, #0
   3225     cmlt            v30.8h, v6.8h, #0
   3226     cmlt            v31.8h, v7.8h, #0
   3227     abs             v0.8h, v0.8h
   3228     abs             v1.8h, v1.8h
   3229     abs             v2.8h, v2.8h
   3230     abs             v3.8h, v3.8h
   3231     abs             v4.8h, v4.8h
   3232     abs             v5.8h, v5.8h
   3233     abs             v6.8h, v6.8h
   3234     abs             v7.8h, v7.8h
   3235     eor             v24.16b, v24.16b, v0.16b
   3236     eor             v25.16b, v25.16b, v1.16b
   3237     eor             v26.16b, v26.16b, v2.16b
   3238     eor             v27.16b, v27.16b, v3.16b
   3239     eor             v28.16b, v28.16b, v4.16b
   3240     eor             v29.16b, v29.16b, v5.16b
   3241     eor             v30.16b, v30.16b, v6.16b
   3242     eor             v31.16b, v31.16b, v7.16b
   3243     cmeq            v16.8h, v0.8h, #0
   3244     cmeq            v17.8h, v1.8h, #0
   3245     cmeq            v18.8h, v2.8h, #0
   3246     cmeq            v19.8h, v3.8h, #0
   3247     cmeq            v20.8h, v4.8h, #0
   3248     cmeq            v21.8h, v5.8h, #0
   3249     cmeq            v22.8h, v6.8h, #0
   3250     xtn             v16.8b, v16.8h
   3251     xtn             v18.8b, v18.8h
   3252     xtn             v20.8b, v20.8h
   3253     xtn             v22.8b, v22.8h
   3254       umov            w14, v0.h[0]
   3255     xtn2            v16.16b, v17.8h
   3256       umov            w13, v24.h[0]
   3257     xtn2            v18.16b, v19.8h
   3258       clz             w14, w14
   3259     xtn2            v20.16b, v21.8h
   3260       lsl             w13, w13, w14
   3261     cmeq            v17.8h, v7.8h, #0
   3262       sub             w12, w14, #32
   3263     xtn2            v22.16b, v17.8h
   3264       lsr             w13, w13, w14
   3265     and             v16.16b, v16.16b, v23.16b
   3266       neg             w12, w12
   3267     and             v18.16b, v18.16b, v23.16b
   3268       add             x3, x4, #0x400           /* r1 = dctbl->ehufsi */
   3269     and             v20.16b, v20.16b, v23.16b
   3270       add             x15, sp, #0x90           /* x15 = t2 */
   3271     and             v22.16b, v22.16b, v23.16b
   3272       ldr             w10, [x4, x12, lsl #2]
   3273     addp            v16.16b, v16.16b, v18.16b
   3274       ldrb            w11, [x3, x12]
   3275     addp            v20.16b, v20.16b, v22.16b
   3276       checkbuf47
   3277     addp            v16.16b, v16.16b, v20.16b
   3278       put_bits        x10, x11
   3279     addp            v16.16b, v16.16b, v18.16b
   3280       checkbuf47
   3281     umov            x9,v16.D[0]
   3282       put_bits        x13, x12
   3283     cnt             v17.8b, v16.8b
   3284       mvn             x9, x9
   3285     addv            B18, v17.8b
   3286       add             x4, x5, #0x400   /* x4 = actbl->ehufsi */
   3287     umov            w12, v18.b[0]
   3288       lsr             x9, x9, #0x1     /* clear AC coeff */
   3289     ldr             w13, [x5, #0x3c0]  /* x13 = actbl->ehufco[0xf0] */
   3290     rbit            x9, x9             /* x9 = index0 */
   3291     ldrb            w14, [x4, #0xf0]   /* x14 = actbl->ehufsi[0xf0] */
   3292     cmp             w12, #(64-8)
   3293     add             x11, sp, #16
   3294     b.lt            4f
   3295     cbz             x9, 6f
   3296     st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
   3297     st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
   3298     st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
   3299     st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
   3300 1:
   3301     clz             x2, x9
   3302     add             x15, x15, x2, lsl #1
   3303     lsl             x9, x9, x2
   3304     ldrh            w20, [x15, #-126]
   3305 2:
   3306     cmp             x2, #0x10
   3307     b.lt            3f
   3308     sub             x2, x2, #0x10
   3309     checkbuf47
   3310     put_bits        x13, x14
   3311     b               2b
   3312 3:
   3313     clz             w20, w20
   3314     ldrh            w3, [x15, #2]!
   3315     sub             w11, w20, #32
   3316     lsl             w3, w3, w20
   3317     neg             w11, w11
   3318     lsr             w3, w3, w20
   3319     add             x2, x11, x2, lsl #4
   3320     lsl             x9, x9, #0x1
   3321     ldr             w12, [x5, x2, lsl #2]
   3322     ldrb            w10, [x4, x2]
   3323     checkbuf31
   3324     put_bits        x12, x10
   3325     put_bits        x3, x11
   3326     cbnz            x9, 1b
   3327     b               6f
   3328 4:
   3329     movi            v21.8h, #0x0010
   3330     clz             v0.8h, v0.8h
   3331     clz             v1.8h, v1.8h
   3332     clz             v2.8h, v2.8h
   3333     clz             v3.8h, v3.8h
   3334     clz             v4.8h, v4.8h
   3335     clz             v5.8h, v5.8h
   3336     clz             v6.8h, v6.8h
   3337     clz             v7.8h, v7.8h
   3338     ushl            v24.8h, v24.8h, v0.8h
   3339     ushl            v25.8h, v25.8h, v1.8h
   3340     ushl            v26.8h, v26.8h, v2.8h
   3341     ushl            v27.8h, v27.8h, v3.8h
   3342     ushl            v28.8h, v28.8h, v4.8h
   3343     ushl            v29.8h, v29.8h, v5.8h
   3344     ushl            v30.8h, v30.8h, v6.8h
   3345     ushl            v31.8h, v31.8h, v7.8h
   3346     neg             v0.8h, v0.8h
   3347     neg             v1.8h, v1.8h
   3348     neg             v2.8h, v2.8h
   3349     neg             v3.8h, v3.8h
   3350     neg             v4.8h, v4.8h
   3351     neg             v5.8h, v5.8h
   3352     neg             v6.8h, v6.8h
   3353     neg             v7.8h, v7.8h
   3354     ushl            v24.8h, v24.8h, v0.8h
   3355     ushl            v25.8h, v25.8h, v1.8h
   3356     ushl            v26.8h, v26.8h, v2.8h
   3357     ushl            v27.8h, v27.8h, v3.8h
   3358     ushl            v28.8h, v28.8h, v4.8h
   3359     ushl            v29.8h, v29.8h, v5.8h
   3360     ushl            v30.8h, v30.8h, v6.8h
   3361     ushl            v31.8h, v31.8h, v7.8h
   3362     add             v0.8h, v21.8h, v0.8h
   3363     add             v1.8h, v21.8h, v1.8h
   3364     add             v2.8h, v21.8h, v2.8h
   3365     add             v3.8h, v21.8h, v3.8h
   3366     add             v4.8h, v21.8h, v4.8h
   3367     add             v5.8h, v21.8h, v5.8h
   3368     add             v6.8h, v21.8h, v6.8h
   3369     add             v7.8h, v21.8h, v7.8h
   3370     st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
   3371     st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
   3372     st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
   3373     st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
   3374 1:
   3375     clz             x2, x9
   3376     add             x15, x15, x2, lsl #1
   3377     lsl             x9, x9, x2
   3378     ldrh            w11, [x15, #-126]
   3379 2:
   3380     cmp             x2, #0x10
   3381     b.lt            3f
   3382     sub             x2, x2, #0x10
   3383     checkbuf47
   3384     put_bits        x13, x14
   3385     b               2b
   3386 3:
   3387     ldrh            w3, [x15, #2]!
   3388     add             x2, x11, x2, lsl #4
   3389     lsl             x9, x9, #0x1
   3390     ldr             w12, [x5, x2, lsl #2]
   3391     ldrb            w10, [x4, x2]
   3392     checkbuf31
   3393     put_bits        x12, x10
   3394     put_bits        x3, x11
   3395     cbnz            x9, 1b
   3396 6:
   3397     add             x13, sp, #0x10e
   3398     cmp             x15, x13
   3399     b.hs            1f
   3400     ldr             w12, [x5]
   3401     ldrb            w14, [x4]
   3402     checkbuf47
   3403     put_bits        x12, x14
   3404 1:
   3405     str             PUT_BUFFER, [x0, #0x10]
   3406     str             PUT_BITSw, [x0, #0x18]
   3407     ldp             x19, x20, [sp], 16
   3408     add             x0, BUFFER, #0x1
   3409     add             sp, sp, 256
   3410     br              x30
   3411 
   3412 .endm
   3413 
   3414 generate_jsimd_huff_encode_one_block 1
   3415 generate_jsimd_huff_encode_one_block 0
   3416 
   3417     .unreq          BUFFER
   3418     .unreq          PUT_BUFFER
   3419     .unreq          PUT_BITS
   3420     .unreq          PUT_BITSw
   3421 
   3422 .purgem emit_byte
   3423 .purgem put_bits
   3424 .purgem checkbuf31
   3425 .purgem checkbuf47
   3426