Home | History | Annotate | Download | only in arm64
      1 /*
      2  * ARMv8 NEON optimizations for libjpeg-turbo
      3  *
      4  * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
      5  *                          All Rights Reserved.
      6  * Author:  Siarhei Siamashka <siarhei.siamashka (at) nokia.com>
      7  * Copyright (C) 2013-2014, Linaro Limited.  All Rights Reserved.
      8  * Author:  Ragesh Radhakrishnan <ragesh.r (at) linaro.org>
      9  * Copyright (C) 2014-2016, D. R. Commander.  All Rights Reserved.
     10  * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
     11  * Copyright (C) 2016, Siarhei Siamashka.  All Rights Reserved.
     12  *
     13  * This software is provided 'as-is', without any express or implied
     14  * warranty.  In no event will the authors be held liable for any damages
     15  * arising from the use of this software.
     16  *
     17  * Permission is granted to anyone to use this software for any purpose,
     18  * including commercial applications, and to alter it and redistribute it
     19  * freely, subject to the following restrictions:
     20  *
     21  * 1. The origin of this software must not be misrepresented; you must not
     22  *    claim that you wrote the original software. If you use this software
     23  *    in a product, an acknowledgment in the product documentation would be
     24  *    appreciated but is not required.
     25  * 2. Altered source versions must be plainly marked as such, and must not be
     26  *    misrepresented as being the original software.
     27  * 3. This notice may not be removed or altered from any source distribution.
     28  */
     29 
     30 #if defined(__linux__) && defined(__ELF__)
     31 .section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
     32 #endif
     33 
     34 #if defined(__APPLE__)
     35 .section __DATA,__const
     36 #else
     37 .section .rodata, "a", %progbits
     38 #endif
     39 
     40 #define F_0_298   2446  /* FIX(0.298631336) */
     41 #define F_0_390   3196  /* FIX(0.390180644) */
     42 #define F_0_541   4433  /* FIX(0.541196100) */
     43 #define F_0_765   6270  /* FIX(0.765366865) */
     44 #define F_0_899   7373  /* FIX(0.899976223) */
     45 #define F_1_175   9633  /* FIX(1.175875602) */
     46 #define F_1_501  12299  /* FIX(1.501321110) */
     47 #define F_1_847  15137  /* FIX(1.847759065) */
     48 #define F_1_961  16069  /* FIX(1.961570560) */
     49 #define F_2_053  16819  /* FIX(2.053119869) */
     50 #define F_2_562  20995  /* FIX(2.562915447) */
     51 #define F_3_072  25172  /* FIX(3.072711026) */
     52 
     53 .balign 16
     54 Ljsimd_idct_islow_neon_consts:
     55   .short F_0_298
     56   .short -F_0_390
     57   .short F_0_541
     58   .short F_0_765
     59   .short - F_0_899
     60   .short F_1_175
     61   .short F_1_501
     62   .short - F_1_847
     63   .short - F_1_961
     64   .short F_2_053
     65   .short - F_2_562
     66   .short F_3_072
     67   .short 0          /* padding */
     68   .short 0
     69   .short 0
     70   .short 0
     71 
     72 #undef F_0_298
     73 #undef F_0_390
     74 #undef F_0_541
     75 #undef F_0_765
     76 #undef F_0_899
     77 #undef F_1_175
     78 #undef F_1_501
     79 #undef F_1_847
     80 #undef F_1_961
     81 #undef F_2_053
     82 #undef F_2_562
     83 #undef F_3_072
     84 
     85 
     86 #define XFIX_1_082392200  v0.h[0]
     87 #define XFIX_1_414213562  v0.h[1]
     88 #define XFIX_1_847759065  v0.h[2]
     89 #define XFIX_2_613125930  v0.h[3]
     90 
     91 .balign 16
     92 Ljsimd_idct_ifast_neon_consts:
     93   .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
     94   .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
     95   .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
     96   .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
     97 
     98 #define CONST_BITS  13
     99 #define PASS1_BITS  2
    100 
    101 #define FIX_0_211164243  (1730)   /* FIX(0.211164243) */
    102 #define FIX_0_509795579  (4176)   /* FIX(0.509795579) */
    103 #define FIX_0_601344887  (4926)   /* FIX(0.601344887) */
    104 #define FIX_0_720959822  (5906)   /* FIX(0.720959822) */
    105 #define FIX_0_765366865  (6270)   /* FIX(0.765366865) */
    106 #define FIX_0_850430095  (6967)   /* FIX(0.850430095) */
    107 #define FIX_0_899976223  (7373)   /* FIX(0.899976223) */
    108 #define FIX_1_061594337  (8697)   /* FIX(1.061594337) */
    109 #define FIX_1_272758580  (10426)  /* FIX(1.272758580) */
    110 #define FIX_1_451774981  (11893)  /* FIX(1.451774981) */
    111 #define FIX_1_847759065  (15137)  /* FIX(1.847759065) */
    112 #define FIX_2_172734803  (17799)  /* FIX(2.172734803) */
    113 #define FIX_2_562915447  (20995)  /* FIX(2.562915447) */
    114 #define FIX_3_624509785  (29692)  /* FIX(3.624509785) */
    115 
    116 .balign 16
    117 Ljsimd_idct_4x4_neon_consts:
    118   .short FIX_1_847759065        /* v0.h[0] */
    119   .short -FIX_0_765366865       /* v0.h[1] */
    120   .short -FIX_0_211164243       /* v0.h[2] */
    121   .short FIX_1_451774981        /* v0.h[3] */
    122   .short -FIX_2_172734803       /* d1[0] */
    123   .short FIX_1_061594337        /* d1[1] */
    124   .short -FIX_0_509795579       /* d1[2] */
    125   .short -FIX_0_601344887       /* d1[3] */
    126   .short FIX_0_899976223        /* v2.h[0] */
    127   .short FIX_2_562915447        /* v2.h[1] */
    128   .short 1 << (CONST_BITS + 1)  /* v2.h[2] */
    129   .short 0                      /* v2.h[3] */
    130 
    131 .balign 8
    132 Ljsimd_idct_2x2_neon_consts:
    133   .short -FIX_0_720959822  /* v14[0] */
    134   .short FIX_0_850430095   /* v14[1] */
    135   .short -FIX_1_272758580  /* v14[2] */
    136   .short FIX_3_624509785   /* v14[3] */
    137 
    138 .balign 16
    139 Ljsimd_ycc_colorid_neon_consts:
    140   .short 0,      0,     0,      0
    141   .short 22971, -11277, -23401, 29033
    142   .short -128,  -128,   -128,   -128
    143   .short -128,  -128,   -128,   -128
    144 
    145 .balign 16
    146 Ljsimd_colorid_ycc_neon_consts:
    147   .short 19595, 38470, 7471, 11059
    148   .short 21709, 32768, 27439, 5329
    149   .short 32767, 128, 32767, 128
    150   .short 32767, 128, 32767, 128
    151 
    152 #define F_0_298   2446  /* FIX(0.298631336) */
    153 #define F_0_390   3196  /* FIX(0.390180644) */
    154 #define F_0_541   4433  /* FIX(0.541196100) */
    155 #define F_0_765   6270  /* FIX(0.765366865) */
    156 #define F_0_899   7373  /* FIX(0.899976223) */
    157 #define F_1_175   9633  /* FIX(1.175875602) */
    158 #define F_1_501  12299  /* FIX(1.501321110) */
    159 #define F_1_847  15137  /* FIX(1.847759065) */
    160 #define F_1_961  16069  /* FIX(1.961570560) */
    161 #define F_2_053  16819  /* FIX(2.053119869) */
    162 #define F_2_562  20995  /* FIX(2.562915447) */
    163 #define F_3_072  25172  /* FIX(3.072711026) */
    164 
    165 .balign 16
    166 Ljsimd_fdct_islow_neon_consts:
    167   .short F_0_298
    168   .short -F_0_390
    169   .short F_0_541
    170   .short F_0_765
    171   .short - F_0_899
    172   .short F_1_175
    173   .short F_1_501
    174   .short - F_1_847
    175   .short - F_1_961
    176   .short F_2_053
    177   .short - F_2_562
    178   .short F_3_072
    179   .short 0          /* padding */
    180   .short 0
    181   .short 0
    182   .short 0
    183 
    184 #undef F_0_298
    185 #undef F_0_390
    186 #undef F_0_541
    187 #undef F_0_765
    188 #undef F_0_899
    189 #undef F_1_175
    190 #undef F_1_501
    191 #undef F_1_847
    192 #undef F_1_961
    193 #undef F_2_053
    194 #undef F_2_562
    195 #undef F_3_072
    196 
    197 .balign 16
    198 Ljsimd_fdct_ifast_neon_consts:
    199   .short (98 * 128)               /* XFIX_0_382683433 */
    200   .short (139 * 128)              /* XFIX_0_541196100 */
    201   .short (181 * 128)              /* XFIX_0_707106781 */
    202   .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
    203 
    204 .balign 16
    205 Ljsimd_h2_downsample_neon_consts:
    206   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
    207         0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F  /* diff 0 */
    208   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
    209         0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E  /* diff 1 */
    210   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
    211         0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D  /* diff 2 */
    212   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
    213         0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C  /* diff 3 */
    214   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
    215         0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B  /* diff 4 */
    216   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
    217         0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A  /* diff 5 */
    218   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
    219         0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09  /* diff 6 */
    220   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
    221         0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08  /* diff 7 */
    222   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
    223         0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8 */
    224   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
    225         0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9 */
    226   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
    227         0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10 */
    228   .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
    229         0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11 */
    230   .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
    231         0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12 */
    232   .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
    233         0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13 */
    234   .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
    235         0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14 */
    236   .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
    237         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15 */
    238 
    239 Ljsimd_huff_encode_one_block_neon_consts:
    240     .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
    241           0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
    242     .byte    0,   1,   2,   3,  16,  17,  32,  33, \
    243             18,  19,   4,   5,   6,   7,  20,  21  /* L0 => L3 : 4 lines OK */
    244     .byte   34,  35,  48,  49, 255, 255,  50,  51, \
    245             36,  37,  22,  23,   8,   9,  10,  11  /* L0 => L3 : 4 lines OK */
    246     .byte    8,   9,  22,  23,  36,  37,  50,  51, \
    247            255, 255, 255, 255, 255, 255,  52,  53  /* L1 => L4 : 4 lines OK */
    248     .byte   54,  55,  40,  41,  26,  27,  12,  13, \
    249             14,  15,  28,  29,  42,  43,  56,  57  /* L0 => L3 : 4 lines OK */
    250     .byte    6,   7,  20,  21,  34,  35,  48,  49, \
    251             50,  51,  36,  37,  22,  23,   8,   9  /* L4 => L7 : 4 lines OK */
    252     .byte   42,  43,  28,  29,  14,  15,  30,  31, \
    253             44,  45,  58,  59, 255, 255, 255, 255  /* L1 => L4 : 4 lines OK */
    254     .byte  255, 255, 255, 255,  56,  57,  42,  43, \
    255             28,  29,  14,  15,  30,  31,  44,  45  /* L3 => L6 : 4 lines OK */
    256     .byte   26,  27,  40,  41,  42,  43,  28,  29, \
    257             14,  15,  30,  31,  44,  45,  46,  47  /* L5 => L7 : 3 lines OK */
    258     .byte  255, 255, 255, 255,   0,   1, 255, 255, \
    259            255, 255, 255, 255, 255, 255, 255, 255  /* L4 : 1 lines OK */
    260     .byte  255, 255, 255, 255, 255, 255, 255, 255, \
    261              0,   1,  16,  17,   2,   3, 255, 255  /* L5 => L6 : 2 lines OK */
    262     .byte  255, 255, 255, 255, 255, 255, 255, 255, \
    263            255, 255, 255, 255,   8,   9,  22,  23  /* L5 => L6 : 2 lines OK */
    264     .byte    4,   5,   6,   7, 255, 255, 255, 255, \
    265            255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
    266 Ljsimd_huff_encode_one_block_neon_slowtbl_consts:
    267     .byte 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, \
    268           0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80
    269 
    270 .text
    271 
    272 
    273 #define RESPECT_STRICT_ALIGNMENT  1
    274 
    275 
    276 /*****************************************************************************/
    277 
    278 /* Supplementary macro for setting function attributes */
    279 .macro asm_function fname
    280 #ifdef __APPLE__
    281     .private_extern _\fname
    282     .globl _\fname
    283 _\fname:
    284 #else
    285     .global \fname
    286 #ifdef __ELF__
    287     .hidden \fname
    288     .type \fname, %function
    289 #endif
    290 \fname:
    291 #endif
    292 .endm
    293 
    294 .macro get_symbol_loc xi, symbol
    295 #ifdef __APPLE__
    296     adrp            \xi, \symbol@PAGE
    297     add             \xi, \xi, \symbol@PAGEOFF
    298 #else
    299     adrp            \xi, \symbol
    300     add             \xi, \xi, :lo12:\symbol
    301 #endif
    302 .endm
    303 
    304 /* Transpose elements of single 128 bit registers */
    305 .macro transpose_single x0, x1, xi, xilen, literal
    306     ins             \xi\xilen[0], \x0\xilen[0]
    307     ins             \x1\xilen[0], \x0\xilen[1]
    308     trn1            \x0\literal, \x0\literal, \x1\literal
    309     trn2            \x1\literal, \xi\literal, \x1\literal
    310 .endm
    311 
    312 /* Transpose elements of 2 different registers */
    313 .macro transpose x0, x1, xi, xilen, literal
    314     mov             \xi\xilen, \x0\xilen
    315     trn1            \x0\literal, \x0\literal, \x1\literal
    316     trn2            \x1\literal, \xi\literal, \x1\literal
    317 .endm
    318 
    319 /* Transpose a block of 4x4 coefficients in four 64-bit registers */
    320 .macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
    321     mov             \xi\xilen, \x0\xilen
    322     trn1            \x0\x0len, \x0\x0len, \x2\x2len
    323     trn2            \x2\x2len, \xi\x0len, \x2\x2len
    324     mov             \xi\xilen, \x1\xilen
    325     trn1            \x1\x1len, \x1\x1len, \x3\x3len
    326     trn2            \x3\x3len, \xi\x1len, \x3\x3len
    327 .endm
    328 
    329 .macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
    330     mov             \xi\xilen, \x0\xilen
    331     trn1            \x0\x0len, \x0\x0len, \x1\x1len
    332     trn2            \x1\x2len, \xi\x0len, \x1\x2len
    333     mov             \xi\xilen, \x2\xilen
    334     trn1            \x2\x2len, \x2\x2len, \x3\x3len
    335     trn2            \x3\x2len, \xi\x1len, \x3\x3len
    336 .endm
    337 
    338 .macro transpose_4x4 x0, x1, x2, x3, x5
    339     transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
    340     transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
    341 .endm
    342 
    343 .macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
    344     trn1            \t0\().8h, \l0\().8h, \l1\().8h
    345     trn1            \t1\().8h, \l2\().8h, \l3\().8h
    346     trn1            \t2\().8h, \l4\().8h, \l5\().8h
    347     trn1            \t3\().8h, \l6\().8h, \l7\().8h
    348     trn2            \l1\().8h, \l0\().8h, \l1\().8h
    349     trn2            \l3\().8h, \l2\().8h, \l3\().8h
    350     trn2            \l5\().8h, \l4\().8h, \l5\().8h
    351     trn2            \l7\().8h, \l6\().8h, \l7\().8h
    352 
    353     trn1            \l4\().4s, \t2\().4s, \t3\().4s
    354     trn2            \t3\().4s, \t2\().4s, \t3\().4s
    355     trn1            \t2\().4s, \t0\().4s, \t1\().4s
    356     trn2            \l2\().4s, \t0\().4s, \t1\().4s
    357     trn1            \t0\().4s, \l1\().4s, \l3\().4s
    358     trn2            \l3\().4s, \l1\().4s, \l3\().4s
    359     trn2            \t1\().4s, \l5\().4s, \l7\().4s
    360     trn1            \l5\().4s, \l5\().4s, \l7\().4s
    361 
    362     trn2            \l6\().2d, \l2\().2d, \t3\().2d
    363     trn1            \l0\().2d, \t2\().2d, \l4\().2d
    364     trn1            \l1\().2d, \t0\().2d, \l5\().2d
    365     trn2            \l7\().2d, \l3\().2d, \t1\().2d
    366     trn1            \l2\().2d, \l2\().2d, \t3\().2d
    367     trn2            \l4\().2d, \t2\().2d, \l4\().2d
    368     trn1            \l3\().2d, \l3\().2d, \t1\().2d
    369     trn2            \l5\().2d, \t0\().2d, \l5\().2d
    370 .endm
    371 
    372 
    373 #define CENTERJSAMPLE  128
    374 
    375 /*****************************************************************************/
    376 
    377 /*
    378  * Perform dequantization and inverse DCT on one block of coefficients.
    379  *
    380  * GLOBAL(void)
    381  * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
    382  *                       JSAMPARRAY output_buf, JDIMENSION output_col)
    383  */
    384 
    385 #define CONST_BITS  13
    386 #define PASS1_BITS  2
    387 
    388 #define XFIX_P_0_298  v0.h[0]
    389 #define XFIX_N_0_390  v0.h[1]
    390 #define XFIX_P_0_541  v0.h[2]
    391 #define XFIX_P_0_765  v0.h[3]
    392 #define XFIX_N_0_899  v0.h[4]
    393 #define XFIX_P_1_175  v0.h[5]
    394 #define XFIX_P_1_501  v0.h[6]
    395 #define XFIX_N_1_847  v0.h[7]
    396 #define XFIX_N_1_961  v1.h[0]
    397 #define XFIX_P_2_053  v1.h[1]
    398 #define XFIX_N_2_562  v1.h[2]
    399 #define XFIX_P_3_072  v1.h[3]
    400 
    401 asm_function jsimd_idct_islow_neon
    402     DCT_TABLE       .req x0
    403     COEF_BLOCK      .req x1
    404     OUTPUT_BUF      .req x2
    405     OUTPUT_COL      .req x3
    406     TMP1            .req x0
    407     TMP2            .req x1
    408     TMP3            .req x9
    409     TMP4            .req x10
    410     TMP5            .req x11
    411     TMP6            .req x12
    412     TMP7            .req x13
    413     TMP8            .req x14
    414 
    415     /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
    416        guarantee that the upper (unused) 32 bits of x3 are valid.  This
    417        instruction ensures that those bits are set to zero. */
    418     uxtw x3, w3
    419 
    420     sub             sp, sp, #64
    421     get_symbol_loc  x15, Ljsimd_idct_islow_neon_consts
    422     mov             x10, sp
    423     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], #32
    424     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], #32
    425     ld1             {v0.8h, v1.8h}, [x15]
    426     ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [COEF_BLOCK], #64
    427     ld1             {v18.8h, v19.8h, v20.8h, v21.8h}, [DCT_TABLE], #64
    428     ld1             {v6.8h, v7.8h, v8.8h, v9.8h}, [COEF_BLOCK], #64
    429     ld1             {v22.8h, v23.8h, v24.8h, v25.8h}, [DCT_TABLE], #64
    430 
    431     cmeq            v16.8h, v3.8h, #0
    432     cmeq            v26.8h, v4.8h, #0
    433     cmeq            v27.8h, v5.8h, #0
    434     cmeq            v28.8h, v6.8h, #0
    435     cmeq            v29.8h, v7.8h, #0
    436     cmeq            v30.8h, v8.8h, #0
    437     cmeq            v31.8h, v9.8h, #0
    438 
    439     and             v10.16b, v16.16b, v26.16b
    440     and             v11.16b, v27.16b, v28.16b
    441     and             v12.16b, v29.16b, v30.16b
    442     and             v13.16b, v31.16b, v10.16b
    443     and             v14.16b, v11.16b, v12.16b
    444     mul             v2.8h, v2.8h, v18.8h
    445     and             v15.16b, v13.16b, v14.16b
    446     shl             v10.8h, v2.8h, #(PASS1_BITS)
    447     sqxtn           v16.8b, v15.8h
    448     mov             TMP1, v16.d[0]
    449     mvn             TMP2, TMP1
    450 
    451     cbnz            TMP2, 2f
    452     /* case all AC coeffs are zeros */
    453     dup             v2.2d, v10.d[0]
    454     dup             v6.2d, v10.d[1]
    455     mov             v3.16b, v2.16b
    456     mov             v7.16b, v6.16b
    457     mov             v4.16b, v2.16b
    458     mov             v8.16b, v6.16b
    459     mov             v5.16b, v2.16b
    460     mov             v9.16b, v6.16b
    461 1:
    462     /* for this transpose, we should organise data like this:
    463      * 00, 01, 02, 03, 40, 41, 42, 43
    464      * 10, 11, 12, 13, 50, 51, 52, 53
    465      * 20, 21, 22, 23, 60, 61, 62, 63
    466      * 30, 31, 32, 33, 70, 71, 72, 73
    467      * 04, 05, 06, 07, 44, 45, 46, 47
    468      * 14, 15, 16, 17, 54, 55, 56, 57
    469      * 24, 25, 26, 27, 64, 65, 66, 67
    470      * 34, 35, 36, 37, 74, 75, 76, 77
    471      */
    472     trn1            v28.8h, v2.8h, v3.8h
    473     trn1            v29.8h, v4.8h, v5.8h
    474     trn1            v30.8h, v6.8h, v7.8h
    475     trn1            v31.8h, v8.8h, v9.8h
    476     trn2            v16.8h, v2.8h, v3.8h
    477     trn2            v17.8h, v4.8h, v5.8h
    478     trn2            v18.8h, v6.8h, v7.8h
    479     trn2            v19.8h, v8.8h, v9.8h
    480     trn1            v2.4s, v28.4s, v29.4s
    481     trn1            v6.4s, v30.4s, v31.4s
    482     trn1            v3.4s, v16.4s, v17.4s
    483     trn1            v7.4s, v18.4s, v19.4s
    484     trn2            v4.4s, v28.4s, v29.4s
    485     trn2            v8.4s, v30.4s, v31.4s
    486     trn2            v5.4s, v16.4s, v17.4s
    487     trn2            v9.4s, v18.4s, v19.4s
    488     /* Even part: reverse the even part of the forward DCT. */
    489     add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
    490     add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    491     smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
    492     sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    493     smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
    494     sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
    495     mov             v21.16b, v19.16b               /* tmp3 = z1 */
    496     mov             v20.16b, v18.16b               /* tmp3 = z1 */
    497     smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
    498     smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
    499     sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
    500     smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
    501     smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
    502     sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
    503     sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
    504     add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
    505     sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
    506     add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
    507     sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
    508     add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
    509     sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
    510     add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
    511     sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
    512 
    513     /* Odd part per figure 8; the matrix is unitary and hence its
    514      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
    515      */
    516 
    517     add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    518     add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    519     add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    520     add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    521     add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
    522 
    523     smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
    524     smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
    525     smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
    526     smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
    527     smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
    528     smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
    529     smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
    530     smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
    531     smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
    532 
    533     smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
    534     smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
    535     smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
    536     smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
    537     smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
    538     smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
    539     smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
    540     smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
    541     smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
    542 
    543     add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
    544     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
    545     add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
    546     add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
    547 
    548     add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
    549     add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
    550     add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
    551     add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
    552     add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
    553     add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
    554     add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
    555     add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
    556 
    557     add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
    558     add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
    559     add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
    560     add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
    561     add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
    562     add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
    563     add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
    564     add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
    565 
    566     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
    567 
    568     add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
    569     add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
    570     sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
    571     sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
    572     add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
    573     add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
    574     sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
    575     sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
    576     add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
    577     add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
    578     sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
    579     sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
    580     add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
    581     add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
    582     sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
    583     sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
    584 
    585     shrn            v2.4h, v18.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
    586     shrn            v9.4h, v20.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
    587     shrn            v3.4h, v22.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
    588     shrn            v8.4h, v24.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
    589     shrn            v4.4h, v26.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
    590     shrn            v7.4h, v28.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
    591     shrn            v5.4h, v14.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
    592     shrn            v6.4h, v16.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
    593     shrn2           v2.8h, v19.4s, #16  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) */
    594     shrn2           v9.8h, v21.4s, #16  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) */
    595     shrn2           v3.8h, v23.4s, #16  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) */
    596     shrn2           v8.8h, v25.4s, #16  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) */
    597     shrn2           v4.8h, v27.4s, #16  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) */
    598     shrn2           v7.8h, v29.4s, #16  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) */
    599     shrn2           v5.8h, v15.4s, #16  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) */
    600     shrn2           v6.8h, v17.4s, #16  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) */
    601     movi            v0.16b, #(CENTERJSAMPLE)
    602     /* Prepare pointers (dual-issue with NEON instructions) */
    603       ldp             TMP1, TMP2, [OUTPUT_BUF], 16
    604     sqrshrn         v28.8b, v2.8h, #(CONST_BITS+PASS1_BITS+3-16)
    605       ldp             TMP3, TMP4, [OUTPUT_BUF], 16
    606     sqrshrn         v29.8b, v3.8h, #(CONST_BITS+PASS1_BITS+3-16)
    607       add             TMP1, TMP1, OUTPUT_COL
    608     sqrshrn         v30.8b, v4.8h, #(CONST_BITS+PASS1_BITS+3-16)
    609       add             TMP2, TMP2, OUTPUT_COL
    610     sqrshrn         v31.8b, v5.8h, #(CONST_BITS+PASS1_BITS+3-16)
    611       add             TMP3, TMP3, OUTPUT_COL
    612     sqrshrn2        v28.16b, v6.8h, #(CONST_BITS+PASS1_BITS+3-16)
    613       add             TMP4, TMP4, OUTPUT_COL
    614     sqrshrn2        v29.16b, v7.8h, #(CONST_BITS+PASS1_BITS+3-16)
    615       ldp             TMP5, TMP6, [OUTPUT_BUF], 16
    616     sqrshrn2        v30.16b, v8.8h, #(CONST_BITS+PASS1_BITS+3-16)
    617       ldp             TMP7, TMP8, [OUTPUT_BUF], 16
    618     sqrshrn2        v31.16b, v9.8h, #(CONST_BITS+PASS1_BITS+3-16)
    619       add             TMP5, TMP5, OUTPUT_COL
    620     add             v16.16b, v28.16b, v0.16b
    621       add             TMP6, TMP6, OUTPUT_COL
    622     add             v18.16b, v29.16b, v0.16b
    623       add             TMP7, TMP7, OUTPUT_COL
    624     add             v20.16b, v30.16b, v0.16b
    625       add             TMP8, TMP8, OUTPUT_COL
    626     add             v22.16b, v31.16b, v0.16b
    627 
    628     /* Transpose the final 8-bit samples */
    629     trn1            v28.16b, v16.16b, v18.16b
    630     trn1            v30.16b, v20.16b, v22.16b
    631     trn2            v29.16b, v16.16b, v18.16b
    632     trn2            v31.16b, v20.16b, v22.16b
    633 
    634     trn1            v16.8h, v28.8h, v30.8h
    635     trn2            v18.8h, v28.8h, v30.8h
    636     trn1            v20.8h, v29.8h, v31.8h
    637     trn2            v22.8h, v29.8h, v31.8h
    638 
    639     uzp1            v28.4s, v16.4s, v18.4s
    640     uzp2            v30.4s, v16.4s, v18.4s
    641     uzp1            v29.4s, v20.4s, v22.4s
    642     uzp2            v31.4s, v20.4s, v22.4s
    643 
    644     /* Store results to the output buffer */
    645     st1             {v28.d}[0], [TMP1]
    646     st1             {v29.d}[0], [TMP2]
    647     st1             {v28.d}[1], [TMP3]
    648     st1             {v29.d}[1], [TMP4]
    649     st1             {v30.d}[0], [TMP5]
    650     st1             {v31.d}[0], [TMP6]
    651     st1             {v30.d}[1], [TMP7]
    652     st1             {v31.d}[1], [TMP8]
    653     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], #32
    654     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], #32
    655     blr             x30
    656 
    657 .balign 16
    658 2:
    659     mul             v3.8h, v3.8h, v19.8h
    660     mul             v4.8h, v4.8h, v20.8h
    661     mul             v5.8h, v5.8h, v21.8h
    662     add             TMP4, xzr, TMP2, LSL #32
    663     mul             v6.8h, v6.8h, v22.8h
    664     mul             v7.8h, v7.8h, v23.8h
    665     adds            TMP3, xzr, TMP2, LSR #32
    666     mul             v8.8h, v8.8h, v24.8h
    667     mul             v9.8h, v9.8h, v25.8h
    668     b.ne            3f
    669     /* Right AC coef is zero */
    670     dup             v15.2d, v10.d[1]
    671     /* Even part: reverse the even part of the forward DCT. */
    672     add             v18.4h, v4.4h, v8.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
    673     add             v22.4h, v2.4h, v6.4h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    674     sub             v26.4h, v2.4h, v6.4h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    675     smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
    676     sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
    677     mov             v20.16b, v18.16b               /* tmp3 = z1 */
    678     sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
    679     smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
    680     smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
    681     add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
    682     sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
    683     add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
    684     sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
    685 
    686     /* Odd part per figure 8; the matrix is unitary and hence its
    687      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
    688      */
    689 
    690     add             v22.4h, v9.4h, v5.4h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    691     add             v24.4h, v7.4h, v3.4h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    692     add             v18.4h, v9.4h, v3.4h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    693     add             v20.4h, v7.4h, v5.4h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    694     add             v26.4h, v22.4h, v24.4h  /* z5 = z3 + z4 */
    695 
    696     smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
    697     smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
    698     smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
    699     smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
    700     smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
    701     smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
    702     smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
    703     smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
    704     smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
    705 
    706     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
    707     add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
    708 
    709     add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
    710     add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
    711     add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
    712     add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
    713 
    714     add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
    715     add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
    716     add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
    717     add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
    718 
    719     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
    720 
    721     add             v18.4s, v2.4s, v16.4s  /* tmp10 + tmp3 */
    722     sub             v20.4s, v2.4s, v16.4s  /* tmp10 - tmp3 */
    723     add             v22.4s, v8.4s, v14.4s  /* tmp11 + tmp2 */
    724     sub             v24.4s, v8.4s, v14.4s  /* tmp11 - tmp2 */
    725     add             v26.4s, v4.4s, v12.4s  /* tmp12 + tmp1 */
    726     sub             v28.4s, v4.4s, v12.4s  /* tmp12 - tmp1 */
    727     add             v14.4s, v6.4s, v10.4s  /* tmp13 + tmp0 */
    728     sub             v16.4s, v6.4s, v10.4s  /* tmp13 - tmp0 */
    729 
    730     rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
    731     rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
    732     rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
    733     rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
    734     rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
    735     rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
    736     rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
    737     rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
    738     mov             v6.16b, v15.16b
    739     mov             v7.16b, v15.16b
    740     mov             v8.16b, v15.16b
    741     mov             v9.16b, v15.16b
    742     b               1b
    743 
    744 .balign 16
    745 3:
    746     cbnz            TMP4, 4f
    747     /* Left AC coef is zero */
    748     dup             v14.2d, v10.d[0]
    749     /* Even part: reverse the even part of the forward DCT. */
    750     add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
    751     add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    752     smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
    753     sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    754     sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
    755     mov             v21.16b, v19.16b               /* tmp3 = z1 */
    756     smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
    757     sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
    758     smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
    759     add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
    760     sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
    761     add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
    762     sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
    763 
    764     /* Odd part per figure 8; the matrix is unitary and hence its
    765      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
    766      */
    767 
    768     add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    769     add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    770     add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    771     add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    772     add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
    773 
    774     smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
    775     smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
    776     smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
    777     smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
    778     smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
    779     smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
    780     smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
    781     smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
    782     smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
    783 
    784     add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
    785     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
    786     add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
    787     add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
    788 
    789     add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
    790     add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
    791     add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
    792     add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
    793 
    794     add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
    795     add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
    796     add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
    797     add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
    798 
    799     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
    800 
    801     add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
    802     sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
    803     add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
    804     sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
    805     add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
    806     sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
    807     add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
    808     sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
    809 
    810     mov             v2.16b, v14.16b
    811     mov             v3.16b, v14.16b
    812     mov             v4.16b, v14.16b
    813     mov             v5.16b, v14.16b
    814     rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
    815     rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
    816     rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
    817     rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
    818     rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
    819     rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
    820     rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
    821     rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
    822     b               1b
    823 
    824 .balign 16
    825 4:
    826     /* "No" AC coef is zero */
    827     /* Even part: reverse the even part of the forward DCT. */
    828     add             v18.8h, v4.8h, v8.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*2], quantptr[DCTSIZE*2]) + DEQUANTIZE(inptr[DCTSIZE*6], quantptr[DCTSIZE*6]) */
    829     add             v22.8h, v2.8h, v6.8h           /* z2 + z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) + DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    830     smull2          v19.4s, v18.8h, XFIX_P_0_541   /* z1h z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
    831     sub             v26.8h, v2.8h, v6.8h           /* z2 - z3 = DEQUANTIZE(inptr[DCTSIZE*0], quantptr[DCTSIZE*0]) - DEQUANTIZE(inptr[DCTSIZE*4], quantptr[DCTSIZE*4]) */
    832     smull           v18.4s, v18.4h, XFIX_P_0_541   /* z1l z1 = MULTIPLY(z2 + z3, FIX_0_541196100); */
    833     sshll2          v23.4s, v22.8h, #(CONST_BITS)  /* tmp0h tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
    834     mov             v21.16b, v19.16b               /* tmp3 = z1 */
    835     mov             v20.16b, v18.16b               /* tmp3 = z1 */
    836     smlal2          v19.4s, v8.8h, XFIX_N_1_847    /* tmp2h tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
    837     smlal           v18.4s, v8.4h, XFIX_N_1_847    /* tmp2l tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065); */
    838     sshll2          v27.4s, v26.8h, #(CONST_BITS)  /* tmp1h tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
    839     smlal2          v21.4s, v4.8h, XFIX_P_0_765    /* tmp3h tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
    840     smlal           v20.4s, v4.4h, XFIX_P_0_765    /* tmp3l tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865); */
    841     sshll           v22.4s, v22.4h, #(CONST_BITS)  /* tmp0l tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS); */
    842     sshll           v26.4s, v26.4h, #(CONST_BITS)  /* tmp1l tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS); */
    843     add             v2.4s, v22.4s, v20.4s          /* tmp10l tmp10 = tmp0 + tmp3; */
    844     sub             v6.4s, v22.4s, v20.4s          /* tmp13l tmp13 = tmp0 - tmp3; */
    845     add             v8.4s, v26.4s, v18.4s          /* tmp11l tmp11 = tmp1 + tmp2; */
    846     sub             v4.4s, v26.4s, v18.4s          /* tmp12l tmp12 = tmp1 - tmp2; */
    847     add             v28.4s, v23.4s, v21.4s         /* tmp10h tmp10 = tmp0 + tmp3; */
    848     sub             v31.4s, v23.4s, v21.4s         /* tmp13h tmp13 = tmp0 - tmp3; */
    849     add             v29.4s, v27.4s, v19.4s         /* tmp11h tmp11 = tmp1 + tmp2; */
    850     sub             v30.4s, v27.4s, v19.4s         /* tmp12h tmp12 = tmp1 - tmp2; */
    851 
    852     /* Odd part per figure 8; the matrix is unitary and hence its
    853      * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
    854      */
    855 
    856     add             v22.8h, v9.8h, v5.8h    /* z3 = tmp0 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    857     add             v24.8h, v7.8h, v3.8h    /* z4 = tmp1 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    858     add             v18.8h, v9.8h, v3.8h    /* z1 = tmp0 + tmp3 = DEQUANTIZE(inptr[DCTSIZE*7], quantptr[DCTSIZE*7]) + DEQUANTIZE(inptr[DCTSIZE*1], quantptr[DCTSIZE*1]) */
    859     add             v20.8h, v7.8h, v5.8h    /* z2 = tmp1 + tmp2 = DEQUANTIZE(inptr[DCTSIZE*5], quantptr[DCTSIZE*5]) + DEQUANTIZE(inptr[DCTSIZE*3], quantptr[DCTSIZE*3]) */
    860     add             v26.8h, v22.8h, v24.8h  /* z5 = z3 + z4 */
    861 
    862     smull2          v11.4s, v9.8h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
    863     smull2          v13.4s, v7.8h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
    864     smull2          v15.4s, v5.8h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
    865     smull2          v17.4s, v3.8h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
    866     smull2          v27.4s, v26.8h, XFIX_P_1_175  /* z5h z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
    867     smull2          v23.4s, v22.8h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
    868     smull2          v25.4s, v24.8h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
    869     smull2          v19.4s, v18.8h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
    870     smull2          v21.4s, v20.8h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
    871 
    872     smull           v10.4s, v9.4h, XFIX_P_0_298   /* tmp0 = MULTIPLY(tmp0, FIX_0_298631336) */
    873     smull           v12.4s, v7.4h, XFIX_P_2_053   /* tmp1 = MULTIPLY(tmp1, FIX_2_053119869) */
    874     smull           v14.4s, v5.4h, XFIX_P_3_072   /* tmp2 = MULTIPLY(tmp2, FIX_3_072711026) */
    875     smull           v16.4s, v3.4h, XFIX_P_1_501   /* tmp3 = MULTIPLY(tmp3, FIX_1_501321110) */
    876     smull           v26.4s, v26.4h, XFIX_P_1_175  /* z5l z5 = MULTIPLY(z3 + z4, FIX_1_175875602) */
    877     smull           v22.4s, v22.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560) */
    878     smull           v24.4s, v24.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644) */
    879     smull           v18.4s, v18.4h, XFIX_N_0_899  /* z1 = MULTIPLY(z1, -FIX_0_899976223) */
    880     smull           v20.4s, v20.4h, XFIX_N_2_562  /* z2 = MULTIPLY(z2, -FIX_2_562915447) */
    881 
    882     add             v23.4s, v23.4s, v27.4s  /* z3 += z5 */
    883     add             v22.4s, v22.4s, v26.4s  /* z3 += z5 */
    884     add             v25.4s, v25.4s, v27.4s  /* z4 += z5 */
    885     add             v24.4s, v24.4s, v26.4s  /* z4 += z5 */
    886 
    887     add             v11.4s, v11.4s, v19.4s  /* tmp0 += z1 */
    888     add             v10.4s, v10.4s, v18.4s  /* tmp0 += z1 */
    889     add             v13.4s, v13.4s, v21.4s  /* tmp1 += z2 */
    890     add             v12.4s, v12.4s, v20.4s  /* tmp1 += z2 */
    891     add             v15.4s, v15.4s, v21.4s  /* tmp2 += z2 */
    892     add             v14.4s, v14.4s, v20.4s  /* tmp2 += z2 */
    893     add             v17.4s, v17.4s, v19.4s  /* tmp3 += z1 */
    894     add             v16.4s, v16.4s, v18.4s  /* tmp3 += z1 */
    895 
    896     add             v11.4s, v11.4s, v23.4s  /* tmp0 += z3 */
    897     add             v10.4s, v10.4s, v22.4s  /* tmp0 += z3 */
    898     add             v13.4s, v13.4s, v25.4s  /* tmp1 += z4 */
    899     add             v12.4s, v12.4s, v24.4s  /* tmp1 += z4 */
    900     add             v17.4s, v17.4s, v25.4s  /* tmp3 += z4 */
    901     add             v16.4s, v16.4s, v24.4s  /* tmp3 += z4 */
    902     add             v15.4s, v15.4s, v23.4s  /* tmp2 += z3 */
    903     add             v14.4s, v14.4s, v22.4s  /* tmp2 += z3 */
    904 
    905     /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
    906 
    907     add             v18.4s, v2.4s, v16.4s   /* tmp10 + tmp3 */
    908     add             v19.4s, v28.4s, v17.4s  /* tmp10 + tmp3 */
    909     sub             v20.4s, v2.4s, v16.4s   /* tmp10 - tmp3 */
    910     sub             v21.4s, v28.4s, v17.4s  /* tmp10 - tmp3 */
    911     add             v22.4s, v8.4s, v14.4s   /* tmp11 + tmp2 */
    912     add             v23.4s, v29.4s, v15.4s  /* tmp11 + tmp2 */
    913     sub             v24.4s, v8.4s, v14.4s   /* tmp11 - tmp2 */
    914     sub             v25.4s, v29.4s, v15.4s  /* tmp11 - tmp2 */
    915     add             v26.4s, v4.4s, v12.4s   /* tmp12 + tmp1 */
    916     add             v27.4s, v30.4s, v13.4s  /* tmp12 + tmp1 */
    917     sub             v28.4s, v4.4s, v12.4s   /* tmp12 - tmp1 */
    918     sub             v29.4s, v30.4s, v13.4s  /* tmp12 - tmp1 */
    919     add             v14.4s, v6.4s, v10.4s   /* tmp13 + tmp0 */
    920     add             v15.4s, v31.4s, v11.4s  /* tmp13 + tmp0 */
    921     sub             v16.4s, v6.4s, v10.4s   /* tmp13 - tmp0 */
    922     sub             v17.4s, v31.4s, v11.4s  /* tmp13 - tmp0 */
    923 
    924     rshrn           v2.4h, v18.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
    925     rshrn           v3.4h, v22.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
    926     rshrn           v4.4h, v26.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
    927     rshrn           v5.4h, v14.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
    928     rshrn           v6.4h, v19.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) */
    929     rshrn           v7.4h, v23.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) */
    930     rshrn           v8.4h, v27.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) */
    931     rshrn           v9.4h, v15.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) */
    932     rshrn2          v2.8h, v16.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
    933     rshrn2          v3.8h, v28.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
    934     rshrn2          v4.8h, v24.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
    935     rshrn2          v5.8h, v20.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
    936     rshrn2          v6.8h, v17.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) */
    937     rshrn2          v7.8h, v29.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) */
    938     rshrn2          v8.8h, v25.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) */
    939     rshrn2          v9.8h, v21.4s, #(CONST_BITS-PASS1_BITS)  /* wsptr[DCTSIZE*7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) */
    940     b               1b
    941 
    942     .unreq          DCT_TABLE
    943     .unreq          COEF_BLOCK
    944     .unreq          OUTPUT_BUF
    945     .unreq          OUTPUT_COL
    946     .unreq          TMP1
    947     .unreq          TMP2
    948     .unreq          TMP3
    949     .unreq          TMP4
    950     .unreq          TMP5
    951     .unreq          TMP6
    952     .unreq          TMP7
    953     .unreq          TMP8
    954 
    955 #undef CENTERJSAMPLE
    956 #undef CONST_BITS
    957 #undef PASS1_BITS
    958 #undef XFIX_P_0_298
    959 #undef XFIX_N_0_390
    960 #undef XFIX_P_0_541
    961 #undef XFIX_P_0_765
    962 #undef XFIX_N_0_899
    963 #undef XFIX_P_1_175
    964 #undef XFIX_P_1_501
    965 #undef XFIX_N_1_847
    966 #undef XFIX_N_1_961
    967 #undef XFIX_P_2_053
    968 #undef XFIX_N_2_562
    969 #undef XFIX_P_3_072
    970 
    971 
    972 /*****************************************************************************/
    973 
    974 /*
    975  * jsimd_idct_ifast_neon
    976  *
    977  * This function contains a fast, not so accurate integer implementation of
    978  * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
    979  * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
    980  * function from jidctfst.c
    981  *
    982  * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
    983  * But in ARM NEON case some extra additions are required because VQDMULH
    984  * instruction can't handle the constants larger than 1. So the expressions
    985  * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
    986  * which introduces an extra addition. Overall, there are 6 extra additions
    987  * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
    988  */
    989 
    990 asm_function jsimd_idct_ifast_neon
    991 
    992     DCT_TABLE       .req x0
    993     COEF_BLOCK      .req x1
    994     OUTPUT_BUF      .req x2
    995     OUTPUT_COL      .req x3
    996     TMP1            .req x0
    997     TMP2            .req x1
    998     TMP3            .req x9
    999     TMP4            .req x10
   1000     TMP5            .req x11
   1001     TMP6            .req x12
   1002     TMP7            .req x13
   1003     TMP8            .req x14
   1004 
   1005     /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
   1006        guarantee that the upper (unused) 32 bits of x3 are valid.  This
   1007        instruction ensures that those bits are set to zero. */
   1008     uxtw x3, w3
   1009 
   1010     /* Load and dequantize coefficients into NEON registers
   1011      * with the following allocation:
   1012      *       0 1 2 3 | 4 5 6 7
   1013      *      ---------+--------
   1014      *   0 | d16     | d17     ( v16.8h )
   1015      *   1 | d18     | d19     ( v17.8h )
   1016      *   2 | d20     | d21     ( v18.8h )
   1017      *   3 | d22     | d23     ( v19.8h )
   1018      *   4 | d24     | d25     ( v20.8h )
   1019      *   5 | d26     | d27     ( v21.8h )
   1020      *   6 | d28     | d29     ( v22.8h )
   1021      *   7 | d30     | d31     ( v23.8h )
   1022      */
   1023     /* Save NEON registers used in fast IDCT */
   1024     get_symbol_loc  TMP5, Ljsimd_idct_ifast_neon_consts
   1025     ld1             {v16.8h, v17.8h}, [COEF_BLOCK], 32
   1026     ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
   1027     ld1             {v18.8h, v19.8h}, [COEF_BLOCK], 32
   1028     mul             v16.8h, v16.8h, v0.8h
   1029     ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
   1030     mul             v17.8h, v17.8h, v1.8h
   1031     ld1             {v20.8h, v21.8h}, [COEF_BLOCK], 32
   1032     mul             v18.8h, v18.8h, v2.8h
   1033     ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
   1034     mul             v19.8h, v19.8h, v3.8h
   1035     ld1             {v22.8h, v23.8h}, [COEF_BLOCK], 32
   1036     mul             v20.8h, v20.8h, v0.8h
   1037     ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
   1038     mul             v22.8h, v22.8h, v2.8h
   1039     mul             v21.8h, v21.8h, v1.8h
   1040     ld1             {v0.4h}, [TMP5]        /* load constants */
   1041     mul             v23.8h, v23.8h, v3.8h
   1042 
   1043     /* 1-D IDCT, pass 1 */
   1044     sub             v2.8h, v18.8h, v22.8h
   1045     add             v22.8h, v18.8h, v22.8h
   1046     sub             v1.8h, v19.8h, v21.8h
   1047     add             v21.8h, v19.8h, v21.8h
   1048     sub             v5.8h, v17.8h, v23.8h
   1049     add             v23.8h, v17.8h, v23.8h
   1050     sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
   1051     sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
   1052     add             v3.8h, v1.8h, v1.8h
   1053     sub             v1.8h, v5.8h, v1.8h
   1054     add             v18.8h, v2.8h, v4.8h
   1055     sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
   1056     sub             v2.8h, v23.8h, v21.8h
   1057     add             v3.8h, v3.8h, v6.8h
   1058     sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
   1059     add             v1.8h, v1.8h, v4.8h
   1060     sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
   1061     sub             v18.8h, v18.8h, v22.8h
   1062     add             v2.8h, v2.8h, v6.8h
   1063     sub             v6.8h, v16.8h, v20.8h
   1064     add             v20.8h, v16.8h, v20.8h
   1065     add             v17.8h, v5.8h, v4.8h
   1066     add             v5.8h, v6.8h, v18.8h
   1067     sub             v18.8h, v6.8h, v18.8h
   1068     add             v6.8h, v23.8h, v21.8h
   1069     add             v16.8h, v20.8h, v22.8h
   1070     sub             v3.8h, v6.8h, v3.8h
   1071     sub             v20.8h, v20.8h, v22.8h
   1072     sub             v3.8h, v3.8h, v1.8h
   1073     sub             v1.8h, v17.8h, v1.8h
   1074     add             v2.8h, v3.8h, v2.8h
   1075     sub             v23.8h, v16.8h, v6.8h
   1076     add             v1.8h, v1.8h, v2.8h
   1077     add             v16.8h, v16.8h, v6.8h
   1078     add             v22.8h, v5.8h, v3.8h
   1079     sub             v17.8h, v5.8h, v3.8h
   1080     sub             v21.8h, v18.8h, v2.8h
   1081     add             v18.8h, v18.8h, v2.8h
   1082     sub             v19.8h, v20.8h, v1.8h
   1083     add             v20.8h, v20.8h, v1.8h
   1084     transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31
   1085     /* 1-D IDCT, pass 2 */
   1086     sub             v2.8h, v18.8h, v22.8h
   1087     add             v22.8h, v18.8h, v22.8h
   1088     sub             v1.8h, v19.8h, v21.8h
   1089     add             v21.8h, v19.8h, v21.8h
   1090     sub             v5.8h, v17.8h, v23.8h
   1091     add             v23.8h, v17.8h, v23.8h
   1092     sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
   1093     sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
   1094     add             v3.8h, v1.8h, v1.8h
   1095     sub             v1.8h, v5.8h, v1.8h
   1096     add             v18.8h, v2.8h, v4.8h
   1097     sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
   1098     sub             v2.8h, v23.8h, v21.8h
   1099     add             v3.8h, v3.8h, v6.8h
   1100     sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
   1101     add             v1.8h, v1.8h, v4.8h
   1102     sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
   1103     sub             v18.8h, v18.8h, v22.8h
   1104     add             v2.8h, v2.8h, v6.8h
   1105     sub             v6.8h, v16.8h, v20.8h
   1106     add             v20.8h, v16.8h, v20.8h
   1107     add             v17.8h, v5.8h, v4.8h
   1108     add             v5.8h, v6.8h, v18.8h
   1109     sub             v18.8h, v6.8h, v18.8h
   1110     add             v6.8h, v23.8h, v21.8h
   1111     add             v16.8h, v20.8h, v22.8h
   1112     sub             v3.8h, v6.8h, v3.8h
   1113     sub             v20.8h, v20.8h, v22.8h
   1114     sub             v3.8h, v3.8h, v1.8h
   1115     sub             v1.8h, v17.8h, v1.8h
   1116     add             v2.8h, v3.8h, v2.8h
   1117     sub             v23.8h, v16.8h, v6.8h
   1118     add             v1.8h, v1.8h, v2.8h
   1119     add             v16.8h, v16.8h, v6.8h
   1120     add             v22.8h, v5.8h, v3.8h
   1121     sub             v17.8h, v5.8h, v3.8h
   1122     sub             v21.8h, v18.8h, v2.8h
   1123     add             v18.8h, v18.8h, v2.8h
   1124     sub             v19.8h, v20.8h, v1.8h
   1125     add             v20.8h, v20.8h, v1.8h
   1126     /* Descale to 8-bit and range limit */
   1127     movi            v0.16b, #0x80
   1128       /* Prepare pointers (dual-issue with NEON instructions) */
   1129       ldp             TMP1, TMP2, [OUTPUT_BUF], 16
   1130     sqshrn          v28.8b, v16.8h, #5
   1131       ldp             TMP3, TMP4, [OUTPUT_BUF], 16
   1132     sqshrn          v29.8b, v17.8h, #5
   1133       add             TMP1, TMP1, OUTPUT_COL
   1134     sqshrn          v30.8b, v18.8h, #5
   1135       add             TMP2, TMP2, OUTPUT_COL
   1136     sqshrn          v31.8b, v19.8h, #5
   1137       add             TMP3, TMP3, OUTPUT_COL
   1138     sqshrn2         v28.16b, v20.8h, #5
   1139       add             TMP4, TMP4, OUTPUT_COL
   1140     sqshrn2         v29.16b, v21.8h, #5
   1141       ldp             TMP5, TMP6, [OUTPUT_BUF], 16
   1142     sqshrn2         v30.16b, v22.8h, #5
   1143       ldp             TMP7, TMP8, [OUTPUT_BUF], 16
   1144     sqshrn2         v31.16b, v23.8h, #5
   1145       add             TMP5, TMP5, OUTPUT_COL
   1146     add             v16.16b, v28.16b, v0.16b
   1147       add             TMP6, TMP6, OUTPUT_COL
   1148     add             v18.16b, v29.16b, v0.16b
   1149       add             TMP7, TMP7, OUTPUT_COL
   1150     add             v20.16b, v30.16b, v0.16b
   1151       add             TMP8, TMP8, OUTPUT_COL
   1152     add             v22.16b, v31.16b, v0.16b
   1153 
   1154     /* Transpose the final 8-bit samples */
   1155     trn1            v28.16b, v16.16b, v18.16b
   1156     trn1            v30.16b, v20.16b, v22.16b
   1157     trn2            v29.16b, v16.16b, v18.16b
   1158     trn2            v31.16b, v20.16b, v22.16b
   1159 
   1160     trn1            v16.8h, v28.8h, v30.8h
   1161     trn2            v18.8h, v28.8h, v30.8h
   1162     trn1            v20.8h, v29.8h, v31.8h
   1163     trn2            v22.8h, v29.8h, v31.8h
   1164 
   1165     uzp1            v28.4s, v16.4s, v18.4s
   1166     uzp2            v30.4s, v16.4s, v18.4s
   1167     uzp1            v29.4s, v20.4s, v22.4s
   1168     uzp2            v31.4s, v20.4s, v22.4s
   1169 
   1170     /* Store results to the output buffer */
   1171     st1             {v28.d}[0], [TMP1]
   1172     st1             {v29.d}[0], [TMP2]
   1173     st1             {v28.d}[1], [TMP3]
   1174     st1             {v29.d}[1], [TMP4]
   1175     st1             {v30.d}[0], [TMP5]
   1176     st1             {v31.d}[0], [TMP6]
   1177     st1             {v30.d}[1], [TMP7]
   1178     st1             {v31.d}[1], [TMP8]
   1179     blr             x30
   1180 
   1181     .unreq          DCT_TABLE
   1182     .unreq          COEF_BLOCK
   1183     .unreq          OUTPUT_BUF
   1184     .unreq          OUTPUT_COL
   1185     .unreq          TMP1
   1186     .unreq          TMP2
   1187     .unreq          TMP3
   1188     .unreq          TMP4
   1189     .unreq          TMP5
   1190     .unreq          TMP6
   1191     .unreq          TMP7
   1192     .unreq          TMP8
   1193 
   1194 
   1195 /*****************************************************************************/
   1196 
   1197 /*
   1198  * jsimd_idct_4x4_neon
   1199  *
   1200  * This function contains inverse-DCT code for getting reduced-size
   1201  * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
   1202  * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
   1203  * function from jpeg-6b (jidctred.c).
   1204  *
   1205  * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
   1206  *       requires much less arithmetic operations and hence should be faster.
   1207  *       The primary purpose of this particular NEON optimized function is
   1208  *       bit exact compatibility with jpeg-6b.
   1209  *
   1210  * TODO: a bit better instructions scheduling can be achieved by expanding
   1211  *       idct_helper/transpose_4x4 macros and reordering instructions,
   1212  *       but readability will suffer somewhat.
   1213  */
   1214 
   1215 #define CONST_BITS  13
   1216 
   1217 .macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
   1218     smull           v28.4s, \x4, v2.h[2]
   1219     smlal           v28.4s, \x8, v0.h[0]
   1220     smlal           v28.4s, \x14, v0.h[1]
   1221 
   1222     smull           v26.4s, \x16, v1.h[2]
   1223     smlal           v26.4s, \x12, v1.h[3]
   1224     smlal           v26.4s, \x10, v2.h[0]
   1225     smlal           v26.4s, \x6, v2.h[1]
   1226 
   1227     smull           v30.4s, \x4, v2.h[2]
   1228     smlsl           v30.4s, \x8, v0.h[0]
   1229     smlsl           v30.4s, \x14, v0.h[1]
   1230 
   1231     smull           v24.4s, \x16, v0.h[2]
   1232     smlal           v24.4s, \x12, v0.h[3]
   1233     smlal           v24.4s, \x10, v1.h[0]
   1234     smlal           v24.4s, \x6, v1.h[1]
   1235 
   1236     add             v20.4s, v28.4s, v26.4s
   1237     sub             v28.4s, v28.4s, v26.4s
   1238 
   1239   .if \shift > 16
   1240     srshr           v20.4s, v20.4s, #\shift
   1241     srshr           v28.4s, v28.4s, #\shift
   1242     xtn             \y26, v20.4s
   1243     xtn             \y29, v28.4s
   1244   .else
   1245     rshrn           \y26, v20.4s, #\shift
   1246     rshrn           \y29, v28.4s, #\shift
   1247   .endif
   1248 
   1249     add             v20.4s, v30.4s, v24.4s
   1250     sub             v30.4s, v30.4s, v24.4s
   1251 
   1252   .if \shift > 16
   1253     srshr           v20.4s, v20.4s, #\shift
   1254     srshr           v30.4s, v30.4s, #\shift
   1255     xtn             \y27, v20.4s
   1256     xtn             \y28, v30.4s
   1257   .else
   1258     rshrn           \y27, v20.4s, #\shift
   1259     rshrn           \y28, v30.4s, #\shift
   1260   .endif
   1261 .endm
   1262 
   1263 asm_function jsimd_idct_4x4_neon
   1264 
   1265     DCT_TABLE       .req x0
   1266     COEF_BLOCK      .req x1
   1267     OUTPUT_BUF      .req x2
   1268     OUTPUT_COL      .req x3
   1269     TMP1            .req x0
   1270     TMP2            .req x1
   1271     TMP3            .req x2
   1272     TMP4            .req x15
   1273 
   1274     /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
   1275        guarantee that the upper (unused) 32 bits of x3 are valid.  This
   1276        instruction ensures that those bits are set to zero. */
   1277     uxtw x3, w3
   1278 
   1279     /* Save all used NEON registers */
   1280     sub             sp, sp, 64
   1281     mov             x9, sp
   1282     /* Load constants (v3.4h is just used for padding) */
   1283     get_symbol_loc  TMP4, Ljsimd_idct_4x4_neon_consts
   1284     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
   1285     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
   1286     ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
   1287 
   1288     /* Load all COEF_BLOCK into NEON registers with the following allocation:
   1289      *       0 1 2 3 | 4 5 6 7
   1290      *      ---------+--------
   1291      *   0 | v4.4h   | v5.4h
   1292      *   1 | v6.4h   | v7.4h
   1293      *   2 | v8.4h   | v9.4h
   1294      *   3 | v10.4h  | v11.4h
   1295      *   4 | -       | -
   1296      *   5 | v12.4h  | v13.4h
   1297      *   6 | v14.4h  | v15.4h
   1298      *   7 | v16.4h  | v17.4h
   1299      */
   1300     ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
   1301     ld1             {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
   1302     add             COEF_BLOCK, COEF_BLOCK, #16
   1303     ld1             {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
   1304     ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
   1305     /* dequantize */
   1306     ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
   1307     mul             v4.4h, v4.4h, v18.4h
   1308     mul             v5.4h, v5.4h, v19.4h
   1309     ins             v4.d[1], v5.d[0]              /* 128 bit q4 */
   1310     ld1             {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
   1311     mul             v6.4h, v6.4h, v20.4h
   1312     mul             v7.4h, v7.4h, v21.4h
   1313     ins             v6.d[1], v7.d[0]              /* 128 bit q6 */
   1314     mul             v8.4h, v8.4h, v22.4h
   1315     mul             v9.4h, v9.4h, v23.4h
   1316     ins             v8.d[1], v9.d[0]              /* 128 bit q8 */
   1317     add             DCT_TABLE, DCT_TABLE, #16
   1318     ld1             {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
   1319     mul             v10.4h, v10.4h, v24.4h
   1320     mul             v11.4h, v11.4h, v25.4h
   1321     ins             v10.d[1], v11.d[0]            /* 128 bit q10 */
   1322     mul             v12.4h, v12.4h, v26.4h
   1323     mul             v13.4h, v13.4h, v27.4h
   1324     ins             v12.d[1], v13.d[0]            /* 128 bit q12 */
   1325     ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
   1326     mul             v14.4h, v14.4h, v28.4h
   1327     mul             v15.4h, v15.4h, v29.4h
   1328     ins             v14.d[1], v15.d[0]            /* 128 bit q14 */
   1329     mul             v16.4h, v16.4h, v30.4h
   1330     mul             v17.4h, v17.4h, v31.4h
   1331     ins             v16.d[1], v17.d[0]            /* 128 bit q16 */
   1332 
   1333     /* Pass 1 */
   1334     idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
   1335                     v4.4h, v6.4h, v8.4h, v10.4h
   1336     transpose_4x4   v4, v6, v8, v10, v3
   1337     ins             v10.d[1], v11.d[0]
   1338     idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
   1339                     v5.4h, v7.4h, v9.4h, v11.4h
   1340     transpose_4x4   v5, v7, v9, v11, v3
   1341     ins             v10.d[1], v11.d[0]
   1342 
   1343     /* Pass 2 */
   1344     idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
   1345                     v26.4h, v27.4h, v28.4h, v29.4h
   1346     transpose_4x4   v26, v27, v28, v29, v3
   1347 
   1348     /* Range limit */
   1349     movi            v30.8h, #0x80
   1350     ins             v26.d[1], v27.d[0]
   1351     ins             v28.d[1], v29.d[0]
   1352     add             v26.8h, v26.8h, v30.8h
   1353     add             v28.8h, v28.8h, v30.8h
   1354     sqxtun          v26.8b, v26.8h
   1355     sqxtun          v27.8b, v28.8h
   1356 
   1357     /* Store results to the output buffer */
   1358     ldp             TMP1, TMP2, [OUTPUT_BUF], 16
   1359     ldp             TMP3, TMP4, [OUTPUT_BUF]
   1360     add             TMP1, TMP1, OUTPUT_COL
   1361     add             TMP2, TMP2, OUTPUT_COL
   1362     add             TMP3, TMP3, OUTPUT_COL
   1363     add             TMP4, TMP4, OUTPUT_COL
   1364 
   1365 #if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
   1366     /* We can use much less instructions on little endian systems if the
   1367      * OS kernel is not configured to trap unaligned memory accesses
   1368      */
   1369     st1             {v26.s}[0], [TMP1], 4
   1370     st1             {v27.s}[0], [TMP3], 4
   1371     st1             {v26.s}[1], [TMP2], 4
   1372     st1             {v27.s}[1], [TMP4], 4
   1373 #else
   1374     st1             {v26.b}[0], [TMP1], 1
   1375     st1             {v27.b}[0], [TMP3], 1
   1376     st1             {v26.b}[1], [TMP1], 1
   1377     st1             {v27.b}[1], [TMP3], 1
   1378     st1             {v26.b}[2], [TMP1], 1
   1379     st1             {v27.b}[2], [TMP3], 1
   1380     st1             {v26.b}[3], [TMP1], 1
   1381     st1             {v27.b}[3], [TMP3], 1
   1382 
   1383     st1             {v26.b}[4], [TMP2], 1
   1384     st1             {v27.b}[4], [TMP4], 1
   1385     st1             {v26.b}[5], [TMP2], 1
   1386     st1             {v27.b}[5], [TMP4], 1
   1387     st1             {v26.b}[6], [TMP2], 1
   1388     st1             {v27.b}[6], [TMP4], 1
   1389     st1             {v26.b}[7], [TMP2], 1
   1390     st1             {v27.b}[7], [TMP4], 1
   1391 #endif
   1392 
   1393     /* vpop            {v8.4h - v15.4h}    ;not available */
   1394     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
   1395     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
   1396     blr             x30
   1397 
   1398     .unreq          DCT_TABLE
   1399     .unreq          COEF_BLOCK
   1400     .unreq          OUTPUT_BUF
   1401     .unreq          OUTPUT_COL
   1402     .unreq          TMP1
   1403     .unreq          TMP2
   1404     .unreq          TMP3
   1405     .unreq          TMP4
   1406 
   1407 .purgem idct_helper
   1408 
   1409 
   1410 /*****************************************************************************/
   1411 
   1412 /*
   1413  * jsimd_idct_2x2_neon
   1414  *
   1415  * This function contains inverse-DCT code for getting reduced-size
   1416  * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
   1417  * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
   1418  * function from jpeg-6b (jidctred.c).
   1419  *
   1420  * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
   1421  *       requires much less arithmetic operations and hence should be faster.
   1422  *       The primary purpose of this particular NEON optimized function is
   1423  *       bit exact compatibility with jpeg-6b.
   1424  */
   1425 
   1426 .macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
   1427     sshll           v15.4s, \x4, #15
   1428     smull           v26.4s, \x6, v14.h[3]
   1429     smlal           v26.4s, \x10, v14.h[2]
   1430     smlal           v26.4s, \x12, v14.h[1]
   1431     smlal           v26.4s, \x16, v14.h[0]
   1432 
   1433     add             v20.4s, v15.4s, v26.4s
   1434     sub             v15.4s, v15.4s, v26.4s
   1435 
   1436   .if \shift > 16
   1437     srshr           v20.4s, v20.4s, #\shift
   1438     srshr           v15.4s, v15.4s, #\shift
   1439     xtn             \y26, v20.4s
   1440     xtn             \y27, v15.4s
   1441   .else
   1442     rshrn           \y26, v20.4s, #\shift
   1443     rshrn           \y27, v15.4s, #\shift
   1444   .endif
   1445 .endm
   1446 
   1447 asm_function jsimd_idct_2x2_neon
   1448 
   1449     DCT_TABLE       .req x0
   1450     COEF_BLOCK      .req x1
   1451     OUTPUT_BUF      .req x2
   1452     OUTPUT_COL      .req x3
   1453     TMP1            .req x0
   1454     TMP2            .req x15
   1455 
   1456     /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
   1457        guarantee that the upper (unused) 32 bits of x3 are valid.  This
   1458        instruction ensures that those bits are set to zero. */
   1459     uxtw x3, w3
   1460 
   1461     /* vpush           {v8.4h - v15.4h}            ; not available */
   1462     sub             sp, sp, 64
   1463     mov             x9, sp
   1464 
   1465     /* Load constants */
   1466     get_symbol_loc  TMP2, Ljsimd_idct_2x2_neon_consts
   1467     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
   1468     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
   1469     ld1             {v14.4h}, [TMP2]
   1470 
   1471     /* Load all COEF_BLOCK into NEON registers with the following allocation:
   1472      *       0 1 2 3 | 4 5 6 7
   1473      *      ---------+--------
   1474      *   0 | v4.4h   | v5.4h
   1475      *   1 | v6.4h   | v7.4h
   1476      *   2 | -       | -
   1477      *   3 | v10.4h  | v11.4h
   1478      *   4 | -       | -
   1479      *   5 | v12.4h  | v13.4h
   1480      *   6 | -       | -
   1481      *   7 | v16.4h  | v17.4h
   1482      */
   1483     ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
   1484     add             COEF_BLOCK, COEF_BLOCK, #16
   1485     ld1             {v10.4h, v11.4h}, [COEF_BLOCK], 16
   1486     add             COEF_BLOCK, COEF_BLOCK, #16
   1487     ld1             {v12.4h, v13.4h}, [COEF_BLOCK], 16
   1488     add             COEF_BLOCK, COEF_BLOCK, #16
   1489     ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
   1490     /* Dequantize */
   1491     ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
   1492     mul             v4.4h, v4.4h, v18.4h
   1493     mul             v5.4h, v5.4h, v19.4h
   1494     ins             v4.d[1], v5.d[0]
   1495     mul             v6.4h, v6.4h, v20.4h
   1496     mul             v7.4h, v7.4h, v21.4h
   1497     ins             v6.d[1], v7.d[0]
   1498     add             DCT_TABLE, DCT_TABLE, #16
   1499     ld1             {v24.4h, v25.4h}, [DCT_TABLE], 16
   1500     mul             v10.4h, v10.4h, v24.4h
   1501     mul             v11.4h, v11.4h, v25.4h
   1502     ins             v10.d[1], v11.d[0]
   1503     add             DCT_TABLE, DCT_TABLE, #16
   1504     ld1             {v26.4h, v27.4h}, [DCT_TABLE], 16
   1505     mul             v12.4h, v12.4h, v26.4h
   1506     mul             v13.4h, v13.4h, v27.4h
   1507     ins             v12.d[1], v13.d[0]
   1508     add             DCT_TABLE, DCT_TABLE, #16
   1509     ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
   1510     mul             v16.4h, v16.4h, v30.4h
   1511     mul             v17.4h, v17.4h, v31.4h
   1512     ins             v16.d[1], v17.d[0]
   1513 
   1514     /* Pass 1 */
   1515 #if 0
   1516     idct_helper     v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
   1517     transpose_4x4   v4.4h, v6.4h, v8.4h, v10.4h
   1518     idct_helper     v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
   1519     transpose_4x4   v5.4h, v7.4h, v9.4h, v11.4h
   1520 #else
   1521     smull           v26.4s, v6.4h, v14.h[3]
   1522     smlal           v26.4s, v10.4h, v14.h[2]
   1523     smlal           v26.4s, v12.4h, v14.h[1]
   1524     smlal           v26.4s, v16.4h, v14.h[0]
   1525     smull           v24.4s, v7.4h, v14.h[3]
   1526     smlal           v24.4s, v11.4h, v14.h[2]
   1527     smlal           v24.4s, v13.4h, v14.h[1]
   1528     smlal           v24.4s, v17.4h, v14.h[0]
   1529     sshll           v15.4s, v4.4h, #15
   1530     sshll           v30.4s, v5.4h, #15
   1531     add             v20.4s, v15.4s, v26.4s
   1532     sub             v15.4s, v15.4s, v26.4s
   1533     rshrn           v4.4h, v20.4s, #13
   1534     rshrn           v6.4h, v15.4s, #13
   1535     add             v20.4s, v30.4s, v24.4s
   1536     sub             v15.4s, v30.4s, v24.4s
   1537     rshrn           v5.4h, v20.4s, #13
   1538     rshrn           v7.4h, v15.4s, #13
   1539     ins             v4.d[1], v5.d[0]
   1540     ins             v6.d[1], v7.d[0]
   1541     transpose       v4, v6, v3, .16b, .8h
   1542     transpose       v6, v10, v3, .16b, .4s
   1543     ins             v11.d[0], v10.d[1]
   1544     ins             v7.d[0], v6.d[1]
   1545 #endif
   1546 
   1547     /* Pass 2 */
   1548     idct_helper     v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
   1549 
   1550     /* Range limit */
   1551     movi            v30.8h, #0x80
   1552     ins             v26.d[1], v27.d[0]
   1553     add             v26.8h, v26.8h, v30.8h
   1554     sqxtun          v30.8b, v26.8h
   1555     ins             v26.d[0], v30.d[0]
   1556     sqxtun          v27.8b, v26.8h
   1557 
   1558     /* Store results to the output buffer */
   1559     ldp             TMP1, TMP2, [OUTPUT_BUF]
   1560     add             TMP1, TMP1, OUTPUT_COL
   1561     add             TMP2, TMP2, OUTPUT_COL
   1562 
   1563     st1             {v26.b}[0], [TMP1], 1
   1564     st1             {v27.b}[4], [TMP1], 1
   1565     st1             {v26.b}[1], [TMP2], 1
   1566     st1             {v27.b}[5], [TMP2], 1
   1567 
   1568     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
   1569     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
   1570     blr             x30
   1571 
   1572     .unreq          DCT_TABLE
   1573     .unreq          COEF_BLOCK
   1574     .unreq          OUTPUT_BUF
   1575     .unreq          OUTPUT_COL
   1576     .unreq          TMP1
   1577     .unreq          TMP2
   1578 
   1579 .purgem idct_helper
   1580 
   1581 
   1582 /*****************************************************************************/
   1583 
   1584 /*
   1585  * jsimd_ycc_extrgb_convert_neon
   1586  * jsimd_ycc_extbgr_convert_neon
   1587  * jsimd_ycc_extrgbx_convert_neon
   1588  * jsimd_ycc_extbgrx_convert_neon
   1589  * jsimd_ycc_extxbgr_convert_neon
   1590  * jsimd_ycc_extxrgb_convert_neon
   1591  *
   1592  * Colorspace conversion YCbCr -> RGB
   1593  */
   1594 
   1595 .macro do_load size
   1596   .if \size == 8
   1597     ld1             {v4.8b}, [U], 8
   1598     ld1             {v5.8b}, [V], 8
   1599     ld1             {v0.8b}, [Y], 8
   1600     prfm            pldl1keep, [U, #64]
   1601     prfm            pldl1keep, [V, #64]
   1602     prfm            pldl1keep, [Y, #64]
   1603   .elseif \size == 4
   1604     ld1             {v4.b}[0], [U], 1
   1605     ld1             {v4.b}[1], [U], 1
   1606     ld1             {v4.b}[2], [U], 1
   1607     ld1             {v4.b}[3], [U], 1
   1608     ld1             {v5.b}[0], [V], 1
   1609     ld1             {v5.b}[1], [V], 1
   1610     ld1             {v5.b}[2], [V], 1
   1611     ld1             {v5.b}[3], [V], 1
   1612     ld1             {v0.b}[0], [Y], 1
   1613     ld1             {v0.b}[1], [Y], 1
   1614     ld1             {v0.b}[2], [Y], 1
   1615     ld1             {v0.b}[3], [Y], 1
   1616   .elseif \size == 2
   1617     ld1             {v4.b}[4], [U], 1
   1618     ld1             {v4.b}[5], [U], 1
   1619     ld1             {v5.b}[4], [V], 1
   1620     ld1             {v5.b}[5], [V], 1
   1621     ld1             {v0.b}[4], [Y], 1
   1622     ld1             {v0.b}[5], [Y], 1
   1623   .elseif \size == 1
   1624     ld1             {v4.b}[6], [U], 1
   1625     ld1             {v5.b}[6], [V], 1
   1626     ld1             {v0.b}[6], [Y], 1
   1627   .else
   1628     .error unsupported macroblock size
   1629   .endif
   1630 .endm
   1631 
   1632 .macro do_store bpp, size, fast_st3
   1633   .if \bpp == 24
   1634     .if \size == 8
   1635       .if \fast_st3 == 1
   1636         st3         {v10.8b, v11.8b, v12.8b}, [RGB], 24
   1637       .else
   1638         st1         {v10.b}[0], [RGB], #1
   1639         st1         {v11.b}[0], [RGB], #1
   1640         st1         {v12.b}[0], [RGB], #1
   1641 
   1642         st1         {v10.b}[1], [RGB], #1
   1643         st1         {v11.b}[1], [RGB], #1
   1644         st1         {v12.b}[1], [RGB], #1
   1645 
   1646         st1         {v10.b}[2], [RGB], #1
   1647         st1         {v11.b}[2], [RGB], #1
   1648         st1         {v12.b}[2], [RGB], #1
   1649 
   1650         st1         {v10.b}[3], [RGB], #1
   1651         st1         {v11.b}[3], [RGB], #1
   1652         st1         {v12.b}[3], [RGB], #1
   1653 
   1654         st1         {v10.b}[4], [RGB], #1
   1655         st1         {v11.b}[4], [RGB], #1
   1656         st1         {v12.b}[4], [RGB], #1
   1657 
   1658         st1         {v10.b}[5], [RGB], #1
   1659         st1         {v11.b}[5], [RGB], #1
   1660         st1         {v12.b}[5], [RGB], #1
   1661 
   1662         st1         {v10.b}[6], [RGB], #1
   1663         st1         {v11.b}[6], [RGB], #1
   1664         st1         {v12.b}[6], [RGB], #1
   1665 
   1666         st1         {v10.b}[7], [RGB], #1
   1667         st1         {v11.b}[7], [RGB], #1
   1668         st1         {v12.b}[7], [RGB], #1
   1669       .endif
   1670     .elseif \size == 4
   1671       st3           {v10.b, v11.b, v12.b}[0], [RGB], 3
   1672       st3           {v10.b, v11.b, v12.b}[1], [RGB], 3
   1673       st3           {v10.b, v11.b, v12.b}[2], [RGB], 3
   1674       st3           {v10.b, v11.b, v12.b}[3], [RGB], 3
   1675     .elseif \size == 2
   1676       st3           {v10.b, v11.b, v12.b}[4], [RGB], 3
   1677       st3           {v10.b, v11.b, v12.b}[5], [RGB], 3
   1678     .elseif \size == 1
   1679       st3           {v10.b, v11.b, v12.b}[6], [RGB], 3
   1680     .else
   1681      .error unsupported macroblock size
   1682     .endif
   1683   .elseif \bpp == 32
   1684     .if \size == 8
   1685       st4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
   1686     .elseif \size == 4
   1687       st4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
   1688       st4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
   1689       st4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
   1690       st4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
   1691     .elseif \size == 2
   1692       st4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
   1693       st4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
   1694     .elseif \size == 1
   1695       st4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
   1696     .else
   1697       .error unsupported macroblock size
   1698     .endif
   1699   .elseif \bpp == 16
   1700     .if \size == 8
   1701       st1           {v25.8h}, [RGB], 16
   1702     .elseif \size == 4
   1703       st1           {v25.4h}, [RGB], 8
   1704     .elseif \size == 2
   1705       st1           {v25.h}[4], [RGB], 2
   1706       st1           {v25.h}[5], [RGB], 2
   1707     .elseif \size == 1
   1708       st1           {v25.h}[6], [RGB], 2
   1709     .else
   1710       .error unsupported macroblock size
   1711     .endif
   1712   .else
   1713     .error unsupported bpp
   1714   .endif
   1715 .endm
   1716 
   1717 .macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, \
   1718                                            g_offs, gsize, b_offs, bsize, \
   1719                                            defsize, fast_st3
   1720 
   1721 /*
   1722  * 2-stage pipelined YCbCr->RGB conversion
   1723  */
   1724 
   1725 .macro do_yuv_to_rgb_stage1
   1726     uaddw           v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
   1727     uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
   1728     smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
   1729     smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
   1730     smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
   1731     smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
   1732     smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
   1733     smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
   1734     smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
   1735     smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
   1736 .endm
   1737 
   1738 .macro do_yuv_to_rgb_stage2
   1739     rshrn           v20.4h, v20.4s, #15
   1740     rshrn2          v20.8h, v22.4s, #15
   1741     rshrn           v24.4h, v24.4s, #14
   1742     rshrn2          v24.8h, v26.4s, #14
   1743     rshrn           v28.4h, v28.4s, #14
   1744     rshrn2          v28.8h, v30.4s, #14
   1745     uaddw           v20.8h, v20.8h, v0.8b
   1746     uaddw           v24.8h, v24.8h, v0.8b
   1747     uaddw           v28.8h, v28.8h, v0.8b
   1748   .if \bpp != 16
   1749     sqxtun          v1\g_offs\defsize, v20.8h
   1750     sqxtun          v1\r_offs\defsize, v24.8h
   1751     sqxtun          v1\b_offs\defsize, v28.8h
   1752   .else
   1753     sqshlu          v21.8h, v20.8h, #8
   1754     sqshlu          v25.8h, v24.8h, #8
   1755     sqshlu          v29.8h, v28.8h, #8
   1756     sri             v25.8h, v21.8h, #5
   1757     sri             v25.8h, v29.8h, #11
   1758   .endif
   1759 .endm
   1760 
   1761 .macro do_yuv_to_rgb_stage2_store_load_stage1 fast_st3
   1762     rshrn           v20.4h, v20.4s, #15
   1763     rshrn           v24.4h, v24.4s, #14
   1764     rshrn           v28.4h, v28.4s, #14
   1765     ld1             {v4.8b}, [U], 8
   1766     rshrn2          v20.8h, v22.4s, #15
   1767     rshrn2          v24.8h, v26.4s, #14
   1768     rshrn2          v28.8h, v30.4s, #14
   1769     ld1             {v5.8b}, [V], 8
   1770     uaddw           v20.8h, v20.8h, v0.8b
   1771     uaddw           v24.8h, v24.8h, v0.8b
   1772     uaddw           v28.8h, v28.8h, v0.8b
   1773   .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
   1774     sqxtun          v1\g_offs\defsize, v20.8h
   1775     ld1             {v0.8b}, [Y], 8
   1776     sqxtun          v1\r_offs\defsize, v24.8h
   1777     prfm            pldl1keep, [U, #64]
   1778     prfm            pldl1keep, [V, #64]
   1779     prfm            pldl1keep, [Y, #64]
   1780     sqxtun          v1\b_offs\defsize, v28.8h
   1781     uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
   1782     uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
   1783     smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
   1784     smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
   1785     smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
   1786     smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
   1787     smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
   1788     smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
   1789   .else  /**************************** rgb565 ********************************/
   1790     sqshlu          v21.8h, v20.8h, #8
   1791     sqshlu          v25.8h, v24.8h, #8
   1792     sqshlu          v29.8h, v28.8h, #8
   1793     uaddw           v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
   1794     uaddw           v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
   1795     ld1             {v0.8b}, [Y], 8
   1796     smull           v20.4s, v6.4h, v1.h[1]  /* multiply by -11277 */
   1797     smlal           v20.4s, v8.4h, v1.h[2]  /* multiply by -23401 */
   1798     smull2          v22.4s, v6.8h, v1.h[1]  /* multiply by -11277 */
   1799     smlal2          v22.4s, v8.8h, v1.h[2]  /* multiply by -23401 */
   1800     sri             v25.8h, v21.8h, #5
   1801     smull           v24.4s, v8.4h, v1.h[0]  /* multiply by 22971 */
   1802     smull2          v26.4s, v8.8h, v1.h[0]  /* multiply by 22971 */
   1803     prfm            pldl1keep, [U, #64]
   1804     prfm            pldl1keep, [V, #64]
   1805     prfm            pldl1keep, [Y, #64]
   1806     sri             v25.8h, v29.8h, #11
   1807   .endif
   1808     do_store        \bpp, 8, \fast_st3
   1809     smull           v28.4s, v6.4h, v1.h[3]  /* multiply by 29033 */
   1810     smull2          v30.4s, v6.8h, v1.h[3]  /* multiply by 29033 */
   1811 .endm
   1812 
   1813 .macro do_yuv_to_rgb
   1814     do_yuv_to_rgb_stage1
   1815     do_yuv_to_rgb_stage2
   1816 .endm
   1817 
   1818 /* Apple gas crashes on adrl, work around that by using adr.
   1819  * But this requires a copy of these constants for each function.
   1820  */
   1821 
   1822 .if \fast_st3 == 1
   1823 asm_function jsimd_ycc_\colorid\()_convert_neon
   1824 .else
   1825 asm_function jsimd_ycc_\colorid\()_convert_neon_slowst3
   1826 .endif
   1827     OUTPUT_WIDTH    .req w0
   1828     INPUT_BUF       .req x1
   1829     INPUT_ROW       .req w2
   1830     OUTPUT_BUF      .req x3
   1831     NUM_ROWS        .req w4
   1832 
   1833     INPUT_BUF0      .req x5
   1834     INPUT_BUF1      .req x6
   1835     INPUT_BUF2      .req x1
   1836 
   1837     RGB             .req x7
   1838     Y               .req x9
   1839     U               .req x10
   1840     V               .req x11
   1841     N               .req w15
   1842 
   1843     sub             sp, sp, 64
   1844     mov             x9, sp
   1845 
   1846     /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
   1847     get_symbol_loc x15, Ljsimd_ycc_colorid_neon_consts
   1848 
   1849     /* Save NEON registers */
   1850     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
   1851     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
   1852     ld1             {v0.4h, v1.4h}, [x15], 16
   1853     ld1             {v2.8h}, [x15]
   1854 
   1855     ldr             INPUT_BUF0, [INPUT_BUF]
   1856     ldr             INPUT_BUF1, [INPUT_BUF, #8]
   1857     ldr             INPUT_BUF2, [INPUT_BUF, #16]
   1858     .unreq          INPUT_BUF
   1859 
   1860     /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
   1861     movi            v10.16b, #255
   1862     movi            v13.16b, #255
   1863 
   1864     /* Outer loop over scanlines */
   1865     cmp             NUM_ROWS, #1
   1866     b.lt            9f
   1867 0:
   1868     ldr             Y, [INPUT_BUF0, INPUT_ROW, uxtw #3]
   1869     ldr             U, [INPUT_BUF1, INPUT_ROW, uxtw #3]
   1870     mov             N, OUTPUT_WIDTH
   1871     ldr             V, [INPUT_BUF2, INPUT_ROW, uxtw #3]
   1872     add             INPUT_ROW, INPUT_ROW, #1
   1873     ldr             RGB, [OUTPUT_BUF], #8
   1874 
   1875     /* Inner loop over pixels */
   1876     subs            N, N, #8
   1877     b.lt            3f
   1878     do_load         8
   1879     do_yuv_to_rgb_stage1
   1880     subs            N, N, #8
   1881     b.lt            2f
   1882 1:
   1883     do_yuv_to_rgb_stage2_store_load_stage1 \fast_st3
   1884     subs            N, N, #8
   1885     b.ge            1b
   1886 2:
   1887     do_yuv_to_rgb_stage2
   1888     do_store        \bpp, 8, \fast_st3
   1889     tst             N, #7
   1890     b.eq            8f
   1891 3:
   1892     tst             N, #4
   1893     b.eq            3f
   1894     do_load         4
   1895 3:
   1896     tst             N, #2
   1897     b.eq            4f
   1898     do_load         2
   1899 4:
   1900     tst             N, #1
   1901     b.eq            5f
   1902     do_load         1
   1903 5:
   1904     do_yuv_to_rgb
   1905     tst             N, #4
   1906     b.eq            6f
   1907     do_store        \bpp, 4, \fast_st3
   1908 6:
   1909     tst             N, #2
   1910     b.eq            7f
   1911     do_store        \bpp, 2, \fast_st3
   1912 7:
   1913     tst             N, #1
   1914     b.eq            8f
   1915     do_store        \bpp, 1, \fast_st3
   1916 8:
   1917     subs            NUM_ROWS, NUM_ROWS, #1
   1918     b.gt            0b
   1919 9:
   1920     /* Restore all registers and return */
   1921     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
   1922     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
   1923     br              x30
   1924     .unreq          OUTPUT_WIDTH
   1925     .unreq          INPUT_ROW
   1926     .unreq          OUTPUT_BUF
   1927     .unreq          NUM_ROWS
   1928     .unreq          INPUT_BUF0
   1929     .unreq          INPUT_BUF1
   1930     .unreq          INPUT_BUF2
   1931     .unreq          RGB
   1932     .unreq          Y
   1933     .unreq          U
   1934     .unreq          V
   1935     .unreq          N
   1936 
   1937 .purgem do_yuv_to_rgb
   1938 .purgem do_yuv_to_rgb_stage1
   1939 .purgem do_yuv_to_rgb_stage2
   1940 .purgem do_yuv_to_rgb_stage2_store_load_stage1
   1941 
   1942 .endm
   1943 
   1944 /*--------------------------------- id ----- bpp R  rsize G  gsize B  bsize defsize fast_st3*/
   1945 generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
   1946 generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
   1947 generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,  1, .4h,  2, .4h,  .8b,    1
   1948 generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,  1, .4h,  0, .4h,  .8b,    1
   1949 generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,  2, .4h,  1, .4h,  .8b,    1
   1950 generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,  2, .4h,  3, .4h,  .8b,    1
   1951 generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,  0, .4h,  0, .4h,  .8b,    1
   1952 
   1953 generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,  1, .4h,  2, .4h,  .8b,    0
   1954 generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,  1, .4h,  0, .4h,  .8b,    0
   1955 
   1956 .purgem do_load
   1957 .purgem do_store
   1958 
   1959 
   1960 /*****************************************************************************/
   1961 
   1962 /*
   1963  * jsimd_extrgb_ycc_convert_neon
   1964  * jsimd_extbgr_ycc_convert_neon
   1965  * jsimd_extrgbx_ycc_convert_neon
   1966  * jsimd_extbgrx_ycc_convert_neon
   1967  * jsimd_extxbgr_ycc_convert_neon
   1968  * jsimd_extxrgb_ycc_convert_neon
   1969  *
   1970  * Colorspace conversion RGB -> YCbCr
   1971  */
   1972 
   1973 .macro do_store size
   1974   .if \size == 8
   1975     st1             {v20.8b}, [Y], #8
   1976     st1             {v21.8b}, [U], #8
   1977     st1             {v22.8b}, [V], #8
   1978   .elseif \size == 4
   1979     st1             {v20.b}[0], [Y], #1
   1980     st1             {v20.b}[1], [Y], #1
   1981     st1             {v20.b}[2], [Y], #1
   1982     st1             {v20.b}[3], [Y], #1
   1983     st1             {v21.b}[0], [U], #1
   1984     st1             {v21.b}[1], [U], #1
   1985     st1             {v21.b}[2], [U], #1
   1986     st1             {v21.b}[3], [U], #1
   1987     st1             {v22.b}[0], [V], #1
   1988     st1             {v22.b}[1], [V], #1
   1989     st1             {v22.b}[2], [V], #1
   1990     st1             {v22.b}[3], [V], #1
   1991   .elseif \size == 2
   1992     st1             {v20.b}[4], [Y], #1
   1993     st1             {v20.b}[5], [Y], #1
   1994     st1             {v21.b}[4], [U], #1
   1995     st1             {v21.b}[5], [U], #1
   1996     st1             {v22.b}[4], [V], #1
   1997     st1             {v22.b}[5], [V], #1
   1998   .elseif \size == 1
   1999     st1             {v20.b}[6], [Y], #1
   2000     st1             {v21.b}[6], [U], #1
   2001     st1             {v22.b}[6], [V], #1
   2002   .else
   2003     .error unsupported macroblock size
   2004   .endif
   2005 .endm
   2006 
   2007 .macro do_load bpp, size, fast_ld3
   2008   .if \bpp == 24
   2009     .if \size == 8
   2010       .if \fast_ld3 == 1
   2011         ld3         {v10.8b, v11.8b, v12.8b}, [RGB], #24
   2012       .else
   2013         ld1         {v10.b}[0], [RGB], #1
   2014         ld1         {v11.b}[0], [RGB], #1
   2015         ld1         {v12.b}[0], [RGB], #1
   2016 
   2017         ld1         {v10.b}[1], [RGB], #1
   2018         ld1         {v11.b}[1], [RGB], #1
   2019         ld1         {v12.b}[1], [RGB], #1
   2020 
   2021         ld1         {v10.b}[2], [RGB], #1
   2022         ld1         {v11.b}[2], [RGB], #1
   2023         ld1         {v12.b}[2], [RGB], #1
   2024 
   2025         ld1         {v10.b}[3], [RGB], #1
   2026         ld1         {v11.b}[3], [RGB], #1
   2027         ld1         {v12.b}[3], [RGB], #1
   2028 
   2029         ld1         {v10.b}[4], [RGB], #1
   2030         ld1         {v11.b}[4], [RGB], #1
   2031         ld1         {v12.b}[4], [RGB], #1
   2032 
   2033         ld1         {v10.b}[5], [RGB], #1
   2034         ld1         {v11.b}[5], [RGB], #1
   2035         ld1         {v12.b}[5], [RGB], #1
   2036 
   2037         ld1         {v10.b}[6], [RGB], #1
   2038         ld1         {v11.b}[6], [RGB], #1
   2039         ld1         {v12.b}[6], [RGB], #1
   2040 
   2041         ld1         {v10.b}[7], [RGB], #1
   2042         ld1         {v11.b}[7], [RGB], #1
   2043         ld1         {v12.b}[7], [RGB], #1
   2044       .endif
   2045       prfm          pldl1keep, [RGB, #128]
   2046     .elseif \size == 4
   2047       ld3           {v10.b, v11.b, v12.b}[0], [RGB], #3
   2048       ld3           {v10.b, v11.b, v12.b}[1], [RGB], #3
   2049       ld3           {v10.b, v11.b, v12.b}[2], [RGB], #3
   2050       ld3           {v10.b, v11.b, v12.b}[3], [RGB], #3
   2051     .elseif \size == 2
   2052       ld3           {v10.b, v11.b, v12.b}[4], [RGB], #3
   2053       ld3           {v10.b, v11.b, v12.b}[5], [RGB], #3
   2054     .elseif \size == 1
   2055       ld3           {v10.b, v11.b, v12.b}[6], [RGB], #3
   2056     .else
   2057       .error unsupported macroblock size
   2058     .endif
   2059   .elseif \bpp == 32
   2060     .if \size == 8
   2061       ld4           {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], #32
   2062       prfm          pldl1keep, [RGB, #128]
   2063     .elseif \size == 4
   2064       ld4           {v10.b, v11.b, v12.b, v13.b}[0], [RGB], #4
   2065       ld4           {v10.b, v11.b, v12.b, v13.b}[1], [RGB], #4
   2066       ld4           {v10.b, v11.b, v12.b, v13.b}[2], [RGB], #4
   2067       ld4           {v10.b, v11.b, v12.b, v13.b}[3], [RGB], #4
   2068     .elseif \size == 2
   2069       ld4           {v10.b, v11.b, v12.b, v13.b}[4], [RGB], #4
   2070       ld4           {v10.b, v11.b, v12.b, v13.b}[5], [RGB], #4
   2071     .elseif \size == 1
   2072       ld4           {v10.b, v11.b, v12.b, v13.b}[6], [RGB], #4
   2073     .else
   2074       .error unsupported macroblock size
   2075     .endif
   2076   .else
   2077     .error unsupported bpp
   2078   .endif
   2079 .endm
   2080 
   2081 .macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, \
   2082                                            b_offs, fast_ld3
   2083 
   2084 /*
   2085  * 2-stage pipelined RGB->YCbCr conversion
   2086  */
   2087 
   2088 .macro do_rgb_to_yuv_stage1
   2089     ushll           v4.8h, v1\r_offs\().8b, #0  /* r = v4 */
   2090     ushll           v6.8h, v1\g_offs\().8b, #0  /* g = v6 */
   2091     ushll           v8.8h, v1\b_offs\().8b, #0  /* b = v8 */
   2092     rev64           v18.4s, v1.4s
   2093     rev64           v26.4s, v1.4s
   2094     rev64           v28.4s, v1.4s
   2095     rev64           v30.4s, v1.4s
   2096     umull           v14.4s, v4.4h, v0.h[0]
   2097     umull2          v16.4s, v4.8h, v0.h[0]
   2098     umlsl           v18.4s, v4.4h, v0.h[3]
   2099     umlsl2          v26.4s, v4.8h, v0.h[3]
   2100     umlal           v28.4s, v4.4h, v0.h[5]
   2101     umlal2          v30.4s, v4.8h, v0.h[5]
   2102     umlal           v14.4s, v6.4h, v0.h[1]
   2103     umlal2          v16.4s, v6.8h, v0.h[1]
   2104     umlsl           v18.4s, v6.4h, v0.h[4]
   2105     umlsl2          v26.4s, v6.8h, v0.h[4]
   2106     umlsl           v28.4s, v6.4h, v0.h[6]
   2107     umlsl2          v30.4s, v6.8h, v0.h[6]
   2108     umlal           v14.4s, v8.4h, v0.h[2]
   2109     umlal2          v16.4s, v8.8h, v0.h[2]
   2110     umlal           v18.4s, v8.4h, v0.h[5]
   2111     umlal2          v26.4s, v8.8h, v0.h[5]
   2112     umlsl           v28.4s, v8.4h, v0.h[7]
   2113     umlsl2          v30.4s, v8.8h, v0.h[7]
   2114 .endm
   2115 
   2116 .macro do_rgb_to_yuv_stage2
   2117     rshrn           v20.4h, v14.4s, #16
   2118     shrn            v22.4h, v18.4s, #16
   2119     shrn            v24.4h, v28.4s, #16
   2120     rshrn2          v20.8h, v16.4s, #16
   2121     shrn2           v22.8h, v26.4s, #16
   2122     shrn2           v24.8h, v30.4s, #16
   2123     xtn             v20.8b, v20.8h       /* v20 = y */
   2124     xtn             v21.8b, v22.8h       /* v21 = u */
   2125     xtn             v22.8b, v24.8h       /* v22 = v */
   2126 .endm
   2127 
   2128 .macro do_rgb_to_yuv
   2129     do_rgb_to_yuv_stage1
   2130     do_rgb_to_yuv_stage2
   2131 .endm
   2132 
   2133 /* TODO: expand macros and interleave instructions if some in-order
   2134  *       ARM64 processor actually can dual-issue LOAD/STORE with ALU */
   2135 .macro do_rgb_to_yuv_stage2_store_load_stage1 fast_ld3
   2136     do_rgb_to_yuv_stage2
   2137     do_load         \bpp, 8, \fast_ld3
   2138     st1             {v20.8b}, [Y], #8
   2139     st1             {v21.8b}, [U], #8
   2140     st1             {v22.8b}, [V], #8
   2141     do_rgb_to_yuv_stage1
   2142 .endm
   2143 
   2144 
   2145 .if \fast_ld3 == 1
   2146 asm_function jsimd_\colorid\()_ycc_convert_neon
   2147 .else
   2148 asm_function jsimd_\colorid\()_ycc_convert_neon_slowld3
   2149 .endif
   2150     OUTPUT_WIDTH    .req w0
   2151     INPUT_BUF       .req x1
   2152     OUTPUT_BUF      .req x2
   2153     OUTPUT_ROW      .req w3
   2154     NUM_ROWS        .req w4
   2155 
   2156     OUTPUT_BUF0     .req x5
   2157     OUTPUT_BUF1     .req x6
   2158     OUTPUT_BUF2     .req x2  /* OUTPUT_BUF */
   2159 
   2160     RGB             .req x7
   2161     Y               .req x9
   2162     U               .req x10
   2163     V               .req x11
   2164     N               .req w12
   2165 
   2166     /* Load constants to d0, d1, d2, d3 */
   2167     get_symbol_loc x13, Ljsimd_colorid_ycc_neon_consts
   2168 
   2169     ld1             {v0.8h, v1.8h}, [x13]
   2170 
   2171     ldr             OUTPUT_BUF0, [OUTPUT_BUF]
   2172     ldr             OUTPUT_BUF1, [OUTPUT_BUF, #8]
   2173     ldr             OUTPUT_BUF2, [OUTPUT_BUF, #16]
   2174     .unreq          OUTPUT_BUF
   2175 
   2176     /* Save NEON registers */
   2177     sub             sp, sp, #64
   2178     mov             x9, sp
   2179     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
   2180     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
   2181 
   2182     /* Outer loop over scanlines */
   2183     cmp             NUM_ROWS, #1
   2184     b.lt            9f
   2185 0:
   2186     ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, uxtw #3]
   2187     ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, uxtw #3]
   2188     mov             N, OUTPUT_WIDTH
   2189     ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, uxtw #3]
   2190     add             OUTPUT_ROW, OUTPUT_ROW, #1
   2191     ldr             RGB, [INPUT_BUF], #8
   2192 
   2193     /* Inner loop over pixels */
   2194     subs            N, N, #8
   2195     b.lt            3f
   2196     do_load         \bpp, 8, \fast_ld3
   2197     do_rgb_to_yuv_stage1
   2198     subs            N, N, #8
   2199     b.lt            2f
   2200 1:
   2201     do_rgb_to_yuv_stage2_store_load_stage1 \fast_ld3
   2202     subs            N, N, #8
   2203     b.ge            1b
   2204 2:
   2205     do_rgb_to_yuv_stage2
   2206     do_store        8
   2207     tst             N, #7
   2208     b.eq            8f
   2209 3:
   2210     tbz             N, #2, 3f
   2211     do_load         \bpp, 4, \fast_ld3
   2212 3:
   2213     tbz             N, #1, 4f
   2214     do_load         \bpp, 2, \fast_ld3
   2215 4:
   2216     tbz             N, #0, 5f
   2217     do_load         \bpp, 1, \fast_ld3
   2218 5:
   2219     do_rgb_to_yuv
   2220     tbz             N, #2, 6f
   2221     do_store        4
   2222 6:
   2223     tbz             N, #1, 7f
   2224     do_store        2
   2225 7:
   2226     tbz             N, #0, 8f
   2227     do_store        1
   2228 8:
   2229     subs            NUM_ROWS, NUM_ROWS, #1
   2230     b.gt            0b
   2231 9:
   2232     /* Restore all registers and return */
   2233     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
   2234     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
   2235     br              x30
   2236 
   2237     .unreq          OUTPUT_WIDTH
   2238     .unreq          OUTPUT_ROW
   2239     .unreq          INPUT_BUF
   2240     .unreq          NUM_ROWS
   2241     .unreq          OUTPUT_BUF0
   2242     .unreq          OUTPUT_BUF1
   2243     .unreq          OUTPUT_BUF2
   2244     .unreq          RGB
   2245     .unreq          Y
   2246     .unreq          U
   2247     .unreq          V
   2248     .unreq          N
   2249 
   2250 .purgem do_rgb_to_yuv
   2251 .purgem do_rgb_to_yuv_stage1
   2252 .purgem do_rgb_to_yuv_stage2
   2253 .purgem do_rgb_to_yuv_stage2_store_load_stage1
   2254 
   2255 .endm
   2256 
   2257 /*--------------------------------- id ----- bpp R  G  B  Fast LD3 */
   2258 generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 1
   2259 generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 1
   2260 generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2, 1
   2261 generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0, 1
   2262 generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1, 1
   2263 generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3, 1
   2264 
   2265 generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2, 0
   2266 generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 0
   2267 
   2268 .purgem do_load
   2269 .purgem do_store
   2270 
   2271 
   2272 /*****************************************************************************/
   2273 
   2274 /*
   2275  * Load data into workspace, applying unsigned->signed conversion
   2276  *
   2277  * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
   2278  *       rid of VST1.16 instructions
   2279  */
   2280 
   2281 asm_function jsimd_convsamp_neon
   2282     SAMPLE_DATA     .req x0
   2283     START_COL       .req x1
   2284     WORKSPACE       .req x2
   2285     TMP1            .req x9
   2286     TMP2            .req x10
   2287     TMP3            .req x11
   2288     TMP4            .req x12
   2289     TMP5            .req x13
   2290     TMP6            .req x14
   2291     TMP7            .req x15
   2292     TMP8            .req x4
   2293     TMPDUP          .req w3
   2294 
   2295     /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
   2296        guarantee that the upper (unused) 32 bits of x1 are valid.  This
   2297        instruction ensures that those bits are set to zero. */
   2298     uxtw x1, w1
   2299 
   2300     mov             TMPDUP, #128
   2301     ldp             TMP1, TMP2, [SAMPLE_DATA], 16
   2302     ldp             TMP3, TMP4, [SAMPLE_DATA], 16
   2303     dup             v0.8b, TMPDUP
   2304     add             TMP1, TMP1, START_COL
   2305     add             TMP2, TMP2, START_COL
   2306     ldp             TMP5, TMP6, [SAMPLE_DATA], 16
   2307     add             TMP3, TMP3, START_COL
   2308     add             TMP4, TMP4, START_COL
   2309     ldp             TMP7, TMP8, [SAMPLE_DATA], 16
   2310     add             TMP5, TMP5, START_COL
   2311     add             TMP6, TMP6, START_COL
   2312     ld1             {v16.8b}, [TMP1]
   2313     add             TMP7, TMP7, START_COL
   2314     add             TMP8, TMP8, START_COL
   2315     ld1             {v17.8b}, [TMP2]
   2316     usubl           v16.8h, v16.8b, v0.8b
   2317     ld1             {v18.8b}, [TMP3]
   2318     usubl           v17.8h, v17.8b, v0.8b
   2319     ld1             {v19.8b}, [TMP4]
   2320     usubl           v18.8h, v18.8b, v0.8b
   2321     ld1             {v20.8b}, [TMP5]
   2322     usubl           v19.8h, v19.8b, v0.8b
   2323     ld1             {v21.8b}, [TMP6]
   2324     st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
   2325     usubl           v20.8h, v20.8b, v0.8b
   2326     ld1             {v22.8b}, [TMP7]
   2327     usubl           v21.8h, v21.8b, v0.8b
   2328     ld1             {v23.8b}, [TMP8]
   2329     usubl           v22.8h, v22.8b, v0.8b
   2330     usubl           v23.8h, v23.8b, v0.8b
   2331     st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
   2332 
   2333     br              x30
   2334 
   2335     .unreq          SAMPLE_DATA
   2336     .unreq          START_COL
   2337     .unreq          WORKSPACE
   2338     .unreq          TMP1
   2339     .unreq          TMP2
   2340     .unreq          TMP3
   2341     .unreq          TMP4
   2342     .unreq          TMP5
   2343     .unreq          TMP6
   2344     .unreq          TMP7
   2345     .unreq          TMP8
   2346     .unreq          TMPDUP
   2347 
   2348 /*****************************************************************************/
   2349 
   2350 /*
   2351  * jsimd_fdct_islow_neon
   2352  *
   2353  * This file contains a slow-but-accurate integer implementation of the
   2354  * forward DCT (Discrete Cosine Transform). The following code is based
   2355  * directly on the IJG''s original jfdctint.c; see the jfdctint.c for
   2356  * more details.
   2357  *
   2358  * TODO: can be combined with 'jsimd_convsamp_neon' to get
   2359  *       rid of a bunch of VLD1.16 instructions
   2360  */
   2361 
   2362 #define CONST_BITS  13
   2363 #define PASS1_BITS  2
   2364 
   2365 #define DESCALE_P1  (CONST_BITS - PASS1_BITS)
   2366 #define DESCALE_P2  (CONST_BITS + PASS1_BITS)
   2367 
   2368 #define XFIX_P_0_298  v0.h[0]
   2369 #define XFIX_N_0_390  v0.h[1]
   2370 #define XFIX_P_0_541  v0.h[2]
   2371 #define XFIX_P_0_765  v0.h[3]
   2372 #define XFIX_N_0_899  v0.h[4]
   2373 #define XFIX_P_1_175  v0.h[5]
   2374 #define XFIX_P_1_501  v0.h[6]
   2375 #define XFIX_N_1_847  v0.h[7]
   2376 #define XFIX_N_1_961  v1.h[0]
   2377 #define XFIX_P_2_053  v1.h[1]
   2378 #define XFIX_N_2_562  v1.h[2]
   2379 #define XFIX_P_3_072  v1.h[3]
   2380 
   2381 asm_function jsimd_fdct_islow_neon
   2382 
   2383     DATA            .req x0
   2384     TMP             .req x9
   2385 
   2386     /* Load constants */
   2387     get_symbol_loc  TMP, Ljsimd_fdct_islow_neon_consts
   2388     ld1             {v0.8h, v1.8h}, [TMP]
   2389 
   2390     /* Save NEON registers */
   2391     sub             sp, sp, #64
   2392     mov             x10, sp
   2393     st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x10], 32
   2394     st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x10], 32
   2395 
   2396     /* Load all DATA into NEON registers with the following allocation:
   2397      *       0 1 2 3 | 4 5 6 7
   2398      *      ---------+--------
   2399      *   0 | d16     | d17    | v16.8h
   2400      *   1 | d18     | d19    | v17.8h
   2401      *   2 | d20     | d21    | v18.8h
   2402      *   3 | d22     | d23    | v19.8h
   2403      *   4 | d24     | d25    | v20.8h
   2404      *   5 | d26     | d27    | v21.8h
   2405      *   6 | d28     | d29    | v22.8h
   2406      *   7 | d30     | d31    | v23.8h
   2407      */
   2408 
   2409     ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
   2410     ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
   2411     sub             DATA, DATA, #64
   2412 
   2413     /* Transpose */
   2414     transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
   2415     /* 1-D FDCT */
   2416     add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
   2417     sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
   2418     add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
   2419     sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
   2420     add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
   2421     sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
   2422     add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
   2423     sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
   2424 
   2425     /* even part */
   2426 
   2427     add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
   2428     sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
   2429     add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
   2430     sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
   2431 
   2432     add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
   2433     sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
   2434 
   2435     add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
   2436 
   2437     shl             v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)LEFT_SHIFT(tmp10 + tmp11, PASS1_BITS); */
   2438     shl             v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)LEFT_SHIFT(tmp10 - tmp11, PASS1_BITS); */
   2439 
   2440     smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
   2441     smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
   2442     mov             v22.16b, v18.16b
   2443     mov             v25.16b, v24.16b
   2444 
   2445     smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
   2446     smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
   2447     smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
   2448     smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
   2449 
   2450     rshrn           v18.4h, v18.4s, #DESCALE_P1
   2451     rshrn           v22.4h, v22.4s, #DESCALE_P1
   2452     rshrn2          v18.8h, v24.4s, #DESCALE_P1  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
   2453     rshrn2          v22.8h, v25.4s, #DESCALE_P1  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
   2454 
   2455     /* Odd part */
   2456 
   2457     add             v8.8h, v28.8h, v31.8h        /* z1 = tmp4 + tmp7; */
   2458     add             v9.8h, v29.8h, v30.8h        /* z2 = tmp5 + tmp6; */
   2459     add             v10.8h, v28.8h, v30.8h       /* z3 = tmp4 + tmp6; */
   2460     add             v11.8h, v29.8h, v31.8h       /* z4 = tmp5 + tmp7; */
   2461     smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
   2462     smull2          v5.4s, v10.8h, XFIX_P_1_175
   2463     smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
   2464     smlal2          v5.4s, v11.8h, XFIX_P_1_175
   2465 
   2466     smull2          v24.4s, v28.8h, XFIX_P_0_298
   2467     smull2          v25.4s, v29.8h, XFIX_P_2_053
   2468     smull2          v26.4s, v30.8h, XFIX_P_3_072
   2469     smull2          v27.4s, v31.8h, XFIX_P_1_501
   2470     smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
   2471     smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
   2472     smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
   2473     smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
   2474 
   2475     smull2          v12.4s, v8.8h, XFIX_N_0_899
   2476     smull2          v13.4s, v9.8h, XFIX_N_2_562
   2477     smull2          v14.4s, v10.8h, XFIX_N_1_961
   2478     smull2          v15.4s, v11.8h, XFIX_N_0_390
   2479     smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
   2480     smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
   2481     smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
   2482     smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
   2483 
   2484     add             v10.4s, v10.4s, v4.4s  /* z3 += z5 */
   2485     add             v14.4s, v14.4s, v5.4s
   2486     add             v11.4s, v11.4s, v4.4s  /* z4 += z5 */
   2487     add             v15.4s, v15.4s, v5.4s
   2488 
   2489     add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
   2490     add             v24.4s, v24.4s, v12.4s
   2491     add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
   2492     add             v25.4s, v25.4s, v13.4s
   2493     add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
   2494     add             v26.4s, v26.4s, v14.4s
   2495     add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
   2496     add             v27.4s, v27.4s, v15.4s
   2497 
   2498     add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
   2499     add             v24.4s, v24.4s, v14.4s
   2500     add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
   2501     add             v25.4s, v25.4s, v15.4s
   2502     add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
   2503     add             v26.4s, v26.4s, v13.4s
   2504     add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
   2505     add             v27.4s, v27.4s, v12.4s
   2506 
   2507     rshrn           v23.4h, v28.4s, #DESCALE_P1
   2508     rshrn           v21.4h, v29.4s, #DESCALE_P1
   2509     rshrn           v19.4h, v30.4s, #DESCALE_P1
   2510     rshrn           v17.4h, v31.4s, #DESCALE_P1
   2511     rshrn2          v23.8h, v24.4s, #DESCALE_P1  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
   2512     rshrn2          v21.8h, v25.4s, #DESCALE_P1  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
   2513     rshrn2          v19.8h, v26.4s, #DESCALE_P1  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
   2514     rshrn2          v17.8h, v27.4s, #DESCALE_P1  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
   2515 
   2516     /* Transpose */
   2517     transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v31, v2, v3, v4
   2518 
   2519     /* 1-D FDCT */
   2520     add             v24.8h, v16.8h, v23.8h  /* tmp0 = dataptr[0] + dataptr[7]; */
   2521     sub             v31.8h, v16.8h, v23.8h  /* tmp7 = dataptr[0] - dataptr[7]; */
   2522     add             v25.8h, v17.8h, v22.8h  /* tmp1 = dataptr[1] + dataptr[6]; */
   2523     sub             v30.8h, v17.8h, v22.8h  /* tmp6 = dataptr[1] - dataptr[6]; */
   2524     add             v26.8h, v18.8h, v21.8h  /* tmp2 = dataptr[2] + dataptr[5]; */
   2525     sub             v29.8h, v18.8h, v21.8h  /* tmp5 = dataptr[2] - dataptr[5]; */
   2526     add             v27.8h, v19.8h, v20.8h  /* tmp3 = dataptr[3] + dataptr[4]; */
   2527     sub             v28.8h, v19.8h, v20.8h  /* tmp4 = dataptr[3] - dataptr[4]; */
   2528 
   2529     /* even part */
   2530     add             v8.8h, v24.8h, v27.8h   /* tmp10 = tmp0 + tmp3; */
   2531     sub             v9.8h, v24.8h, v27.8h   /* tmp13 = tmp0 - tmp3; */
   2532     add             v10.8h, v25.8h, v26.8h  /* tmp11 = tmp1 + tmp2; */
   2533     sub             v11.8h, v25.8h, v26.8h  /* tmp12 = tmp1 - tmp2; */
   2534 
   2535     add             v16.8h, v8.8h, v10.8h  /* tmp10 + tmp11 */
   2536     sub             v20.8h, v8.8h, v10.8h  /* tmp10 - tmp11 */
   2537 
   2538     add             v18.8h, v11.8h, v9.8h  /* tmp12 + tmp13 */
   2539 
   2540     srshr           v16.8h, v16.8h, #PASS1_BITS  /* dataptr[0] = (DCTELEM)DESCALE(tmp10 + tmp11, PASS1_BITS); */
   2541     srshr           v20.8h, v20.8h, #PASS1_BITS  /* dataptr[4] = (DCTELEM)DESCALE(tmp10 - tmp11, PASS1_BITS); */
   2542 
   2543     smull2          v24.4s, v18.8h, XFIX_P_0_541  /* z1 hi = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
   2544     smull           v18.4s, v18.4h, XFIX_P_0_541  /* z1 lo = MULTIPLY(tmp12 + tmp13, XFIX_P_0_541); */
   2545     mov             v22.16b, v18.16b
   2546     mov             v25.16b, v24.16b
   2547 
   2548     smlal           v18.4s, v9.4h, XFIX_P_0_765   /* lo z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
   2549     smlal2          v24.4s, v9.8h, XFIX_P_0_765   /* hi z1 + MULTIPLY(tmp13, XFIX_P_0_765) */
   2550     smlal           v22.4s, v11.4h, XFIX_N_1_847  /* lo z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
   2551     smlal2          v25.4s, v11.8h, XFIX_N_1_847  /* hi z1 + MULTIPLY(tmp12, XFIX_N_1_847) */
   2552 
   2553     rshrn           v18.4h, v18.4s, #DESCALE_P2
   2554     rshrn           v22.4h, v22.4s, #DESCALE_P2
   2555     rshrn2          v18.8h, v24.4s, #DESCALE_P2  /* dataptr[2] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp13, XFIX_P_0_765), CONST_BITS-PASS1_BITS); */
   2556     rshrn2          v22.8h, v25.4s, #DESCALE_P2  /* dataptr[6] = (DCTELEM)DESCALE(z1 + MULTIPLY(tmp12, XFIX_N_1_847), CONST_BITS-PASS1_BITS); */
   2557 
   2558     /* Odd part */
   2559     add             v8.8h, v28.8h, v31.8h   /* z1 = tmp4 + tmp7; */
   2560     add             v9.8h, v29.8h, v30.8h   /* z2 = tmp5 + tmp6; */
   2561     add             v10.8h, v28.8h, v30.8h  /* z3 = tmp4 + tmp6; */
   2562     add             v11.8h, v29.8h, v31.8h  /* z4 = tmp5 + tmp7; */
   2563 
   2564     smull           v4.4s, v10.4h, XFIX_P_1_175  /* z5 lo = z3 lo * XFIX_P_1_175 */
   2565     smull2          v5.4s, v10.8h, XFIX_P_1_175
   2566     smlal           v4.4s, v11.4h, XFIX_P_1_175  /* z5 = MULTIPLY(z3 + z4, FIX_1_175875602); */
   2567     smlal2          v5.4s, v11.8h, XFIX_P_1_175
   2568 
   2569     smull2          v24.4s, v28.8h, XFIX_P_0_298
   2570     smull2          v25.4s, v29.8h, XFIX_P_2_053
   2571     smull2          v26.4s, v30.8h, XFIX_P_3_072
   2572     smull2          v27.4s, v31.8h, XFIX_P_1_501
   2573     smull           v28.4s, v28.4h, XFIX_P_0_298  /* tmp4 = MULTIPLY(tmp4, FIX_0_298631336); */
   2574     smull           v29.4s, v29.4h, XFIX_P_2_053  /* tmp5 = MULTIPLY(tmp5, FIX_2_053119869); */
   2575     smull           v30.4s, v30.4h, XFIX_P_3_072  /* tmp6 = MULTIPLY(tmp6, FIX_3_072711026); */
   2576     smull           v31.4s, v31.4h, XFIX_P_1_501  /* tmp7 = MULTIPLY(tmp7, FIX_1_501321110); */
   2577 
   2578     smull2          v12.4s, v8.8h, XFIX_N_0_899
   2579     smull2          v13.4s, v9.8h, XFIX_N_2_562
   2580     smull2          v14.4s, v10.8h, XFIX_N_1_961
   2581     smull2          v15.4s, v11.8h, XFIX_N_0_390
   2582     smull           v8.4s, v8.4h, XFIX_N_0_899    /* z1 = MULTIPLY(z1, -FIX_0_899976223); */
   2583     smull           v9.4s, v9.4h, XFIX_N_2_562    /* z2 = MULTIPLY(z2, -FIX_2_562915447); */
   2584     smull           v10.4s, v10.4h, XFIX_N_1_961  /* z3 = MULTIPLY(z3, -FIX_1_961570560); */
   2585     smull           v11.4s, v11.4h, XFIX_N_0_390  /* z4 = MULTIPLY(z4, -FIX_0_390180644); */
   2586 
   2587     add             v10.4s, v10.4s, v4.4s
   2588     add             v14.4s, v14.4s, v5.4s
   2589     add             v11.4s, v11.4s, v4.4s
   2590     add             v15.4s, v15.4s, v5.4s
   2591 
   2592     add             v28.4s, v28.4s, v8.4s   /* tmp4 += z1 */
   2593     add             v24.4s, v24.4s, v12.4s
   2594     add             v29.4s, v29.4s, v9.4s   /* tmp5 += z2 */
   2595     add             v25.4s, v25.4s, v13.4s
   2596     add             v30.4s, v30.4s, v10.4s  /* tmp6 += z3 */
   2597     add             v26.4s, v26.4s, v14.4s
   2598     add             v31.4s, v31.4s, v11.4s  /* tmp7 += z4 */
   2599     add             v27.4s, v27.4s, v15.4s
   2600 
   2601     add             v28.4s, v28.4s, v10.4s  /* tmp4 += z3 */
   2602     add             v24.4s, v24.4s, v14.4s
   2603     add             v29.4s, v29.4s, v11.4s  /* tmp5 += z4 */
   2604     add             v25.4s, v25.4s, v15.4s
   2605     add             v30.4s, v30.4s, v9.4s   /* tmp6 += z2 */
   2606     add             v26.4s, v26.4s, v13.4s
   2607     add             v31.4s, v31.4s, v8.4s   /* tmp7 += z1 */
   2608     add             v27.4s, v27.4s, v12.4s
   2609 
   2610     rshrn           v23.4h, v28.4s, #DESCALE_P2
   2611     rshrn           v21.4h, v29.4s, #DESCALE_P2
   2612     rshrn           v19.4h, v30.4s, #DESCALE_P2
   2613     rshrn           v17.4h, v31.4s, #DESCALE_P2
   2614     rshrn2          v23.8h, v24.4s, #DESCALE_P2  /* dataptr[7] = (DCTELEM)DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); */
   2615     rshrn2          v21.8h, v25.4s, #DESCALE_P2  /* dataptr[5] = (DCTELEM)DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); */
   2616     rshrn2          v19.8h, v26.4s, #DESCALE_P2  /* dataptr[3] = (DCTELEM)DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); */
   2617     rshrn2          v17.8h, v27.4s, #DESCALE_P2  /* dataptr[1] = (DCTELEM)DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); */
   2618 
   2619     /* store results */
   2620     st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
   2621     st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
   2622 
   2623     /* Restore NEON registers */
   2624     ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
   2625     ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
   2626 
   2627     br              x30
   2628 
   2629     .unreq          DATA
   2630     .unreq          TMP
   2631 
   2632 #undef XFIX_P_0_298
   2633 #undef XFIX_N_0_390
   2634 #undef XFIX_P_0_541
   2635 #undef XFIX_P_0_765
   2636 #undef XFIX_N_0_899
   2637 #undef XFIX_P_1_175
   2638 #undef XFIX_P_1_501
   2639 #undef XFIX_N_1_847
   2640 #undef XFIX_N_1_961
   2641 #undef XFIX_P_2_053
   2642 #undef XFIX_N_2_562
   2643 #undef XFIX_P_3_072
   2644 
   2645 
   2646 /*****************************************************************************/
   2647 
   2648 /*
   2649  * jsimd_fdct_ifast_neon
   2650  *
   2651  * This function contains a fast, not so accurate integer implementation of
   2652  * the forward DCT (Discrete Cosine Transform). It uses the same calculations
   2653  * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
   2654  * function from jfdctfst.c
   2655  *
   2656  * TODO: can be combined with 'jsimd_convsamp_neon' to get
   2657  *       rid of a bunch of VLD1.16 instructions
   2658  */
   2659 
   2660 #undef XFIX_0_541196100
   2661 #define XFIX_0_382683433  v0.h[0]
   2662 #define XFIX_0_541196100  v0.h[1]
   2663 #define XFIX_0_707106781  v0.h[2]
   2664 #define XFIX_1_306562965  v0.h[3]
   2665 
   2666 asm_function jsimd_fdct_ifast_neon
   2667 
   2668     DATA            .req x0
   2669     TMP             .req x9
   2670 
   2671     /* Load constants */
   2672     get_symbol_loc  TMP, Ljsimd_fdct_ifast_neon_consts
   2673     ld1             {v0.4h}, [TMP]
   2674 
   2675     /* Load all DATA into NEON registers with the following allocation:
   2676      *       0 1 2 3 | 4 5 6 7
   2677      *      ---------+--------
   2678      *   0 | d16     | d17    | v0.8h
   2679      *   1 | d18     | d19    | q9
   2680      *   2 | d20     | d21    | q10
   2681      *   3 | d22     | d23    | q11
   2682      *   4 | d24     | d25    | q12
   2683      *   5 | d26     | d27    | q13
   2684      *   6 | d28     | d29    | q14
   2685      *   7 | d30     | d31    | q15
   2686      */
   2687 
   2688     ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
   2689     ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
   2690     mov             TMP, #2
   2691     sub             DATA, DATA, #64
   2692 1:
   2693     /* Transpose */
   2694     transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
   2695     subs            TMP, TMP, #1
   2696     /* 1-D FDCT */
   2697     add             v4.8h, v19.8h, v20.8h
   2698     sub             v20.8h, v19.8h, v20.8h
   2699     sub             v28.8h, v18.8h, v21.8h
   2700     add             v18.8h, v18.8h, v21.8h
   2701     sub             v29.8h, v17.8h, v22.8h
   2702     add             v17.8h, v17.8h, v22.8h
   2703     sub             v21.8h, v16.8h, v23.8h
   2704     add             v16.8h, v16.8h, v23.8h
   2705     sub             v6.8h, v17.8h, v18.8h
   2706     sub             v7.8h, v16.8h, v4.8h
   2707     add             v5.8h, v17.8h, v18.8h
   2708     add             v6.8h, v6.8h, v7.8h
   2709     add             v4.8h, v16.8h, v4.8h
   2710     sqdmulh         v6.8h, v6.8h, XFIX_0_707106781
   2711     add             v19.8h, v20.8h, v28.8h
   2712     add             v16.8h, v4.8h, v5.8h
   2713     sub             v20.8h, v4.8h, v5.8h
   2714     add             v5.8h, v28.8h, v29.8h
   2715     add             v29.8h, v29.8h, v21.8h
   2716     sqdmulh         v5.8h, v5.8h, XFIX_0_707106781
   2717     sub             v28.8h, v19.8h, v29.8h
   2718     add             v18.8h, v7.8h, v6.8h
   2719     sqdmulh         v28.8h, v28.8h, XFIX_0_382683433
   2720     sub             v22.8h, v7.8h, v6.8h
   2721     sqdmulh         v19.8h, v19.8h, XFIX_0_541196100
   2722     sqdmulh         v7.8h, v29.8h, XFIX_1_306562965
   2723     add             v6.8h, v21.8h, v5.8h
   2724     sub             v5.8h, v21.8h, v5.8h
   2725     add             v29.8h, v29.8h, v28.8h
   2726     add             v19.8h, v19.8h, v28.8h
   2727     add             v29.8h, v29.8h, v7.8h
   2728     add             v21.8h, v5.8h, v19.8h
   2729     sub             v19.8h, v5.8h, v19.8h
   2730     add             v17.8h, v6.8h, v29.8h
   2731     sub             v23.8h, v6.8h, v29.8h
   2732 
   2733     b.ne            1b
   2734 
   2735     /* store results */
   2736     st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
   2737     st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
   2738 
   2739     br              x30
   2740 
   2741     .unreq          DATA
   2742     .unreq          TMP
   2743 #undef XFIX_0_382683433
   2744 #undef XFIX_0_541196100
   2745 #undef XFIX_0_707106781
   2746 #undef XFIX_1_306562965
   2747 
   2748 
   2749 /*****************************************************************************/
   2750 
   2751 /*
   2752  * GLOBAL(void)
   2753  * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
   2754  *                     DCTELEM *workspace);
   2755  *
   2756  */
   2757 asm_function jsimd_quantize_neon
   2758 
   2759     COEF_BLOCK      .req x0
   2760     DIVISORS        .req x1
   2761     WORKSPACE       .req x2
   2762 
   2763     RECIPROCAL      .req DIVISORS
   2764     CORRECTION      .req x9
   2765     SHIFT           .req x10
   2766     LOOP_COUNT      .req x11
   2767 
   2768     mov             LOOP_COUNT, #2
   2769     add             CORRECTION, DIVISORS, #(64 * 2)
   2770     add             SHIFT, DIVISORS, #(64 * 6)
   2771 1:
   2772     subs            LOOP_COUNT, LOOP_COUNT, #1
   2773     ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
   2774     ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
   2775     abs             v20.8h, v0.8h
   2776     abs             v21.8h, v1.8h
   2777     abs             v22.8h, v2.8h
   2778     abs             v23.8h, v3.8h
   2779     ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
   2780     add             v20.8h, v20.8h, v4.8h  /* add correction */
   2781     add             v21.8h, v21.8h, v5.8h
   2782     add             v22.8h, v22.8h, v6.8h
   2783     add             v23.8h, v23.8h, v7.8h
   2784     umull           v4.4s, v20.4h, v28.4h  /* multiply by reciprocal */
   2785     umull2          v16.4s, v20.8h, v28.8h
   2786     umull           v5.4s, v21.4h, v29.4h
   2787     umull2          v17.4s, v21.8h, v29.8h
   2788     umull           v6.4s, v22.4h, v30.4h  /* multiply by reciprocal */
   2789     umull2          v18.4s, v22.8h, v30.8h
   2790     umull           v7.4s, v23.4h, v31.4h
   2791     umull2          v19.4s, v23.8h, v31.8h
   2792     ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
   2793     shrn            v4.4h, v4.4s, #16
   2794     shrn            v5.4h, v5.4s, #16
   2795     shrn            v6.4h, v6.4s, #16
   2796     shrn            v7.4h, v7.4s, #16
   2797     shrn2           v4.8h, v16.4s, #16
   2798     shrn2           v5.8h, v17.4s, #16
   2799     shrn2           v6.8h, v18.4s, #16
   2800     shrn2           v7.8h, v19.4s, #16
   2801     neg             v24.8h, v24.8h
   2802     neg             v25.8h, v25.8h
   2803     neg             v26.8h, v26.8h
   2804     neg             v27.8h, v27.8h
   2805     sshr            v0.8h, v0.8h, #15  /* extract sign */
   2806     sshr            v1.8h, v1.8h, #15
   2807     sshr            v2.8h, v2.8h, #15
   2808     sshr            v3.8h, v3.8h, #15
   2809     ushl            v4.8h, v4.8h, v24.8h  /* shift */
   2810     ushl            v5.8h, v5.8h, v25.8h
   2811     ushl            v6.8h, v6.8h, v26.8h
   2812     ushl            v7.8h, v7.8h, v27.8h
   2813 
   2814     eor             v4.16b, v4.16b, v0.16b  /* restore sign */
   2815     eor             v5.16b, v5.16b, v1.16b
   2816     eor             v6.16b, v6.16b, v2.16b
   2817     eor             v7.16b, v7.16b, v3.16b
   2818     sub             v4.8h, v4.8h, v0.8h
   2819     sub             v5.8h, v5.8h, v1.8h
   2820     sub             v6.8h, v6.8h, v2.8h
   2821     sub             v7.8h, v7.8h, v3.8h
   2822     st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
   2823 
   2824     b.ne            1b
   2825 
   2826     br              x30  /* return */
   2827 
   2828     .unreq          COEF_BLOCK
   2829     .unreq          DIVISORS
   2830     .unreq          WORKSPACE
   2831     .unreq          RECIPROCAL
   2832     .unreq          CORRECTION
   2833     .unreq          SHIFT
   2834     .unreq          LOOP_COUNT
   2835 
   2836 
   2837 /*****************************************************************************/
   2838 
   2839 /*
   2840  * Downsample pixel values of a single component.
   2841  * This version handles the common case of 2:1 horizontal and 1:1 vertical,
   2842  * without smoothing.
   2843  *
   2844  * GLOBAL(void)
   2845  * jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
   2846  *                            JDIMENSION v_samp_factor,
   2847  *                            JDIMENSION width_in_blocks,
   2848  *                            JSAMPARRAY input_data, JSAMPARRAY output_data);
   2849  */
   2850 
   2851 asm_function jsimd_h2v1_downsample_neon
   2852     IMAGE_WIDTH     .req x0
   2853     MAX_V_SAMP      .req x1
   2854     V_SAMP          .req x2
   2855     BLOCK_WIDTH     .req x3
   2856     INPUT_DATA      .req x4
   2857     OUTPUT_DATA     .req x5
   2858     OUTPTR          .req x9
   2859     INPTR           .req x10
   2860     TMP1            .req x11
   2861     TMP2            .req x12
   2862     TMP3            .req x13
   2863     TMPDUP          .req w15
   2864 
   2865     mov             TMPDUP, #0x10000
   2866     lsl             TMP2, BLOCK_WIDTH, #4
   2867     sub             TMP2, TMP2, IMAGE_WIDTH
   2868     get_symbol_loc  TMP3, Ljsimd_h2_downsample_neon_consts
   2869     add             TMP3, TMP3, TMP2, lsl #4
   2870     dup             v16.4s, TMPDUP
   2871     ld1             {v18.16b}, [TMP3]
   2872 
   2873 1:  /* row loop */
   2874     ldr             INPTR, [INPUT_DATA], #8
   2875     ldr             OUTPTR, [OUTPUT_DATA], #8
   2876     subs            TMP1, BLOCK_WIDTH, #1
   2877     b.eq            3f
   2878 2:  /* columns */
   2879     ld1             {v0.16b}, [INPTR], #16
   2880     mov             v4.16b, v16.16b
   2881     subs            TMP1, TMP1, #1
   2882     uadalp          v4.8h, v0.16b
   2883     shrn            v6.8b, v4.8h, #1
   2884     st1             {v6.8b}, [OUTPTR], #8
   2885     b.ne            2b
   2886 3:  /* last columns */
   2887     ld1             {v0.16b}, [INPTR]
   2888     mov             v4.16b, v16.16b
   2889     subs            V_SAMP, V_SAMP, #1
   2890     /* expand right */
   2891     tbl             v2.16b, {v0.16b}, v18.16b
   2892     uadalp          v4.8h, v2.16b
   2893     shrn            v6.8b, v4.8h, #1
   2894     st1             {v6.8b}, [OUTPTR], #8
   2895     b.ne            1b
   2896 
   2897     br              x30
   2898 
   2899     .unreq          IMAGE_WIDTH
   2900     .unreq          MAX_V_SAMP
   2901     .unreq          V_SAMP
   2902     .unreq          BLOCK_WIDTH
   2903     .unreq          INPUT_DATA
   2904     .unreq          OUTPUT_DATA
   2905     .unreq          OUTPTR
   2906     .unreq          INPTR
   2907     .unreq          TMP1
   2908     .unreq          TMP2
   2909     .unreq          TMP3
   2910     .unreq          TMPDUP
   2911 
   2912 
   2913 /*****************************************************************************/
   2914 
   2915 /*
   2916  * Downsample pixel values of a single component.
   2917  * This version handles the common case of 2:1 horizontal and 2:1 vertical,
   2918  * without smoothing.
   2919  *
   2920  * GLOBAL(void)
   2921  * jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
   2922  *                            JDIMENSION v_samp_factor,
   2923  *                            JDIMENSION width_in_blocks,
   2924  *                            JSAMPARRAY input_data, JSAMPARRAY output_data);
   2925  */
   2926 
   2927 .balign 16
   2928 asm_function jsimd_h2v2_downsample_neon
   2929     IMAGE_WIDTH     .req x0
   2930     MAX_V_SAMP      .req x1
   2931     V_SAMP          .req x2
   2932     BLOCK_WIDTH     .req x3
   2933     INPUT_DATA      .req x4
   2934     OUTPUT_DATA     .req x5
   2935     OUTPTR          .req x9
   2936     INPTR0          .req x10
   2937     INPTR1          .req x14
   2938     TMP1            .req x11
   2939     TMP2            .req x12
   2940     TMP3            .req x13
   2941     TMPDUP          .req w15
   2942 
   2943     mov             TMPDUP, #1
   2944     lsl             TMP2, BLOCK_WIDTH, #4
   2945     lsl             TMPDUP, TMPDUP, #17
   2946     sub             TMP2, TMP2, IMAGE_WIDTH
   2947     get_symbol_loc  TMP3, Ljsimd_h2_downsample_neon_consts
   2948     orr             TMPDUP, TMPDUP, #1
   2949     add             TMP3, TMP3, TMP2, lsl #4
   2950     dup             v16.4s, TMPDUP
   2951     ld1             {v18.16b}, [TMP3]
   2952 
   2953 1:  /* row loop */
   2954     ldr             INPTR0, [INPUT_DATA], #8
   2955     ldr             OUTPTR, [OUTPUT_DATA], #8
   2956     ldr             INPTR1, [INPUT_DATA], #8
   2957     subs            TMP1, BLOCK_WIDTH, #1
   2958     b.eq            3f
   2959 2:  /* columns */
   2960     ld1             {v0.16b}, [INPTR0], #16
   2961     ld1             {v1.16b}, [INPTR1], #16
   2962     mov             v4.16b, v16.16b
   2963     subs            TMP1, TMP1, #1
   2964     uadalp          v4.8h, v0.16b
   2965     uadalp          v4.8h, v1.16b
   2966     shrn            v6.8b, v4.8h, #2
   2967     st1             {v6.8b}, [OUTPTR], #8
   2968     b.ne            2b
   2969 3:  /* last columns */
   2970     ld1             {v0.16b}, [INPTR0], #16
   2971     ld1             {v1.16b}, [INPTR1], #16
   2972     mov             v4.16b, v16.16b
   2973     subs            V_SAMP, V_SAMP, #1
   2974     /* expand right */
   2975     tbl             v2.16b, {v0.16b}, v18.16b
   2976     tbl             v3.16b, {v1.16b}, v18.16b
   2977     uadalp          v4.8h, v2.16b
   2978     uadalp          v4.8h, v3.16b
   2979     shrn            v6.8b, v4.8h, #2
   2980     st1             {v6.8b}, [OUTPTR], #8
   2981     b.ne            1b
   2982 
   2983     br              x30
   2984 
   2985     .unreq          IMAGE_WIDTH
   2986     .unreq          MAX_V_SAMP
   2987     .unreq          V_SAMP
   2988     .unreq          BLOCK_WIDTH
   2989     .unreq          INPUT_DATA
   2990     .unreq          OUTPUT_DATA
   2991     .unreq          OUTPTR
   2992     .unreq          INPTR0
   2993     .unreq          INPTR1
   2994     .unreq          TMP1
   2995     .unreq          TMP2
   2996     .unreq          TMP3
   2997     .unreq          TMPDUP
   2998 
   2999 
   3000 /*****************************************************************************/
   3001 
   3002 /*
   3003  * GLOBAL(JOCTET *)
   3004  * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
   3005  *                             JCOEFPTR block, int last_dc_val,
   3006  *                             c_derived_tbl *dctbl, c_derived_tbl *actbl)
   3007  *
   3008  */
   3009 
   3010     BUFFER          .req x1
   3011     PUT_BUFFER      .req x6
   3012     PUT_BITS        .req x7
   3013     PUT_BITSw       .req w7
   3014 
   3015 .macro emit_byte
   3016     sub             PUT_BITS, PUT_BITS, #0x8
   3017     lsr             x19, PUT_BUFFER, PUT_BITS
   3018     uxtb            w19, w19
   3019     strb            w19, [BUFFER, #1]!
   3020     cmp             w19, #0xff
   3021     b.ne            14f
   3022     strb            wzr, [BUFFER, #1]!
   3023 14:
   3024 .endm
   3025 .macro put_bits CODE, SIZE
   3026     lsl             PUT_BUFFER, PUT_BUFFER, \SIZE
   3027     add             PUT_BITS, PUT_BITS, \SIZE
   3028     orr             PUT_BUFFER, PUT_BUFFER, \CODE
   3029 .endm
   3030 .macro checkbuf31
   3031     cmp             PUT_BITS, #0x20
   3032     b.lt            31f
   3033     emit_byte
   3034     emit_byte
   3035     emit_byte
   3036     emit_byte
   3037 31:
   3038 .endm
   3039 .macro checkbuf47
   3040     cmp             PUT_BITS, #0x30
   3041     b.lt            47f
   3042     emit_byte
   3043     emit_byte
   3044     emit_byte
   3045     emit_byte
   3046     emit_byte
   3047     emit_byte
   3048 47:
   3049 .endm
   3050 
   3051 .macro generate_jsimd_huff_encode_one_block fast_tbl
   3052 
   3053 .balign 16
   3054 
   3055 .if \fast_tbl == 1
   3056 asm_function jsimd_huff_encode_one_block_neon
   3057 .else
   3058 asm_function jsimd_huff_encode_one_block_neon_slowtbl
   3059 .endif
   3060     sub             sp, sp, 272
   3061     sub             BUFFER, BUFFER, #0x1    /* BUFFER=buffer-- */
   3062     /* Save ARM registers */
   3063     stp             x19, x20, [sp]
   3064 .if \fast_tbl == 1
   3065     get_symbol_loc  x15, Ljsimd_huff_encode_one_block_neon_consts
   3066 .else
   3067     get_symbol_loc  x15, Ljsimd_huff_encode_one_block_neon_slowtbl_consts
   3068 .endif
   3069     ldr             PUT_BUFFER, [x0, #0x10]
   3070     ldr             PUT_BITSw, [x0, #0x18]
   3071     ldrsh           w12, [x2]               /* load DC coeff in w12 */
   3072     /* prepare data */
   3073 .if \fast_tbl == 1
   3074     ld1             {v23.16b}, [x15], #16
   3075     ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x15], #64
   3076     ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x15], #64
   3077     ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x15], #64
   3078     ld1             {v24.16b, v25.16b, v26.16b, v27.16b}, [x2], #64
   3079     ld1             {v28.16b, v29.16b, v30.16b, v31.16b}, [x2], #64
   3080     sub             w12, w12, w3      /* last_dc_val, not used afterwards */
   3081     /* ZigZag 8x8 */
   3082     tbl             v0.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v0.16b
   3083     tbl             v1.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v1.16b
   3084     tbl             v2.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v2.16b
   3085     tbl             v3.16b, {v24.16b, v25.16b, v26.16b, v27.16b}, v3.16b
   3086     tbl             v4.16b, {v28.16b, v29.16b, v30.16b, v31.16b}, v4.16b
   3087     tbl             v5.16b, {v25.16b, v26.16b, v27.16b, v28.16b}, v5.16b
   3088     tbl             v6.16b, {v27.16b, v28.16b, v29.16b, v30.16b}, v6.16b
   3089     tbl             v7.16b, {v29.16b, v30.16b, v31.16b}, v7.16b
   3090     ins             v0.h[0], w12
   3091     tbx             v1.16b, {v28.16b}, v16.16b
   3092     tbx             v2.16b, {v29.16b, v30.16b}, v17.16b
   3093     tbx             v5.16b, {v29.16b, v30.16b}, v18.16b
   3094     tbx             v6.16b, {v31.16b}, v19.16b
   3095 .else
   3096       add             x13, x2, #0x22
   3097       sub             w12, w12, w3    /* last_dc_val, not used afterwards */
   3098     ld1             {v23.16b}, [x15]
   3099       add             x14, x2, #0x18
   3100       add             x3, x2, #0x36
   3101     ins             v0.h[0], w12
   3102       add             x9, x2, #0x2
   3103     ld1             {v1.h}[0], [x13]
   3104       add             x15, x2, #0x30
   3105     ld1             {v2.h}[0], [x14]
   3106       add             x19, x2, #0x26
   3107     ld1             {v3.h}[0], [x3]
   3108       add             x20, x2, #0x28
   3109     ld1             {v0.h}[1], [x9]
   3110       add             x12, x2, #0x10
   3111     ld1             {v1.h}[1], [x15]
   3112       add             x13, x2, #0x40
   3113     ld1             {v2.h}[1], [x19]
   3114       add             x14, x2, #0x34
   3115     ld1             {v3.h}[1], [x20]
   3116       add             x3, x2, #0x1a
   3117     ld1             {v0.h}[2], [x12]
   3118       add             x9, x2, #0x20
   3119     ld1             {v1.h}[2], [x13]
   3120       add             x15, x2, #0x32
   3121     ld1             {v2.h}[2], [x14]
   3122       add             x19, x2, #0x42
   3123     ld1             {v3.h}[2], [x3]
   3124       add             x20, x2, #0xc
   3125     ld1             {v0.h}[3], [x9]
   3126       add             x12, x2, #0x12
   3127     ld1             {v1.h}[3], [x15]
   3128       add             x13, x2, #0x24
   3129     ld1             {v2.h}[3], [x19]
   3130       add             x14, x2, #0x50
   3131     ld1             {v3.h}[3], [x20]
   3132       add             x3, x2, #0xe
   3133     ld1             {v0.h}[4], [x12]
   3134       add             x9, x2, #0x4
   3135     ld1             {v1.h}[4], [x13]
   3136       add             x15, x2, #0x16
   3137     ld1             {v2.h}[4], [x14]
   3138       add             x19, x2, #0x60
   3139     ld1             {v3.h}[4], [x3]
   3140       add             x20, x2, #0x1c
   3141     ld1             {v0.h}[5], [x9]
   3142       add             x12, x2, #0x6
   3143     ld1             {v1.h}[5], [x15]
   3144       add             x13, x2, #0x8
   3145     ld1             {v2.h}[5], [x19]
   3146       add             x14, x2, #0x52
   3147     ld1             {v3.h}[5], [x20]
   3148       add             x3, x2, #0x2a
   3149     ld1             {v0.h}[6], [x12]
   3150       add             x9, x2, #0x14
   3151     ld1             {v1.h}[6], [x13]
   3152       add             x15, x2, #0xa
   3153     ld1             {v2.h}[6], [x14]
   3154       add             x19, x2, #0x44
   3155     ld1             {v3.h}[6], [x3]
   3156       add             x20, x2, #0x38
   3157     ld1             {v0.h}[7], [x9]
   3158       add             x12, x2, #0x46
   3159     ld1             {v1.h}[7], [x15]
   3160       add             x13, x2, #0x3a
   3161     ld1             {v2.h}[7], [x19]
   3162       add             x14, x2, #0x74
   3163     ld1             {v3.h}[7], [x20]
   3164       add             x3, x2, #0x6a
   3165     ld1             {v4.h}[0], [x12]
   3166       add             x9, x2, #0x54
   3167     ld1             {v5.h}[0], [x13]
   3168       add             x15, x2, #0x2c
   3169     ld1             {v6.h}[0], [x14]
   3170       add             x19, x2, #0x76
   3171     ld1             {v7.h}[0], [x3]
   3172       add             x20, x2, #0x78
   3173     ld1             {v4.h}[1], [x9]
   3174       add             x12, x2, #0x62
   3175     ld1             {v5.h}[1], [x15]
   3176       add             x13, x2, #0x1e
   3177     ld1             {v6.h}[1], [x19]
   3178       add             x14, x2, #0x68
   3179     ld1             {v7.h}[1], [x20]
   3180       add             x3, x2, #0x7a
   3181     ld1             {v4.h}[2], [x12]
   3182       add             x9, x2, #0x70
   3183     ld1             {v5.h}[2], [x13]
   3184       add             x15, x2, #0x2e
   3185     ld1             {v6.h}[2], [x14]
   3186       add             x19, x2, #0x5a
   3187     ld1             {v7.h}[2], [x3]
   3188       add             x20, x2, #0x6c
   3189     ld1             {v4.h}[3], [x9]
   3190       add             x12, x2, #0x72
   3191     ld1             {v5.h}[3], [x15]
   3192       add             x13, x2, #0x3c
   3193     ld1             {v6.h}[3], [x19]
   3194       add             x14, x2, #0x4c
   3195     ld1             {v7.h}[3], [x20]
   3196       add             x3, x2, #0x5e
   3197     ld1             {v4.h}[4], [x12]
   3198       add             x9, x2, #0x64
   3199     ld1             {v5.h}[4], [x13]
   3200       add             x15, x2, #0x4a
   3201     ld1             {v6.h}[4], [x14]
   3202       add             x19, x2, #0x3e
   3203     ld1             {v7.h}[4], [x3]
   3204       add             x20, x2, #0x6e
   3205     ld1             {v4.h}[5], [x9]
   3206       add             x12, x2, #0x56
   3207     ld1             {v5.h}[5], [x15]
   3208       add             x13, x2, #0x58
   3209     ld1             {v6.h}[5], [x19]
   3210       add             x14, x2, #0x4e
   3211     ld1             {v7.h}[5], [x20]
   3212       add             x3, x2, #0x7c
   3213     ld1             {v4.h}[6], [x12]
   3214       add             x9, x2, #0x48
   3215     ld1             {v5.h}[6], [x13]
   3216       add             x15, x2, #0x66
   3217     ld1             {v6.h}[6], [x14]
   3218       add             x19, x2, #0x5c
   3219     ld1             {v7.h}[6], [x3]
   3220       add             x20, x2, #0x7e
   3221     ld1             {v4.h}[7], [x9]
   3222     ld1             {v5.h}[7], [x15]
   3223     ld1             {v6.h}[7], [x19]
   3224     ld1             {v7.h}[7], [x20]
   3225 .endif
   3226     cmlt            v24.8h, v0.8h, #0
   3227     cmlt            v25.8h, v1.8h, #0
   3228     cmlt            v26.8h, v2.8h, #0
   3229     cmlt            v27.8h, v3.8h, #0
   3230     cmlt            v28.8h, v4.8h, #0
   3231     cmlt            v29.8h, v5.8h, #0
   3232     cmlt            v30.8h, v6.8h, #0
   3233     cmlt            v31.8h, v7.8h, #0
   3234     abs             v0.8h, v0.8h
   3235     abs             v1.8h, v1.8h
   3236     abs             v2.8h, v2.8h
   3237     abs             v3.8h, v3.8h
   3238     abs             v4.8h, v4.8h
   3239     abs             v5.8h, v5.8h
   3240     abs             v6.8h, v6.8h
   3241     abs             v7.8h, v7.8h
   3242     eor             v24.16b, v24.16b, v0.16b
   3243     eor             v25.16b, v25.16b, v1.16b
   3244     eor             v26.16b, v26.16b, v2.16b
   3245     eor             v27.16b, v27.16b, v3.16b
   3246     eor             v28.16b, v28.16b, v4.16b
   3247     eor             v29.16b, v29.16b, v5.16b
   3248     eor             v30.16b, v30.16b, v6.16b
   3249     eor             v31.16b, v31.16b, v7.16b
   3250     cmeq            v16.8h, v0.8h, #0
   3251     cmeq            v17.8h, v1.8h, #0
   3252     cmeq            v18.8h, v2.8h, #0
   3253     cmeq            v19.8h, v3.8h, #0
   3254     cmeq            v20.8h, v4.8h, #0
   3255     cmeq            v21.8h, v5.8h, #0
   3256     cmeq            v22.8h, v6.8h, #0
   3257     xtn             v16.8b, v16.8h
   3258     xtn             v18.8b, v18.8h
   3259     xtn             v20.8b, v20.8h
   3260     xtn             v22.8b, v22.8h
   3261       umov            w14, v0.h[0]
   3262     xtn2            v16.16b, v17.8h
   3263       umov            w13, v24.h[0]
   3264     xtn2            v18.16b, v19.8h
   3265       clz             w14, w14
   3266     xtn2            v20.16b, v21.8h
   3267       lsl             w13, w13, w14
   3268     cmeq            v17.8h, v7.8h, #0
   3269       sub             w12, w14, #32
   3270     xtn2            v22.16b, v17.8h
   3271       lsr             w13, w13, w14
   3272     and             v16.16b, v16.16b, v23.16b
   3273       neg             w12, w12
   3274     and             v18.16b, v18.16b, v23.16b
   3275       add             x3, x4, #0x400           /* r1 = dctbl->ehufsi */
   3276     and             v20.16b, v20.16b, v23.16b
   3277       add             x15, sp, #0x90           /* x15 = t2 */
   3278     and             v22.16b, v22.16b, v23.16b
   3279       ldr             w10, [x4, x12, lsl #2]
   3280     addp            v16.16b, v16.16b, v18.16b
   3281       ldrb            w11, [x3, x12]
   3282     addp            v20.16b, v20.16b, v22.16b
   3283       checkbuf47
   3284     addp            v16.16b, v16.16b, v20.16b
   3285       put_bits        x10, x11
   3286     addp            v16.16b, v16.16b, v18.16b
   3287       checkbuf47
   3288     umov            x9, v16.D[0]
   3289       put_bits        x13, x12
   3290     cnt             v17.8b, v16.8b
   3291       mvn             x9, x9
   3292     addv            B18, v17.8b
   3293       add             x4, x5, #0x400   /* x4 = actbl->ehufsi */
   3294     umov            w12, v18.b[0]
   3295       lsr             x9, x9, #0x1     /* clear AC coeff */
   3296     ldr             w13, [x5, #0x3c0]  /* x13 = actbl->ehufco[0xf0] */
   3297     rbit            x9, x9             /* x9 = index0 */
   3298     ldrb            w14, [x4, #0xf0]   /* x14 = actbl->ehufsi[0xf0] */
   3299     cmp             w12, #(64-8)
   3300     add             x11, sp, #16
   3301     b.lt            4f
   3302     cbz             x9, 6f
   3303     st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
   3304     st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
   3305     st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
   3306     st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
   3307 1:
   3308     clz             x2, x9
   3309     add             x15, x15, x2, lsl #1
   3310     lsl             x9, x9, x2
   3311     ldrh            w20, [x15, #-126]
   3312 2:
   3313     cmp             x2, #0x10
   3314     b.lt            3f
   3315     sub             x2, x2, #0x10
   3316     checkbuf47
   3317     put_bits        x13, x14
   3318     b               2b
   3319 3:
   3320     clz             w20, w20
   3321     ldrh            w3, [x15, #2]!
   3322     sub             w11, w20, #32
   3323     lsl             w3, w3, w20
   3324     neg             w11, w11
   3325     lsr             w3, w3, w20
   3326     add             x2, x11, x2, lsl #4
   3327     lsl             x9, x9, #0x1
   3328     ldr             w12, [x5, x2, lsl #2]
   3329     ldrb            w10, [x4, x2]
   3330     checkbuf31
   3331     put_bits        x12, x10
   3332     put_bits        x3, x11
   3333     cbnz            x9, 1b
   3334     b               6f
   3335 4:
   3336     movi            v21.8h, #0x0010
   3337     clz             v0.8h, v0.8h
   3338     clz             v1.8h, v1.8h
   3339     clz             v2.8h, v2.8h
   3340     clz             v3.8h, v3.8h
   3341     clz             v4.8h, v4.8h
   3342     clz             v5.8h, v5.8h
   3343     clz             v6.8h, v6.8h
   3344     clz             v7.8h, v7.8h
   3345     ushl            v24.8h, v24.8h, v0.8h
   3346     ushl            v25.8h, v25.8h, v1.8h
   3347     ushl            v26.8h, v26.8h, v2.8h
   3348     ushl            v27.8h, v27.8h, v3.8h
   3349     ushl            v28.8h, v28.8h, v4.8h
   3350     ushl            v29.8h, v29.8h, v5.8h
   3351     ushl            v30.8h, v30.8h, v6.8h
   3352     ushl            v31.8h, v31.8h, v7.8h
   3353     neg             v0.8h, v0.8h
   3354     neg             v1.8h, v1.8h
   3355     neg             v2.8h, v2.8h
   3356     neg             v3.8h, v3.8h
   3357     neg             v4.8h, v4.8h
   3358     neg             v5.8h, v5.8h
   3359     neg             v6.8h, v6.8h
   3360     neg             v7.8h, v7.8h
   3361     ushl            v24.8h, v24.8h, v0.8h
   3362     ushl            v25.8h, v25.8h, v1.8h
   3363     ushl            v26.8h, v26.8h, v2.8h
   3364     ushl            v27.8h, v27.8h, v3.8h
   3365     ushl            v28.8h, v28.8h, v4.8h
   3366     ushl            v29.8h, v29.8h, v5.8h
   3367     ushl            v30.8h, v30.8h, v6.8h
   3368     ushl            v31.8h, v31.8h, v7.8h
   3369     add             v0.8h, v21.8h, v0.8h
   3370     add             v1.8h, v21.8h, v1.8h
   3371     add             v2.8h, v21.8h, v2.8h
   3372     add             v3.8h, v21.8h, v3.8h
   3373     add             v4.8h, v21.8h, v4.8h
   3374     add             v5.8h, v21.8h, v5.8h
   3375     add             v6.8h, v21.8h, v6.8h
   3376     add             v7.8h, v21.8h, v7.8h
   3377     st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x11], #64
   3378     st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x11], #64
   3379     st1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x11], #64
   3380     st1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x11], #64
   3381 1:
   3382     clz             x2, x9
   3383     add             x15, x15, x2, lsl #1
   3384     lsl             x9, x9, x2
   3385     ldrh            w11, [x15, #-126]
   3386 2:
   3387     cmp             x2, #0x10
   3388     b.lt            3f
   3389     sub             x2, x2, #0x10
   3390     checkbuf47
   3391     put_bits        x13, x14
   3392     b               2b
   3393 3:
   3394     ldrh            w3, [x15, #2]!
   3395     add             x2, x11, x2, lsl #4
   3396     lsl             x9, x9, #0x1
   3397     ldr             w12, [x5, x2, lsl #2]
   3398     ldrb            w10, [x4, x2]
   3399     checkbuf31
   3400     put_bits        x12, x10
   3401     put_bits        x3, x11
   3402     cbnz            x9, 1b
   3403 6:
   3404     add             x13, sp, #0x10e
   3405     cmp             x15, x13
   3406     b.hs            1f
   3407     ldr             w12, [x5]
   3408     ldrb            w14, [x4]
   3409     checkbuf47
   3410     put_bits        x12, x14
   3411 1:
   3412     str             PUT_BUFFER, [x0, #0x10]
   3413     str             PUT_BITSw, [x0, #0x18]
   3414     ldp             x19, x20, [sp], 16
   3415     add             x0, BUFFER, #0x1
   3416     add             sp, sp, 256
   3417     br              x30
   3418 
   3419 .endm
   3420 
   3421 generate_jsimd_huff_encode_one_block 1
   3422 generate_jsimd_huff_encode_one_block 0
   3423 
   3424     .unreq          BUFFER
   3425     .unreq          PUT_BUFFER
   3426     .unreq          PUT_BITS
   3427     .unreq          PUT_BITSw
   3428 
   3429 .purgem emit_byte
   3430 .purgem put_bits
   3431 .purgem checkbuf31
   3432 .purgem checkbuf47
   3433