Home | History | Annotate | Download | only in libjpeg_turbo
      1 Index: jdmarker.c
      2 ===================================================================
      3 --- jdmarker.c	(revision 829)
      4 +++ jdmarker.c	(working copy)
      5 @@ -910,7 +910,7 @@
      6    }
      7  
      8    if (cinfo->marker->discarded_bytes != 0) {
      9 -    WARNMS2(cinfo, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c);
     10 +    TRACEMS2(cinfo, 1, JWRN_EXTRANEOUS_DATA, cinfo->marker->discarded_bytes, c);
     11      cinfo->marker->discarded_bytes = 0;
     12    }
     13  
     14 @@ -944,7 +944,144 @@
     15    return TRUE;
     16  }
     17  
     18 +#ifdef MOTION_JPEG_SUPPORTED
     19  
     20 +/* The default Huffman tables used by motion JPEG frames. When a motion JPEG
     21 + * frame does not have DHT tables, we should use the huffman tables suggested by
     22 + * the JPEG standard. Each of these tables represents a member of the JHUFF_TBLS
     23 + * struct so we can just copy it to the according JHUFF_TBLS member.
     24 + */
     25 +/* DC table 0 */
     26 +LOCAL(const unsigned char) mjpg_dc0_bits[] = {
     27 +  0x00, 0x01, 0x05, 0x01, 0x01, 0x01, 0x01, 0x01,
     28 +  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     29 +};
     30 +
     31 +LOCAL(const unsigned char) mjpg_dc0_huffval[] = {
     32 +  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
     33 +  0x08, 0x09, 0x0A, 0x0B
     34 +};
     35 +
     36 +/* DC table 1 */
     37 +LOCAL(const unsigned char) mjpg_dc1_bits[] = {
     38 +  0x00, 0x03, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
     39 +  0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00
     40 +};
     41 +
     42 +LOCAL(const unsigned char) mjpg_dc1_huffval[] = {
     43 +  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
     44 +  0x08, 0x09, 0x0A, 0x0B
     45 +};
     46 +  
     47 +/* AC table 0 */
     48 +LOCAL(const unsigned char) mjpg_ac0_bits[] = {
     49 +  0x00, 0x02, 0x01, 0x03, 0x03, 0x02, 0x04, 0x03,
     50 +  0x05, 0x05, 0x04, 0x04, 0x00, 0x00, 0x01, 0x7D
     51 +};
     52 +
     53 +LOCAL(const unsigned char) mjpg_ac0_huffval[] = {
     54 +  0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
     55 +  0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
     56 +  0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xA1, 0x08,
     57 +  0x23, 0x42, 0xB1, 0xC1, 0x15, 0x52, 0xD1, 0xF0,
     58 +  0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0A, 0x16,
     59 +  0x17, 0x18, 0x19, 0x1A, 0x25, 0x26, 0x27, 0x28,
     60 +  0x29, 0x2A, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
     61 +  0x3A, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
     62 +  0x4A, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
     63 +  0x5A, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
     64 +  0x6A, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
     65 +  0x7A, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
     66 +  0x8A, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
     67 +  0x99, 0x9A, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
     68 +  0xA8, 0xA9, 0xAA, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6,
     69 +  0xB7, 0xB8, 0xB9, 0xBA, 0xC2, 0xC3, 0xC4, 0xC5,
     70 +  0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xD2, 0xD3, 0xD4,
     71 +  0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xE1, 0xE2,
     72 +  0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA,
     73 +  0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8,
     74 +  0xF9, 0xFA
     75 +};
     76 +
     77 +/* AC table 1 */
     78 +LOCAL(const unsigned char) mjpg_ac1_bits[] = {
     79 +  0x00, 0x02, 0x01, 0x02, 0x04, 0x04, 0x03, 0x04,
     80 +  0x07, 0x05, 0x04, 0x04, 0x00, 0x01, 0x02, 0x77
     81 +};
     82 +
     83 +LOCAL(const unsigned char) mjpg_ac1_huffval[] = {
     84 +  0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
     85 +  0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
     86 +  0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
     87 +  0xA1, 0xB1, 0xC1, 0x09, 0x23, 0x33, 0x52, 0xF0,
     88 +  0x15, 0x62, 0x72, 0xD1, 0x0A, 0x16, 0x24, 0x34,
     89 +  0xE1, 0x25, 0xF1, 0x17, 0x18, 0x19, 0x1A, 0x26,
     90 +  0x27, 0x28, 0x29, 0x2A, 0x35, 0x36, 0x37, 0x38,
     91 +  0x39, 0x3A, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
     92 +  0x49, 0x4A, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
     93 +  0x59, 0x5A, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
     94 +  0x69, 0x6A, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
     95 +  0x79, 0x7A, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
     96 +  0x88, 0x89, 0x8A, 0x92, 0x93, 0x94, 0x95, 0x96,
     97 +  0x97, 0x98, 0x99, 0x9A, 0xA2, 0xA3, 0xA4, 0xA5,
     98 +  0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xB2, 0xB3, 0xB4,
     99 +  0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xC2, 0xC3,
    100 +  0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xD2,
    101 +  0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA,
    102 +  0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9,
    103 +  0xEA, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8,
    104 +  0xF9, 0xFA
    105 +};
    106 +
    107 +/* Loads the default Huffman tables used by motion JPEG frames. This function
    108 + * just copies the huffman tables suggested in the JPEG standard when we have
    109 + * not load them.
    110 + */
    111 +LOCAL(void)
    112 +mjpg_load_huff_tables (j_decompress_ptr cinfo)
    113 +{
    114 +  JHUFF_TBL *htblptr;
    115 +
    116 +  if (! cinfo->dc_huff_tbl_ptrs[0]) {
    117 +    htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
    118 +    MEMZERO(htblptr, SIZEOF(JHUFF_TBL));
    119 +    MEMCOPY(&htblptr->bits[1], mjpg_dc0_bits, SIZEOF(mjpg_dc0_bits));
    120 +    MEMCOPY(&htblptr->huffval[0], mjpg_dc0_huffval, SIZEOF(mjpg_dc0_huffval));
    121 +    cinfo->dc_huff_tbl_ptrs[0] = htblptr;
    122 +  }
    123 +
    124 +  if (! cinfo->dc_huff_tbl_ptrs[1]) {
    125 +    htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
    126 +    MEMZERO(htblptr, SIZEOF(JHUFF_TBL));
    127 +    MEMCOPY(&htblptr->bits[1], mjpg_dc1_bits, SIZEOF(mjpg_dc1_bits));
    128 +    MEMCOPY(&htblptr->huffval[0], mjpg_dc1_huffval, SIZEOF(mjpg_dc1_huffval));
    129 +    cinfo->dc_huff_tbl_ptrs[1] = htblptr;
    130 +  }
    131 +
    132 +  if (! cinfo->ac_huff_tbl_ptrs[0]) {
    133 +    htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
    134 +    MEMZERO(htblptr, SIZEOF(JHUFF_TBL));
    135 +    MEMCOPY(&htblptr->bits[1], mjpg_ac0_bits, SIZEOF(mjpg_ac0_bits));
    136 +    MEMCOPY(&htblptr->huffval[0], mjpg_ac0_huffval, SIZEOF(mjpg_ac0_huffval));
    137 +    cinfo->ac_huff_tbl_ptrs[0] = htblptr;
    138 +  }
    139 +
    140 +  if (! cinfo->ac_huff_tbl_ptrs[1]) {
    141 +    htblptr = jpeg_alloc_huff_table((j_common_ptr) cinfo);
    142 +    MEMZERO(htblptr, SIZEOF(JHUFF_TBL));
    143 +    MEMCOPY(&htblptr->bits[1], mjpg_ac1_bits, SIZEOF(mjpg_ac1_bits));
    144 +    MEMCOPY(&htblptr->huffval[0], mjpg_ac1_huffval, SIZEOF(mjpg_ac1_huffval));
    145 +    cinfo->ac_huff_tbl_ptrs[1] = htblptr;
    146 +  }
    147 +}
    148 +
    149 +#else
    150 +
    151 +#define mjpg_load_huff_tables(cinfo)
    152 +
    153 +#endif /* MOTION_JPEG_SUPPORTED */
    154 +
    155 +
    156  /*
    157   * Read markers until SOS or EOI.
    158   *
    159 @@ -1013,6 +1150,7 @@
    160        break;
    161  
    162      case M_SOS:
    163 +      mjpg_load_huff_tables(cinfo);
    164        if (! get_sos(cinfo))
    165  	return JPEG_SUSPENDED;
    166        cinfo->unread_marker = 0;	/* processed the marker */
    167 Index: jmorecfg.h
    168 ===================================================================
    169 --- jmorecfg.h	(revision 829)
    170 +++ jmorecfg.h	(working copy)
    171 @@ -153,14 +153,18 @@
    172  /* INT16 must hold at least the values -32768..32767. */
    173  
    174  #ifndef XMD_H			/* X11/xmd.h correctly defines INT16 */
    175 +#ifndef _BASETSD_H_		/* basetsd.h correctly defines INT32 */
    176  typedef short INT16;
    177  #endif
    178 +#endif
    179  
    180  /* INT32 must hold at least signed 32-bit values. */
    181  
    182  #ifndef XMD_H			/* X11/xmd.h correctly defines INT32 */
    183 +#ifndef _BASETSD_H_		/* basetsd.h correctly defines INT32 */
    184  typedef long INT32;
    185  #endif
    186 +#endif
    187  
    188  /* Datatype used for image dimensions.  The JPEG standard only supports
    189   * images up to 64K*64K due to 16-bit fields in SOF markers.  Therefore
    190 @@ -210,11 +214,13 @@
    191   * explicit coding is needed; see uses of the NEED_FAR_POINTERS symbol.
    192   */
    193  
    194 +#ifndef FAR
    195  #ifdef NEED_FAR_POINTERS
    196  #define FAR  far
    197  #else
    198  #define FAR
    199  #endif
    200 +#endif
    201  
    202  
    203  /*
    204 Index: jpeglib.h
    205 ===================================================================
    206 --- jpeglib.h	(revision 829)
    207 +++ jpeglib.h	(working copy)
    208 @@ -15,6 +15,10 @@
    209  #ifndef JPEGLIB_H
    210  #define JPEGLIB_H
    211  
    212 +/* Begin chromium edits */
    213 +#include "jpeglibmangler.h"
    214 +/* End chromium edits */
    215 +
    216  /*
    217   * First we include the configuration files that record how this
    218   * installation of the JPEG library is set up.  jconfig.h can be
    219 Index: jpeglibmangler.h
    220 ===================================================================
    221 --- jpeglibmangler.h	(revision 0)
    222 +++ jpeglibmangler.h	(revision 0)
    223 @@ -0,0 +1,113 @@
    224 +// Copyright (c) 2009 The Chromium Authors. All rights reserved.
    225 +// Use of this source code is governed by a BSD-style license that can be
    226 +// found in the LICENSE file.
    227 +
    228 +#ifndef THIRD_PARTY_LIBJPEG_TURBO_JPEGLIBMANGLER_H_
    229 +#define THIRD_PARTY_LIBJPEG_TURBO_JPEGLIBMANGLER_H_
    230 +
    231 +// Mangle all externally visible function names so we can build our own libjpeg
    232 +// without system libraries trying to use it.
    233 +
    234 +#define jpeg_make_c_derived_tbl chromium_jpeg_make_c_derived_tbl
    235 +#define jpeg_gen_optimal_table chromium_jpeg_gen_optimal_table
    236 +#define jpeg_make_d_derived_tbl chromium_jpeg_make_d_derived_tbl
    237 +#define jpeg_fill_bit_buffer chromium_jpeg_fill_bit_buffer
    238 +#define jpeg_huff_decode chromium_jpeg_huff_decode
    239 +#define jpeg_fdct_islow chromium_jpeg_fdct_islow
    240 +#define jpeg_fdct_ifast chromium_jpeg_fdct_ifast
    241 +#define jpeg_fdct_float chromium_jpeg_fdct_float
    242 +#define jpeg_idct_islow chromium_jpeg_idct_islow
    243 +#define jpeg_idct_ifast chromium_jpeg_idct_ifast
    244 +#define jpeg_idct_float chromium_jpeg_idct_float
    245 +#define jpeg_idct_4x4 chromium_jpeg_idct_4x4
    246 +#define jpeg_idct_2x2 chromium_jpeg_idct_2x2
    247 +#define jpeg_idct_1x1 chromium_jpeg_idct_1x1
    248 +#define jinit_compress_master chromium_jinit_compress_master
    249 +#define jinit_c_master_control chromium_jinit_c_master_control
    250 +#define jinit_c_main_controller chromium_jinit_c_main_controller
    251 +#define jinit_c_prep_controller chromium_jinit_c_prep_controller
    252 +#define jinit_c_coef_controller chromium_jinit_c_coef_controller
    253 +#define jinit_color_converter chromium_jinit_color_converter
    254 +#define jinit_downsampler chromium_jinit_downsampler
    255 +#define jinit_forward_dct chromium_jinit_forward_dct
    256 +#define jinit_huff_encoder chromium_jinit_huff_encoder
    257 +#define jinit_phuff_encoder chromium_jinit_phuff_encoder
    258 +#define jinit_marker_writer chromium_jinit_marker_writer
    259 +#define jinit_master_decompress chromium_jinit_master_decompress
    260 +#define jinit_d_main_controller chromium_jinit_d_main_controller
    261 +#define jinit_d_coef_controller chromium_jinit_d_coef_controller
    262 +#define jinit_d_post_controller chromium_jinit_d_post_controller
    263 +#define jinit_input_controller chromium_jinit_input_controller
    264 +#define jinit_marker_reader chromium_jinit_marker_reader
    265 +#define jinit_huff_decoder chromium_jinit_huff_decoder
    266 +#define jinit_phuff_decoder chromium_jinit_phuff_decoder
    267 +#define jinit_inverse_dct chromium_jinit_inverse_dct
    268 +#define jinit_upsampler chromium_jinit_upsampler
    269 +#define jinit_color_deconverter chromium_jinit_color_deconverter
    270 +#define jinit_1pass_quantizer chromium_jinit_1pass_quantizer
    271 +#define jinit_2pass_quantizer chromium_jinit_2pass_quantizer
    272 +#define jinit_merged_upsampler chromium_jinit_merged_upsampler
    273 +#define jinit_memory_mgr chromium_jinit_memory_mgr
    274 +#define jdiv_round_up chromium_jdiv_round_up
    275 +#define jround_up chromium_jround_up
    276 +#define jcopy_sample_rows chromium_jcopy_sample_rows
    277 +#define jcopy_block_row chromium_jcopy_block_row
    278 +#define jzero_far chromium_jzero_far
    279 +#define jpeg_std_error chromium_jpeg_std_error
    280 +#define jpeg_CreateCompress chromium_jpeg_CreateCompress
    281 +#define jpeg_CreateDecompress chromium_jpeg_CreateDecompress
    282 +#define jpeg_destroy_compress chromium_jpeg_destroy_compress
    283 +#define jpeg_destroy_decompress chromium_jpeg_destroy_decompress
    284 +#define jpeg_stdio_dest chromium_jpeg_stdio_dest
    285 +#define jpeg_stdio_src chromium_jpeg_stdio_src
    286 +#define jpeg_set_defaults chromium_jpeg_set_defaults
    287 +#define jpeg_set_colorspace chromium_jpeg_set_colorspace
    288 +#define jpeg_default_colorspace chromium_jpeg_default_colorspace
    289 +#define jpeg_set_quality chromium_jpeg_set_quality
    290 +#define jpeg_set_linear_quality chromium_jpeg_set_linear_quality
    291 +#define jpeg_add_quant_table chromium_jpeg_add_quant_table
    292 +#define jpeg_quality_scaling chromium_jpeg_quality_scaling
    293 +#define jpeg_simple_progression chromium_jpeg_simple_progression
    294 +#define jpeg_suppress_tables chromium_jpeg_suppress_tables
    295 +#define jpeg_alloc_quant_table chromium_jpeg_alloc_quant_table
    296 +#define jpeg_alloc_huff_table chromium_jpeg_alloc_huff_table
    297 +#define jpeg_start_compress chromium_jpeg_start_compress
    298 +#define jpeg_write_scanlines chromium_jpeg_write_scanlines
    299 +#define jpeg_finish_compress chromium_jpeg_finish_compress
    300 +#define jpeg_write_raw_data chromium_jpeg_write_raw_data
    301 +#define jpeg_write_marker chromium_jpeg_write_marker
    302 +#define jpeg_write_m_header chromium_jpeg_write_m_header
    303 +#define jpeg_write_m_byte chromium_jpeg_write_m_byte
    304 +#define jpeg_write_tables chromium_jpeg_write_tables
    305 +#define jpeg_read_header chromium_jpeg_read_header
    306 +#define jpeg_start_decompress chromium_jpeg_start_decompress
    307 +#define jpeg_read_scanlines chromium_jpeg_read_scanlines
    308 +#define jpeg_finish_decompress chromium_jpeg_finish_decompress
    309 +#define jpeg_read_raw_data chromium_jpeg_read_raw_data
    310 +#define jpeg_has_multiple_scans chromium_jpeg_has_multiple_scans
    311 +#define jpeg_start_output chromium_jpeg_start_output
    312 +#define jpeg_finish_output chromium_jpeg_finish_output
    313 +#define jpeg_input_complete chromium_jpeg_input_complete
    314 +#define jpeg_new_colormap chromium_jpeg_new_colormap
    315 +#define jpeg_consume_input chromium_jpeg_consume_input
    316 +#define jpeg_calc_output_dimensions chromium_jpeg_calc_output_dimensions
    317 +#define jpeg_save_markers chromium_jpeg_save_markers
    318 +#define jpeg_set_marker_processor chromium_jpeg_set_marker_processor
    319 +#define jpeg_read_coefficients chromium_jpeg_read_coefficients
    320 +#define jpeg_write_coefficients chromium_jpeg_write_coefficients
    321 +#define jpeg_copy_critical_parameters chromium_jpeg_copy_critical_parameters
    322 +#define jpeg_abort_compress chromium_jpeg_abort_compress
    323 +#define jpeg_abort_decompress chromium_jpeg_abort_decompress
    324 +#define jpeg_abort chromium_jpeg_abort
    325 +#define jpeg_destroy chromium_jpeg_destroy
    326 +#define jpeg_resync_to_restart chromium_jpeg_resync_to_restart
    327 +#define jpeg_get_small chromium_jpeg_get_small
    328 +#define jpeg_free_small chromium_jpeg_free_small
    329 +#define jpeg_get_large chromium_jpeg_get_large
    330 +#define jpeg_free_large chromium_jpeg_free_large
    331 +#define jpeg_mem_available chromium_jpeg_mem_available
    332 +#define jpeg_open_backing_store chromium_jpeg_open_backing_store
    333 +#define jpeg_mem_init chromium_jpeg_mem_init
    334 +#define jpeg_mem_term chromium_jpeg_mem_term
    335 +
    336 +#endif  // THIRD_PARTY_LIBJPEG_TURBO_JPEGLIBMANGLER_H_
    337 Index: simd/jcgrass2-64.asm
    338 ===================================================================
    339 --- simd/jcgrass2-64.asm	(revision 829)
    340 +++ simd/jcgrass2-64.asm	(working copy)
    341 @@ -30,7 +30,7 @@
    342  	SECTION	SEG_CONST
    343  
    344  	alignz	16
    345 -	global	EXTN(jconst_rgb_gray_convert_sse2)
    346 +	global	EXTN(jconst_rgb_gray_convert_sse2) PRIVATE
    347  
    348  EXTN(jconst_rgb_gray_convert_sse2):
    349  
    350 Index: simd/jiss2fst.asm
    351 ===================================================================
    352 --- simd/jiss2fst.asm	(revision 829)
    353 +++ simd/jiss2fst.asm	(working copy)
    354 @@ -59,7 +59,7 @@
    355  %define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
    356  
    357  	alignz	16
    358 -	global	EXTN(jconst_idct_ifast_sse2)
    359 +	global	EXTN(jconst_idct_ifast_sse2) PRIVATE
    360  
    361  EXTN(jconst_idct_ifast_sse2):
    362  
    363 @@ -92,7 +92,7 @@
    364  %define WK_NUM		2
    365  
    366  	align	16
    367 -	global	EXTN(jsimd_idct_ifast_sse2)
    368 +	global	EXTN(jsimd_idct_ifast_sse2) PRIVATE
    369  
    370  EXTN(jsimd_idct_ifast_sse2):
    371  	push	ebp
    372 Index: simd/jcclrss2-64.asm
    373 ===================================================================
    374 --- simd/jcclrss2-64.asm	(revision 829)
    375 +++ simd/jcclrss2-64.asm	(working copy)
    376 @@ -37,7 +37,7 @@
    377  
    378  	align	16
    379  
    380 -	global	EXTN(jsimd_rgb_ycc_convert_sse2)
    381 +	global	EXTN(jsimd_rgb_ycc_convert_sse2) PRIVATE
    382  
    383  EXTN(jsimd_rgb_ycc_convert_sse2):
    384  	push	rbp
    385 Index: simd/jiss2red-64.asm
    386 ===================================================================
    387 --- simd/jiss2red-64.asm	(revision 829)
    388 +++ simd/jiss2red-64.asm	(working copy)
    389 @@ -73,7 +73,7 @@
    390  	SECTION	SEG_CONST
    391  
    392  	alignz	16
    393 -	global	EXTN(jconst_idct_red_sse2)
    394 +	global	EXTN(jconst_idct_red_sse2) PRIVATE
    395  
    396  EXTN(jconst_idct_red_sse2):
    397  
    398 @@ -114,7 +114,7 @@
    399  %define WK_NUM		2
    400  
    401  	align	16
    402 -	global	EXTN(jsimd_idct_4x4_sse2)
    403 +	global	EXTN(jsimd_idct_4x4_sse2) PRIVATE
    404  
    405  EXTN(jsimd_idct_4x4_sse2):
    406  	push	rbp
    407 @@ -413,7 +413,7 @@
    408  ; r13 = JDIMENSION output_col
    409  
    410  	align	16
    411 -	global	EXTN(jsimd_idct_2x2_sse2)
    412 +	global	EXTN(jsimd_idct_2x2_sse2) PRIVATE
    413  
    414  EXTN(jsimd_idct_2x2_sse2):
    415  	push	rbp
    416 Index: simd/ji3dnflt.asm
    417 ===================================================================
    418 --- simd/ji3dnflt.asm	(revision 829)
    419 +++ simd/ji3dnflt.asm	(working copy)
    420 @@ -27,7 +27,7 @@
    421  	SECTION	SEG_CONST
    422  
    423  	alignz	16
    424 -	global	EXTN(jconst_idct_float_3dnow)
    425 +	global	EXTN(jconst_idct_float_3dnow) PRIVATE
    426  
    427  EXTN(jconst_idct_float_3dnow):
    428  
    429 @@ -63,7 +63,7 @@
    430  					; FAST_FLOAT workspace[DCTSIZE2]
    431  
    432  	align	16
    433 -	global	EXTN(jsimd_idct_float_3dnow)
    434 +	global	EXTN(jsimd_idct_float_3dnow) PRIVATE
    435  
    436  EXTN(jsimd_idct_float_3dnow):
    437  	push	ebp
    438 Index: simd/jsimdcpu.asm
    439 ===================================================================
    440 --- simd/jsimdcpu.asm	(revision 829)
    441 +++ simd/jsimdcpu.asm	(working copy)
    442 @@ -29,7 +29,7 @@
    443  ;
    444  
    445  	align	16
    446 -	global	EXTN(jpeg_simd_cpu_support)
    447 +	global	EXTN(jpeg_simd_cpu_support) PRIVATE
    448  
    449  EXTN(jpeg_simd_cpu_support):
    450  	push	ebx
    451 Index: simd/jdmerss2-64.asm
    452 ===================================================================
    453 --- simd/jdmerss2-64.asm	(revision 829)
    454 +++ simd/jdmerss2-64.asm	(working copy)
    455 @@ -35,7 +35,7 @@
    456  	SECTION	SEG_CONST
    457  
    458  	alignz	16
    459 -	global	EXTN(jconst_merged_upsample_sse2)
    460 +	global	EXTN(jconst_merged_upsample_sse2) PRIVATE
    461  
    462  EXTN(jconst_merged_upsample_sse2):
    463  
    464 Index: simd/jdsammmx.asm
    465 ===================================================================
    466 --- simd/jdsammmx.asm	(revision 829)
    467 +++ simd/jdsammmx.asm	(working copy)
    468 @@ -22,7 +22,7 @@
    469  	SECTION	SEG_CONST
    470  
    471  	alignz	16
    472 -	global	EXTN(jconst_fancy_upsample_mmx)
    473 +	global	EXTN(jconst_fancy_upsample_mmx) PRIVATE
    474  
    475  EXTN(jconst_fancy_upsample_mmx):
    476  
    477 @@ -58,7 +58,7 @@
    478  %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
    479  
    480  	align	16
    481 -	global	EXTN(jsimd_h2v1_fancy_upsample_mmx)
    482 +	global	EXTN(jsimd_h2v1_fancy_upsample_mmx) PRIVATE
    483  
    484  EXTN(jsimd_h2v1_fancy_upsample_mmx):
    485  	push	ebp
    486 @@ -216,7 +216,7 @@
    487  %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
    488  
    489  	align	16
    490 -	global	EXTN(jsimd_h2v2_fancy_upsample_mmx)
    491 +	global	EXTN(jsimd_h2v2_fancy_upsample_mmx) PRIVATE
    492  
    493  EXTN(jsimd_h2v2_fancy_upsample_mmx):
    494  	push	ebp
    495 @@ -542,7 +542,7 @@
    496  %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
    497  
    498  	align	16
    499 -	global	EXTN(jsimd_h2v1_upsample_mmx)
    500 +	global	EXTN(jsimd_h2v1_upsample_mmx) PRIVATE
    501  
    502  EXTN(jsimd_h2v1_upsample_mmx):
    503  	push	ebp
    504 @@ -643,7 +643,7 @@
    505  %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
    506  
    507  	align	16
    508 -	global	EXTN(jsimd_h2v2_upsample_mmx)
    509 +	global	EXTN(jsimd_h2v2_upsample_mmx) PRIVATE
    510  
    511  EXTN(jsimd_h2v2_upsample_mmx):
    512  	push	ebp
    513 Index: simd/jdmrgmmx.asm
    514 ===================================================================
    515 --- simd/jdmrgmmx.asm	(revision 829)
    516 +++ simd/jdmrgmmx.asm	(working copy)
    517 @@ -40,7 +40,7 @@
    518  %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
    519  
    520  	align	16
    521 -	global	EXTN(jsimd_h2v1_merged_upsample_mmx)
    522 +	global	EXTN(jsimd_h2v1_merged_upsample_mmx) PRIVATE
    523  
    524  EXTN(jsimd_h2v1_merged_upsample_mmx):
    525  	push	ebp
    526 @@ -409,7 +409,7 @@
    527  %define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
    528  
    529  	align	16
    530 -	global	EXTN(jsimd_h2v2_merged_upsample_mmx)
    531 +	global	EXTN(jsimd_h2v2_merged_upsample_mmx) PRIVATE
    532  
    533  EXTN(jsimd_h2v2_merged_upsample_mmx):
    534  	push	ebp
    535 Index: simd/jdsamss2.asm
    536 ===================================================================
    537 --- simd/jdsamss2.asm	(revision 829)
    538 +++ simd/jdsamss2.asm	(working copy)
    539 @@ -22,7 +22,7 @@
    540  	SECTION	SEG_CONST
    541  
    542  	alignz	16
    543 -	global	EXTN(jconst_fancy_upsample_sse2)
    544 +	global	EXTN(jconst_fancy_upsample_sse2) PRIVATE
    545  
    546  EXTN(jconst_fancy_upsample_sse2):
    547  
    548 @@ -58,7 +58,7 @@
    549  %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
    550  
    551  	align	16
    552 -	global	EXTN(jsimd_h2v1_fancy_upsample_sse2)
    553 +	global	EXTN(jsimd_h2v1_fancy_upsample_sse2) PRIVATE
    554  
    555  EXTN(jsimd_h2v1_fancy_upsample_sse2):
    556  	push	ebp
    557 @@ -214,7 +214,7 @@
    558  %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
    559  
    560  	align	16
    561 -	global	EXTN(jsimd_h2v2_fancy_upsample_sse2)
    562 +	global	EXTN(jsimd_h2v2_fancy_upsample_sse2) PRIVATE
    563  
    564  EXTN(jsimd_h2v2_fancy_upsample_sse2):
    565  	push	ebp
    566 @@ -538,7 +538,7 @@
    567  %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
    568  
    569  	align	16
    570 -	global	EXTN(jsimd_h2v1_upsample_sse2)
    571 +	global	EXTN(jsimd_h2v1_upsample_sse2) PRIVATE
    572  
    573  EXTN(jsimd_h2v1_upsample_sse2):
    574  	push	ebp
    575 @@ -637,7 +637,7 @@
    576  %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
    577  
    578  	align	16
    579 -	global	EXTN(jsimd_h2v2_upsample_sse2)
    580 +	global	EXTN(jsimd_h2v2_upsample_sse2) PRIVATE
    581  
    582  EXTN(jsimd_h2v2_upsample_sse2):
    583  	push	ebp
    584 Index: simd/jiss2flt-64.asm
    585 ===================================================================
    586 --- simd/jiss2flt-64.asm	(revision 829)
    587 +++ simd/jiss2flt-64.asm	(working copy)
    588 @@ -38,7 +38,7 @@
    589  	SECTION	SEG_CONST
    590  
    591  	alignz	16
    592 -	global	EXTN(jconst_idct_float_sse2)
    593 +	global	EXTN(jconst_idct_float_sse2) PRIVATE
    594  
    595  EXTN(jconst_idct_float_sse2):
    596  
    597 @@ -74,7 +74,7 @@
    598  					; FAST_FLOAT workspace[DCTSIZE2]
    599  
    600  	align	16
    601 -	global	EXTN(jsimd_idct_float_sse2)
    602 +	global	EXTN(jsimd_idct_float_sse2) PRIVATE
    603  
    604  EXTN(jsimd_idct_float_sse2):
    605  	push	rbp
    606 Index: simd/jfss2int-64.asm
    607 ===================================================================
    608 --- simd/jfss2int-64.asm	(revision 829)
    609 +++ simd/jfss2int-64.asm	(working copy)
    610 @@ -67,7 +67,7 @@
    611  	SECTION	SEG_CONST
    612  
    613  	alignz	16
    614 -	global	EXTN(jconst_fdct_islow_sse2)
    615 +	global	EXTN(jconst_fdct_islow_sse2) PRIVATE
    616  
    617  EXTN(jconst_fdct_islow_sse2):
    618  
    619 @@ -101,7 +101,7 @@
    620  %define WK_NUM		6
    621  
    622  	align	16
    623 -	global	EXTN(jsimd_fdct_islow_sse2)
    624 +	global	EXTN(jsimd_fdct_islow_sse2) PRIVATE
    625  
    626  EXTN(jsimd_fdct_islow_sse2):
    627  	push	rbp
    628 Index: simd/jcqnts2f.asm
    629 ===================================================================
    630 --- simd/jcqnts2f.asm	(revision 829)
    631 +++ simd/jcqnts2f.asm	(working copy)
    632 @@ -35,7 +35,7 @@
    633  %define workspace	ebp+16		; FAST_FLOAT * workspace
    634  
    635  	align	16
    636 -	global	EXTN(jsimd_convsamp_float_sse2)
    637 +	global	EXTN(jsimd_convsamp_float_sse2) PRIVATE
    638  
    639  EXTN(jsimd_convsamp_float_sse2):
    640  	push	ebp
    641 @@ -115,7 +115,7 @@
    642  %define workspace	ebp+16		; FAST_FLOAT * workspace
    643  
    644  	align	16
    645 -	global	EXTN(jsimd_quantize_float_sse2)
    646 +	global	EXTN(jsimd_quantize_float_sse2) PRIVATE
    647  
    648  EXTN(jsimd_quantize_float_sse2):
    649  	push	ebp
    650 Index: simd/jdmrgss2.asm
    651 ===================================================================
    652 --- simd/jdmrgss2.asm	(revision 829)
    653 +++ simd/jdmrgss2.asm	(working copy)
    654 @@ -40,7 +40,7 @@
    655  %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
    656  
    657  	align	16
    658 -	global	EXTN(jsimd_h2v1_merged_upsample_sse2)
    659 +	global	EXTN(jsimd_h2v1_merged_upsample_sse2) PRIVATE
    660  
    661  EXTN(jsimd_h2v1_merged_upsample_sse2):
    662  	push	ebp
    663 @@ -560,7 +560,7 @@
    664  %define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
    665  
    666  	align	16
    667 -	global	EXTN(jsimd_h2v2_merged_upsample_sse2)
    668 +	global	EXTN(jsimd_h2v2_merged_upsample_sse2) PRIVATE
    669  
    670  EXTN(jsimd_h2v2_merged_upsample_sse2):
    671  	push	ebp
    672 Index: simd/jfmmxint.asm
    673 ===================================================================
    674 --- simd/jfmmxint.asm	(revision 829)
    675 +++ simd/jfmmxint.asm	(working copy)
    676 @@ -66,7 +66,7 @@
    677  	SECTION	SEG_CONST
    678  
    679  	alignz	16
    680 -	global	EXTN(jconst_fdct_islow_mmx)
    681 +	global	EXTN(jconst_fdct_islow_mmx) PRIVATE
    682  
    683  EXTN(jconst_fdct_islow_mmx):
    684  
    685 @@ -101,7 +101,7 @@
    686  %define WK_NUM		2
    687  
    688  	align	16
    689 -	global	EXTN(jsimd_fdct_islow_mmx)
    690 +	global	EXTN(jsimd_fdct_islow_mmx) PRIVATE
    691  
    692  EXTN(jsimd_fdct_islow_mmx):
    693  	push	ebp
    694 Index: simd/jcgryss2-64.asm
    695 ===================================================================
    696 --- simd/jcgryss2-64.asm	(revision 829)
    697 +++ simd/jcgryss2-64.asm	(working copy)
    698 @@ -37,7 +37,7 @@
    699  
    700  	align	16
    701  
    702 -	global	EXTN(jsimd_rgb_gray_convert_sse2)
    703 +	global	EXTN(jsimd_rgb_gray_convert_sse2) PRIVATE
    704  
    705  EXTN(jsimd_rgb_gray_convert_sse2):
    706  	push	rbp
    707 Index: simd/jcqnts2i.asm
    708 ===================================================================
    709 --- simd/jcqnts2i.asm	(revision 829)
    710 +++ simd/jcqnts2i.asm	(working copy)
    711 @@ -35,7 +35,7 @@
    712  %define workspace	ebp+16		; DCTELEM * workspace
    713  
    714  	align	16
    715 -	global	EXTN(jsimd_convsamp_sse2)
    716 +	global	EXTN(jsimd_convsamp_sse2) PRIVATE
    717  
    718  EXTN(jsimd_convsamp_sse2):
    719  	push	ebp
    720 @@ -117,7 +117,7 @@
    721  %define workspace	ebp+16		; DCTELEM * workspace
    722  
    723  	align	16
    724 -	global	EXTN(jsimd_quantize_sse2)
    725 +	global	EXTN(jsimd_quantize_sse2) PRIVATE
    726  
    727  EXTN(jsimd_quantize_sse2):
    728  	push	ebp
    729 Index: simd/jiss2fst-64.asm
    730 ===================================================================
    731 --- simd/jiss2fst-64.asm	(revision 829)
    732 +++ simd/jiss2fst-64.asm	(working copy)
    733 @@ -60,7 +60,7 @@
    734  %define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
    735  
    736  	alignz	16
    737 -	global	EXTN(jconst_idct_ifast_sse2)
    738 +	global	EXTN(jconst_idct_ifast_sse2) PRIVATE
    739  
    740  EXTN(jconst_idct_ifast_sse2):
    741  
    742 @@ -93,7 +93,7 @@
    743  %define WK_NUM		2
    744  
    745  	align	16
    746 -	global	EXTN(jsimd_idct_ifast_sse2)
    747 +	global	EXTN(jsimd_idct_ifast_sse2) PRIVATE
    748  
    749  EXTN(jsimd_idct_ifast_sse2):
    750  	push	rbp
    751 Index: simd/jiss2flt.asm
    752 ===================================================================
    753 --- simd/jiss2flt.asm	(revision 829)
    754 +++ simd/jiss2flt.asm	(working copy)
    755 @@ -37,7 +37,7 @@
    756  	SECTION	SEG_CONST
    757  
    758  	alignz	16
    759 -	global	EXTN(jconst_idct_float_sse2)
    760 +	global	EXTN(jconst_idct_float_sse2) PRIVATE
    761  
    762  EXTN(jconst_idct_float_sse2):
    763  
    764 @@ -73,7 +73,7 @@
    765  					; FAST_FLOAT workspace[DCTSIZE2]
    766  
    767  	align	16
    768 -	global	EXTN(jsimd_idct_float_sse2)
    769 +	global	EXTN(jsimd_idct_float_sse2) PRIVATE
    770  
    771  EXTN(jsimd_idct_float_sse2):
    772  	push	ebp
    773 Index: simd/jiss2int.asm
    774 ===================================================================
    775 --- simd/jiss2int.asm	(revision 829)
    776 +++ simd/jiss2int.asm	(working copy)
    777 @@ -66,7 +66,7 @@
    778  	SECTION	SEG_CONST
    779  
    780  	alignz	16
    781 -	global	EXTN(jconst_idct_islow_sse2)
    782 +	global	EXTN(jconst_idct_islow_sse2) PRIVATE
    783  
    784  EXTN(jconst_idct_islow_sse2):
    785  
    786 @@ -105,7 +105,7 @@
    787  %define WK_NUM		12
    788  
    789  	align	16
    790 -	global	EXTN(jsimd_idct_islow_sse2)
    791 +	global	EXTN(jsimd_idct_islow_sse2) PRIVATE
    792  
    793  EXTN(jsimd_idct_islow_sse2):
    794  	push	ebp
    795 Index: simd/jfsseflt-64.asm
    796 ===================================================================
    797 --- simd/jfsseflt-64.asm	(revision 829)
    798 +++ simd/jfsseflt-64.asm	(working copy)
    799 @@ -38,7 +38,7 @@
    800  	SECTION	SEG_CONST
    801  
    802  	alignz	16
    803 -	global	EXTN(jconst_fdct_float_sse)
    804 +	global	EXTN(jconst_fdct_float_sse) PRIVATE
    805  
    806  EXTN(jconst_fdct_float_sse):
    807  
    808 @@ -65,7 +65,7 @@
    809  %define WK_NUM		2
    810  
    811  	align	16
    812 -	global	EXTN(jsimd_fdct_float_sse)
    813 +	global	EXTN(jsimd_fdct_float_sse) PRIVATE
    814  
    815  EXTN(jsimd_fdct_float_sse):
    816  	push	rbp
    817 Index: simd/jccolss2-64.asm
    818 ===================================================================
    819 --- simd/jccolss2-64.asm	(revision 829)
    820 +++ simd/jccolss2-64.asm	(working copy)
    821 @@ -34,7 +34,7 @@
    822  	SECTION	SEG_CONST
    823  
    824  	alignz	16
    825 -	global	EXTN(jconst_rgb_ycc_convert_sse2)
    826 +	global	EXTN(jconst_rgb_ycc_convert_sse2) PRIVATE
    827  
    828  EXTN(jconst_rgb_ycc_convert_sse2):
    829  
    830 Index: simd/jcsamss2-64.asm
    831 ===================================================================
    832 --- simd/jcsamss2-64.asm	(revision 829)
    833 +++ simd/jcsamss2-64.asm	(working copy)
    834 @@ -41,7 +41,7 @@
    835  ; r15 = JSAMPARRAY output_data
    836  
    837  	align	16
    838 -	global	EXTN(jsimd_h2v1_downsample_sse2)
    839 +	global	EXTN(jsimd_h2v1_downsample_sse2) PRIVATE
    840  
    841  EXTN(jsimd_h2v1_downsample_sse2):
    842  	push	rbp
    843 @@ -185,7 +185,7 @@
    844  ; r15 = JSAMPARRAY output_data
    845  
    846  	align	16
    847 -	global	EXTN(jsimd_h2v2_downsample_sse2)
    848 +	global	EXTN(jsimd_h2v2_downsample_sse2) PRIVATE
    849  
    850  EXTN(jsimd_h2v2_downsample_sse2):
    851  	push	rbp
    852 Index: simd/jdclrss2-64.asm
    853 ===================================================================
    854 --- simd/jdclrss2-64.asm	(revision 829)
    855 +++ simd/jdclrss2-64.asm	(working copy)
    856 @@ -39,7 +39,7 @@
    857  %define WK_NUM		2
    858  
    859  	align	16
    860 -	global	EXTN(jsimd_ycc_rgb_convert_sse2)
    861 +	global	EXTN(jsimd_ycc_rgb_convert_sse2) PRIVATE
    862  
    863  EXTN(jsimd_ycc_rgb_convert_sse2):
    864  	push	rbp
    865 Index: simd/jdcolmmx.asm
    866 ===================================================================
    867 --- simd/jdcolmmx.asm	(revision 829)
    868 +++ simd/jdcolmmx.asm	(working copy)
    869 @@ -35,7 +35,7 @@
    870  	SECTION	SEG_CONST
    871  
    872  	alignz	16
    873 -	global	EXTN(jconst_ycc_rgb_convert_mmx)
    874 +	global	EXTN(jconst_ycc_rgb_convert_mmx) PRIVATE
    875  
    876  EXTN(jconst_ycc_rgb_convert_mmx):
    877  
    878 Index: simd/jcclrmmx.asm
    879 ===================================================================
    880 --- simd/jcclrmmx.asm	(revision 829)
    881 +++ simd/jcclrmmx.asm	(working copy)
    882 @@ -40,7 +40,7 @@
    883  %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
    884  
    885  	align	16
    886 -	global	EXTN(jsimd_rgb_ycc_convert_mmx)
    887 +	global	EXTN(jsimd_rgb_ycc_convert_mmx) PRIVATE
    888  
    889  EXTN(jsimd_rgb_ycc_convert_mmx):
    890  	push	ebp
    891 Index: simd/jfsseflt.asm
    892 ===================================================================
    893 --- simd/jfsseflt.asm	(revision 829)
    894 +++ simd/jfsseflt.asm	(working copy)
    895 @@ -37,7 +37,7 @@
    896  	SECTION	SEG_CONST
    897  
    898  	alignz	16
    899 -	global	EXTN(jconst_fdct_float_sse)
    900 +	global	EXTN(jconst_fdct_float_sse) PRIVATE
    901  
    902  EXTN(jconst_fdct_float_sse):
    903  
    904 @@ -65,7 +65,7 @@
    905  %define WK_NUM		2
    906  
    907  	align	16
    908 -	global	EXTN(jsimd_fdct_float_sse)
    909 +	global	EXTN(jsimd_fdct_float_sse) PRIVATE
    910  
    911  EXTN(jsimd_fdct_float_sse):
    912  	push	ebp
    913 Index: simd/jdmrgss2-64.asm
    914 ===================================================================
    915 --- simd/jdmrgss2-64.asm	(revision 829)
    916 +++ simd/jdmrgss2-64.asm	(working copy)
    917 @@ -39,7 +39,7 @@
    918  %define WK_NUM		3
    919  
    920  	align	16
    921 -	global	EXTN(jsimd_h2v1_merged_upsample_sse2)
    922 +	global	EXTN(jsimd_h2v1_merged_upsample_sse2) PRIVATE
    923  
    924  EXTN(jsimd_h2v1_merged_upsample_sse2):
    925  	push	rbp
    926 @@ -543,7 +543,7 @@
    927  ; r13 = JSAMPARRAY output_buf
    928  
    929  	align	16
    930 -	global	EXTN(jsimd_h2v2_merged_upsample_sse2)
    931 +	global	EXTN(jsimd_h2v2_merged_upsample_sse2) PRIVATE
    932  
    933  EXTN(jsimd_h2v2_merged_upsample_sse2):
    934  	push	rbp
    935 Index: simd/jdcolss2.asm
    936 ===================================================================
    937 --- simd/jdcolss2.asm	(revision 829)
    938 +++ simd/jdcolss2.asm	(working copy)
    939 @@ -35,7 +35,7 @@
    940  	SECTION	SEG_CONST
    941  
    942  	alignz	16
    943 -	global	EXTN(jconst_ycc_rgb_convert_sse2)
    944 +	global	EXTN(jconst_ycc_rgb_convert_sse2) PRIVATE
    945  
    946  EXTN(jconst_ycc_rgb_convert_sse2):
    947  
    948 Index: simd/jdmermmx.asm
    949 ===================================================================
    950 --- simd/jdmermmx.asm	(revision 829)
    951 +++ simd/jdmermmx.asm	(working copy)
    952 @@ -35,7 +35,7 @@
    953  	SECTION	SEG_CONST
    954  
    955  	alignz	16
    956 -	global	EXTN(jconst_merged_upsample_mmx)
    957 +	global	EXTN(jconst_merged_upsample_mmx) PRIVATE
    958  
    959  EXTN(jconst_merged_upsample_mmx):
    960  
    961 Index: simd/jcclrss2.asm
    962 ===================================================================
    963 --- simd/jcclrss2.asm	(revision 829)
    964 +++ simd/jcclrss2.asm	(working copy)
    965 @@ -38,7 +38,7 @@
    966  
    967  	align	16
    968  
    969 -	global	EXTN(jsimd_rgb_ycc_convert_sse2)
    970 +	global	EXTN(jsimd_rgb_ycc_convert_sse2) PRIVATE
    971  
    972  EXTN(jsimd_rgb_ycc_convert_sse2):
    973  	push	ebp
    974 Index: simd/jiss2red.asm
    975 ===================================================================
    976 --- simd/jiss2red.asm	(revision 829)
    977 +++ simd/jiss2red.asm	(working copy)
    978 @@ -72,7 +72,7 @@
    979  	SECTION	SEG_CONST
    980  
    981  	alignz	16
    982 -	global	EXTN(jconst_idct_red_sse2)
    983 +	global	EXTN(jconst_idct_red_sse2) PRIVATE
    984  
    985  EXTN(jconst_idct_red_sse2):
    986  
    987 @@ -113,7 +113,7 @@
    988  %define WK_NUM		2
    989  
    990  	align	16
    991 -	global	EXTN(jsimd_idct_4x4_sse2)
    992 +	global	EXTN(jsimd_idct_4x4_sse2) PRIVATE
    993  
    994  EXTN(jsimd_idct_4x4_sse2):
    995  	push	ebp
    996 @@ -424,7 +424,7 @@
    997  %define output_col(b)	(b)+20		; JDIMENSION output_col
    998  
    999  	align	16
   1000 -	global	EXTN(jsimd_idct_2x2_sse2)
   1001 +	global	EXTN(jsimd_idct_2x2_sse2) PRIVATE
   1002  
   1003  EXTN(jsimd_idct_2x2_sse2):
   1004  	push	ebp
   1005 Index: simd/jdmerss2.asm
   1006 ===================================================================
   1007 --- simd/jdmerss2.asm	(revision 829)
   1008 +++ simd/jdmerss2.asm	(working copy)
   1009 @@ -35,7 +35,7 @@
   1010  	SECTION	SEG_CONST
   1011  
   1012  	alignz	16
   1013 -	global	EXTN(jconst_merged_upsample_sse2)
   1014 +	global	EXTN(jconst_merged_upsample_sse2) PRIVATE
   1015  
   1016  EXTN(jconst_merged_upsample_sse2):
   1017  
   1018 Index: simd/jfss2fst-64.asm
   1019 ===================================================================
   1020 --- simd/jfss2fst-64.asm	(revision 829)
   1021 +++ simd/jfss2fst-64.asm	(working copy)
   1022 @@ -53,7 +53,7 @@
   1023  %define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
   1024  
   1025  	alignz	16
   1026 -	global	EXTN(jconst_fdct_ifast_sse2)
   1027 +	global	EXTN(jconst_fdct_ifast_sse2) PRIVATE
   1028  
   1029  EXTN(jconst_fdct_ifast_sse2):
   1030  
   1031 @@ -80,7 +80,7 @@
   1032  %define WK_NUM		2
   1033  
   1034  	align	16
   1035 -	global	EXTN(jsimd_fdct_ifast_sse2)
   1036 +	global	EXTN(jsimd_fdct_ifast_sse2) PRIVATE
   1037  
   1038  EXTN(jsimd_fdct_ifast_sse2):
   1039  	push	rbp
   1040 Index: simd/jcqntmmx.asm
   1041 ===================================================================
   1042 --- simd/jcqntmmx.asm	(revision 829)
   1043 +++ simd/jcqntmmx.asm	(working copy)
   1044 @@ -35,7 +35,7 @@
   1045  %define workspace	ebp+16		; DCTELEM * workspace
   1046  
   1047  	align	16
   1048 -	global	EXTN(jsimd_convsamp_mmx)
   1049 +	global	EXTN(jsimd_convsamp_mmx) PRIVATE
   1050  
   1051  EXTN(jsimd_convsamp_mmx):
   1052  	push	ebp
   1053 @@ -140,7 +140,7 @@
   1054  %define workspace	ebp+16		; DCTELEM * workspace
   1055  
   1056  	align	16
   1057 -	global	EXTN(jsimd_quantize_mmx)
   1058 +	global	EXTN(jsimd_quantize_mmx) PRIVATE
   1059  
   1060  EXTN(jsimd_quantize_mmx):
   1061  	push	ebp
   1062 Index: simd/jimmxfst.asm
   1063 ===================================================================
   1064 --- simd/jimmxfst.asm	(revision 829)
   1065 +++ simd/jimmxfst.asm	(working copy)
   1066 @@ -59,7 +59,7 @@
   1067  %define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
   1068  
   1069  	alignz	16
   1070 -	global	EXTN(jconst_idct_ifast_mmx)
   1071 +	global	EXTN(jconst_idct_ifast_mmx) PRIVATE
   1072  
   1073  EXTN(jconst_idct_ifast_mmx):
   1074  
   1075 @@ -94,7 +94,7 @@
   1076  					; JCOEF workspace[DCTSIZE2]
   1077  
   1078  	align	16
   1079 -	global	EXTN(jsimd_idct_ifast_mmx)
   1080 +	global	EXTN(jsimd_idct_ifast_mmx) PRIVATE
   1081  
   1082  EXTN(jsimd_idct_ifast_mmx):
   1083  	push	ebp
   1084 Index: simd/jfss2fst.asm
   1085 ===================================================================
   1086 --- simd/jfss2fst.asm	(revision 829)
   1087 +++ simd/jfss2fst.asm	(working copy)
   1088 @@ -52,7 +52,7 @@
   1089  %define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
   1090  
   1091  	alignz	16
   1092 -	global	EXTN(jconst_fdct_ifast_sse2)
   1093 +	global	EXTN(jconst_fdct_ifast_sse2) PRIVATE
   1094  
   1095  EXTN(jconst_fdct_ifast_sse2):
   1096  
   1097 @@ -80,7 +80,7 @@
   1098  %define WK_NUM		2
   1099  
   1100  	align	16
   1101 -	global	EXTN(jsimd_fdct_ifast_sse2)
   1102 +	global	EXTN(jsimd_fdct_ifast_sse2) PRIVATE
   1103  
   1104  EXTN(jsimd_fdct_ifast_sse2):
   1105  	push	ebp
   1106 Index: simd/jcgrammx.asm
   1107 ===================================================================
   1108 --- simd/jcgrammx.asm	(revision 829)
   1109 +++ simd/jcgrammx.asm	(working copy)
   1110 @@ -33,7 +33,7 @@
   1111  	SECTION	SEG_CONST
   1112  
   1113  	alignz	16
   1114 -	global	EXTN(jconst_rgb_gray_convert_mmx)
   1115 +	global	EXTN(jconst_rgb_gray_convert_mmx) PRIVATE
   1116  
   1117  EXTN(jconst_rgb_gray_convert_mmx):
   1118  
   1119 Index: simd/jdcolss2-64.asm
   1120 ===================================================================
   1121 --- simd/jdcolss2-64.asm	(revision 829)
   1122 +++ simd/jdcolss2-64.asm	(working copy)
   1123 @@ -35,7 +35,7 @@
   1124  	SECTION	SEG_CONST
   1125  
   1126  	alignz	16
   1127 -	global	EXTN(jconst_ycc_rgb_convert_sse2)
   1128 +	global	EXTN(jconst_ycc_rgb_convert_sse2) PRIVATE
   1129  
   1130  EXTN(jconst_ycc_rgb_convert_sse2):
   1131  
   1132 Index: simd/jf3dnflt.asm
   1133 ===================================================================
   1134 --- simd/jf3dnflt.asm	(revision 829)
   1135 +++ simd/jf3dnflt.asm	(working copy)
   1136 @@ -27,7 +27,7 @@
   1137  	SECTION	SEG_CONST
   1138  
   1139  	alignz	16
   1140 -	global	EXTN(jconst_fdct_float_3dnow)
   1141 +	global	EXTN(jconst_fdct_float_3dnow) PRIVATE
   1142  
   1143  EXTN(jconst_fdct_float_3dnow):
   1144  
   1145 @@ -55,7 +55,7 @@
   1146  %define WK_NUM		2
   1147  
   1148  	align	16
   1149 -	global	EXTN(jsimd_fdct_float_3dnow)
   1150 +	global	EXTN(jsimd_fdct_float_3dnow) PRIVATE
   1151  
   1152  EXTN(jsimd_fdct_float_3dnow):
   1153  	push	ebp
   1154 Index: simd/jdsamss2-64.asm
   1155 ===================================================================
   1156 --- simd/jdsamss2-64.asm	(revision 829)
   1157 +++ simd/jdsamss2-64.asm	(working copy)
   1158 @@ -23,7 +23,7 @@
   1159  	SECTION	SEG_CONST
   1160  
   1161  	alignz	16
   1162 -	global	EXTN(jconst_fancy_upsample_sse2)
   1163 +	global	EXTN(jconst_fancy_upsample_sse2) PRIVATE
   1164  
   1165  EXTN(jconst_fancy_upsample_sse2):
   1166  
   1167 @@ -59,7 +59,7 @@
   1168  ; r13 = JSAMPARRAY * output_data_ptr
   1169  
   1170  	align	16
   1171 -	global	EXTN(jsimd_h2v1_fancy_upsample_sse2)
   1172 +	global	EXTN(jsimd_h2v1_fancy_upsample_sse2) PRIVATE
   1173  
   1174  EXTN(jsimd_h2v1_fancy_upsample_sse2):
   1175  	push	rbp
   1176 @@ -201,7 +201,7 @@
   1177  %define WK_NUM		4
   1178  
   1179  	align	16
   1180 -	global	EXTN(jsimd_h2v2_fancy_upsample_sse2)
   1181 +	global	EXTN(jsimd_h2v2_fancy_upsample_sse2) PRIVATE
   1182  
   1183  EXTN(jsimd_h2v2_fancy_upsample_sse2):
   1184  	push	rbp
   1185 @@ -498,7 +498,7 @@
   1186  ; r13 = JSAMPARRAY * output_data_ptr
   1187  
   1188  	align	16
   1189 -	global	EXTN(jsimd_h2v1_upsample_sse2)
   1190 +	global	EXTN(jsimd_h2v1_upsample_sse2) PRIVATE
   1191  
   1192  EXTN(jsimd_h2v1_upsample_sse2):
   1193  	push	rbp
   1194 @@ -587,7 +587,7 @@
   1195  ; r13 = JSAMPARRAY * output_data_ptr
   1196  
   1197  	align	16
   1198 -	global	EXTN(jsimd_h2v2_upsample_sse2)
   1199 +	global	EXTN(jsimd_h2v2_upsample_sse2) PRIVATE
   1200  
   1201  EXTN(jsimd_h2v2_upsample_sse2):
   1202  	push	rbp
   1203 Index: simd/jcgrass2.asm
   1204 ===================================================================
   1205 --- simd/jcgrass2.asm	(revision 829)
   1206 +++ simd/jcgrass2.asm	(working copy)
   1207 @@ -30,7 +30,7 @@
   1208  	SECTION	SEG_CONST
   1209  
   1210  	alignz	16
   1211 -	global	EXTN(jconst_rgb_gray_convert_sse2)
   1212 +	global	EXTN(jconst_rgb_gray_convert_sse2) PRIVATE
   1213  
   1214  EXTN(jconst_rgb_gray_convert_sse2):
   1215  
   1216 Index: simd/jcsammmx.asm
   1217 ===================================================================
   1218 --- simd/jcsammmx.asm	(revision 829)
   1219 +++ simd/jcsammmx.asm	(working copy)
   1220 @@ -40,7 +40,7 @@
   1221  %define output_data(b)	(b)+28	; JSAMPARRAY output_data
   1222  
   1223  	align	16
   1224 -	global	EXTN(jsimd_h2v1_downsample_mmx)
   1225 +	global	EXTN(jsimd_h2v1_downsample_mmx) PRIVATE
   1226  
   1227  EXTN(jsimd_h2v1_downsample_mmx):
   1228  	push	ebp
   1229 @@ -182,7 +182,7 @@
   1230  %define output_data(b)	(b)+28	; JSAMPARRAY output_data
   1231  
   1232  	align	16
   1233 -	global	EXTN(jsimd_h2v2_downsample_mmx)
   1234 +	global	EXTN(jsimd_h2v2_downsample_mmx) PRIVATE
   1235  
   1236  EXTN(jsimd_h2v2_downsample_mmx):
   1237  	push	ebp
   1238 Index: simd/jsimd_arm_neon.S
   1239 ===================================================================
   1240 --- simd/jsimd_arm_neon.S	(revision 272637)
   1241 +++ simd/jsimd_arm_neon.S	(working copy)
   1242 @@ -41,11 +41,9 @@
   1243  /* Supplementary macro for setting function attributes */
   1244  .macro asm_function fname
   1245  #ifdef __APPLE__
   1246 -    .func _\fname
   1247      .globl _\fname
   1248  _\fname:
   1249  #else
   1250 -    .func \fname
   1251      .global \fname
   1252  #ifdef __ELF__
   1253      .hidden \fname
   1254 @@ -670,7 +668,6 @@
   1255      .unreq          ROW6R
   1256      .unreq          ROW7L
   1257      .unreq          ROW7R
   1258 -.endfunc
   1259  
   1260  
   1261  /*****************************************************************************/
   1262 @@ -895,7 +892,6 @@
   1263      .unreq          TMP2
   1264      .unreq          TMP3
   1265      .unreq          TMP4
   1266 -.endfunc
   1267  
   1268  
   1269  /*****************************************************************************/
   1270 @@ -1108,7 +1104,6 @@
   1271      .unreq          TMP2
   1272      .unreq          TMP3
   1273      .unreq          TMP4
   1274 -.endfunc
   1275  
   1276  .purgem idct_helper
   1277  
   1278 @@ -1263,7 +1258,6 @@
   1279      .unreq          OUTPUT_COL
   1280      .unreq          TMP1
   1281      .unreq          TMP2
   1282 -.endfunc
   1283  
   1284  .purgem idct_helper
   1285  
   1286 @@ -1547,7 +1541,6 @@
   1287      .unreq          U
   1288      .unreq          V
   1289      .unreq          N
   1290 -.endfunc
   1291  
   1292  .purgem do_yuv_to_rgb
   1293  .purgem do_yuv_to_rgb_stage1
   1294 @@ -1858,7 +1851,6 @@
   1295      .unreq          U
   1296      .unreq          V
   1297      .unreq          N
   1298 -.endfunc
   1299  
   1300  .purgem do_rgb_to_yuv
   1301  .purgem do_rgb_to_yuv_stage1
   1302 @@ -1940,7 +1932,6 @@
   1303      .unreq          TMP2
   1304      .unreq          TMP3
   1305      .unreq          TMP4
   1306 -.endfunc
   1307  
   1308  
   1309  /*****************************************************************************/
   1310 @@ -2064,7 +2055,6 @@
   1311  
   1312      .unreq          DATA
   1313      .unreq          TMP
   1314 -.endfunc
   1315  
   1316  
   1317  /*****************************************************************************/
   1318 @@ -2166,7 +2156,6 @@
   1319      .unreq          CORRECTION
   1320      .unreq          SHIFT
   1321      .unreq          LOOP_COUNT
   1322 -.endfunc
   1323  
   1324  
   1325  /*****************************************************************************/
   1326 @@ -2401,7 +2390,6 @@
   1327      .unreq          WIDTH
   1328      .unreq          TMP
   1329  
   1330 -.endfunc
   1331  
   1332  .purgem upsample16
   1333  .purgem upsample32
   1334 Index: simd/jsimd_i386.c
   1335 ===================================================================
   1336 --- simd/jsimd_i386.c	(revision 829)
   1337 +++ simd/jsimd_i386.c	(working copy)
   1338 @@ -61,6 +61,7 @@
   1339      simd_support &= JSIMD_SSE2;
   1340  }
   1341  
   1342 +#ifndef JPEG_DECODE_ONLY
   1343  GLOBAL(int)
   1344  jsimd_can_rgb_ycc (void)
   1345  {
   1346 @@ -82,6 +83,7 @@
   1347  
   1348    return 0;
   1349  }
   1350 +#endif
   1351  
   1352  GLOBAL(int)
   1353  jsimd_can_rgb_gray (void)
   1354 @@ -127,6 +129,7 @@
   1355    return 0;
   1356  }
   1357  
   1358 +#ifndef JPEG_DECODE_ONLY
   1359  GLOBAL(void)
   1360  jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
   1361                         JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
   1362 @@ -179,6 +182,7 @@
   1363      mmxfct(cinfo->image_width, input_buf,
   1364          output_buf, output_row, num_rows);
   1365  }
   1366 +#endif
   1367  
   1368  GLOBAL(void)
   1369  jsimd_rgb_gray_convert (j_compress_ptr cinfo,
   1370 @@ -286,6 +290,7 @@
   1371          input_row, output_buf, num_rows);
   1372  }
   1373  
   1374 +#ifndef JPEG_DECODE_ONLY
   1375  GLOBAL(int)
   1376  jsimd_can_h2v2_downsample (void)
   1377  {
   1378 @@ -351,6 +356,7 @@
   1379          compptr->v_samp_factor, compptr->width_in_blocks,
   1380          input_data, output_data);
   1381  }
   1382 +#endif
   1383  
   1384  GLOBAL(int)
   1385  jsimd_can_h2v2_upsample (void)
   1386 @@ -636,6 +642,7 @@
   1387          in_row_group_ctr, output_buf);
   1388  }
   1389  
   1390 +#ifndef JPEG_DECODE_ONLY
   1391  GLOBAL(int)
   1392  jsimd_can_convsamp (void)
   1393  {
   1394 @@ -855,6 +862,7 @@
   1395    else if (simd_support & JSIMD_3DNOW)
   1396      jsimd_quantize_float_3dnow(coef_block, divisors, workspace);
   1397  }
   1398 +#endif
   1399  
   1400  GLOBAL(int)
   1401  jsimd_can_idct_2x2 (void)
   1402 @@ -1045,4 +1053,3 @@
   1403      jsimd_idct_float_3dnow(compptr->dct_table, coef_block,
   1404          output_buf, output_col);
   1405  }
   1406 -
   1407 Index: simd/jcqnts2f-64.asm
   1408 ===================================================================
   1409 --- simd/jcqnts2f-64.asm	(revision 829)
   1410 +++ simd/jcqnts2f-64.asm	(working copy)
   1411 @@ -36,7 +36,7 @@
   1412  ; r12 = FAST_FLOAT * workspace
   1413  
   1414  	align	16
   1415 -	global	EXTN(jsimd_convsamp_float_sse2)
   1416 +	global	EXTN(jsimd_convsamp_float_sse2) PRIVATE
   1417  
   1418  EXTN(jsimd_convsamp_float_sse2):
   1419  	push	rbp
   1420 @@ -110,7 +110,7 @@
   1421  ; r12 = FAST_FLOAT * workspace
   1422  
   1423  	align	16
   1424 -	global	EXTN(jsimd_quantize_float_sse2)
   1425 +	global	EXTN(jsimd_quantize_float_sse2) PRIVATE
   1426  
   1427  EXTN(jsimd_quantize_float_sse2):
   1428  	push	rbp
   1429 Index: simd/jcqnt3dn.asm
   1430 ===================================================================
   1431 --- simd/jcqnt3dn.asm	(revision 829)
   1432 +++ simd/jcqnt3dn.asm	(working copy)
   1433 @@ -35,7 +35,7 @@
   1434  %define workspace	ebp+16		; FAST_FLOAT * workspace
   1435  
   1436  	align	16
   1437 -	global	EXTN(jsimd_convsamp_float_3dnow)
   1438 +	global	EXTN(jsimd_convsamp_float_3dnow) PRIVATE
   1439  
   1440  EXTN(jsimd_convsamp_float_3dnow):
   1441  	push	ebp
   1442 @@ -138,7 +138,7 @@
   1443  %define workspace	ebp+16		; FAST_FLOAT * workspace
   1444  
   1445  	align	16
   1446 -	global	EXTN(jsimd_quantize_float_3dnow)
   1447 +	global	EXTN(jsimd_quantize_float_3dnow) PRIVATE
   1448  
   1449  EXTN(jsimd_quantize_float_3dnow):
   1450  	push	ebp
   1451 Index: simd/jcsamss2.asm
   1452 ===================================================================
   1453 --- simd/jcsamss2.asm	(revision 829)
   1454 +++ simd/jcsamss2.asm	(working copy)
   1455 @@ -40,7 +40,7 @@
   1456  %define output_data(b)	(b)+28		; JSAMPARRAY output_data
   1457  
   1458  	align	16
   1459 -	global	EXTN(jsimd_h2v1_downsample_sse2)
   1460 +	global	EXTN(jsimd_h2v1_downsample_sse2) PRIVATE
   1461  
   1462  EXTN(jsimd_h2v1_downsample_sse2):
   1463  	push	ebp
   1464 @@ -195,7 +195,7 @@
   1465  %define output_data(b)	(b)+28	; JSAMPARRAY output_data
   1466  
   1467  	align	16
   1468 -	global	EXTN(jsimd_h2v2_downsample_sse2)
   1469 +	global	EXTN(jsimd_h2v2_downsample_sse2) PRIVATE
   1470  
   1471  EXTN(jsimd_h2v2_downsample_sse2):
   1472  	push	ebp
   1473 Index: simd/jsimd_x86_64.c
   1474 ===================================================================
   1475 --- simd/jsimd_x86_64.c	(revision 829)
   1476 +++ simd/jsimd_x86_64.c	(working copy)
   1477 @@ -29,6 +29,7 @@
   1478  
   1479  #define IS_ALIGNED_SSE(ptr) (IS_ALIGNED(ptr, 4)) /* 16 byte alignment */
   1480  
   1481 +#ifndef JPEG_DECODE_ONLY
   1482  GLOBAL(int)
   1483  jsimd_can_rgb_ycc (void)
   1484  {
   1485 @@ -45,6 +46,7 @@
   1486  
   1487    return 1;
   1488  }
   1489 +#endif
   1490  
   1491  GLOBAL(int)
   1492  jsimd_can_rgb_gray (void)
   1493 @@ -80,6 +82,7 @@
   1494    return 1;
   1495  }
   1496  
   1497 +#ifndef JPEG_DECODE_ONLY
   1498  GLOBAL(void)
   1499  jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
   1500                         JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
   1501 @@ -118,6 +121,7 @@
   1502  
   1503    sse2fct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
   1504  }
   1505 +#endif
   1506  
   1507  GLOBAL(void)
   1508  jsimd_rgb_gray_convert (j_compress_ptr cinfo,
   1509 @@ -197,6 +201,7 @@
   1510    sse2fct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
   1511  }
   1512  
   1513 +#ifndef JPEG_DECODE_ONLY
   1514  GLOBAL(int)
   1515  jsimd_can_h2v2_downsample (void)
   1516  {
   1517 @@ -242,6 +247,7 @@
   1518                               compptr->width_in_blocks,
   1519                               input_data, output_data);
   1520  }
   1521 +#endif
   1522  
   1523  GLOBAL(int)
   1524  jsimd_can_h2v2_upsample (void)
   1525 @@ -451,6 +457,7 @@
   1526    sse2fct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
   1527  }
   1528  
   1529 +#ifndef JPEG_DECODE_ONLY
   1530  GLOBAL(int)
   1531  jsimd_can_convsamp (void)
   1532  {
   1533 @@ -601,6 +608,7 @@
   1534  {
   1535    jsimd_quantize_float_sse2(coef_block, divisors, workspace);
   1536  }
   1537 +#endif
   1538  
   1539  GLOBAL(int)
   1540  jsimd_can_idct_2x2 (void)
   1541 @@ -750,4 +758,3 @@
   1542    jsimd_idct_float_sse2(compptr->dct_table, coef_block,
   1543                          output_buf, output_col);
   1544  }
   1545 -
   1546 Index: simd/jimmxint.asm
   1547 ===================================================================
   1548 --- simd/jimmxint.asm	(revision 829)
   1549 +++ simd/jimmxint.asm	(working copy)
   1550 @@ -66,7 +66,7 @@
   1551  	SECTION	SEG_CONST
   1552  
   1553  	alignz	16
   1554 -	global	EXTN(jconst_idct_islow_mmx)
   1555 +	global	EXTN(jconst_idct_islow_mmx) PRIVATE
   1556  
   1557  EXTN(jconst_idct_islow_mmx):
   1558  
   1559 @@ -107,7 +107,7 @@
   1560  					; JCOEF workspace[DCTSIZE2]
   1561  
   1562  	align	16
   1563 -	global	EXTN(jsimd_idct_islow_mmx)
   1564 +	global	EXTN(jsimd_idct_islow_mmx) PRIVATE
   1565  
   1566  EXTN(jsimd_idct_islow_mmx):
   1567  	push	ebp
   1568 Index: simd/jcgrymmx.asm
   1569 ===================================================================
   1570 --- simd/jcgrymmx.asm	(revision 829)
   1571 +++ simd/jcgrymmx.asm	(working copy)
   1572 @@ -41,7 +41,7 @@
   1573  %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
   1574  
   1575  	align	16
   1576 -	global	EXTN(jsimd_rgb_gray_convert_mmx)
   1577 +	global	EXTN(jsimd_rgb_gray_convert_mmx) PRIVATE
   1578  
   1579  EXTN(jsimd_rgb_gray_convert_mmx):
   1580  	push	ebp
   1581 Index: simd/jfss2int.asm
   1582 ===================================================================
   1583 --- simd/jfss2int.asm	(revision 829)
   1584 +++ simd/jfss2int.asm	(working copy)
   1585 @@ -66,7 +66,7 @@
   1586  	SECTION	SEG_CONST
   1587  
   1588  	alignz	16
   1589 -	global	EXTN(jconst_fdct_islow_sse2)
   1590 +	global	EXTN(jconst_fdct_islow_sse2) PRIVATE
   1591  
   1592  EXTN(jconst_fdct_islow_sse2):
   1593  
   1594 @@ -101,7 +101,7 @@
   1595  %define WK_NUM		6
   1596  
   1597  	align	16
   1598 -	global	EXTN(jsimd_fdct_islow_sse2)
   1599 +	global	EXTN(jsimd_fdct_islow_sse2) PRIVATE
   1600  
   1601  EXTN(jsimd_fdct_islow_sse2):
   1602  	push	ebp
   1603 Index: simd/jcgryss2.asm
   1604 ===================================================================
   1605 --- simd/jcgryss2.asm	(revision 829)
   1606 +++ simd/jcgryss2.asm	(working copy)
   1607 @@ -39,7 +39,7 @@
   1608  
   1609  	align	16
   1610  
   1611 -	global	EXTN(jsimd_rgb_gray_convert_sse2)
   1612 +	global	EXTN(jsimd_rgb_gray_convert_sse2) PRIVATE
   1613  
   1614  EXTN(jsimd_rgb_gray_convert_sse2):
   1615  	push	ebp
   1616 Index: simd/jccolmmx.asm
   1617 ===================================================================
   1618 --- simd/jccolmmx.asm	(revision 829)
   1619 +++ simd/jccolmmx.asm	(working copy)
   1620 @@ -37,7 +37,7 @@
   1621  	SECTION	SEG_CONST
   1622  
   1623  	alignz	16
   1624 -	global	EXTN(jconst_rgb_ycc_convert_mmx)
   1625 +	global	EXTN(jconst_rgb_ycc_convert_mmx) PRIVATE
   1626  
   1627  EXTN(jconst_rgb_ycc_convert_mmx):
   1628  
   1629 Index: simd/jimmxred.asm
   1630 ===================================================================
   1631 --- simd/jimmxred.asm	(revision 829)
   1632 +++ simd/jimmxred.asm	(working copy)
   1633 @@ -72,7 +72,7 @@
   1634  	SECTION	SEG_CONST
   1635  
   1636  	alignz	16
   1637 -	global	EXTN(jconst_idct_red_mmx)
   1638 +	global	EXTN(jconst_idct_red_mmx) PRIVATE
   1639  
   1640  EXTN(jconst_idct_red_mmx):
   1641  
   1642 @@ -115,7 +115,7 @@
   1643  					; JCOEF workspace[DCTSIZE2]
   1644  
   1645  	align	16
   1646 -	global	EXTN(jsimd_idct_4x4_mmx)
   1647 +	global	EXTN(jsimd_idct_4x4_mmx) PRIVATE
   1648  
   1649  EXTN(jsimd_idct_4x4_mmx):
   1650  	push	ebp
   1651 @@ -503,7 +503,7 @@
   1652  %define output_col(b)	(b)+20		; JDIMENSION output_col
   1653  
   1654  	align	16
   1655 -	global	EXTN(jsimd_idct_2x2_mmx)
   1656 +	global	EXTN(jsimd_idct_2x2_mmx) PRIVATE
   1657  
   1658  EXTN(jsimd_idct_2x2_mmx):
   1659  	push	ebp
   1660 Index: simd/jsimdext.inc
   1661 ===================================================================
   1662 --- simd/jsimdext.inc	(revision 829)
   1663 +++ simd/jsimdext.inc	(working copy)
   1664 @@ -73,6 +73,9 @@
   1665  ; * *BSD family Unix using elf format
   1666  ; * Unix System V, including Solaris x86, UnixWare and SCO Unix
   1667  
   1668 +; PIC is the default on Linux
   1669 +%define PIC
   1670 +
   1671  ; mark stack as non-executable
   1672  section .note.GNU-stack noalloc noexec nowrite progbits
   1673  
   1674 @@ -375,4 +378,14 @@
   1675  ;
   1676  %include "jsimdcfg.inc"
   1677  
   1678 +; Begin chromium edits
   1679 +%ifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
   1680 +%define PRIVATE :private_extern
   1681 +%elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------
   1682 +%define PRIVATE :hidden
   1683 +%else
   1684 +%define PRIVATE
   1685 +%endif
   1686 +; End chromium edits
   1687 +
   1688  ; --------------------------------------------------------------------------
   1689 Index: simd/jdclrmmx.asm
   1690 ===================================================================
   1691 --- simd/jdclrmmx.asm	(revision 829)
   1692 +++ simd/jdclrmmx.asm	(working copy)
   1693 @@ -40,7 +40,7 @@
   1694  %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
   1695  
   1696  	align	16
   1697 -	global	EXTN(jsimd_ycc_rgb_convert_mmx)
   1698 +	global	EXTN(jsimd_ycc_rgb_convert_mmx) PRIVATE
   1699  
   1700  EXTN(jsimd_ycc_rgb_convert_mmx):
   1701  	push	ebp
   1702 Index: simd/jccolss2.asm
   1703 ===================================================================
   1704 --- simd/jccolss2.asm	(revision 829)
   1705 +++ simd/jccolss2.asm	(working copy)
   1706 @@ -34,7 +34,7 @@
   1707  	SECTION	SEG_CONST
   1708  
   1709  	alignz	16
   1710 -	global	EXTN(jconst_rgb_ycc_convert_sse2)
   1711 +	global	EXTN(jconst_rgb_ycc_convert_sse2) PRIVATE
   1712  
   1713  EXTN(jconst_rgb_ycc_convert_sse2):
   1714  
   1715 Index: simd/jisseflt.asm
   1716 ===================================================================
   1717 --- simd/jisseflt.asm	(revision 829)
   1718 +++ simd/jisseflt.asm	(working copy)
   1719 @@ -37,7 +37,7 @@
   1720  	SECTION	SEG_CONST
   1721  
   1722  	alignz	16
   1723 -	global	EXTN(jconst_idct_float_sse)
   1724 +	global	EXTN(jconst_idct_float_sse) PRIVATE
   1725  
   1726  EXTN(jconst_idct_float_sse):
   1727  
   1728 @@ -73,7 +73,7 @@
   1729  					; FAST_FLOAT workspace[DCTSIZE2]
   1730  
   1731  	align	16
   1732 -	global	EXTN(jsimd_idct_float_sse)
   1733 +	global	EXTN(jsimd_idct_float_sse) PRIVATE
   1734  
   1735  EXTN(jsimd_idct_float_sse):
   1736  	push	ebp
   1737 Index: simd/jcqnts2i-64.asm
   1738 ===================================================================
   1739 --- simd/jcqnts2i-64.asm	(revision 829)
   1740 +++ simd/jcqnts2i-64.asm	(working copy)
   1741 @@ -36,7 +36,7 @@
   1742  ; r12 = DCTELEM * workspace
   1743  
   1744  	align	16
   1745 -	global	EXTN(jsimd_convsamp_sse2)
   1746 +	global	EXTN(jsimd_convsamp_sse2) PRIVATE
   1747  
   1748  EXTN(jsimd_convsamp_sse2):
   1749  	push	rbp
   1750 @@ -112,7 +112,7 @@
   1751  ; r12 = DCTELEM * workspace
   1752  
   1753  	align	16
   1754 -	global	EXTN(jsimd_quantize_sse2)
   1755 +	global	EXTN(jsimd_quantize_sse2) PRIVATE
   1756  
   1757  EXTN(jsimd_quantize_sse2):
   1758  	push	rbp
   1759 Index: simd/jdclrss2.asm
   1760 ===================================================================
   1761 --- simd/jdclrss2.asm	(revision 829)
   1762 +++ simd/jdclrss2.asm	(working copy)
   1763 @@ -40,7 +40,7 @@
   1764  %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
   1765  
   1766  	align	16
   1767 -	global	EXTN(jsimd_ycc_rgb_convert_sse2)
   1768 +	global	EXTN(jsimd_ycc_rgb_convert_sse2) PRIVATE
   1769  
   1770  EXTN(jsimd_ycc_rgb_convert_sse2):
   1771  	push	ebp
   1772 Index: simd/jcqntsse.asm
   1773 ===================================================================
   1774 --- simd/jcqntsse.asm	(revision 829)
   1775 +++ simd/jcqntsse.asm	(working copy)
   1776 @@ -35,7 +35,7 @@
   1777  %define workspace	ebp+16		; FAST_FLOAT * workspace
   1778  
   1779  	align	16
   1780 -	global	EXTN(jsimd_convsamp_float_sse)
   1781 +	global	EXTN(jsimd_convsamp_float_sse) PRIVATE
   1782  
   1783  EXTN(jsimd_convsamp_float_sse):
   1784  	push	ebp
   1785 @@ -138,7 +138,7 @@
   1786  %define workspace	ebp+16		; FAST_FLOAT * workspace
   1787  
   1788  	align	16
   1789 -	global	EXTN(jsimd_quantize_float_sse)
   1790 +	global	EXTN(jsimd_quantize_float_sse) PRIVATE
   1791  
   1792  EXTN(jsimd_quantize_float_sse):
   1793  	push	ebp
   1794 Index: simd/jiss2int-64.asm
   1795 ===================================================================
   1796 --- simd/jiss2int-64.asm	(revision 829)
   1797 +++ simd/jiss2int-64.asm	(working copy)
   1798 @@ -67,7 +67,7 @@
   1799  	SECTION	SEG_CONST
   1800  
   1801  	alignz	16
   1802 -	global	EXTN(jconst_idct_islow_sse2)
   1803 +	global	EXTN(jconst_idct_islow_sse2) PRIVATE
   1804  
   1805  EXTN(jconst_idct_islow_sse2):
   1806  
   1807 @@ -106,7 +106,7 @@
   1808  %define WK_NUM		12
   1809  
   1810  	align	16
   1811 -	global	EXTN(jsimd_idct_islow_sse2)
   1812 +	global	EXTN(jsimd_idct_islow_sse2) PRIVATE
   1813  
   1814  EXTN(jsimd_idct_islow_sse2):
   1815  	push	rbp
   1816 Index: simd/jfmmxfst.asm
   1817 ===================================================================
   1818 --- simd/jfmmxfst.asm	(revision 829)
   1819 +++ simd/jfmmxfst.asm	(working copy)
   1820 @@ -52,7 +52,7 @@
   1821  %define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
   1822  
   1823  	alignz	16
   1824 -	global	EXTN(jconst_fdct_ifast_mmx)
   1825 +	global	EXTN(jconst_fdct_ifast_mmx) PRIVATE
   1826  
   1827  EXTN(jconst_fdct_ifast_mmx):
   1828  
   1829 @@ -80,7 +80,7 @@
   1830  %define WK_NUM		2
   1831  
   1832  	align	16
   1833 -	global	EXTN(jsimd_fdct_ifast_mmx)
   1834 +	global	EXTN(jsimd_fdct_ifast_mmx) PRIVATE
   1835  
   1836  EXTN(jsimd_fdct_ifast_mmx):
   1837  	push	ebp
   1838 Index: jdarith.c
   1839 ===================================================================
   1840 --- jdarith.c	(revision 829)
   1841 +++ jdarith.c	(working copy)
   1842 @@ -150,8 +150,8 @@
   1843     */
   1844    sv = *st;
   1845    qe = jpeg_aritab[sv & 0x7F];	/* => Qe_Value */
   1846 -  nl = qe & 0xFF; qe >>= 8;	/* Next_Index_LPS + Switch_MPS */
   1847 -  nm = qe & 0xFF; qe >>= 8;	/* Next_Index_MPS */
   1848 +  nl = (unsigned char) qe & 0xFF; qe >>= 8;	/* Next_Index_LPS + Switch_MPS */
   1849 +  nm = (unsigned char) qe & 0xFF; qe >>= 8;	/* Next_Index_MPS */
   1850  
   1851    /* Decode & estimation procedures per sections D.2.4 & D.2.5 */
   1852    temp = e->a - qe;
   1853 Index: jdhuff.c
   1854 ===================================================================
   1855 --- jdhuff.c	(revision 829)
   1856 +++ jdhuff.c	(working copy)
   1857 @@ -742,7 +742,7 @@
   1858   * this module, since we'll just re-assign them on the next call.)
   1859   */
   1860  
   1861 -#define BUFSIZE (DCTSIZE2 * 2)
   1862 +#define BUFSIZE (DCTSIZE2 * 2u)
   1863  
   1864  METHODDEF(boolean)
   1865  decode_mcu (j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   1866 Index: jchuff.c
   1867 ===================================================================
   1868 --- jchuff.c	(revision 1219)
   1869 +++ jchuff.c	(revision 1220)
   1870 @@ -22,8 +22,36 @@
   1871  #include "jchuff.h"		/* Declarations shared with jcphuff.c */
   1872  #include <limits.h>
   1873  
   1874 +/*
   1875 + * NOTE: If USE_CLZ_INTRINSIC is defined, then clz/bsr instructions will be
   1876 + * used for bit counting rather than the lookup table.  This will reduce the
   1877 + * memory footprint by 64k, which is important for some mobile applications
   1878 + * that create many isolated instances of libjpeg-turbo (web browsers, for
   1879 + * instance.)  This may improve performance on some mobile platforms as well.
   1880 + * This feature is enabled by default only on ARM processors, because some x86
   1881 + * chips have a slow implementation of bsr, and the use of clz/bsr cannot be
   1882 + * shown to have a significant performance impact even on the x86 chips that
   1883 + * have a fast implementation of it.  When building for ARMv6, you can
   1884 + * explicitly disable the use of clz/bsr by adding -mthumb to the compiler
   1885 + * flags (this defines __thumb__).
   1886 + */
   1887 +
   1888 +/* NOTE: Both GCC and Clang define __GNUC__ */
   1889 +#if defined __GNUC__ && defined __arm__
   1890 +#if !defined __thumb__ || defined __thumb2__
   1891 +#define USE_CLZ_INTRINSIC
   1892 +#endif
   1893 +#endif
   1894 +
   1895 +#ifdef USE_CLZ_INTRINSIC
   1896 +#define JPEG_NBITS_NONZERO(x) (32 - __builtin_clz(x))
   1897 +#define JPEG_NBITS(x) (x ? JPEG_NBITS_NONZERO(x) : 0)
   1898 +#else
   1899  static unsigned char jpeg_nbits_table[65536];
   1900  static int jpeg_nbits_table_init = 0;
   1901 +#define JPEG_NBITS(x) (jpeg_nbits_table[x])
   1902 +#define JPEG_NBITS_NONZERO(x) JPEG_NBITS(x)
   1903 +#endif
   1904  
   1905  #ifndef min
   1906   #define min(a,b) ((a)<(b)?(a):(b))
   1907 @@ -272,6 +300,7 @@
   1908      dtbl->ehufsi[i] = huffsize[p];
   1909    }
   1910  
   1911 +#ifndef USE_CLZ_INTRINSIC
   1912    if(!jpeg_nbits_table_init) {
   1913      for(i = 0; i < 65536; i++) {
   1914        int nbits = 0, temp = i;
   1915 @@ -280,6 +309,7 @@
   1916      }
   1917      jpeg_nbits_table_init = 1;
   1918    }
   1919 +#endif
   1920  }
   1921  
   1922  
   1923 @@ -482,7 +512,7 @@
   1924    temp2 += temp3;
   1925  
   1926    /* Find the number of bits needed for the magnitude of the coefficient */
   1927 -  nbits = jpeg_nbits_table[temp];
   1928 +  nbits = JPEG_NBITS(temp);
   1929  
   1930    /* Emit the Huffman-coded symbol for the number of bits */
   1931    code = dctbl->ehufco[nbits];
   1932 @@ -516,7 +546,7 @@
   1933      temp ^= temp3; \
   1934      temp -= temp3; \
   1935      temp2 += temp3; \
   1936 -    nbits = jpeg_nbits_table[temp]; \
   1937 +    nbits = JPEG_NBITS_NONZERO(temp); \
   1938      /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
   1939      while (r > 15) { \
   1940        EMIT_BITS(code_0xf0, size_0xf0) \
   1941 Index: simd/jsimd_arm64.c
   1942 ===================================================================
   1943 --- /dev/null
   1944 +++ simd/jsimd_arm64.c
   1945 @@ -0,0 +1,544 @@
   1946 +/*
   1947 + * jsimd_arm64.c
   1948 + *
   1949 + * Copyright 2009 Pierre Ossman <ossman (a] cendio.se> for Cendio AB
   1950 + * Copyright 2009-2011, 2013-2014 D. R. Commander
   1951 + *
   1952 + * Based on the x86 SIMD extension for IJG JPEG library,
   1953 + * Copyright (C) 1999-2006, MIYASAKA Masaru.
   1954 + * For conditions of distribution and use, see copyright notice in jsimdext.inc
   1955 + *
   1956 + * This file contains the interface between the "normal" portions
   1957 + * of the library and the SIMD implementations when running on a
   1958 + * 64-bit ARM architecture.
   1959 + */
   1960 +
   1961 +#define JPEG_INTERNALS
   1962 +#include "../jinclude.h"
   1963 +#include "../jpeglib.h"
   1964 +#include "../jsimd.h"
   1965 +#include "../jdct.h"
   1966 +#include "../jsimddct.h"
   1967 +#include "jsimd.h"
   1968 +
   1969 +#include <stdio.h>
   1970 +#include <string.h>
   1971 +#include <ctype.h>
   1972 +
   1973 +static unsigned int simd_support = ~0;
   1974 +
   1975 +/*
   1976 + * Check what SIMD accelerations are supported.
   1977 + *
   1978 + * FIXME: This code is racy under a multi-threaded environment.
   1979 + */
   1980 +
   1981 +/* 
   1982 + * ARMv8 architectures support NEON extensions by default.
   1983 + * It is no longer optional as it was with ARMv7.
   1984 + */ 
   1985 +
   1986 +
   1987 +LOCAL(void)
   1988 +init_simd (void)
   1989 +{
   1990 +  char *env = NULL;
   1991 +
   1992 +  if (simd_support != ~0U)
   1993 +    return;
   1994 +
   1995 +  simd_support = 0;
   1996 +
   1997 +  simd_support |= JSIMD_ARM_NEON;
   1998 +
   1999 +  /* Force different settings through environment variables */
   2000 +  env = getenv("JSIMD_FORCENEON");
   2001 +  if ((env != NULL) && (strcmp(env, "1") == 0))
   2002 +    simd_support &= JSIMD_ARM_NEON;
   2003 +  env = getenv("JSIMD_FORCENONE");
   2004 +  if ((env != NULL) && (strcmp(env, "1") == 0))
   2005 +    simd_support = 0;
   2006 +}
   2007 +
   2008 +GLOBAL(int)
   2009 +jsimd_can_rgb_ycc (void)
   2010 +{
   2011 +  init_simd();
   2012 +
   2013 +  return 0;
   2014 +}
   2015 +
   2016 +GLOBAL(int)
   2017 +jsimd_can_rgb_gray (void)
   2018 +{
   2019 +  init_simd();
   2020 +
   2021 +  return 0;
   2022 +}
   2023 +
   2024 +GLOBAL(int)
   2025 +jsimd_can_ycc_rgb (void)
   2026 +{
   2027 +  init_simd();
   2028 +
   2029 +  /* The code is optimised for these values only */
   2030 +  if (BITS_IN_JSAMPLE != 8)
   2031 +    return 0;
   2032 +  if (sizeof(JDIMENSION) != 4)
   2033 +    return 0;
   2034 +  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
   2035 +    return 0;
   2036 +
   2037 +  if (simd_support & JSIMD_ARM_NEON)
   2038 +    return 1;
   2039 +
   2040 +  return 0;
   2041 +}
   2042 +
   2043 +GLOBAL(int)
   2044 +jsimd_can_ycc_rgb565 (void)
   2045 +{
   2046 +  init_simd();
   2047 +
   2048 +  /* The code is optimised for these values only */
   2049 +  if (BITS_IN_JSAMPLE != 8)
   2050 +    return 0;
   2051 +  if (sizeof(JDIMENSION) != 4)
   2052 +    return 0;
   2053 +
   2054 +  if (simd_support & JSIMD_ARM_NEON)
   2055 +    return 1;
   2056 +
   2057 +  return 0;
   2058 +}
   2059 +
   2060 +GLOBAL(void)
   2061 +jsimd_rgb_ycc_convert (j_compress_ptr cinfo,
   2062 +                       JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
   2063 +                       JDIMENSION output_row, int num_rows)
   2064 +{
   2065 +}
   2066 +
   2067 +GLOBAL(void)
   2068 +jsimd_rgb_gray_convert (j_compress_ptr cinfo,
   2069 +                        JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
   2070 +                        JDIMENSION output_row, int num_rows)
   2071 +{
   2072 +}
   2073 +
   2074 +GLOBAL(void)
   2075 +jsimd_ycc_rgb_convert (j_decompress_ptr cinfo,
   2076 +                       JSAMPIMAGE input_buf, JDIMENSION input_row,
   2077 +                       JSAMPARRAY output_buf, int num_rows)
   2078 +{
   2079 +  void (*neonfct)(JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY, int);
   2080 +
   2081 +  switch(cinfo->out_color_space) {
   2082 +    case JCS_EXT_RGB:
   2083 +      neonfct=jsimd_ycc_extrgb_convert_neon;
   2084 +      break;
   2085 +    case JCS_EXT_RGBX:
   2086 +    case JCS_EXT_RGBA:
   2087 +      neonfct=jsimd_ycc_extrgbx_convert_neon;
   2088 +      break;
   2089 +    case JCS_EXT_BGR:
   2090 +      neonfct=jsimd_ycc_extbgr_convert_neon;
   2091 +      break;
   2092 +    case JCS_EXT_BGRX:
   2093 +    case JCS_EXT_BGRA:
   2094 +      neonfct=jsimd_ycc_extbgrx_convert_neon;
   2095 +      break;
   2096 +    case JCS_EXT_XBGR:
   2097 +    case JCS_EXT_ABGR:
   2098 +      neonfct=jsimd_ycc_extxbgr_convert_neon;
   2099 +      break;
   2100 +    case JCS_EXT_XRGB:
   2101 +    case JCS_EXT_ARGB:
   2102 +      neonfct=jsimd_ycc_extxrgb_convert_neon;
   2103 +      break;
   2104 +    default:
   2105 +      neonfct=jsimd_ycc_extrgb_convert_neon;
   2106 +      break;
   2107 +  }
   2108 +
   2109 +  if (simd_support & JSIMD_ARM_NEON)
   2110 +    neonfct(cinfo->output_width, input_buf, input_row, output_buf, num_rows);
   2111 +}
   2112 +
   2113 +GLOBAL(void)
   2114 +jsimd_ycc_rgb565_convert (j_decompress_ptr cinfo,
   2115 +                          JSAMPIMAGE input_buf, JDIMENSION input_row,
   2116 +                          JSAMPARRAY output_buf, int num_rows)
   2117 +{
   2118 +  if (simd_support & JSIMD_ARM_NEON)
   2119 +    jsimd_ycc_rgb565_convert_neon(cinfo->output_width, input_buf, input_row,
   2120 +                                  output_buf, num_rows);
   2121 +}
   2122 +
   2123 +GLOBAL(int)
   2124 +jsimd_can_h2v2_downsample (void)
   2125 +{
   2126 +  init_simd();
   2127 +
   2128 +  return 0;
   2129 +}
   2130 +
   2131 +GLOBAL(int)
   2132 +jsimd_can_h2v1_downsample (void)
   2133 +{
   2134 +  init_simd();
   2135 +
   2136 +  return 0;
   2137 +}
   2138 +
   2139 +GLOBAL(void)
   2140 +jsimd_h2v2_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
   2141 +                       JSAMPARRAY input_data, JSAMPARRAY output_data)
   2142 +{
   2143 +}
   2144 +
   2145 +GLOBAL(void)
   2146 +jsimd_h2v1_downsample (j_compress_ptr cinfo, jpeg_component_info * compptr,
   2147 +                       JSAMPARRAY input_data, JSAMPARRAY output_data)
   2148 +{
   2149 +}
   2150 +
   2151 +GLOBAL(int)
   2152 +jsimd_can_h2v2_upsample (void)
   2153 +{
   2154 +  init_simd();
   2155 +
   2156 +  return 0;
   2157 +}
   2158 +
   2159 +GLOBAL(int)
   2160 +jsimd_can_h2v1_upsample (void)
   2161 +{
   2162 +  init_simd();
   2163 +
   2164 +  return 0;
   2165 +}
   2166 +
   2167 +GLOBAL(void)
   2168 +jsimd_h2v2_upsample (j_decompress_ptr cinfo,
   2169 +                     jpeg_component_info * compptr,
   2170 +                     JSAMPARRAY input_data,
   2171 +                     JSAMPARRAY * output_data_ptr)
   2172 +{
   2173 +}
   2174 +
   2175 +GLOBAL(void)
   2176 +jsimd_h2v1_upsample (j_decompress_ptr cinfo,
   2177 +                     jpeg_component_info * compptr,
   2178 +                     JSAMPARRAY input_data,
   2179 +                     JSAMPARRAY * output_data_ptr)
   2180 +{
   2181 +}
   2182 +
   2183 +GLOBAL(int)
   2184 +jsimd_can_h2v2_fancy_upsample (void)
   2185 +{
   2186 +  init_simd();
   2187 +
   2188 +  return 0;
   2189 +}
   2190 +
   2191 +GLOBAL(int)
   2192 +jsimd_can_h2v1_fancy_upsample (void)
   2193 +{
   2194 +  init_simd();
   2195 +
   2196 +  return 0;
   2197 +}
   2198 +
   2199 +GLOBAL(void)
   2200 +jsimd_h2v2_fancy_upsample (j_decompress_ptr cinfo,
   2201 +                           jpeg_component_info * compptr,
   2202 +                           JSAMPARRAY input_data,
   2203 +                           JSAMPARRAY * output_data_ptr)
   2204 +{
   2205 +}
   2206 +
   2207 +GLOBAL(void)
   2208 +jsimd_h2v1_fancy_upsample (j_decompress_ptr cinfo,
   2209 +                           jpeg_component_info * compptr,
   2210 +                           JSAMPARRAY input_data,
   2211 +                           JSAMPARRAY * output_data_ptr)
   2212 +{
   2213 +}
   2214 +
   2215 +GLOBAL(int)
   2216 +jsimd_can_h2v2_merged_upsample (void)
   2217 +{
   2218 +  init_simd();
   2219 +
   2220 +  return 0;
   2221 +}
   2222 +
   2223 +GLOBAL(int)
   2224 +jsimd_can_h2v1_merged_upsample (void)
   2225 +{
   2226 +  init_simd();
   2227 +
   2228 +  return 0;
   2229 +}
   2230 +
   2231 +GLOBAL(void)
   2232 +jsimd_h2v2_merged_upsample (j_decompress_ptr cinfo,
   2233 +                            JSAMPIMAGE input_buf,
   2234 +                            JDIMENSION in_row_group_ctr,
   2235 +                            JSAMPARRAY output_buf)
   2236 +{
   2237 +}
   2238 +
   2239 +GLOBAL(void)
   2240 +jsimd_h2v1_merged_upsample (j_decompress_ptr cinfo,
   2241 +                            JSAMPIMAGE input_buf,
   2242 +                            JDIMENSION in_row_group_ctr,
   2243 +                            JSAMPARRAY output_buf)
   2244 +{
   2245 +}
   2246 +
   2247 +GLOBAL(int)
   2248 +jsimd_can_convsamp (void)
   2249 +{
   2250 +  init_simd();
   2251 +
   2252 +  return 0;
   2253 +}
   2254 +
   2255 +GLOBAL(int)
   2256 +jsimd_can_convsamp_float (void)
   2257 +{
   2258 +  init_simd();
   2259 +
   2260 +  return 0;
   2261 +}
   2262 +
   2263 +GLOBAL(void)
   2264 +jsimd_convsamp (JSAMPARRAY sample_data, JDIMENSION start_col,
   2265 +                DCTELEM * workspace)
   2266 +{
   2267 +}
   2268 +
   2269 +GLOBAL(void)
   2270 +jsimd_convsamp_float (JSAMPARRAY sample_data, JDIMENSION start_col,
   2271 +                      FAST_FLOAT * workspace)
   2272 +{
   2273 +}
   2274 +
   2275 +GLOBAL(int)
   2276 +jsimd_can_fdct_islow (void)
   2277 +{
   2278 +  init_simd();
   2279 +
   2280 +  return 0;
   2281 +}
   2282 +
   2283 +GLOBAL(int)
   2284 +jsimd_can_fdct_ifast (void)
   2285 +{
   2286 +  init_simd();
   2287 +
   2288 +  return 0;
   2289 +}
   2290 +
   2291 +GLOBAL(int)
   2292 +jsimd_can_fdct_float (void)
   2293 +{
   2294 +  init_simd();
   2295 +
   2296 +  return 0;
   2297 +}
   2298 +
   2299 +GLOBAL(void)
   2300 +jsimd_fdct_islow (DCTELEM * data)
   2301 +{
   2302 +}
   2303 +
   2304 +GLOBAL(void)
   2305 +jsimd_fdct_ifast (DCTELEM * data)
   2306 +{
   2307 +}
   2308 +
   2309 +GLOBAL(void)
   2310 +jsimd_fdct_float (FAST_FLOAT * data)
   2311 +{
   2312 +}
   2313 +
   2314 +GLOBAL(int)
   2315 +jsimd_can_quantize (void)
   2316 +{
   2317 +  init_simd();
   2318 +
   2319 +  return 0;
   2320 +}
   2321 +
   2322 +GLOBAL(int)
   2323 +jsimd_can_quantize_float (void)
   2324 +{
   2325 +  init_simd();
   2326 +
   2327 +  return 0;
   2328 +}
   2329 +
   2330 +GLOBAL(void)
   2331 +jsimd_quantize (JCOEFPTR coef_block, DCTELEM * divisors,
   2332 +                DCTELEM * workspace)
   2333 +{
   2334 +}
   2335 +
   2336 +GLOBAL(void)
   2337 +jsimd_quantize_float (JCOEFPTR coef_block, FAST_FLOAT * divisors,
   2338 +                      FAST_FLOAT * workspace)
   2339 +{
   2340 +}
   2341 +
   2342 +GLOBAL(int)
   2343 +jsimd_can_idct_2x2 (void)
   2344 +{
   2345 +  init_simd();
   2346 +
   2347 +  /* The code is optimised for these values only */
   2348 +  if (DCTSIZE != 8)
   2349 +    return 0;
   2350 +  if (sizeof(JCOEF) != 2)
   2351 +    return 0;
   2352 +  if (BITS_IN_JSAMPLE != 8)
   2353 +    return 0;
   2354 +  if (sizeof(JDIMENSION) != 4)
   2355 +    return 0;
   2356 +  if (sizeof(ISLOW_MULT_TYPE) != 2)
   2357 +    return 0;
   2358 +
   2359 +  if (simd_support & JSIMD_ARM_NEON)
   2360 +    return 1;
   2361 +
   2362 +  return 0;
   2363 +}
   2364 +
   2365 +GLOBAL(int)
   2366 +jsimd_can_idct_4x4 (void)
   2367 +{
   2368 +  init_simd();
   2369 +
   2370 +  /* The code is optimised for these values only */
   2371 +  if (DCTSIZE != 8)
   2372 +    return 0;
   2373 +  if (sizeof(JCOEF) != 2)
   2374 +    return 0;
   2375 +  if (BITS_IN_JSAMPLE != 8)
   2376 +    return 0;
   2377 +  if (sizeof(JDIMENSION) != 4)
   2378 +    return 0;
   2379 +  if (sizeof(ISLOW_MULT_TYPE) != 2)
   2380 +    return 0;
   2381 +
   2382 +  if (simd_support & JSIMD_ARM_NEON)
   2383 +    return 1;
   2384 +
   2385 +  return 0;
   2386 +}
   2387 +
   2388 +GLOBAL(void)
   2389 +jsimd_idct_2x2 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   2390 +                JCOEFPTR coef_block, JSAMPARRAY output_buf,
   2391 +                JDIMENSION output_col)
   2392 +{
   2393 +  if (simd_support & JSIMD_ARM_NEON)
   2394 +    jsimd_idct_2x2_neon(compptr->dct_table, coef_block, output_buf,
   2395 +                        output_col);
   2396 +}
   2397 +
   2398 +GLOBAL(void)
   2399 +jsimd_idct_4x4 (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   2400 +                JCOEFPTR coef_block, JSAMPARRAY output_buf,
   2401 +                JDIMENSION output_col)
   2402 +{
   2403 +  if (simd_support & JSIMD_ARM_NEON)
   2404 +    jsimd_idct_4x4_neon(compptr->dct_table, coef_block, output_buf,
   2405 +                        output_col);
   2406 +}
   2407 +
   2408 +GLOBAL(int)
   2409 +jsimd_can_idct_islow (void)
   2410 +{
   2411 +  init_simd();
   2412 +
   2413 +  /* The code is optimised for these values only */
   2414 +  if (DCTSIZE != 8)
   2415 +    return 0;
   2416 +  if (sizeof(JCOEF) != 2)
   2417 +    return 0;
   2418 +  if (BITS_IN_JSAMPLE != 8)
   2419 +    return 0;
   2420 +  if (sizeof(JDIMENSION) != 4)
   2421 +    return 0;
   2422 +  if (sizeof(ISLOW_MULT_TYPE) != 2)
   2423 +    return 0;
   2424 +
   2425 +  if (simd_support & JSIMD_ARM_NEON)
   2426 +    return 1;
   2427 +
   2428 +  return 0;
   2429 +}
   2430 +
   2431 +GLOBAL(int)
   2432 +jsimd_can_idct_ifast (void)
   2433 +{
   2434 +  init_simd();
   2435 +
   2436 +  /* The code is optimised for these values only */
   2437 +  if (DCTSIZE != 8)
   2438 +    return 0;
   2439 +  if (sizeof(JCOEF) != 2)
   2440 +    return 0;
   2441 +  if (BITS_IN_JSAMPLE != 8)
   2442 +    return 0;
   2443 +  if (sizeof(JDIMENSION) != 4)
   2444 +    return 0;
   2445 +  if (sizeof(IFAST_MULT_TYPE) != 2)
   2446 +    return 0;
   2447 +  if (IFAST_SCALE_BITS != 2)
   2448 +    return 0;
   2449 +
   2450 +  if (simd_support & JSIMD_ARM_NEON)
   2451 +    return 1;
   2452 +
   2453 +  return 0;
   2454 +}
   2455 +
   2456 +GLOBAL(int)
   2457 +jsimd_can_idct_float (void)
   2458 +{
   2459 +  init_simd();
   2460 +
   2461 +  return 0;
   2462 +}
   2463 +
   2464 +GLOBAL(void)
   2465 +jsimd_idct_islow (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   2466 +                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
   2467 +                  JDIMENSION output_col)
   2468 +{
   2469 +  if (simd_support & JSIMD_ARM_NEON)
   2470 +    jsimd_idct_islow_neon(compptr->dct_table, coef_block, output_buf,
   2471 +                          output_col);
   2472 +}
   2473 +
   2474 +GLOBAL(void)
   2475 +jsimd_idct_ifast (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   2476 +                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
   2477 +                  JDIMENSION output_col)
   2478 +{
   2479 +  if (simd_support & JSIMD_ARM_NEON)
   2480 +    jsimd_idct_ifast_neon(compptr->dct_table, coef_block, output_buf,
   2481 +                          output_col);
   2482 +}
   2483 +
   2484 +GLOBAL(void)
   2485 +jsimd_idct_float (j_decompress_ptr cinfo, jpeg_component_info * compptr,
   2486 +                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
   2487 +                  JDIMENSION output_col)
   2488 +{
   2489 +}
   2490 Index: simd/jsimd_arm64_neon.S
   2491 new file mode 100644
   2492 ===================================================================
   2493 --- /dev/null
   2494 +++ simd/jsimd_arm64_neon.S
   2495 @@ -0,0 +1,1861 @@
   2496 +/*
   2497 + * ARMv8 NEON optimizations for libjpeg-turbo
   2498 + *
   2499 + * Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies).
   2500 + * All rights reserved.
   2501 + * Author: Siarhei Siamashka <siarhei.siamashka (a] nokia.com>
   2502 + * Copyright (C) 2013-2014, Linaro Limited
   2503 + * Author: Ragesh Radhakrishnan <ragesh.r (a] linaro.org>
   2504 + *
   2505 + * This software is provided 'as-is', without any express or implied
   2506 + * warranty.  In no event will the authors be held liable for any damages
   2507 + * arising from the use of this software.
   2508 + *
   2509 + * Permission is granted to anyone to use this software for any purpose,
   2510 + * including commercial applications, and to alter it and redistribute it
   2511 + * freely, subject to the following restrictions:
   2512 + *
   2513 + * 1. The origin of this software must not be misrepresented; you must not
   2514 + *    claim that you wrote the original software. If you use this software
   2515 + *    in a product, an acknowledgment in the product documentation would be
   2516 + *    appreciated but is not required.
   2517 + * 2. Altered source versions must be plainly marked as such, and must not be
   2518 + *    misrepresented as being the original software.
   2519 + * 3. This notice may not be removed or altered from any source distribution.
   2520 + */
   2521 +
   2522 +#if defined(__linux__) && defined(__ELF__)
   2523 +.section .note.GNU-stack,"",%progbits /* mark stack as non-executable */
   2524 +#endif
   2525 +
   2526 +.text
   2527 +.arch armv8-a+fp+simd
   2528 +
   2529 +
   2530 +#define RESPECT_STRICT_ALIGNMENT 1
   2531 +
   2532 +
   2533 +/*****************************************************************************/
   2534 +
   2535 +/* Supplementary macro for setting function attributes */
   2536 +.macro asm_function fname
   2537 +#ifdef __APPLE__
   2538 +    .globl _\fname
   2539 +_\fname:
   2540 +#else
   2541 +    .global \fname
   2542 +#ifdef __ELF__
   2543 +    .hidden \fname
   2544 +    .type \fname, %function
   2545 +#endif
   2546 +\fname:
   2547 +#endif
   2548 +.endm
   2549 +
   2550 +/* Transpose elements of single 128 bit registers */
   2551 +.macro transpose_single x0,x1,xi,xilen,literal
   2552 +    ins  \xi\xilen[0],  \x0\xilen[0]
   2553 +    ins  \x1\xilen[0],  \x0\xilen[1]
   2554 +    trn1 \x0\literal,   \x0\literal, \x1\literal
   2555 +    trn2 \x1\literal,   \xi\literal, \x1\literal
   2556 +.endm
   2557 +
   2558 +/* Transpose elements of 2 differnet registers */
   2559 +.macro transpose x0,x1,xi,xilen,literal
   2560 +    mov  \xi\xilen,     \x0\xilen
   2561 +    trn1 \x0\literal,   \x0\literal, \x1\literal
   2562 +    trn2 \x1\literal,   \xi\literal, \x1\literal
   2563 +.endm
   2564 +
   2565 +/* Transpose a block of 4x4 coefficients in four 64-bit registers */
   2566 +.macro transpose_4x4_32 x0,x0len x1,x1len x2,x2len x3,x3len,xi,xilen
   2567 +    mov  \xi\xilen, \x0\xilen
   2568 +    trn1 \x0\x0len, \x0\x0len, \x2\x2len
   2569 +    trn2 \x2\x2len, \xi\x0len, \x2\x2len
   2570 +    mov  \xi\xilen, \x1\xilen
   2571 +    trn1 \x1\x1len, \x1\x1len, \x3\x3len
   2572 +    trn2 \x3\x3len, \xi\x1len, \x3\x3len
   2573 +.endm
   2574 +
   2575 +.macro transpose_4x4_16 x0,x0len x1,x1len, x2,x2len, x3,x3len,xi,xilen
   2576 +    mov  \xi\xilen, \x0\xilen
   2577 +    trn1 \x0\x0len, \x0\x0len, \x1\x1len
   2578 +    trn2 \x1\x2len, \xi\x0len, \x1\x2len
   2579 +    mov  \xi\xilen, \x2\xilen
   2580 +    trn1 \x2\x2len, \x2\x2len, \x3\x3len
   2581 +    trn2 \x3\x2len, \xi\x1len, \x3\x3len
   2582 +.endm
   2583 +
   2584 +.macro transpose_4x4 x0, x1, x2, x3,x5
   2585 +    transpose_4x4_16 \x0,.4h, \x1,.4h, \x2,.4h,\x3,.4h,\x5,.16b
   2586 +    transpose_4x4_32 \x0,.2s, \x1,.2s, \x2,.2s,\x3,.2s,\x5,.16b
   2587 +.endm
   2588 +
   2589 +
   2590 +#define CENTERJSAMPLE 128
   2591 +
   2592 +/*****************************************************************************/
   2593 +
   2594 +/*
   2595 + * Perform dequantization and inverse DCT on one block of coefficients.
   2596 + *
   2597 + * GLOBAL(void)
   2598 + * jsimd_idct_islow_neon (void * dct_table, JCOEFPTR coef_block,
   2599 + *                        JSAMPARRAY output_buf, JDIMENSION output_col)
   2600 + */
   2601 +
   2602 +#define FIX_0_298631336  (2446)
   2603 +#define FIX_0_390180644  (3196)
   2604 +#define FIX_0_541196100  (4433)
   2605 +#define FIX_0_765366865  (6270)
   2606 +#define FIX_0_899976223  (7373)
   2607 +#define FIX_1_175875602  (9633)
   2608 +#define FIX_1_501321110  (12299)
   2609 +#define FIX_1_847759065  (15137)
   2610 +#define FIX_1_961570560  (16069)
   2611 +#define FIX_2_053119869  (16819)
   2612 +#define FIX_2_562915447  (20995)
   2613 +#define FIX_3_072711026  (25172)
   2614 +
   2615 +#define FIX_1_175875602_MINUS_1_961570560 (FIX_1_175875602 - FIX_1_961570560)
   2616 +#define FIX_1_175875602_MINUS_0_390180644 (FIX_1_175875602 - FIX_0_390180644)
   2617 +#define FIX_0_541196100_MINUS_1_847759065 (FIX_0_541196100 - FIX_1_847759065)
   2618 +#define FIX_3_072711026_MINUS_2_562915447 (FIX_3_072711026 - FIX_2_562915447)
   2619 +#define FIX_0_298631336_MINUS_0_899976223 (FIX_0_298631336 - FIX_0_899976223)
   2620 +#define FIX_1_501321110_MINUS_0_899976223 (FIX_1_501321110 - FIX_0_899976223)
   2621 +#define FIX_2_053119869_MINUS_2_562915447 (FIX_2_053119869 - FIX_2_562915447)
   2622 +#define FIX_0_541196100_PLUS_0_765366865  (FIX_0_541196100 + FIX_0_765366865)
   2623 +
   2624 +/*
   2625 + * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
   2626 + * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
   2627 + */
   2628 +#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7)   \
   2629 +{                                                                             \
   2630 +    DCTELEM row0, row1, row2, row3, row4, row5, row6, row7;                   \
   2631 +    INT32   q1, q2, q3, q4, q5, q6, q7;                                       \
   2632 +    INT32   tmp11_plus_tmp2, tmp11_minus_tmp2;                                \
   2633 +                                                                              \
   2634 +    /* 1-D iDCT input data */                                                 \
   2635 +    row0 = xrow0;                                                             \
   2636 +    row1 = xrow1;                                                             \
   2637 +    row2 = xrow2;                                                             \
   2638 +    row3 = xrow3;                                                             \
   2639 +    row4 = xrow4;                                                             \
   2640 +    row5 = xrow5;                                                             \
   2641 +    row6 = xrow6;                                                             \
   2642 +    row7 = xrow7;                                                             \
   2643 +                                                                              \
   2644 +    q5 = row7 + row3;                                                         \
   2645 +    q4 = row5 + row1;                                                         \
   2646 +    q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) +                    \
   2647 +         MULTIPLY(q4, FIX_1_175875602);                                       \
   2648 +    q7 = MULTIPLY(q5, FIX_1_175875602) +                                      \
   2649 +         MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644);                     \
   2650 +    q2 = MULTIPLY(row2, FIX_0_541196100) +                                    \
   2651 +         MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065);                   \
   2652 +    q4 = q6;                                                                  \
   2653 +    q3 = ((INT32) row0 - (INT32) row4) << 13;                                 \
   2654 +    q6 += MULTIPLY(row5, -FIX_2_562915447) +                                  \
   2655 +          MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447);                  \
   2656 +    /* now we can use q1 (reloadable constants have been used up) */          \
   2657 +    q1 = q3 + q2;                                                             \
   2658 +    q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) +                 \
   2659 +          MULTIPLY(row1, -FIX_0_899976223);                                   \
   2660 +    q5 = q7;                                                                  \
   2661 +    q1 = q1 + q6;                                                             \
   2662 +    q7 += MULTIPLY(row7, -FIX_0_899976223) +                                  \
   2663 +          MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223);                  \
   2664 +                                                                              \
   2665 +    /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */        \
   2666 +    tmp11_plus_tmp2 = q1;                                                     \
   2667 +    row1 = 0;                                                                 \
   2668 +                                                                              \
   2669 +    q1 = q1 - q6;                                                             \
   2670 +    q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) +                 \
   2671 +          MULTIPLY(row3, -FIX_2_562915447);                                   \
   2672 +    q1 = q1 - q6;                                                             \
   2673 +    q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) +                   \
   2674 +         MULTIPLY(row6, FIX_0_541196100);                                     \
   2675 +    q3 = q3 - q2;                                                             \
   2676 +                                                                              \
   2677 +    /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */        \
   2678 +    tmp11_minus_tmp2 = q1;                                                    \
   2679 +                                                                              \
   2680 +    q1 = ((INT32) row0 + (INT32) row4) << 13;                                 \
   2681 +    q2 = q1 + q6;                                                             \
   2682 +    q1 = q1 - q6;                                                             \
   2683 +                                                                              \
   2684 +    /* pick up the results */                                                 \
   2685 +    tmp0  = q4;                                                               \
   2686 +    tmp1  = q5;                                                               \
   2687 +    tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2;                         \
   2688 +    tmp3  = q7;                                                               \
   2689 +    tmp10 = q2;                                                               \
   2690 +    tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2;                         \
   2691 +    tmp12 = q3;                                                               \
   2692 +    tmp13 = q1;                                                               \
   2693 +}
   2694 +
   2695 +#define XFIX_0_899976223                    v0.4h[0]
   2696 +#define XFIX_0_541196100                    v0.4h[1]
   2697 +#define XFIX_2_562915447                    v0.4h[2]
   2698 +#define XFIX_0_298631336_MINUS_0_899976223  v0.4h[3]
   2699 +#define XFIX_1_501321110_MINUS_0_899976223  v1.4h[0]
   2700 +#define XFIX_2_053119869_MINUS_2_562915447  v1.4h[1]
   2701 +#define XFIX_0_541196100_PLUS_0_765366865   v1.4h[2]
   2702 +#define XFIX_1_175875602                    v1.4h[3]
   2703 +#define XFIX_1_175875602_MINUS_0_390180644  v2.4h[0]
   2704 +#define XFIX_0_541196100_MINUS_1_847759065  v2.4h[1]
   2705 +#define XFIX_3_072711026_MINUS_2_562915447  v2.4h[2]
   2706 +#define XFIX_1_175875602_MINUS_1_961570560  v2.4h[3]
   2707 +
   2708 +.balign 16
   2709 +jsimd_idct_islow_neon_consts:
   2710 +    .short FIX_0_899976223                    /* d0[0] */
   2711 +    .short FIX_0_541196100                    /* d0[1] */
   2712 +    .short FIX_2_562915447                    /* d0[2] */
   2713 +    .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
   2714 +    .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
   2715 +    .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
   2716 +    .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
   2717 +    .short FIX_1_175875602                    /* d1[3] */
   2718 +    /* reloadable constants */
   2719 +    .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
   2720 +    .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
   2721 +    .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
   2722 +    .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
   2723 +
   2724 +asm_function jsimd_idct_islow_neon
   2725 +
   2726 +    DCT_TABLE       .req x0
   2727 +    COEF_BLOCK      .req x1
   2728 +    OUTPUT_BUF      .req x2
   2729 +    OUTPUT_COL      .req x3
   2730 +    TMP1            .req x0
   2731 +    TMP2            .req x1
   2732 +    TMP3            .req x2
   2733 +    TMP4            .req x15
   2734 +
   2735 +    ROW0L           .req v16
   2736 +    ROW0R           .req v17
   2737 +    ROW1L           .req v18
   2738 +    ROW1R           .req v19
   2739 +    ROW2L           .req v20
   2740 +    ROW2R           .req v21
   2741 +    ROW3L           .req v22
   2742 +    ROW3R           .req v23
   2743 +    ROW4L           .req v24
   2744 +    ROW4R           .req v25
   2745 +    ROW5L           .req v26
   2746 +    ROW5R           .req v27
   2747 +    ROW6L           .req v28
   2748 +    ROW6R           .req v29
   2749 +    ROW7L           .req v30
   2750 +    ROW7R           .req v31
   2751 +    /* Save all NEON registers and x15 (32 NEON registers * 8 bytes + 16) */
   2752 +    sub             sp, sp, 272
   2753 +    str             x15, [sp], 16
   2754 +    adr             x15, jsimd_idct_islow_neon_consts
   2755 +    st1             {v0.8b - v3.8b}, [sp], 32
   2756 +    st1             {v4.8b - v7.8b}, [sp], 32
   2757 +    st1             {v8.8b - v11.8b}, [sp], 32
   2758 +    st1             {v12.8b - v15.8b}, [sp], 32
   2759 +    st1             {v16.8b - v19.8b}, [sp], 32
   2760 +    st1             {v20.8b - v23.8b}, [sp], 32
   2761 +    st1             {v24.8b - v27.8b}, [sp], 32
   2762 +    st1             {v28.8b - v31.8b}, [sp], 32
   2763 +    ld1             {v16.4h, v17.4h, v18.4h, v19.4h}, [COEF_BLOCK], 32
   2764 +    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
   2765 +    ld1             {v20.4h, v21.4h, v22.4h, v23.4h}, [COEF_BLOCK], 32
   2766 +    mul             v16.4h, v16.4h, v0.4h
   2767 +    mul             v17.4h, v17.4h, v1.4h
   2768 +    ins             v16.2d[1], v17.2d[0]  /* 128 bit q8 */
   2769 +    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
   2770 +    mul             v18.4h, v18.4h, v2.4h
   2771 +    mul             v19.4h, v19.4h, v3.4h
   2772 +    ins             v18.2d[1], v19.2d[0]  /* 128 bit q9 */
   2773 +    ld1             {v24.4h, v25.4h, v26.4h, v27.4h}, [COEF_BLOCK], 32
   2774 +    mul             v20.4h, v20.4h, v4.4h
   2775 +    mul             v21.4h, v21.4h, v5.4h
   2776 +    ins             v20.2d[1], v21.2d[0]  /* 128 bit q10 */
   2777 +    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [DCT_TABLE], 32
   2778 +    mul             v22.4h, v22.4h, v6.4h
   2779 +    mul             v23.4h, v23.4h, v7.4h
   2780 +    ins             v22.2d[1], v23.2d[0]  /* 128 bit q11 */
   2781 +    ld1             {v28.4h, v29.4h, v30.4h, v31.4h}, [COEF_BLOCK]
   2782 +    mul             v24.4h, v24.4h, v0.4h
   2783 +    mul             v25.4h, v25.4h, v1.4h
   2784 +    ins             v24.2d[1], v25.2d[0]  /* 128 bit q12 */
   2785 +    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [DCT_TABLE], 32
   2786 +    mul             v28.4h, v28.4h, v4.4h
   2787 +    mul             v29.4h, v29.4h, v5.4h
   2788 +    ins             v28.2d[1], v29.2d[0]  /* 128 bit q14 */
   2789 +    mul             v26.4h, v26.4h, v2.4h
   2790 +    mul             v27.4h, v27.4h, v3.4h
   2791 +    ins             v26.2d[1], v27.2d[0]  /* 128 bit q13 */
   2792 +    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x15]  /* load constants */
   2793 +    add             x15, x15, #16
   2794 +    mul             v30.4h, v30.4h, v6.4h
   2795 +    mul             v31.4h, v31.4h, v7.4h
   2796 +    ins             v30.2d[1], v31.2d[0]  /* 128 bit q15 */
   2797 +    /* Go to the bottom of the stack */
   2798 +    sub             sp, sp, 352
   2799 +    stp             x4, x5, [sp], 16
   2800 +    st1             {v8.4h - v11.4h}, [sp], 32  /* save NEON registers */
   2801 +    st1             {v12.4h - v15.4h}, [sp], 32
   2802 +    /* 1-D IDCT, pass 1, left 4x8 half */
   2803 +    add             v4.4h,    ROW7L.4h, ROW3L.4h
   2804 +    add             v5.4h,    ROW5L.4h, ROW1L.4h
   2805 +    smull           v12.4s,   v4.4h,    XFIX_1_175875602_MINUS_1_961570560
   2806 +    smlal           v12.4s,   v5.4h,    XFIX_1_175875602
   2807 +    smull           v14.4s,   v4.4h,    XFIX_1_175875602
   2808 +    /* Check for the zero coefficients in the right 4x8 half */
   2809 +    smlal           v14.4s,   v5.4h,    XFIX_1_175875602_MINUS_0_390180644
   2810 +    ssubl           v6.4s,    ROW0L.4h, ROW4L.4h
   2811 +      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
   2812 +    smull           v4.4s,    ROW2L.4h, XFIX_0_541196100
   2813 +    smlal           v4.4s,    ROW6L.4h, XFIX_0_541196100_MINUS_1_847759065
   2814 +      orr           x0,       x4,       x5
   2815 +    mov             v8.16b,   v12.16b
   2816 +    smlsl           v12.4s,   ROW5L.4h, XFIX_2_562915447
   2817 +      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
   2818 +    smlal           v12.4s,   ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
   2819 +    shl             v6.4s,    v6.4s,    #13
   2820 +      orr           x0,       x0,       x4
   2821 +    smlsl           v8.4s,    ROW1L.4h, XFIX_0_899976223
   2822 +      orr           x0,       x0 ,      x5
   2823 +    add             v2.4s,    v6.4s,    v4.4s
   2824 +      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
   2825 +    mov             v10.16b,  v14.16b
   2826 +    add             v2.4s,    v2.4s,    v12.4s
   2827 +      orr           x0,       x0,       x4
   2828 +    smlsl           v14.4s,   ROW7L.4h, XFIX_0_899976223
   2829 +      orr           x0,       x0,       x5
   2830 +    smlal           v14.4s,   ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
   2831 +    rshrn           ROW1L.4h, v2.4s,    #11
   2832 +      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
   2833 +    sub             v2.4s,    v2.4s,    v12.4s
   2834 +    smlal           v10.4s,   ROW5L.4h, XFIX_2_053119869_MINUS_2_562915447
   2835 +      orr           x0,       x0,       x4
   2836 +    smlsl           v10.4s,   ROW3L.4h, XFIX_2_562915447
   2837 +      orr           x0,       x0,       x5
   2838 +    sub             v2.4s,    v2.4s,    v12.4s
   2839 +    smull           v12.4s,   ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
   2840 +      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
   2841 +    smlal           v12.4s,   ROW6L.4h, XFIX_0_541196100
   2842 +    sub             v6.4s,    v6.4s,    v4.4s
   2843 +      orr           x0,       x0,       x4
   2844 +    rshrn           ROW6L.4h, v2.4s,    #11
   2845 +      orr           x0,       x0,       x5
   2846 +    add             v2.4s,    v6.4s,    v10.4s
   2847 +      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
   2848 +    sub             v6.4s,    v6.4s,    v10.4s
   2849 +    saddl           v10.4s,   ROW0L.4h, ROW4L.4h
   2850 +      orr           x0,       x0,       x4
   2851 +    rshrn           ROW2L.4h, v2.4s,    #11
   2852 +      orr           x0,       x0,       x5
   2853 +    rshrn           ROW5L.4h, v6.4s,    #11
   2854 +      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
   2855 +    shl             v10.4s,   v10.4s,   #13
   2856 +    smlal           v8.4s,    ROW7L.4h, XFIX_0_298631336_MINUS_0_899976223
   2857 +      orr           x0,       x0,       x4
   2858 +    add             v4.4s,    v10.4s,   v12.4s
   2859 +      orr           x0,       x0,       x5
   2860 +    cmp             x0, #0 /* orrs instruction removed */
   2861 +    sub             v2.4s,    v10.4s,   v12.4s
   2862 +    add             v12.4s,   v4.4s,    v14.4s
   2863 +      ldp           w4,       w5,       [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
   2864 +    sub             v4.4s,    v4.4s,    v14.4s
   2865 +    add             v10.4s,   v2.4s,    v8.4s
   2866 +      orr           x0,       x4,       x5
   2867 +    sub             v6.4s,    v2.4s,    v8.4s
   2868 +      /* pop             {x4, x5} */
   2869 +      sub           sp, sp, 80
   2870 +      ldp           x4, x5, [sp], 16
   2871 +    rshrn           ROW7L.4h, v4.4s,    #11
   2872 +    rshrn           ROW3L.4h, v10.4s,   #11
   2873 +    rshrn           ROW0L.4h, v12.4s,   #11
   2874 +    rshrn           ROW4L.4h, v6.4s,    #11
   2875 +
   2876 +      beq             3f /* Go to do some special handling for the sparse right 4x8 half */
   2877 +
   2878 +    /* 1-D IDCT, pass 1, right 4x8 half */
   2879 +    ld1             {v2.4h},  [x15]    /* reload constants */
   2880 +    add             v10.4h,   ROW7R.4h, ROW3R.4h
   2881 +    add             v8.4h,    ROW5R.4h, ROW1R.4h
   2882 +    /* Transpose ROW6L <-> ROW7L   (v3 available free register) */
   2883 +    transpose       ROW6L, ROW7L, v3, .16b, .4h
   2884 +    smull           v12.4s,   v10.4h,   XFIX_1_175875602_MINUS_1_961570560
   2885 +    smlal           v12.4s,   v8.4h,    XFIX_1_175875602
   2886 +    /* Transpose ROW2L <-> ROW3L   (v3 available free register) */
   2887 +    transpose       ROW2L, ROW3L, v3, .16b, .4h
   2888 +    smull           v14.4s,   v10.4h,   XFIX_1_175875602
   2889 +    smlal           v14.4s,   v8.4h,    XFIX_1_175875602_MINUS_0_390180644
   2890 +    /* Transpose ROW0L <-> ROW1L   (v3 available free register) */
   2891 +    transpose       ROW0L, ROW1L, v3, .16b, .4h
   2892 +    ssubl           v6.4s,    ROW0R.4h, ROW4R.4h
   2893 +    smull           v4.4s,    ROW2R.4h, XFIX_0_541196100
   2894 +    smlal           v4.4s,    ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
   2895 +    /* Transpose ROW4L <-> ROW5L   (v3 available free register) */
   2896 +    transpose       ROW4L, ROW5L, v3, .16b, .4h
   2897 +    mov             v8.16b,   v12.16b
   2898 +    smlsl           v12.4s,   ROW5R.4h, XFIX_2_562915447
   2899 +    smlal           v12.4s,   ROW3R.4h, XFIX_3_072711026_MINUS_2_562915447
   2900 +    /* Transpose ROW1L <-> ROW3L   (v3 available free register) */
   2901 +    transpose       ROW1L, ROW3L, v3, .16b, .2s
   2902 +    shl             v6.4s,    v6.4s,    #13
   2903 +    smlsl           v8.4s,    ROW1R.4h, XFIX_0_899976223
   2904 +    /* Transpose ROW4L <-> ROW6L   (v3 available free register) */
   2905 +    transpose       ROW4L, ROW6L, v3, .16b, .2s
   2906 +    add             v2.4s,    v6.4s,    v4.4s
   2907 +    mov             v10.16b,  v14.16b
   2908 +    add             v2.4s,    v2.4s,    v12.4s
   2909 +    /* Transpose ROW0L <-> ROW2L   (v3 available free register) */
   2910 +    transpose       ROW0L, ROW2L, v3, .16b, .2s
   2911 +    smlsl           v14.4s,   ROW7R.4h, XFIX_0_899976223
   2912 +    smlal           v14.4s,   ROW1R.4h, XFIX_1_501321110_MINUS_0_899976223
   2913 +    rshrn           ROW1R.4h, v2.4s,    #11
   2914 +    /* Transpose ROW5L <-> ROW7L   (v3 available free register) */
   2915 +    transpose       ROW5L, ROW7L, v3, .16b, .2s
   2916 +    sub             v2.4s,    v2.4s,    v12.4s
   2917 +    smlal           v10.4s,   ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
   2918 +    smlsl           v10.4s,   ROW3R.4h, XFIX_2_562915447
   2919 +    sub             v2.4s,    v2.4s,    v12.4s
   2920 +    smull           v12.4s,   ROW2R.4h, XFIX_0_541196100_PLUS_0_765366865
   2921 +    smlal           v12.4s,   ROW6R.4h, XFIX_0_541196100
   2922 +    sub             v6.4s,    v6.4s,    v4.4s
   2923 +    rshrn           ROW6R.4h, v2.4s,    #11
   2924 +    add             v2.4s,    v6.4s,    v10.4s
   2925 +    sub             v6.4s,    v6.4s,    v10.4s
   2926 +    saddl           v10.4s,   ROW0R.4h, ROW4R.4h
   2927 +    rshrn           ROW2R.4h, v2.4s,    #11
   2928 +    rshrn           ROW5R.4h, v6.4s,    #11
   2929 +    shl             v10.4s,   v10.4s,   #13
   2930 +    smlal           v8.4s,    ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
   2931 +    add             v4.4s,    v10.4s,   v12.4s
   2932 +    sub             v2.4s,    v10.4s,   v12.4s
   2933 +    add             v12.4s,   v4.4s,    v14.4s
   2934 +    sub             v4.4s,    v4.4s,    v14.4s
   2935 +    add             v10.4s,   v2.4s,    v8.4s
   2936 +    sub             v6.4s,    v2.4s,    v8.4s
   2937 +    rshrn           ROW7R.4h, v4.4s,    #11
   2938 +    rshrn           ROW3R.4h, v10.4s,   #11
   2939 +    rshrn           ROW0R.4h, v12.4s,   #11
   2940 +    rshrn           ROW4R.4h, v6.4s,    #11
   2941 +    /* Transpose right 4x8 half */
   2942 +    transpose       ROW6R, ROW7R, v3, .16b, .4h
   2943 +    transpose       ROW2R, ROW3R, v3, .16b, .4h
   2944 +    transpose       ROW0R, ROW1R, v3, .16b, .4h
   2945 +    transpose       ROW4R, ROW5R, v3, .16b, .4h
   2946 +    transpose       ROW1R, ROW3R, v3, .16b, .2s
   2947 +    transpose       ROW4R, ROW6R, v3, .16b, .2s
   2948 +    transpose       ROW0R, ROW2R, v3, .16b, .2s
   2949 +    transpose       ROW5R, ROW7R, v3, .16b, .2s
   2950 +
   2951 +1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
   2952 +    ld1             {v2.4h},  [x15]    /* reload constants */
   2953 +    smull           v12.4S,   ROW1R.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4h */
   2954 +    smlal           v12.4s,   ROW1L.4h, XFIX_1_175875602
   2955 +    smlal           v12.4s,   ROW3R.4h, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L.4h <-> ROW3R.4h */
   2956 +    smlal           v12.4s,   ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
   2957 +    smull           v14.4s,   ROW3R.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4h */
   2958 +    smlal           v14.4s,   ROW3L.4h, XFIX_1_175875602
   2959 +    smlal           v14.4s,   ROW1R.4h, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L.4h <-> ROW1R.4h */
   2960 +    smlal           v14.4s,   ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
   2961 +    ssubl           v6.4s,    ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
   2962 +    smull           v4.4s,    ROW2L.4h, XFIX_0_541196100
   2963 +    smlal           v4.4s,    ROW2R.4h, XFIX_0_541196100_MINUS_1_847759065 /* ROW6L.4h <-> ROW2R.4h */
   2964 +    mov             v8.16b,   v12.16b
   2965 +    smlsl           v12.4s,   ROW1R.4h, XFIX_2_562915447 /* ROW5L.4h <-> ROW1R.4h */
   2966 +    smlal           v12.4s,   ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
   2967 +    shl             v6.4s,    v6.4s,    #13
   2968 +    smlsl           v8.4s,    ROW1L.4h, XFIX_0_899976223
   2969 +    add             v2.4s,    v6.4s,    v4.4s
   2970 +    mov             v10.16b,  v14.16b
   2971 +    add             v2.4s,    v2.4s,    v12.4s
   2972 +    smlsl           v14.4s,   ROW3R.4h, XFIX_0_899976223 /* ROW7L.4h <-> ROW3R.4h */
   2973 +    smlal           v14.4s,   ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
   2974 +    shrn            ROW1L.4h, v2.4s,    #16
   2975 +    sub             v2.4s,    v2.4s,    v12.4s
   2976 +    smlal           v10.4s,   ROW1R.4h, XFIX_2_053119869_MINUS_2_562915447 /* ROW5L.4h <-> ROW1R.4h */
   2977 +    smlsl           v10.4s,   ROW3L.4h, XFIX_2_562915447
   2978 +    sub             v2.4s,    v2.4s,    v12.4s
   2979 +    smull           v12.4s,   ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
   2980 +    smlal           v12.4s,   ROW2R.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4h */
   2981 +    sub             v6.4s,    v6.4s,    v4.4s
   2982 +    shrn            ROW2R.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
   2983 +    add             v2.4s,    v6.4s,    v10.4s
   2984 +    sub             v6.4s,    v6.4s,    v10.4s
   2985 +    saddl           v10.4s,   ROW0L.4h, ROW0R.4h /* ROW4L.4h <-> ROW0R.4h */
   2986 +    shrn            ROW2L.4h, v2.4s,    #16
   2987 +    shrn            ROW1R.4h, v6.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
   2988 +    shl             v10.4s,   v10.4s,   #13
   2989 +    smlal           v8.4s,    ROW3R.4h, XFIX_0_298631336_MINUS_0_899976223 /* ROW7L.4h <-> ROW3R.4h */
   2990 +    add             v4.4s,    v10.4s,   v12.4s
   2991 +    sub             v2.4s,    v10.4s,   v12.4s
   2992 +    add             v12.4s,   v4.4s,    v14.4s
   2993 +    sub             v4.4s,    v4.4s,    v14.4s
   2994 +    add             v10.4s,   v2.4s,    v8.4s
   2995 +    sub             v6.4s,    v2.4s,    v8.4s
   2996 +    shrn            ROW3R.4h, v4.4s,    #16 /* ROW7L.4h <-> ROW3R.4h */
   2997 +    shrn            ROW3L.4h, v10.4s,   #16
   2998 +    shrn            ROW0L.4h, v12.4s,   #16
   2999 +    shrn            ROW0R.4h, v6.4s,    #16 /* ROW4L.4h <-> ROW0R.4h */
   3000 +    /* 1-D IDCT, pass 2, right 4x8 half */
   3001 +    ld1             {v2.4h},  [x15]    /* reload constants */
   3002 +    smull           v12.4s,   ROW5R.4h, XFIX_1_175875602
   3003 +    smlal           v12.4s,   ROW5L.4h, XFIX_1_175875602 /* ROW5L.4h <-> ROW1R.4h */
   3004 +    smlal           v12.4s,   ROW7R.4h, XFIX_1_175875602_MINUS_1_961570560
   3005 +    smlal           v12.4s,   ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560 /* ROW7L.4h <-> ROW3R.4h */
   3006 +    smull           v14.4s,   ROW7R.4h, XFIX_1_175875602
   3007 +    smlal           v14.4s,   ROW7L.4h, XFIX_1_175875602 /* ROW7L.4h <-> ROW3R.4h */
   3008 +    smlal           v14.4s,   ROW5R.4h, XFIX_1_175875602_MINUS_0_390180644
   3009 +    smlal           v14.4s,   ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644 /* ROW5L.4h <-> ROW1R.4h */
   3010 +    ssubl           v6.4s,    ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
   3011 +    smull           v4.4s,    ROW6L.4h, XFIX_0_541196100 /* ROW6L.4h <-> ROW2R.4h */
   3012 +    smlal           v4.4s,    ROW6R.4h, XFIX_0_541196100_MINUS_1_847759065
   3013 +    mov             v8.16b,   v12.16b
   3014 +    smlsl           v12.4s,   ROW5R.4h, XFIX_2_562915447
   3015 +    smlal           v12.4s,   ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447 /* ROW7L.4h <-> ROW3R.4h */
   3016 +    shl             v6.4s,    v6.4s,    #13
   3017 +    smlsl           v8.4s,    ROW5L.4h, XFIX_0_899976223 /* ROW5L.4h <-> ROW1R.4h */
   3018 +    add             v2.4s,    v6.4s,    v4.4s
   3019 +    mov             v10.16b,  v14.16b
   3020 +    add             v2.4s,    v2.4s,    v12.4s
   3021 +    smlsl           v14.4s,   ROW7R.4h, XFIX_0_899976223
   3022 +    smlal           v14.4s,   ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223 /* ROW5L.4h <-> ROW1R.4h */
   3023 +    shrn            ROW5L.4h, v2.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
   3024 +    sub             v2.4s,    v2.4s,    v12.4s
   3025 +    smlal           v10.4s,   ROW5R.4h, XFIX_2_053119869_MINUS_2_562915447
   3026 +    smlsl           v10.4s,   ROW7L.4h, XFIX_2_562915447 /* ROW7L.4h <-> ROW3R.4h */
   3027 +    sub             v2.4s,    v2.4s,    v12.4s
   3028 +    smull           v12.4s,   ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865 /* ROW6L.4h <-> ROW2R.4h */
   3029 +    smlal           v12.4s,   ROW6R.4h, XFIX_0_541196100
   3030 +    sub             v6.4s,    v6.4s,    v4.4s
   3031 +    shrn            ROW6R.4h, v2.4s,    #16
   3032 +    add             v2.4s,    v6.4s,    v10.4s
   3033 +    sub             v6.4s,    v6.4s,    v10.4s
   3034 +    saddl           v10.4s,   ROW4L.4h, ROW4R.4h /* ROW4L.4h <-> ROW0R.4h */
   3035 +    shrn            ROW6L.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
   3036 +    shrn            ROW5R.4h, v6.4s,    #16
   3037 +    shl             v10.4s,   v10.4s,   #13
   3038 +    smlal           v8.4s,    ROW7R.4h, XFIX_0_298631336_MINUS_0_899976223
   3039 +    add             v4.4s,    v10.4s,   v12.4s
   3040 +    sub             v2.4s,    v10.4s,   v12.4s
   3041 +    add             v12.4s,   v4.4s,    v14.4s
   3042 +    sub             v4.4s,    v4.4s,    v14.4s
   3043 +    add             v10.4s,   v2.4s,    v8.4s
   3044 +    sub             v6.4s,    v2.4s,    v8.4s
   3045 +    shrn            ROW7R.4h, v4.4s,    #16
   3046 +    shrn            ROW7L.4h, v10.4s,   #16 /* ROW7L.4h <-> ROW3R.4h */
   3047 +    shrn            ROW4L.4h, v12.4s,   #16 /* ROW4L.4h <-> ROW0R.4h */
   3048 +    shrn            ROW4R.4h, v6.4s,    #16
   3049 +
   3050 +2:  /* Descale to 8-bit and range limit */
   3051 +    ins             v16.2d[1], v17.2d[0]
   3052 +    ins             v18.2d[1], v19.2d[0]
   3053 +    ins             v20.2d[1], v21.2d[0]
   3054 +    ins             v22.2d[1], v23.2d[0]
   3055 +    sqrshrn         v16.8b,   v16.8h,   #2
   3056 +    sqrshrn2        v16.16b,  v18.8h,   #2
   3057 +    sqrshrn         v18.8b,   v20.8h,   #2
   3058 +    sqrshrn2        v18.16b,  v22.8h,   #2
   3059 +
   3060 +    /* vpop            {v8.4h - d15.4h} */ /* restore NEON registers */
   3061 +    ld1             {v8.4h - v11.4h}, [sp], 32
   3062 +    ld1             {v12.4h - v15.4h}, [sp], 32
   3063 +    ins             v24.2d[1], v25.2d[0]
   3064 +
   3065 +    sqrshrn         v20.8b,   v24.8h,   #2
   3066 +      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
   3067 +    /* trn1            v16.8h,    v16.8h,  v18.8h */
   3068 +    transpose       v16, v18, v3, .16b, .8h
   3069 +    ins             v26.2d[1], v27.2d[0]
   3070 +    ins             v28.2d[1], v29.2d[0]
   3071 +    ins             v30.2d[1], v31.2d[0]
   3072 +    sqrshrn2        v20.16b,  v26.8h,   #2
   3073 +    sqrshrn         v22.8b,   v28.8h,   #2
   3074 +    movi            v0.16b,   #(CENTERJSAMPLE)
   3075 +    sqrshrn2        v22.16b,  v30.8h,   #2
   3076 +    transpose_single v16, v17, v3, .2d, .8b
   3077 +    transpose_single v18, v19, v3, .2d, .8b
   3078 +    add             v16.8b,   v16.8b,   v0.8b
   3079 +    add             v17.8b,   v17.8b,   v0.8b
   3080 +    add             v18.8b,   v18.8b,   v0.8b
   3081 +    add             v19.8b,   v19.8b,   v0.8b
   3082 +    transpose       v20, v22, v3, .16b, .8h
   3083 +    /* Store results to the output buffer */
   3084 +    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
   3085 +    add             TMP1,     TMP1,     OUTPUT_COL
   3086 +    add             TMP2,     TMP2,     OUTPUT_COL
   3087 +    st1             {v16.8b}, [TMP1]
   3088 +    transpose_single v20, v21, v3, .2d, .8b
   3089 +    st1             {v17.8b}, [TMP2]
   3090 +    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
   3091 +    add             TMP1,     TMP1,     OUTPUT_COL
   3092 +    add             TMP2,     TMP2,     OUTPUT_COL
   3093 +    st1             {v18.8b}, [TMP1]
   3094 +    add             v20.8b,   v20.8b,   v0.8b
   3095 +    add             v21.8b,   v21.8b,   v0.8b
   3096 +    st1             {v19.8b}, [TMP2]
   3097 +    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
   3098 +    ldp             TMP3,     TMP4,     [OUTPUT_BUF]
   3099 +    add             TMP1,     TMP1,     OUTPUT_COL
   3100 +    add             TMP2,     TMP2,     OUTPUT_COL
   3101 +    add             TMP3,     TMP3,     OUTPUT_COL
   3102 +    add             TMP4,     TMP4,     OUTPUT_COL
   3103 +    transpose_single v22, v23, v3, .2d, .8b
   3104 +    st1             {v20.8b}, [TMP1]
   3105 +    add             v22.8b,   v22.8b,   v0.8b
   3106 +    add             v23.8b,   v23.8b,   v0.8b
   3107 +    st1             {v21.8b}, [TMP2]
   3108 +    st1             {v22.8b}, [TMP3]
   3109 +    st1             {v23.8b}, [TMP4]
   3110 +    ldr             x15, [sp], 16
   3111 +    ld1             {v0.8b - v3.8b}, [sp], 32
   3112 +    ld1             {v4.8b - v7.8b}, [sp], 32
   3113 +    ld1             {v8.8b - v11.8b}, [sp], 32
   3114 +    ld1             {v12.8b - v15.8b}, [sp], 32
   3115 +    ld1             {v16.8b - v19.8b}, [sp], 32
   3116 +    ld1             {v20.8b - v23.8b}, [sp], 32
   3117 +    ld1             {v24.8b - v27.8b}, [sp], 32
   3118 +    ld1             {v28.8b - v31.8b}, [sp], 32
   3119 +    blr             x30
   3120 +
   3121 +3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
   3122 +
   3123 +    /* Transpose left 4x8 half */
   3124 +    transpose       ROW6L, ROW7L, v3, .16b, .4h
   3125 +    transpose       ROW2L, ROW3L, v3, .16b, .4h
   3126 +    transpose       ROW0L, ROW1L, v3, .16b, .4h
   3127 +    transpose       ROW4L, ROW5L, v3, .16b, .4h
   3128 +    shl             ROW0R.4h, ROW0R.4h, #2 /* PASS1_BITS */
   3129 +    transpose       ROW1L, ROW3L, v3, .16b, .2s
   3130 +    transpose       ROW4L, ROW6L, v3, .16b, .2s
   3131 +    transpose       ROW0L, ROW2L, v3, .16b, .2s
   3132 +    transpose       ROW5L, ROW7L, v3, .16b, .2s
   3133 +    cmp             x0, #0
   3134 +    beq             4f /* Right 4x8 half has all zeros, go to 'sparse' second pass */
   3135 +
   3136 +    /* Only row 0 is non-zero for the right 4x8 half  */
   3137 +    dup             ROW1R.4h, ROW0R.4h[1]
   3138 +    dup             ROW2R.4h, ROW0R.4h[2]
   3139 +    dup             ROW3R.4h, ROW0R.4h[3]
   3140 +    dup             ROW4R.4h, ROW0R.4h[0]
   3141 +    dup             ROW5R.4h, ROW0R.4h[1]
   3142 +    dup             ROW6R.4h, ROW0R.4h[2]
   3143 +    dup             ROW7R.4h, ROW0R.4h[3]
   3144 +    dup             ROW0R.4h, ROW0R.4h[0]
   3145 +    b               1b /* Go to 'normal' second pass */
   3146 +
   3147 +4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
   3148 +    ld1             {v2.4h},  [x15]    /* reload constants */
   3149 +    smull           v12.4s,   ROW1L.4h, XFIX_1_175875602
   3150 +    smlal           v12.4s,   ROW3L.4h, XFIX_1_175875602_MINUS_1_961570560
   3151 +    smull           v14.4s,   ROW3L.4h, XFIX_1_175875602
   3152 +    smlal           v14.4s,   ROW1L.4h, XFIX_1_175875602_MINUS_0_390180644
   3153 +    smull           v4.4s,    ROW2L.4h, XFIX_0_541196100
   3154 +    sshll           v6.4s,    ROW0L.4h, #13
   3155 +    mov             v8.16b,   v12.16b
   3156 +    smlal           v12.4s,   ROW3L.4h, XFIX_3_072711026_MINUS_2_562915447
   3157 +    smlsl           v8.4s,    ROW1L.4h, XFIX_0_899976223
   3158 +    add             v2.4s,    v6.4s,    v4.4s
   3159 +    mov             v10.16b,  v14.16b
   3160 +    smlal           v14.4s,   ROW1L.4h, XFIX_1_501321110_MINUS_0_899976223
   3161 +    add             v2.4s,    v2.4s,    v12.4s
   3162 +    add             v12.4s,   v12.4s,   v12.4s
   3163 +    smlsl           v10.4s,   ROW3L.4h, XFIX_2_562915447
   3164 +    shrn            ROW1L.4h, v2.4s,    #16
   3165 +    sub             v2.4s,    v2.4s,    v12.4s
   3166 +    smull           v12.4s,   ROW2L.4h, XFIX_0_541196100_PLUS_0_765366865
   3167 +    sub             v6.4s,    v6.4s,    v4.4s
   3168 +    shrn            ROW2R.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
   3169 +    add             v2.4s,    v6.4s,    v10.4s
   3170 +    sub             v6.4s,    v6.4s,    v10.4s
   3171 +    sshll           v10.4s,   ROW0L.4h, #13
   3172 +    shrn            ROW2L.4h, v2.4s,    #16
   3173 +    shrn            ROW1R.4h, v6.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
   3174 +    add             v4.4s,    v10.4s,   v12.4s
   3175 +    sub             v2.4s,    v10.4s,   v12.4s
   3176 +    add             v12.4s,   v4.4s,    v14.4s
   3177 +    sub             v4.4s,    v4.4s,    v14.4s
   3178 +    add             v10.4s,   v2.4s,    v8.4s
   3179 +    sub             v6.4s,    v2.4s,    v8.4s
   3180 +    shrn            ROW3R.4h, v4.4s,    #16 /* ROW7L.4h <-> ROW3R.4h */
   3181 +    shrn            ROW3L.4h, v10.4s,   #16
   3182 +    shrn            ROW0L.4h, v12.4s,   #16
   3183 +    shrn            ROW0R.4h, v6.4s,    #16 /* ROW4L.4h <-> ROW0R.4h */
   3184 +    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
   3185 +    ld1             {v2.4h},  [x15]    /* reload constants */
   3186 +    smull           v12.4s,   ROW5L.4h, XFIX_1_175875602
   3187 +    smlal           v12.4s,   ROW7L.4h, XFIX_1_175875602_MINUS_1_961570560
   3188 +    smull           v14.4s,   ROW7L.4h, XFIX_1_175875602
   3189 +    smlal           v14.4s,   ROW5L.4h, XFIX_1_175875602_MINUS_0_390180644
   3190 +    smull           v4.4s,    ROW6L.4h, XFIX_0_541196100
   3191 +    sshll           v6.4s,    ROW4L.4h, #13
   3192 +    mov             v8.16b,   v12.16b
   3193 +    smlal           v12.4s,   ROW7L.4h, XFIX_3_072711026_MINUS_2_562915447
   3194 +    smlsl           v8.4s,    ROW5L.4h, XFIX_0_899976223
   3195 +    add             v2.4s,    v6.4s,    v4.4s
   3196 +    mov             v10.16b,  v14.16b
   3197 +    smlal           v14.4s,   ROW5L.4h, XFIX_1_501321110_MINUS_0_899976223
   3198 +    add             v2.4s,    v2.4s,    v12.4s
   3199 +    add             v12.4s,   v12.4s,   v12.4s
   3200 +    smlsl           v10.4s,   ROW7L.4h, XFIX_2_562915447
   3201 +    shrn            ROW5L.4h, v2.4s,    #16 /* ROW5L.4h <-> ROW1R.4h */
   3202 +    sub             v2.4s,    v2.4s,    v12.4s
   3203 +    smull           v12.4s,   ROW6L.4h, XFIX_0_541196100_PLUS_0_765366865
   3204 +    sub             v6.4s,    v6.4s,    v4.4s
   3205 +    shrn            ROW6R.4h, v2.4s,    #16
   3206 +    add             v2.4s,    v6.4s,    v10.4s
   3207 +    sub             v6.4s,    v6.4s,    v10.4s
   3208 +    sshll           v10.4s,   ROW4L.4h, #13
   3209 +    shrn            ROW6L.4h, v2.4s,    #16 /* ROW6L.4h <-> ROW2R.4h */
   3210 +    shrn            ROW5R.4h, v6.4s,    #16
   3211 +    add             v4.4s,    v10.4s,   v12.4s
   3212 +    sub             v2.4s,    v10.4s,   v12.4s
   3213 +    add             v12.4s,   v4.4s,    v14.4s
   3214 +    sub             v4.4s,    v4.4s,    v14.4s
   3215 +    add             v10.4s,   v2.4s,    v8.4s
   3216 +    sub             v6.4s,    v2.4s,    v8.4s
   3217 +    shrn            ROW7R.4h, v4.4s,    #16
   3218 +    shrn            ROW7L.4h, v10.4s,   #16 /* ROW7L.4h <-> ROW3R.4h */
   3219 +    shrn            ROW4L.4h, v12.4s,   #16 /* ROW4L.4h <-> ROW0R.4h */
   3220 +    shrn            ROW4R.4h, v6.4s,    #16
   3221 +    b               2b /* Go to epilogue */
   3222 +
   3223 +    .unreq          DCT_TABLE
   3224 +    .unreq          COEF_BLOCK
   3225 +    .unreq          OUTPUT_BUF
   3226 +    .unreq          OUTPUT_COL
   3227 +    .unreq          TMP1
   3228 +    .unreq          TMP2
   3229 +    .unreq          TMP3
   3230 +    .unreq          TMP4
   3231 +
   3232 +    .unreq          ROW0L
   3233 +    .unreq          ROW0R
   3234 +    .unreq          ROW1L
   3235 +    .unreq          ROW1R
   3236 +    .unreq          ROW2L
   3237 +    .unreq          ROW2R
   3238 +    .unreq          ROW3L
   3239 +    .unreq          ROW3R
   3240 +    .unreq          ROW4L
   3241 +    .unreq          ROW4R
   3242 +    .unreq          ROW5L
   3243 +    .unreq          ROW5R
   3244 +    .unreq          ROW6L
   3245 +    .unreq          ROW6R
   3246 +    .unreq          ROW7L
   3247 +    .unreq          ROW7R
   3248 +
   3249 +
   3250 +/*****************************************************************************/
   3251 +
   3252 +/*
   3253 + * jsimd_idct_ifast_neon
   3254 + *
   3255 + * This function contains a fast, not so accurate integer implementation of
   3256 + * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
   3257 + * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
   3258 + * function from jidctfst.c
   3259 + *
   3260 + * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
   3261 + * But in ARM NEON case some extra additions are required because VQDMULH
   3262 + * instruction can't handle the constants larger than 1. So the expressions
   3263 + * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
   3264 + * which introduces an extra addition. Overall, there are 6 extra additions
   3265 + * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
   3266 + */
   3267 +
   3268 +#define XFIX_1_082392200 v0.4h[0]
   3269 +#define XFIX_1_414213562 v0.4h[1]
   3270 +#define XFIX_1_847759065 v0.4h[2]
   3271 +#define XFIX_2_613125930 v0.4h[3]
   3272 +
   3273 +.balign 16
   3274 +jsimd_idct_ifast_neon_consts:
   3275 +    .short (277 * 128 - 256 * 128) /* XFIX_1_082392200 */
   3276 +    .short (362 * 128 - 256 * 128) /* XFIX_1_414213562 */
   3277 +    .short (473 * 128 - 256 * 128) /* XFIX_1_847759065 */
   3278 +    .short (669 * 128 - 512 * 128) /* XFIX_2_613125930 */
   3279 +
   3280 +asm_function jsimd_idct_ifast_neon
   3281 +
   3282 +    DCT_TABLE       .req x0
   3283 +    COEF_BLOCK      .req x1
   3284 +    OUTPUT_BUF      .req x2
   3285 +    OUTPUT_COL      .req x3
   3286 +    TMP1            .req x0
   3287 +    TMP2            .req x1
   3288 +    TMP3            .req x2
   3289 +    TMP4            .req x22
   3290 +    TMP5            .req x23
   3291 +
   3292 +    /* Load and dequantize coefficients into NEON registers
   3293 +     * with the following allocation:
   3294 +     *       0 1 2 3 | 4 5 6 7
   3295 +     *      ---------+--------
   3296 +     *   0 | d16     | d17     ( v8.8h  )
   3297 +     *   1 | d18     | d19     ( v9.8h  )
   3298 +     *   2 | d20     | d21     ( v10.8h )
   3299 +     *   3 | d22     | d23     ( v11.8h )
   3300 +     *   4 | d24     | d25     ( v12.8h )
   3301 +     *   5 | d26     | d27     ( v13.8h )
   3302 +     *   6 | d28     | d29     ( v14.8h )
   3303 +     *   7 | d30     | d31     ( v15.8h )
   3304 +     */
   3305 +    /* Save NEON registers used in fast IDCT */
   3306 +    sub             sp, sp, #176
   3307 +    stp             x22, x23, [sp], 16
   3308 +    adr             x23, jsimd_idct_ifast_neon_consts
   3309 +    st1             {v0.8b - v3.8b}, [sp], 32
   3310 +    st1             {v4.8b - v7.8b}, [sp], 32
   3311 +    st1             {v8.8b - v11.8b}, [sp], 32
   3312 +    st1             {v12.8b - v15.8b}, [sp], 32
   3313 +    st1             {v16.8b - v19.8b}, [sp], 32
   3314 +    ld1             {v8.8h, v9.8h}, [COEF_BLOCK], 32
   3315 +    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
   3316 +    ld1             {v10.8h, v11.8h}, [COEF_BLOCK], 32
   3317 +    mul             v8.8h,  v8.8h,  v0.8h
   3318 +    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
   3319 +    mul             v9.8h,  v9.8h,  v1.8h
   3320 +    ld1             {v12.8h, v13.8h}, [COEF_BLOCK], 32
   3321 +    mul             v10.8h, v10.8h, v2.8h
   3322 +    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
   3323 +    mul             v11.8h, v11.8h, v3.8h
   3324 +    ld1             {v14.8h, v15.8h}, [COEF_BLOCK], 32
   3325 +    mul             v12.8h, v12.8h, v0.8h
   3326 +    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
   3327 +    mul             v14.8h, v14.8h, v2.8h
   3328 +    mul             v13.8h, v13.8h, v1.8h
   3329 +    ld1             {v0.4h}, [x23]      /* load constants */
   3330 +    mul             v15.8h, v15.8h, v3.8h
   3331 +
   3332 +    /* 1-D IDCT, pass 1 */
   3333 +    sub             v2.8h,    v10.8h,   v14.8h
   3334 +    add             v14.8h,   v10.8h,   v14.8h
   3335 +    sub             v1.8h,    v11.8h,   v13.8h
   3336 +    add             v13.8h,   v11.8h,   v13.8h
   3337 +    sub             v5.8h,    v9.8h,    v15.8h
   3338 +    add             v15.8h,   v9.8h,    v15.8h
   3339 +    sqdmulh         v4.8h,    v2.8h,    XFIX_1_414213562
   3340 +    sqdmulh         v6.8h,    v1.8h,    XFIX_2_613125930
   3341 +    add             v3.8h,    v1.8h,    v1.8h
   3342 +    sub             v1.8h,    v5.8h,    v1.8h
   3343 +    add             v10.8h,   v2.8h,    v4.8h
   3344 +    sqdmulh         v4.8h,    v1.8h,    XFIX_1_847759065
   3345 +    sub             v2.8h,    v15.8h,   v13.8h
   3346 +    add             v3.8h,    v3.8h,    v6.8h
   3347 +    sqdmulh         v6.8h,    v2.8h,    XFIX_1_414213562
   3348 +    add             v1.8h,    v1.8h,    v4.8h
   3349 +    sqdmulh         v4.8h,    v5.8h,    XFIX_1_082392200
   3350 +    sub             v10.8h,   v10.8h,   v14.8h
   3351 +    add             v2.8h,    v2.8h,    v6.8h
   3352 +    sub             v6.8h,    v8.8h,    v12.8h
   3353 +    add             v12.8h,   v8.8h,    v12.8h
   3354 +    add             v9.8h,    v5.8h,    v4.8h
   3355 +    add             v5.8h,    v6.8h,    v10.8h
   3356 +    sub             v10.8h,   v6.8h,    v10.8h
   3357 +    add             v6.8h,    v15.8h,   v13.8h
   3358 +    add             v8.8h,    v12.8h,   v14.8h
   3359 +    sub             v3.8h,    v6.8h,    v3.8h
   3360 +    sub             v12.8h,   v12.8h,   v14.8h
   3361 +    sub             v3.8h,    v3.8h,    v1.8h
   3362 +    sub             v1.8h,    v9.8h,    v1.8h
   3363 +    add             v2.8h,    v3.8h,    v2.8h
   3364 +    sub             v15.8h,   v8.8h,    v6.8h
   3365 +    add             v1.8h,    v1.8h,    v2.8h
   3366 +    add             v8.8h,    v8.8h,    v6.8h
   3367 +    add             v14.8h,   v5.8h,    v3.8h
   3368 +    sub             v9.8h,    v5.8h,    v3.8h
   3369 +    sub             v13.8h,   v10.8h,   v2.8h
   3370 +    add             v10.8h,   v10.8h,   v2.8h
   3371 +    /* Transpose  q8-q9 */
   3372 +    mov             v18.16b,  v8.16b
   3373 +    trn1            v8.8h,    v8.8h,    v9.8h
   3374 +    trn2            v9.8h,    v18.8h,   v9.8h
   3375 +    sub             v11.8h,   v12.8h,   v1.8h
   3376 +    /* Transpose  q14-q15 */
   3377 +    mov             v18.16b,  v14.16b
   3378 +    trn1            v14.8h,   v14.8h,   v15.8h
   3379 +    trn2            v15.8h,   v18.8h,   v15.8h
   3380 +    add             v12.8h,   v12.8h,   v1.8h
   3381 +    /* Transpose  q10-q11 */
   3382 +    mov             v18.16b,  v10.16b
   3383 +    trn1            v10.8h,   v10.8h,   v11.8h
   3384 +    trn2            v11.8h,   v18.8h,   v11.8h
   3385 +    /* Transpose  q12-q13 */
   3386 +    mov             v18.16b,  v12.16b
   3387 +    trn1            v12.8h,   v12.8h,   v13.8h
   3388 +    trn2            v13.8h,   v18.8h,   v13.8h
   3389 +    /* Transpose  q9-q11 */
   3390 +    mov             v18.16b,  v9.16b
   3391 +    trn1            v9.4s,    v9.4s,    v11.4s
   3392 +    trn2            v11.4s,   v18.4s,   v11.4s
   3393 +    /* Transpose  q12-q14 */
   3394 +    mov             v18.16b,  v12.16b
   3395 +    trn1            v12.4s,   v12.4s,   v14.4s
   3396 +    trn2            v14.4s,   v18.4s,   v14.4s
   3397 +    /* Transpose  q8-q10 */
   3398 +    mov             v18.16b,  v8.16b
   3399 +    trn1            v8.4s,    v8.4s,    v10.4s
   3400 +    trn2            v10.4s,   v18.4s,   v10.4s
   3401 +    /* Transpose  q13-q15 */
   3402 +    mov             v18.16b,  v13.16b
   3403 +    trn1            v13.4s,   v13.4s,   v15.4s
   3404 +    trn2            v15.4s,   v18.4s,   v15.4s
   3405 +    /* vswp            v14.4h,   v10-MSB.4h */
   3406 +    umov            x22, v14.d[0]
   3407 +    ins             v14.2d[0], v10.2d[1]
   3408 +    ins             v10.2d[1], x22
   3409 +    /* vswp            v13.4h,   v9MSB.4h */
   3410 +
   3411 +    umov            x22, v13.d[0]
   3412 +    ins             v13.2d[0], v9.2d[1]
   3413 +    ins             v9.2d[1], x22
   3414 +    /* 1-D IDCT, pass 2 */
   3415 +    sub             v2.8h,    v10.8h,   v14.8h
   3416 +    /* vswp            v15.4h,   v11MSB.4h */
   3417 +    umov            x22, v15.d[0]
   3418 +    ins             v15.2d[0], v11.2d[1]
   3419 +    ins             v11.2d[1], x22
   3420 +    add             v14.8h,   v10.8h,   v14.8h
   3421 +    /* vswp            v12.4h,   v8-MSB.4h */
   3422 +    umov            x22, v12.d[0]
   3423 +    ins             v12.2d[0], v8.2d[1]
   3424 +    ins             v8.2d[1], x22
   3425 +    sub             v1.8h,    v11.8h,   v13.8h
   3426 +    add             v13.8h,   v11.8h,   v13.8h
   3427 +    sub             v5.8h,    v9.8h,    v15.8h
   3428 +    add             v15.8h,   v9.8h,    v15.8h
   3429 +    sqdmulh         v4.8h,    v2.8h,    XFIX_1_414213562
   3430 +    sqdmulh         v6.8h,    v1.8h,    XFIX_2_613125930
   3431 +    add             v3.8h,    v1.8h,    v1.8h
   3432 +    sub             v1.8h,    v5.8h,    v1.8h
   3433 +    add             v10.8h,   v2.8h,    v4.8h
   3434 +    sqdmulh         v4.8h,    v1.8h,    XFIX_1_847759065
   3435 +    sub             v2.8h,    v15.8h,   v13.8h
   3436 +    add             v3.8h,    v3.8h,    v6.8h
   3437 +    sqdmulh         v6.8h,    v2.8h,    XFIX_1_414213562
   3438 +    add             v1.8h,    v1.8h,    v4.8h
   3439 +    sqdmulh         v4.8h,    v5.8h,    XFIX_1_082392200
   3440 +    sub             v10.8h,   v10.8h,   v14.8h
   3441 +    add             v2.8h,    v2.8h,    v6.8h
   3442 +    sub             v6.8h,    v8.8h,    v12.8h
   3443 +    add             v12.8h,   v8.8h,    v12.8h
   3444 +    add             v9.8h,    v5.8h,    v4.8h
   3445 +    add             v5.8h,    v6.8h,    v10.8h
   3446 +    sub             v10.8h,   v6.8h,    v10.8h
   3447 +    add             v6.8h,    v15.8h,   v13.8h
   3448 +    add             v8.8h,    v12.8h,   v14.8h
   3449 +    sub             v3.8h,    v6.8h,    v3.8h
   3450 +    sub             v12.8h,   v12.8h,   v14.8h
   3451 +    sub             v3.8h,    v3.8h,    v1.8h
   3452 +    sub             v1.8h,    v9.8h,    v1.8h
   3453 +    add             v2.8h,    v3.8h,    v2.8h
   3454 +    sub             v15.8h,   v8.8h,    v6.8h
   3455 +    add             v1.8h,    v1.8h,    v2.8h
   3456 +    add             v8.8h,    v8.8h,    v6.8h
   3457 +    add             v14.8h,   v5.8h,    v3.8h
   3458 +    sub             v9.8h,    v5.8h,    v3.8h
   3459 +    sub             v13.8h,   v10.8h,   v2.8h
   3460 +    add             v10.8h,   v10.8h,   v2.8h
   3461 +    sub             v11.8h,   v12.8h,   v1.8h
   3462 +    add             v12.8h,   v12.8h,   v1.8h
   3463 +    /* Descale to 8-bit and range limit */
   3464 +    movi            v0.16b,   #0x80
   3465 +    sqshrn          v8.8b,    v8.8h,    #5
   3466 +    sqshrn2         v8.16b,   v9.8h,    #5
   3467 +    sqshrn          v9.8b,    v10.8h,   #5
   3468 +    sqshrn2         v9.16b,   v11.8h,   #5
   3469 +    sqshrn          v10.8b,   v12.8h,   #5
   3470 +    sqshrn2         v10.16b,  v13.8h,   #5
   3471 +    sqshrn          v11.8b,   v14.8h,   #5
   3472 +    sqshrn2         v11.16b,  v15.8h,   #5
   3473 +    add             v8.16b,   v8.16b,   v0.16b
   3474 +    add             v9.16b,   v9.16b,   v0.16b
   3475 +    add             v10.16b,  v10.16b,  v0.16b
   3476 +    add             v11.16b,  v11.16b,  v0.16b
   3477 +    /* Transpose the final 8-bit samples */
   3478 +    /* Transpose  q8-q9 */
   3479 +    mov             v18.16b,  v8.16b
   3480 +    trn1            v8.8h,    v8.8h,    v9.8h
   3481 +    trn2            v9.8h,    v18.8h,   v9.8h
   3482 +    /* Transpose  q10-q11 */
   3483 +    mov             v18.16b,  v10.16b
   3484 +    trn1            v10.8h,   v10.8h,   v11.8h
   3485 +    trn2            v11.8h,   v18.8h,   v11.8h
   3486 +    /* Transpose  q8-q10 */
   3487 +    mov             v18.16b,  v8.16b
   3488 +    trn1            v8.4s,    v8.4s,    v10.4s
   3489 +    trn2            v10.4s,   v18.4s,   v10.4s
   3490 +    /* Transpose  q9-q11 */
   3491 +    mov             v18.16b,  v9.16b
   3492 +    trn1            v9.4s,    v9.4s,    v11.4s
   3493 +    trn2            v11.4s,   v18.4s,   v11.4s
   3494 +    /* make copy */
   3495 +    ins             v17.2d[0], v8.2d[1]
   3496 +    /* Transpose  d16-d17-msb */
   3497 +    mov             v18.16b,  v8.16b
   3498 +    trn1            v8.8b,    v8.8b,    v17.8b
   3499 +    trn2            v17.8b,   v18.8b,   v17.8b
   3500 +    /* make copy */
   3501 +    ins             v19.2d[0], v9.2d[1]
   3502 +    mov             v18.16b,  v9.16b
   3503 +    trn1            v9.8b,    v9.8b,    v19.8b
   3504 +    trn2            v19.8b,   v18.8b,   v19.8b
   3505 +    /* Store results to the output buffer */
   3506 +    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
   3507 +    add             TMP1,     TMP1,     OUTPUT_COL
   3508 +    add             TMP2,     TMP2,     OUTPUT_COL
   3509 +    st1             {v8.8b},  [TMP1]
   3510 +    st1             {v17.8b}, [TMP2]
   3511 +    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
   3512 +    add             TMP1,     TMP1,     OUTPUT_COL
   3513 +    add             TMP2,     TMP2,     OUTPUT_COL
   3514 +    st1             {v9.8b},  [TMP1]
   3515 +    /* make copy */
   3516 +    ins             v7.2d[0], v10.2d[1]
   3517 +    mov             v18.16b,  v10.16b
   3518 +    trn1            v10.8b,   v10.8b,   v7.8b
   3519 +    trn2            v7.8b,    v18.8b,   v7.8b
   3520 +    st1             {v19.8b}, [TMP2]
   3521 +    ldp             TMP1,     TMP2,     [OUTPUT_BUF], 16
   3522 +    ldp             TMP4,     TMP5,     [OUTPUT_BUF], 16
   3523 +    add             TMP1,     TMP1,     OUTPUT_COL
   3524 +    add             TMP2,     TMP2,     OUTPUT_COL
   3525 +    add             TMP4,     TMP4,     OUTPUT_COL
   3526 +    add             TMP5,     TMP5,     OUTPUT_COL
   3527 +    st1             {v10.8b}, [TMP1]
   3528 +    /* make copy */
   3529 +    ins             v16.2d[0], v11.2d[1]
   3530 +    mov             v18.16b,  v11.16b
   3531 +    trn1            v11.8b,   v11.8b,   v16.8b
   3532 +    trn2            v16.8b,   v18.8b,   v16.8b
   3533 +    st1             {v7.8b},  [TMP2]
   3534 +    st1             {v11.8b}, [TMP4]
   3535 +    st1             {v16.8b}, [TMP5]
   3536 +    sub             sp, sp, #176
   3537 +    ldp             x22, x23, [sp], 16
   3538 +    ld1             {v0.8b - v3.8b}, [sp], 32
   3539 +    ld1             {v4.8b - v7.8b}, [sp], 32
   3540 +    ld1             {v8.8b - v11.8b}, [sp], 32
   3541 +    ld1             {v12.8b - v15.8b}, [sp], 32
   3542 +    ld1             {v16.8b - v19.8b}, [sp], 32
   3543 +    blr             x30
   3544 +
   3545 +    .unreq          DCT_TABLE
   3546 +    .unreq          COEF_BLOCK
   3547 +    .unreq          OUTPUT_BUF
   3548 +    .unreq          OUTPUT_COL
   3549 +    .unreq          TMP1
   3550 +    .unreq          TMP2
   3551 +    .unreq          TMP3
   3552 +    .unreq          TMP4
   3553 +
   3554 +
   3555 +/*****************************************************************************/
   3556 +
   3557 +/*
   3558 + * jsimd_idct_4x4_neon
   3559 + *
   3560 + * This function contains inverse-DCT code for getting reduced-size
   3561 + * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
   3562 + * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
   3563 + * function from jpeg-6b (jidctred.c).
   3564 + *
   3565 + * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
   3566 + *       requires much less arithmetic operations and hence should be faster.
   3567 + *       The primary purpose of this particular NEON optimized function is
   3568 + *       bit exact compatibility with jpeg-6b.
   3569 + *
   3570 + * TODO: a bit better instructions scheduling can be achieved by expanding
   3571 + *       idct_helper/transpose_4x4 macros and reordering instructions,
   3572 + *       but readability will suffer somewhat.
   3573 + */
   3574 +
   3575 +#define CONST_BITS  13
   3576 +
   3577 +#define FIX_0_211164243  (1730)  /* FIX(0.211164243) */
   3578 +#define FIX_0_509795579  (4176)  /* FIX(0.509795579) */
   3579 +#define FIX_0_601344887  (4926)  /* FIX(0.601344887) */
   3580 +#define FIX_0_720959822  (5906)  /* FIX(0.720959822) */
   3581 +#define FIX_0_765366865  (6270)  /* FIX(0.765366865) */
   3582 +#define FIX_0_850430095  (6967)  /* FIX(0.850430095) */
   3583 +#define FIX_0_899976223  (7373)  /* FIX(0.899976223) */
   3584 +#define FIX_1_061594337  (8697)  /* FIX(1.061594337) */
   3585 +#define FIX_1_272758580  (10426) /* FIX(1.272758580) */
   3586 +#define FIX_1_451774981  (11893) /* FIX(1.451774981) */
   3587 +#define FIX_1_847759065  (15137) /* FIX(1.847759065) */
   3588 +#define FIX_2_172734803  (17799) /* FIX(2.172734803) */
   3589 +#define FIX_2_562915447  (20995) /* FIX(2.562915447) */
   3590 +#define FIX_3_624509785  (29692) /* FIX(3.624509785) */
   3591 +
   3592 +.balign 16
   3593 +jsimd_idct_4x4_neon_consts:
   3594 +    .short     FIX_1_847759065     /* v0.4h[0] */
   3595 +    .short     -FIX_0_765366865    /* v0.4h[1] */
   3596 +    .short     -FIX_0_211164243    /* v0.4h[2] */
   3597 +    .short     FIX_1_451774981     /* v0.4h[3] */
   3598 +    .short     -FIX_2_172734803    /* d1[0] */
   3599 +    .short     FIX_1_061594337     /* d1[1] */
   3600 +    .short     -FIX_0_509795579    /* d1[2] */
   3601 +    .short     -FIX_0_601344887    /* d1[3] */
   3602 +    .short     FIX_0_899976223     /* v2.4h[0] */
   3603 +    .short     FIX_2_562915447     /* v2.4h[1] */
   3604 +    .short     1 << (CONST_BITS+1) /* v2.4h[2] */
   3605 +    .short     0                   /* v2.4h[3] */
   3606 +
   3607 +.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
   3608 +    smull           v28.4s, \x4,    v2.4h[2]
   3609 +    smlal           v28.4s, \x8,    v0.4h[0]
   3610 +    smlal           v28.4s, \x14,   v0.4h[1]
   3611 +
   3612 +    smull           v26.4s, \x16,   v1.4h[2]
   3613 +    smlal           v26.4s, \x12,   v1.4h[3]
   3614 +    smlal           v26.4s, \x10,   v2.4h[0]
   3615 +    smlal           v26.4s, \x6,    v2.4h[1]
   3616 +
   3617 +    smull           v30.4s, \x4,    v2.4h[2]
   3618 +    smlsl           v30.4s, \x8,    v0.4h[0]
   3619 +    smlsl           v30.4s, \x14,   v0.4h[1]
   3620 +
   3621 +    smull           v24.4s, \x16,   v0.4h[2]
   3622 +    smlal           v24.4s, \x12,   v0.4h[3]
   3623 +    smlal           v24.4s, \x10,   v1.4h[0]
   3624 +    smlal           v24.4s, \x6,    v1.4h[1]
   3625 +
   3626 +    add             v20.4s, v28.4s, v26.4s
   3627 +    sub             v28.4s, v28.4s, v26.4s
   3628 +
   3629 +.if \shift > 16
   3630 +    srshr           v20.4s, v20.4s, #\shift
   3631 +    srshr           v28.4s, v28.4s, #\shift
   3632 +    xtn             \y26,   v20.4s
   3633 +    xtn             \y29,   v28.4s
   3634 +.else
   3635 +    rshrn           \y26,   v20.4s, #\shift
   3636 +    rshrn           \y29,   v28.4s, #\shift
   3637 +.endif
   3638 +
   3639 +    add             v20.4s, v30.4s, v24.4s
   3640 +    sub             v30.4s, v30.4s, v24.4s
   3641 +
   3642 +.if \shift > 16
   3643 +    srshr           v20.4s, v20.4s, #\shift
   3644 +    srshr           v30.4s, v30.4s, #\shift
   3645 +    xtn             \y27,   v20.4s
   3646 +    xtn             \y28,   v30.4s
   3647 +.else
   3648 +    rshrn           \y27,   v20.4s, #\shift
   3649 +    rshrn           \y28,   v30.4s, #\shift
   3650 +.endif
   3651 +
   3652 +.endm
   3653 +
   3654 +asm_function jsimd_idct_4x4_neon
   3655 +
   3656 +    DCT_TABLE       .req x0
   3657 +    COEF_BLOCK      .req x1
   3658 +    OUTPUT_BUF      .req x2
   3659 +    OUTPUT_COL      .req x3
   3660 +    TMP1            .req x0
   3661 +    TMP2            .req x1
   3662 +    TMP3            .req x2
   3663 +    TMP4            .req x15
   3664 +
   3665 +    /* Save all used NEON registers */
   3666 +    sub             sp, sp, 272
   3667 +    str             x15, [sp], 16
   3668 +    /* Load constants (v3.4h is just used for padding) */
   3669 +    adr             TMP4, jsimd_idct_4x4_neon_consts
   3670 +    st1             {v0.8b - v3.8b}, [sp], 32
   3671 +    st1             {v4.8b - v7.8b}, [sp], 32
   3672 +    st1             {v8.8b - v11.8b}, [sp], 32
   3673 +    st1             {v12.8b - v15.8b}, [sp], 32
   3674 +    st1             {v16.8b - v19.8b}, [sp], 32
   3675 +    st1             {v20.8b - v23.8b}, [sp], 32
   3676 +    st1             {v24.8b - v27.8b}, [sp], 32
   3677 +    st1             {v28.8b - v31.8b}, [sp], 32
   3678 +    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
   3679 +
   3680 +    /* Load all COEF_BLOCK into NEON registers with the following allocation:
   3681 +     *       0 1 2 3 | 4 5 6 7
   3682 +     *      ---------+--------
   3683 +     *   0 | v4.4h   | v5.4h
   3684 +     *   1 | v6.4h   | v7.4h
   3685 +     *   2 | v8.4h   | v9.4h
   3686 +     *   3 | v10.4h  | v11.4h
   3687 +     *   4 | -       | -
   3688 +     *   5 | v12.4h  | v13.4h
   3689 +     *   6 | v14.4h  | v15.4h
   3690 +     *   7 | v16.4h  | v17.4h
   3691 +     */
   3692 +    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
   3693 +    ld1             {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
   3694 +    add             COEF_BLOCK, COEF_BLOCK, #16
   3695 +    ld1             {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
   3696 +    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
   3697 +    /* dequantize */
   3698 +    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
   3699 +    mul             v4.4h, v4.4h, v18.4h
   3700 +    mul             v5.4h, v5.4h, v19.4h
   3701 +    ins             v4.2d[1], v5.2d[0]    /* 128 bit q4 */
   3702 +    ld1             {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
   3703 +    mul             v6.4h, v6.4h, v20.4h
   3704 +    mul             v7.4h, v7.4h, v21.4h
   3705 +    ins             v6.2d[1], v7.2d[0]    /* 128 bit q6 */
   3706 +    mul             v8.4h, v8.4h, v22.4h
   3707 +    mul             v9.4h, v9.4h, v23.4h
   3708 +    ins             v8.2d[1], v9.2d[0]    /* 128 bit q8 */
   3709 +    add             DCT_TABLE, DCT_TABLE, #16
   3710 +    ld1             {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
   3711 +    mul             v10.4h, v10.4h, v24.4h
   3712 +    mul             v11.4h, v11.4h, v25.4h
   3713 +    ins             v10.2d[1], v11.2d[0]  /* 128 bit q10 */
   3714 +    mul             v12.4h, v12.4h, v26.4h
   3715 +    mul             v13.4h, v13.4h, v27.4h
   3716 +    ins             v12.2d[1], v13.2d[0]  /* 128 bit q12 */
   3717 +    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
   3718 +    mul             v14.4h, v14.4h, v28.4h
   3719 +    mul             v15.4h, v15.4h, v29.4h
   3720 +    ins             v14.2d[1], v15.2d[0]  /* 128 bit q14 */
   3721 +    mul             v16.4h, v16.4h, v30.4h
   3722 +    mul             v17.4h, v17.4h, v31.4h
   3723 +    ins             v16.2d[1], v17.2d[0]  /* 128 bit q16 */
   3724 +
   3725 +    /* Pass 1 */
   3726 +    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, v4.4h, v6.4h, v8.4h, v10.4h
   3727 +    transpose_4x4   v4, v6, v8, v10, v3
   3728 +    ins             v10.2d[1], v11.2d[0]
   3729 +    idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, v5.4h, v7.4h, v9.4h, v11.4h
   3730 +    transpose_4x4   v5, v7, v9, v11, v3
   3731 +    ins             v10.2d[1], v11.2d[0]
   3732 +    /* Pass 2 */
   3733 +    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, v26.4h, v27.4h, v28.4h, v29.4h
   3734 +    transpose_4x4   v26, v27, v28, v29, v3
   3735 +
   3736 +    /* Range limit */
   3737 +    movi            v30.8h, #0x80
   3738 +    ins             v26.2d[1], v27.2d[0]
   3739 +    ins             v28.2d[1], v29.2d[0]
   3740 +    add             v26.8h, v26.8h, v30.8h
   3741 +    add             v28.8h, v28.8h, v30.8h
   3742 +    sqxtun          v26.8b, v26.8h
   3743 +    sqxtun          v27.8b, v28.8h
   3744 +
   3745 +    /* Store results to the output buffer */
   3746 +    ldp             TMP1, TMP2, [OUTPUT_BUF], 16
   3747 +    ldp             TMP3, TMP4, [OUTPUT_BUF]
   3748 +    add             TMP1, TMP1, OUTPUT_COL
   3749 +    add             TMP2, TMP2, OUTPUT_COL
   3750 +    add             TMP3, TMP3, OUTPUT_COL
   3751 +    add             TMP4, TMP4, OUTPUT_COL
   3752 +
   3753 +#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
   3754 +    /* We can use much less instructions on little endian systems if the
   3755 +     * OS kernel is not configured to trap unaligned memory accesses
   3756 +     */
   3757 +    st1             {v26.s}[0], [TMP1], 4
   3758 +    st1             {v27.s}[0], [TMP3], 4
   3759 +    st1             {v26.s}[1], [TMP2], 4
   3760 +    st1             {v27.s}[1], [TMP4], 4
   3761 +#else
   3762 +    st1             {v26.b}[0], [TMP1], 1
   3763 +    st1             {v27.b}[0], [TMP3], 1
   3764 +    st1             {v26.b}[1], [TMP1], 1
   3765 +    st1             {v27.b}[1], [TMP3], 1
   3766 +    st1             {v26.b}[2], [TMP1], 1
   3767 +    st1             {v27.b}[2], [TMP3], 1
   3768 +    st1             {v26.b}[3], [TMP1], 1
   3769 +    st1             {v27.b}[3], [TMP3], 1
   3770 +
   3771 +    st1             {v26.b}[4], [TMP2], 1
   3772 +    st1             {v27.b}[4], [TMP4], 1
   3773 +    st1             {v26.b}[5], [TMP2], 1
   3774 +    st1             {v27.b}[5], [TMP4], 1
   3775 +    st1             {v26.b}[6], [TMP2], 1
   3776 +    st1             {v27.b}[6], [TMP4], 1
   3777 +    st1             {v26.b}[7], [TMP2], 1
   3778 +    st1             {v27.b}[7], [TMP4], 1
   3779 +#endif
   3780 +
   3781 +    /* vpop            {v8.4h - v15.4h}    ;not available */
   3782 +    sub             sp, sp, #272
   3783 +    ldr             x15, [sp], 16
   3784 +    ld1             {v0.8b - v3.8b}, [sp], 32
   3785 +    ld1             {v4.8b - v7.8b}, [sp], 32
   3786 +    ld1             {v8.8b - v11.8b}, [sp], 32
   3787 +    ld1             {v12.8b - v15.8b}, [sp], 32
   3788 +    ld1             {v16.8b - v19.8b}, [sp], 32
   3789 +    ld1             {v20.8b - v23.8b}, [sp], 32
   3790 +    ld1             {v24.8b - v27.8b}, [sp], 32
   3791 +    ld1             {v28.8b - v31.8b}, [sp], 32
   3792 +    blr             x30
   3793 +
   3794 +    .unreq          DCT_TABLE
   3795 +    .unreq          COEF_BLOCK
   3796 +    .unreq          OUTPUT_BUF
   3797 +    .unreq          OUTPUT_COL
   3798 +    .unreq          TMP1
   3799 +    .unreq          TMP2
   3800 +    .unreq          TMP3
   3801 +    .unreq          TMP4
   3802 +
   3803 +.purgem idct_helper
   3804 +
   3805 +
   3806 +/*****************************************************************************/
   3807 +
   3808 +/*
   3809 + * jsimd_idct_2x2_neon
   3810 + *
   3811 + * This function contains inverse-DCT code for getting reduced-size
   3812 + * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
   3813 + * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
   3814 + * function from jpeg-6b (jidctred.c).
   3815 + *
   3816 + * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
   3817 + *       requires much less arithmetic operations and hence should be faster.
   3818 + *       The primary purpose of this particular NEON optimized function is
   3819 + *       bit exact compatibility with jpeg-6b.
   3820 + */
   3821 +
   3822 +.balign 8
   3823 +jsimd_idct_2x2_neon_consts:
   3824 +    .short     -FIX_0_720959822    /* v14[0] */
   3825 +    .short     FIX_0_850430095     /* v14[1] */
   3826 +    .short     -FIX_1_272758580    /* v14[2] */
   3827 +    .short     FIX_3_624509785     /* v14[3] */
   3828 +
   3829 +.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
   3830 +    sshll      v15.4s, \x4,    #15
   3831 +    smull      v26.4s, \x6,    v14.4h[3]
   3832 +    smlal      v26.4s, \x10,   v14.4h[2]
   3833 +    smlal      v26.4s, \x12,   v14.4h[1]
   3834 +    smlal      v26.4s, \x16,   v14.4h[0]
   3835 +
   3836 +    add        v20.4s, v15.4s, v26.4s
   3837 +    sub        v15.4s, v15.4s, v26.4s
   3838 +
   3839 +.if \shift > 16
   3840 +    srshr      v20.4s, v20.4s, #\shift
   3841 +    srshr      v15.4s, v15.4s, #\shift
   3842 +    xtn        \y26,   v20.4s
   3843 +    xtn        \y27,   v15.4s
   3844 +.else
   3845 +    rshrn      \y26,   v20.4s, #\shift
   3846 +    rshrn      \y27,   v15.4s, #\shift
   3847 +.endif
   3848 +
   3849 +.endm
   3850 +
   3851 +asm_function jsimd_idct_2x2_neon
   3852 +
   3853 +    DCT_TABLE       .req x0
   3854 +    COEF_BLOCK      .req x1
   3855 +    OUTPUT_BUF      .req x2
   3856 +    OUTPUT_COL      .req x3
   3857 +    TMP1            .req x0
   3858 +    TMP2            .req x15
   3859 +
   3860 +    /* vpush           {v8.4h - v15.4h}            ; not available */
   3861 +    sub             sp, sp, 208
   3862 +    str             x15, [sp], 16
   3863 +
   3864 +    /* Load constants */
   3865 +    adr             TMP2, jsimd_idct_2x2_neon_consts
   3866 +    st1             {v4.8b - v7.8b}, [sp], 32
   3867 +    st1             {v8.8b - v11.8b}, [sp], 32
   3868 +    st1             {v12.8b - v15.8b}, [sp], 32
   3869 +    st1             {v16.8b - v19.8b}, [sp], 32
   3870 +    st1             {v21.8b - v22.8b}, [sp], 16
   3871 +    st1             {v24.8b - v27.8b}, [sp], 32
   3872 +    st1             {v30.8b - v31.8b}, [sp], 16
   3873 +    ld1             {v14.4h}, [TMP2]
   3874 +
   3875 +    /* Load all COEF_BLOCK into NEON registers with the following allocation:
   3876 +     *       0 1 2 3 | 4 5 6 7
   3877 +     *      ---------+--------
   3878 +     *   0 | v4.4h   | v5.4h
   3879 +     *   1 | v6.4h   | v7.4h
   3880 +     *   2 | -       | -
   3881 +     *   3 | v10.4h  | v11.4h
   3882 +     *   4 | -       | -
   3883 +     *   5 | v12.4h  | v13.4h
   3884 +     *   6 | -       | -
   3885 +     *   7 | v16.4h  | v17.4h
   3886 +     */
   3887 +    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
   3888 +    add             COEF_BLOCK, COEF_BLOCK, #16
   3889 +    ld1             {v10.4h, v11.4h}, [COEF_BLOCK], 16
   3890 +    add             COEF_BLOCK, COEF_BLOCK, #16
   3891 +    ld1             {v12.4h, v13.4h}, [COEF_BLOCK], 16
   3892 +    add             COEF_BLOCK, COEF_BLOCK, #16
   3893 +    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
   3894 +    /* Dequantize */
   3895 +    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
   3896 +    mul             v4.4h, v4.4h, v18.4h
   3897 +    mul             v5.4h, v5.4h, v19.4h
   3898 +    ins             v4.2d[1], v5.2d[0]
   3899 +    mul             v6.4h, v6.4h, v20.4h
   3900 +    mul             v7.4h, v7.4h, v21.4h
   3901 +    ins             v6.2d[1], v7.2d[0]
   3902 +    add             DCT_TABLE, DCT_TABLE, #16
   3903 +    ld1             {v24.4h, v25.4h}, [DCT_TABLE], 16
   3904 +    mul             v10.4h, v10.4h, v24.4h
   3905 +    mul             v11.4h, v11.4h, v25.4h
   3906 +    ins             v10.2d[1], v11.2d[0]
   3907 +    add             DCT_TABLE, DCT_TABLE, #16
   3908 +    ld1             {v26.4h, v27.4h}, [DCT_TABLE], 16
   3909 +    mul             v12.4h, v12.4h, v26.4h
   3910 +    mul             v13.4h, v13.4h, v27.4h
   3911 +    ins             v12.2d[1], v13.2d[0]
   3912 +    add             DCT_TABLE, DCT_TABLE, #16
   3913 +    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
   3914 +    mul             v16.4h, v16.4h, v30.4h
   3915 +    mul             v17.4h, v17.4h, v31.4h
   3916 +    ins             v16.2d[1], v17.2d[0]
   3917 +
   3918 +    /* Pass 1 */
   3919 +#if 0
   3920 +    idct_helper     v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
   3921 +    transpose_4x4   v4.4h, v6.4h, v8.4h,  v10.4h
   3922 +    idct_helper     v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
   3923 +    transpose_4x4   v5.4h, v7.4h, v9.4h,  v11.4h
   3924 +#else
   3925 +    smull           v26.4s, v6.4h,  v14.4h[3]
   3926 +    smlal           v26.4s, v10.4h, v14.4h[2]
   3927 +    smlal           v26.4s, v12.4h, v14.4h[1]
   3928 +    smlal           v26.4s, v16.4h, v14.4h[0]
   3929 +    smull           v24.4s, v7.4h,  v14.4h[3]
   3930 +    smlal           v24.4s, v11.4h, v14.4h[2]
   3931 +    smlal           v24.4s, v13.4h, v14.4h[1]
   3932 +    smlal           v24.4s, v17.4h, v14.4h[0]
   3933 +    sshll           v15.4s, v4.4h,  #15
   3934 +    sshll           v30.4s, v5.4h,  #15
   3935 +    add             v20.4s, v15.4s, v26.4s
   3936 +    sub             v15.4s, v15.4s, v26.4s
   3937 +    rshrn           v4.4h,  v20.4s, #13
   3938 +    rshrn           v6.4h,  v15.4s, #13
   3939 +    add             v20.4s, v30.4s, v24.4s
   3940 +    sub             v15.4s, v30.4s, v24.4s
   3941 +    rshrn           v5.4h,  v20.4s, #13
   3942 +    rshrn           v7.4h,  v15.4s, #13
   3943 +    ins             v4.2d[1], v5.2d[0]
   3944 +    ins             v6.2d[1], v7.2d[0]
   3945 +    transpose       v4, v6, v3, .16b, .8h
   3946 +    transpose       v6, v10, v3, .16b, .4s
   3947 +    ins             v11.2d[0], v10.2d[1]
   3948 +    ins             v7.2d[0], v6.2d[1]
   3949 +#endif
   3950 +
   3951 +    /* Pass 2 */
   3952 +    idct_helper     v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
   3953 +
   3954 +    /* Range limit */
   3955 +    movi            v30.8h, #0x80
   3956 +    ins             v26.2d[1], v27.2d[0]
   3957 +    add             v26.8h, v26.8h, v30.8h
   3958 +    sqxtun          v30.8b, v26.8h
   3959 +    ins             v26.2d[0], v30.2d[0]
   3960 +    sqxtun          v27.8b, v26.8h
   3961 +
   3962 +    /* Store results to the output buffer */
   3963 +    ldp             TMP1, TMP2, [OUTPUT_BUF]
   3964 +    add             TMP1, TMP1, OUTPUT_COL
   3965 +    add             TMP2, TMP2, OUTPUT_COL
   3966 +
   3967 +    st1             {v26.b}[0], [TMP1], 1
   3968 +    st1             {v27.b}[4], [TMP1], 1
   3969 +    st1             {v26.b}[1], [TMP2], 1
   3970 +    st1             {v27.b}[5], [TMP2], 1
   3971 +
   3972 +    sub             sp, sp, #208
   3973 +    ldr             x15, [sp], 16
   3974 +    ld1             {v4.8b - v7.8b}, [sp], 32
   3975 +    ld1             {v8.8b - v11.8b}, [sp], 32
   3976 +    ld1             {v12.8b - v15.8b}, [sp], 32
   3977 +    ld1             {v16.8b - v19.8b}, [sp], 32
   3978 +    ld1             {v21.8b - v22.8b}, [sp], 16
   3979 +    ld1             {v24.8b - v27.8b}, [sp], 32
   3980 +    ld1             {v30.8b - v31.8b}, [sp], 16
   3981 +    blr             x30
   3982 +
   3983 +    .unreq          DCT_TABLE
   3984 +    .unreq          COEF_BLOCK
   3985 +    .unreq          OUTPUT_BUF
   3986 +    .unreq          OUTPUT_COL
   3987 +    .unreq          TMP1
   3988 +    .unreq          TMP2
   3989 +
   3990 +.purgem idct_helper
   3991 +
   3992 +
   3993 +/*****************************************************************************/
   3994 +
   3995 +/*
   3996 + * jsimd_ycc_extrgb_convert_neon
   3997 + * jsimd_ycc_extbgr_convert_neon
   3998 + * jsimd_ycc_extrgbx_convert_neon
   3999 + * jsimd_ycc_extbgrx_convert_neon
   4000 + * jsimd_ycc_extxbgr_convert_neon
   4001 + * jsimd_ycc_extxrgb_convert_neon
   4002 + *
   4003 + * Colorspace conversion YCbCr -> RGB
   4004 + */
   4005 +
   4006 +
   4007 +.macro do_load size
   4008 +    .if \size == 8
   4009 +        ld1  {v4.8b}, [U], 8
   4010 +        ld1  {v5.8b}, [V], 8
   4011 +        ld1  {v0.8b}, [Y], 8
   4012 +        prfm PLDL1KEEP, [U, #64]
   4013 +        prfm PLDL1KEEP, [V, #64]
   4014 +        prfm PLDL1KEEP, [Y, #64]
   4015 +    .elseif \size == 4
   4016 +        ld1  {v4.b}[0], [U], 1
   4017 +        ld1  {v4.b}[1], [U], 1
   4018 +        ld1  {v4.b}[2], [U], 1
   4019 +        ld1  {v4.b}[3], [U], 1
   4020 +        ld1  {v5.b}[0], [V], 1
   4021 +        ld1  {v5.b}[1], [V], 1
   4022 +        ld1  {v5.b}[2], [V], 1
   4023 +        ld1  {v5.b}[3], [V], 1
   4024 +        ld1  {v0.b}[0], [Y], 1
   4025 +        ld1  {v0.b}[1], [Y], 1
   4026 +        ld1  {v0.b}[2], [Y], 1
   4027 +        ld1  {v0.b}[3], [Y], 1
   4028 +    .elseif \size == 2
   4029 +        ld1  {v4.b}[4], [U], 1
   4030 +        ld1  {v4.b}[5], [U], 1
   4031 +        ld1  {v5.b}[4], [V], 1
   4032 +        ld1  {v5.b}[5], [V], 1
   4033 +        ld1  {v0.b}[4], [Y], 1
   4034 +        ld1  {v0.b}[5], [Y], 1
   4035 +    .elseif \size == 1
   4036 +        ld1  {v4.b}[6], [U], 1
   4037 +        ld1  {v5.b}[6], [V], 1
   4038 +        ld1  {v0.b}[6], [Y], 1
   4039 +    .else
   4040 +        .error unsupported macroblock size
   4041 +    .endif
   4042 +.endm
   4043 +
   4044 +.macro do_store bpp, size
   4045 +    .if \bpp == 24
   4046 +        .if \size == 8
   4047 +            st3  {v10.8b, v11.8b, v12.8b}, [RGB], 24
   4048 +        .elseif \size == 4
   4049 +            st3  {v10.b, v11.b, v12.b}[0], [RGB], 3
   4050 +            st3  {v10.b, v11.b, v12.b}[1], [RGB], 3
   4051 +            st3  {v10.b, v11.b, v12.b}[2], [RGB], 3
   4052 +            st3  {v10.b, v11.b, v12.b}[3], [RGB], 3
   4053 +        .elseif \size == 2
   4054 +            st3  {v10.b, v11.b, v12.b}[4], [RGB], 3
   4055 +            st3  {v10.b, v11.b, v12.b}[5], [RGB], 3
   4056 +        .elseif \size == 1
   4057 +            st3  {v10.b, v11.b, v12.b}[6], [RGB], 3
   4058 +        .else
   4059 +            .error unsupported macroblock size
   4060 +        .endif
   4061 +    .elseif \bpp == 32
   4062 +        .if \size == 8
   4063 +            st4  {v10.8b, v11.8b, v12.8b, v13.8b}, [RGB], 32
   4064 +        .elseif \size == 4
   4065 +            st4  {v10.b, v11.b, v12.b, v13.b}[0], [RGB], 4
   4066 +            st4  {v10.b, v11.b, v12.b, v13.b}[1], [RGB], 4
   4067 +            st4  {v10.b, v11.b, v12.b, v13.b}[2], [RGB], 4
   4068 +            st4  {v10.b, v11.b, v12.b, v13.b}[3], [RGB], 4
   4069 +        .elseif \size == 2
   4070 +            st4  {v10.b, v11.b, v12.b, v13.b}[4], [RGB], 4
   4071 +            st4  {v10.b, v11.b, v12.b, v13.b}[5], [RGB], 4
   4072 +        .elseif \size == 1
   4073 +            st4  {v10.b, v11.b, v12.b, v13.b}[6], [RGB], 4
   4074 +        .else
   4075 +            .error unsupported macroblock size
   4076 +        .endif
   4077 +    .elseif \bpp==16
   4078 +        .if \size == 8
   4079 +            st1  {v25.8h}, [RGB],16
   4080 +        .elseif \size == 4
   4081 +            st1  {v25.4h}, [RGB],8
   4082 +        .elseif \size == 2
   4083 +            st1  {v25.h}[4], [RGB],2
   4084 +            st1  {v25.h}[5], [RGB],2
   4085 +        .elseif \size == 1
   4086 +            st1  {v25.h}[6], [RGB],2
   4087 +        .else
   4088 +            .error unsupported macroblock size
   4089 +        .endif
   4090 +     .else
   4091 +        .error unsupported bpp
   4092 +    .endif
   4093 +.endm
   4094 +
   4095 +.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, rsize, g_offs, gsize, b_offs, bsize, defsize
   4096 +
   4097 +/*
   4098 + * 2-stage pipelined YCbCr->RGB conversion
   4099 + */
   4100 +
   4101 +.macro do_yuv_to_rgb_stage1
   4102 +    uaddw        v6.8h, v2.8h, v4.8b     /* q3 = u - 128 */
   4103 +    uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
   4104 +    smull        v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
   4105 +    smlal        v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
   4106 +    smull2       v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
   4107 +    smlal2       v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
   4108 +    smull        v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
   4109 +    smull2       v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
   4110 +    smull        v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
   4111 +    smull2       v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
   4112 +.endm
   4113 +
   4114 +.macro do_yuv_to_rgb_stage2
   4115 +    rshrn        v20.4h, v20.4s, #15
   4116 +    rshrn2       v20.8h, v22.4s, #15
   4117 +    rshrn        v24.4h, v24.4s, #14
   4118 +    rshrn2       v24.8h, v26.4s, #14
   4119 +    rshrn        v28.4h, v28.4s, #14
   4120 +    rshrn2       v28.8h, v30.4s, #14
   4121 +    uaddw        v20.8h, v20.8h, v0.8b
   4122 +    uaddw        v24.8h, v24.8h, v0.8b
   4123 +    uaddw        v28.8h, v28.8h, v0.8b
   4124 +.if \bpp != 16
   4125 +    sqxtun       v1\g_offs\defsize, v20.8h
   4126 +    sqxtun       v1\r_offs\defsize, v24.8h
   4127 +    sqxtun       v1\b_offs\defsize, v28.8h
   4128 +.else
   4129 +    sqshlu       v21.8h, v20.8h, #8
   4130 +    sqshlu       v25.8h, v24.8h, #8
   4131 +    sqshlu       v29.8h, v28.8h, #8
   4132 +    sri          v25.8h, v21.8h, #5
   4133 +    sri          v25.8h, v29.8h, #11
   4134 +.endif
   4135 +
   4136 +.endm
   4137 +
   4138 +.macro do_yuv_to_rgb_stage2_store_load_stage1
   4139 +    rshrn        v20.4h, v20.4s, #15
   4140 +    rshrn        v24.4h, v24.4s, #14
   4141 +    rshrn        v28.4h, v28.4s, #14
   4142 +    ld1          {v4.8b}, [U], 8
   4143 +    rshrn2       v20.8h, v22.4s, #15
   4144 +    rshrn2       v24.8h, v26.4s, #14
   4145 +    rshrn2       v28.8h, v30.4s, #14
   4146 +    ld1          {v5.8b}, [V], 8
   4147 +    uaddw        v20.8h, v20.8h, v0.8b
   4148 +    uaddw        v24.8h, v24.8h, v0.8b
   4149 +    uaddw        v28.8h, v28.8h, v0.8b
   4150 +.if \bpp != 16 /**************** rgb24/rgb32 *********************************/
   4151 +    sqxtun       v1\g_offs\defsize, v20.8h
   4152 +    ld1          {v0.8b}, [Y], 8
   4153 +    sqxtun       v1\r_offs\defsize, v24.8h
   4154 +    prfm         PLDL1KEEP, [U, #64]
   4155 +    prfm         PLDL1KEEP, [V, #64]
   4156 +    prfm         PLDL1KEEP, [Y, #64]
   4157 +    sqxtun       v1\b_offs\defsize, v28.8h
   4158 +    uaddw        v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
   4159 +    uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
   4160 +    smull        v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
   4161 +    smlal        v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
   4162 +    smull2       v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
   4163 +    smlal2       v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
   4164 +    smull        v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
   4165 +    smull2       v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
   4166 +.else /**************************** rgb565 ***********************************/
   4167 +    sqshlu       v21.8h, v20.8h, #8
   4168 +    sqshlu       v25.8h, v24.8h, #8
   4169 +    sqshlu       v29.8h, v28.8h, #8
   4170 +    uaddw        v6.8h, v2.8h, v4.8b     /* v6.16b = u - 128 */
   4171 +    uaddw        v8.8h, v2.8h, v5.8b     /* q2 = v - 128 */
   4172 +    ld1          {v0.8b}, [Y], 8
   4173 +    smull        v20.4s, v6.4h, v1.4h[1] /* multiply by -11277 */
   4174 +    smlal        v20.4s, v8.4h, v1.4h[2] /* multiply by -23401 */
   4175 +    smull2       v22.4s, v6.8h, v1.4h[1] /* multiply by -11277 */
   4176 +    smlal2       v22.4s, v8.8h, v1.4h[2] /* multiply by -23401 */
   4177 +    sri          v25.8h, v21.8h, #5
   4178 +    smull        v24.4s, v8.4h, v1.4h[0] /* multiply by 22971 */
   4179 +    smull2       v26.4s, v8.8h, v1.4h[0] /* multiply by 22971 */
   4180 +    prfm         PLDL1KEEP, [U, #64]
   4181 +    prfm         PLDL1KEEP, [V, #64]
   4182 +    prfm         PLDL1KEEP, [Y, #64]
   4183 +    sri          v25.8h, v29.8h, #11
   4184 +.endif
   4185 +    do_store     \bpp, 8
   4186 +    smull        v28.4s, v6.4h, v1.4h[3] /* multiply by 29033 */
   4187 +    smull2       v30.4s, v6.8h, v1.4h[3] /* multiply by 29033 */
   4188 +.endm
   4189 +
   4190 +.macro do_yuv_to_rgb
   4191 +    do_yuv_to_rgb_stage1
   4192 +    do_yuv_to_rgb_stage2
   4193 +.endm
   4194 +
   4195 +/* Apple gas crashes on adrl, work around that by using adr.
   4196 + * But this requires a copy of these constants for each function.
   4197 + */
   4198 +
   4199 +.balign 16
   4200 +jsimd_ycc_\colorid\()_neon_consts:
   4201 +    .short          0,      0,     0,      0
   4202 +    .short          22971, -11277, -23401, 29033
   4203 +    .short          -128,  -128,   -128,   -128
   4204 +    .short          -128,  -128,   -128,   -128
   4205 +
   4206 +asm_function jsimd_ycc_\colorid\()_convert_neon
   4207 +    OUTPUT_WIDTH    .req x0
   4208 +    INPUT_BUF       .req x1
   4209 +    INPUT_ROW       .req x2
   4210 +    OUTPUT_BUF      .req x3
   4211 +    NUM_ROWS        .req x4
   4212 +
   4213 +    INPUT_BUF0      .req x5
   4214 +    INPUT_BUF1      .req x6
   4215 +    INPUT_BUF2      .req INPUT_BUF
   4216 +
   4217 +    RGB             .req x7
   4218 +    Y               .req x8
   4219 +    U               .req x9
   4220 +    V               .req x10
   4221 +    N               .req x15
   4222 +
   4223 +    sub             sp, sp, 336
   4224 +    str             x15, [sp], 16
   4225 +    /* Load constants to d1, d2, d3 (v0.4h is just used for padding) */
   4226 +    adr             x15, jsimd_ycc_\colorid\()_neon_consts
   4227 +    /* Save NEON registers */
   4228 +    st1             {v0.8b - v3.8b}, [sp], 32
   4229 +    st1             {v4.8b - v7.8b}, [sp], 32
   4230 +    st1             {v8.8b - v11.8b}, [sp], 32
   4231 +    st1             {v12.8b - v15.8b}, [sp], 32
   4232 +    st1             {v16.8b - v19.8b}, [sp], 32
   4233 +    st1             {v20.8b - v23.8b}, [sp], 32
   4234 +    st1             {v24.8b - v27.8b}, [sp], 32
   4235 +    st1             {v28.8b - v31.8b}, [sp], 32
   4236 +    ld1             {v0.4h, v1.4h}, [x15], 16
   4237 +    ld1             {v2.8h}, [x15]
   4238 +
   4239 +    /* Save ARM registers and handle input arguments */
   4240 +    /* push            {x4, x5, x6, x7, x8, x9, x10, x30} */
   4241 +    stp             x4, x5, [sp], 16
   4242 +    stp             x6, x7, [sp], 16
   4243 +    stp             x8, x9, [sp], 16
   4244 +    stp             x10, x30, [sp], 16
   4245 +    ldr             INPUT_BUF0, [INPUT_BUF]
   4246 +    ldr             INPUT_BUF1, [INPUT_BUF, 8]
   4247 +    ldr             INPUT_BUF2, [INPUT_BUF, 16]
   4248 +    .unreq          INPUT_BUF
   4249 +
   4250 +    /* Initially set v10, v11.4h, v12.8b, d13 to 0xFF */
   4251 +    movi            v10.16b, #255
   4252 +    movi            v13.16b, #255
   4253 +
   4254 +    /* Outer loop over scanlines */
   4255 +    cmp             NUM_ROWS, #1
   4256 +    blt             9f
   4257 +0:
   4258 +    lsl             x16, INPUT_ROW, #3
   4259 +    ldr             Y, [INPUT_BUF0, x16]
   4260 +    ldr             U, [INPUT_BUF1, x16]
   4261 +    mov             N, OUTPUT_WIDTH
   4262 +    ldr             V, [INPUT_BUF2, x16]
   4263 +    add             INPUT_ROW, INPUT_ROW, #1
   4264 +    ldr             RGB, [OUTPUT_BUF], #8
   4265 +
   4266 +    /* Inner loop over pixels */
   4267 +    subs            N, N, #8
   4268 +    blt             3f
   4269 +    do_load         8
   4270 +    do_yuv_to_rgb_stage1
   4271 +    subs            N, N, #8
   4272 +    blt             2f
   4273 +1:
   4274 +    do_yuv_to_rgb_stage2_store_load_stage1
   4275 +    subs            N, N, #8
   4276 +    bge             1b
   4277 +2:
   4278 +    do_yuv_to_rgb_stage2
   4279 +    do_store        \bpp, 8
   4280 +    tst             N, #7
   4281 +    beq             8f
   4282 +3:
   4283 +    tst             N, #4
   4284 +    beq             3f
   4285 +    do_load         4
   4286 +3:
   4287 +    tst             N, #2
   4288 +    beq             4f
   4289 +    do_load         2
   4290 +4:
   4291 +    tst             N, #1
   4292 +    beq             5f
   4293 +    do_load         1
   4294 +5:
   4295 +    do_yuv_to_rgb
   4296 +    tst             N, #4
   4297 +    beq             6f
   4298 +    do_store        \bpp, 4
   4299 +6:
   4300 +    tst             N, #2
   4301 +    beq             7f
   4302 +    do_store        \bpp, 2
   4303 +7:
   4304 +    tst             N, #1
   4305 +    beq             8f
   4306 +    do_store        \bpp, 1
   4307 +8:
   4308 +    subs            NUM_ROWS, NUM_ROWS, #1
   4309 +    bgt             0b
   4310 +9:
   4311 +    /* Restore all registers and return */
   4312 +    sub             sp, sp, #336
   4313 +    ldr             x15, [sp], 16
   4314 +    ld1             {v0.8b - v3.8b}, [sp], 32
   4315 +    ld1             {v4.8b - v7.8b}, [sp], 32
   4316 +    ld1             {v8.8b - v11.8b}, [sp], 32
   4317 +    ld1             {v12.8b - v15.8b}, [sp], 32
   4318 +    ld1             {v16.8b - v19.8b}, [sp], 32
   4319 +    ld1             {v20.8b - v23.8b}, [sp], 32
   4320 +    ld1             {v24.8b - v27.8b}, [sp], 32
   4321 +    ld1             {v28.8b - v31.8b}, [sp], 32
   4322 +    /* pop             {r4, r5, r6, r7, r8, r9, r10, pc} */
   4323 +    ldp             x4, x5, [sp], 16
   4324 +    ldp             x6, x7, [sp], 16
   4325 +    ldp             x8, x9, [sp], 16
   4326 +    ldp             x10, x30, [sp], 16
   4327 +    br              x30
   4328 +    .unreq          OUTPUT_WIDTH
   4329 +    .unreq          INPUT_ROW
   4330 +    .unreq          OUTPUT_BUF
   4331 +    .unreq          NUM_ROWS
   4332 +    .unreq          INPUT_BUF0
   4333 +    .unreq          INPUT_BUF1
   4334 +    .unreq          INPUT_BUF2
   4335 +    .unreq          RGB
   4336 +    .unreq          Y
   4337 +    .unreq          U
   4338 +    .unreq          V
   4339 +    .unreq          N
   4340 +
   4341 +.purgem do_yuv_to_rgb
   4342 +.purgem do_yuv_to_rgb_stage1
   4343 +.purgem do_yuv_to_rgb_stage2
   4344 +.purgem do_yuv_to_rgb_stage2_store_load_stage1
   4345 +.endm
   4346 +
   4347 +/*--------------------------------- id ----- bpp R  rsize  G  gsize  B  bsize  defsize   */
   4348 +generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, .4h,   1, .4h,   2, .4h,   .8b
   4349 +generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, .4h,   1, .4h,   0, .4h,   .8b
   4350 +generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, .4h,   1, .4h,   2, .4h,   .8b
   4351 +generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, .4h,   1, .4h,   0, .4h,   .8b
   4352 +generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, .4h,   2, .4h,   1, .4h,   .8b
   4353 +generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, .4h,   2, .4h,   3, .4h,   .8b
   4354 +generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, .4h,   0, .4h,   0, .4h,   .8b
   4355 +.purgem do_load
   4356 +.purgem do_store
   4357