Home | History | Annotate | Download | only in sljit
      1 /*
      2  *    Stack-less Just-In-Time compiler
      3  *
      4  *    Copyright 2009-2012 Zoltan Herczeg (hzmester (at) freemail.hu). All rights reserved.
      5  *
      6  * Redistribution and use in source and binary forms, with or without modification, are
      7  * permitted provided that the following conditions are met:
      8  *
      9  *   1. Redistributions of source code must retain the above copyright notice, this list of
     10  *      conditions and the following disclaimer.
     11  *
     12  *   2. Redistributions in binary form must reproduce the above copyright notice, this list
     13  *      of conditions and the following disclaimer in the documentation and/or other materials
     14  *      provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
     17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
     19  * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
     21  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
     22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     23  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
     24  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25  */
     26 
     27 SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
     28 {
     29 	return "x86" SLJIT_CPUINFO;
     30 }
     31 
     32 /*
     33    32b register indexes:
     34      0 - EAX
     35      1 - ECX
     36      2 - EDX
     37      3 - EBX
     38      4 - none
     39      5 - EBP
     40      6 - ESI
     41      7 - EDI
     42 */
     43 
     44 /*
     45    64b register indexes:
     46      0 - RAX
     47      1 - RCX
     48      2 - RDX
     49      3 - RBX
     50      4 - none
     51      5 - RBP
     52      6 - RSI
     53      7 - RDI
     54      8 - R8   - From now on REX prefix is required
     55      9 - R9
     56     10 - R10
     57     11 - R11
     58     12 - R12
     59     13 - R13
     60     14 - R14
     61     15 - R15
     62 */
     63 
     64 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
     65 
     66 /* Last register + 1. */
     67 #define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
     68 
     69 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 3] = {
     70 	0, 0, 2, 1, 0, 0, 0, 0, 7, 6, 3, 4, 5
     71 };
     72 
     73 #define CHECK_EXTRA_REGS(p, w, do) \
     74 	if (p >= SLJIT_R3 && p <= SLJIT_R6) { \
     75 		w = SLJIT_LOCALS_OFFSET + ((p) - (SLJIT_R3 + 4)) * sizeof(sljit_sw); \
     76 		p = SLJIT_MEM1(SLJIT_SP); \
     77 		do; \
     78 	}
     79 
     80 #else /* SLJIT_CONFIG_X86_32 */
     81 
     82 /* Last register + 1. */
     83 #define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
     84 #define TMP_REG2	(SLJIT_NUMBER_OF_REGISTERS + 3)
     85 #define TMP_REG3	(SLJIT_NUMBER_OF_REGISTERS + 4)
     86 
     87 /* Note: r12 & 0x7 == 0b100, which decoded as SIB byte present
     88    Note: avoid to use r12 and r13 for memory addessing
     89    therefore r12 is better for SAVED_EREG than SAVED_REG. */
     90 #ifndef _WIN64
     91 /* 1st passed in rdi, 2nd argument passed in rsi, 3rd in rdx. */
     92 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
     93 	0, 0, 6, 1, 8, 11, 10, 12, 5, 13, 14, 15, 3, 4, 2, 7, 9
     94 };
     95 /* low-map. reg_map & 0x7. */
     96 static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 5] = {
     97 	0, 0, 6, 1, 0, 3,  2,  4,  5,  5,  6,  7, 3, 4, 2, 7, 1
     98 };
     99 #else
    100 /* 1st passed in rcx, 2nd argument passed in rdx, 3rd in r8. */
    101 static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
    102 	0, 0, 2, 1, 11, 12, 5, 13, 14, 15, 7, 6, 3, 4, 10, 8, 9
    103 };
    104 /* low-map. reg_map & 0x7. */
    105 static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 5] = {
    106 	0, 0, 2, 1, 3,  4,  5,  5, 6,  7,  7, 6, 3, 4, 2,  0, 1
    107 };
    108 #endif
    109 
    110 #define REX_W		0x48
    111 #define REX_R		0x44
    112 #define REX_X		0x42
    113 #define REX_B		0x41
    114 #define REX		0x40
    115 
    116 #ifndef _WIN64
    117 #define HALFWORD_MAX 0x7fffffffl
    118 #define HALFWORD_MIN -0x80000000l
    119 #else
    120 #define HALFWORD_MAX 0x7fffffffll
    121 #define HALFWORD_MIN -0x80000000ll
    122 #endif
    123 
    124 #define IS_HALFWORD(x)		((x) <= HALFWORD_MAX && (x) >= HALFWORD_MIN)
    125 #define NOT_HALFWORD(x)		((x) > HALFWORD_MAX || (x) < HALFWORD_MIN)
    126 
    127 #define CHECK_EXTRA_REGS(p, w, do)
    128 
    129 #endif /* SLJIT_CONFIG_X86_32 */
    130 
    131 #define TMP_FREG	(0)
    132 
    133 /* Size flags for emit_x86_instruction: */
    134 #define EX86_BIN_INS		0x0010
    135 #define EX86_SHIFT_INS		0x0020
    136 #define EX86_REX		0x0040
    137 #define EX86_NO_REXW		0x0080
    138 #define EX86_BYTE_ARG		0x0100
    139 #define EX86_HALF_ARG		0x0200
    140 #define EX86_PREF_66		0x0400
    141 #define EX86_PREF_F2		0x0800
    142 #define EX86_PREF_F3		0x1000
    143 #define EX86_SSE2_OP1		0x2000
    144 #define EX86_SSE2_OP2		0x4000
    145 #define EX86_SSE2		(EX86_SSE2_OP1 | EX86_SSE2_OP2)
    146 
    147 /* --------------------------------------------------------------------- */
    148 /*  Instrucion forms                                                     */
    149 /* --------------------------------------------------------------------- */
    150 
    151 #define ADD		(/* BINARY */ 0 << 3)
    152 #define ADD_EAX_i32	0x05
    153 #define ADD_r_rm	0x03
    154 #define ADD_rm_r	0x01
    155 #define ADDSD_x_xm	0x58
    156 #define ADC		(/* BINARY */ 2 << 3)
    157 #define ADC_EAX_i32	0x15
    158 #define ADC_r_rm	0x13
    159 #define ADC_rm_r	0x11
    160 #define AND		(/* BINARY */ 4 << 3)
    161 #define AND_EAX_i32	0x25
    162 #define AND_r_rm	0x23
    163 #define AND_rm_r	0x21
    164 #define ANDPD_x_xm	0x54
    165 #define BSR_r_rm	(/* GROUP_0F */ 0xbd)
    166 #define CALL_i32	0xe8
    167 #define CALL_rm		(/* GROUP_FF */ 2 << 3)
    168 #define CDQ		0x99
    169 #define CMOVNE_r_rm	(/* GROUP_0F */ 0x45)
    170 #define CMP		(/* BINARY */ 7 << 3)
    171 #define CMP_EAX_i32	0x3d
    172 #define CMP_r_rm	0x3b
    173 #define CMP_rm_r	0x39
    174 #define CVTPD2PS_x_xm	0x5a
    175 #define CVTSI2SD_x_rm	0x2a
    176 #define CVTTSD2SI_r_xm	0x2c
    177 #define DIV		(/* GROUP_F7 */ 6 << 3)
    178 #define DIVSD_x_xm	0x5e
    179 #define INT3		0xcc
    180 #define IDIV		(/* GROUP_F7 */ 7 << 3)
    181 #define IMUL		(/* GROUP_F7 */ 5 << 3)
    182 #define IMUL_r_rm	(/* GROUP_0F */ 0xaf)
    183 #define IMUL_r_rm_i8	0x6b
    184 #define IMUL_r_rm_i32	0x69
    185 #define JE_i8		0x74
    186 #define JNE_i8		0x75
    187 #define JMP_i8		0xeb
    188 #define JMP_i32		0xe9
    189 #define JMP_rm		(/* GROUP_FF */ 4 << 3)
    190 #define LEA_r_m		0x8d
    191 #define MOV_r_rm	0x8b
    192 #define MOV_r_i32	0xb8
    193 #define MOV_rm_r	0x89
    194 #define MOV_rm_i32	0xc7
    195 #define MOV_rm8_i8	0xc6
    196 #define MOV_rm8_r8	0x88
    197 #define MOVSD_x_xm	0x10
    198 #define MOVSD_xm_x	0x11
    199 #define MOVSXD_r_rm	0x63
    200 #define MOVSX_r_rm8	(/* GROUP_0F */ 0xbe)
    201 #define MOVSX_r_rm16	(/* GROUP_0F */ 0xbf)
    202 #define MOVZX_r_rm8	(/* GROUP_0F */ 0xb6)
    203 #define MOVZX_r_rm16	(/* GROUP_0F */ 0xb7)
    204 #define MUL		(/* GROUP_F7 */ 4 << 3)
    205 #define MULSD_x_xm	0x59
    206 #define NEG_rm		(/* GROUP_F7 */ 3 << 3)
    207 #define NOP		0x90
    208 #define NOT_rm		(/* GROUP_F7 */ 2 << 3)
    209 #define OR		(/* BINARY */ 1 << 3)
    210 #define OR_r_rm		0x0b
    211 #define OR_EAX_i32	0x0d
    212 #define OR_rm_r		0x09
    213 #define OR_rm8_r8	0x08
    214 #define POP_r		0x58
    215 #define POP_rm		0x8f
    216 #define POPF		0x9d
    217 #define PUSH_i32	0x68
    218 #define PUSH_r		0x50
    219 #define PUSH_rm		(/* GROUP_FF */ 6 << 3)
    220 #define PUSHF		0x9c
    221 #define RET_near	0xc3
    222 #define RET_i16		0xc2
    223 #define SBB		(/* BINARY */ 3 << 3)
    224 #define SBB_EAX_i32	0x1d
    225 #define SBB_r_rm	0x1b
    226 #define SBB_rm_r	0x19
    227 #define SAR		(/* SHIFT */ 7 << 3)
    228 #define SHL		(/* SHIFT */ 4 << 3)
    229 #define SHR		(/* SHIFT */ 5 << 3)
    230 #define SUB		(/* BINARY */ 5 << 3)
    231 #define SUB_EAX_i32	0x2d
    232 #define SUB_r_rm	0x2b
    233 #define SUB_rm_r	0x29
    234 #define SUBSD_x_xm	0x5c
    235 #define TEST_EAX_i32	0xa9
    236 #define TEST_rm_r	0x85
    237 #define UCOMISD_x_xm	0x2e
    238 #define UNPCKLPD_x_xm	0x14
    239 #define XCHG_EAX_r	0x90
    240 #define XCHG_r_rm	0x87
    241 #define XOR		(/* BINARY */ 6 << 3)
    242 #define XOR_EAX_i32	0x35
    243 #define XOR_r_rm	0x33
    244 #define XOR_rm_r	0x31
    245 #define XORPD_x_xm	0x57
    246 
    247 #define GROUP_0F	0x0f
    248 #define GROUP_F7	0xf7
    249 #define GROUP_FF	0xff
    250 #define GROUP_BINARY_81	0x81
    251 #define GROUP_BINARY_83	0x83
    252 #define GROUP_SHIFT_1	0xd1
    253 #define GROUP_SHIFT_N	0xc1
    254 #define GROUP_SHIFT_CL	0xd3
    255 
    256 #define MOD_REG		0xc0
    257 #define MOD_DISP8	0x40
    258 
    259 #define INC_SIZE(s)			(*inst++ = (s), compiler->size += (s))
    260 
    261 #define PUSH_REG(r)			(*inst++ = (PUSH_r + (r)))
    262 #define POP_REG(r)			(*inst++ = (POP_r + (r)))
    263 #define RET()				(*inst++ = (RET_near))
    264 #define RET_I16(n)			(*inst++ = (RET_i16), *inst++ = n, *inst++ = 0)
    265 /* r32, r/m32 */
    266 #define MOV_RM(mod, reg, rm)		(*inst++ = (MOV_r_rm), *inst++ = (mod) << 6 | (reg) << 3 | (rm))
    267 
    268 /* Multithreading does not affect these static variables, since they store
    269    built-in CPU features. Therefore they can be overwritten by different threads
    270    if they detect the CPU features in the same time. */
    271 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
    272 static sljit_s32 cpu_has_sse2 = -1;
    273 #endif
    274 static sljit_s32 cpu_has_cmov = -1;
    275 
    276 #ifdef _WIN32_WCE
    277 #include <cmnintrin.h>
    278 #elif defined(_MSC_VER) && _MSC_VER >= 1400
    279 #include <intrin.h>
    280 #endif
    281 
    282 /******************************************************/
    283 /*    Unaligned-store functions                       */
    284 /******************************************************/
    285 
    286 static SLJIT_INLINE void sljit_unaligned_store_s16(void *addr, sljit_s16 value)
    287 {
    288 	SLJIT_MEMCPY(addr, &value, sizeof(value));
    289 }
    290 
    291 static SLJIT_INLINE void sljit_unaligned_store_s32(void *addr, sljit_s32 value)
    292 {
    293 	SLJIT_MEMCPY(addr, &value, sizeof(value));
    294 }
    295 
    296 static SLJIT_INLINE void sljit_unaligned_store_sw(void *addr, sljit_sw value)
    297 {
    298 	SLJIT_MEMCPY(addr, &value, sizeof(value));
    299 }
    300 
    301 /******************************************************/
    302 /*    Utility functions                               */
    303 /******************************************************/
    304 
    305 static void get_cpu_features(void)
    306 {
    307 	sljit_u32 features;
    308 
    309 #if defined(_MSC_VER) && _MSC_VER >= 1400
    310 
    311 	int CPUInfo[4];
    312 	__cpuid(CPUInfo, 1);
    313 	features = (sljit_u32)CPUInfo[3];
    314 
    315 #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C)
    316 
    317 	/* AT&T syntax. */
    318 	__asm__ (
    319 		"movl $0x1, %%eax\n"
    320 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    321 		/* On x86-32, there is no red zone, so this
    322 		   should work (no need for a local variable). */
    323 		"push %%ebx\n"
    324 #endif
    325 		"cpuid\n"
    326 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    327 		"pop %%ebx\n"
    328 #endif
    329 		"movl %%edx, %0\n"
    330 		: "=g" (features)
    331 		:
    332 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    333 		: "%eax", "%ecx", "%edx"
    334 #else
    335 		: "%rax", "%rbx", "%rcx", "%rdx"
    336 #endif
    337 	);
    338 
    339 #else /* _MSC_VER && _MSC_VER >= 1400 */
    340 
    341 	/* Intel syntax. */
    342 	__asm {
    343 		mov eax, 1
    344 		cpuid
    345 		mov features, edx
    346 	}
    347 
    348 #endif /* _MSC_VER && _MSC_VER >= 1400 */
    349 
    350 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
    351 	cpu_has_sse2 = (features >> 26) & 0x1;
    352 #endif
    353 	cpu_has_cmov = (features >> 15) & 0x1;
    354 }
    355 
    356 static sljit_u8 get_jump_code(sljit_s32 type)
    357 {
    358 	switch (type) {
    359 	case SLJIT_EQUAL:
    360 	case SLJIT_EQUAL_F64:
    361 		return 0x84 /* je */;
    362 
    363 	case SLJIT_NOT_EQUAL:
    364 	case SLJIT_NOT_EQUAL_F64:
    365 		return 0x85 /* jne */;
    366 
    367 	case SLJIT_LESS:
    368 	case SLJIT_LESS_F64:
    369 		return 0x82 /* jc */;
    370 
    371 	case SLJIT_GREATER_EQUAL:
    372 	case SLJIT_GREATER_EQUAL_F64:
    373 		return 0x83 /* jae */;
    374 
    375 	case SLJIT_GREATER:
    376 	case SLJIT_GREATER_F64:
    377 		return 0x87 /* jnbe */;
    378 
    379 	case SLJIT_LESS_EQUAL:
    380 	case SLJIT_LESS_EQUAL_F64:
    381 		return 0x86 /* jbe */;
    382 
    383 	case SLJIT_SIG_LESS:
    384 		return 0x8c /* jl */;
    385 
    386 	case SLJIT_SIG_GREATER_EQUAL:
    387 		return 0x8d /* jnl */;
    388 
    389 	case SLJIT_SIG_GREATER:
    390 		return 0x8f /* jnle */;
    391 
    392 	case SLJIT_SIG_LESS_EQUAL:
    393 		return 0x8e /* jle */;
    394 
    395 	case SLJIT_OVERFLOW:
    396 	case SLJIT_MUL_OVERFLOW:
    397 		return 0x80 /* jo */;
    398 
    399 	case SLJIT_NOT_OVERFLOW:
    400 	case SLJIT_MUL_NOT_OVERFLOW:
    401 		return 0x81 /* jno */;
    402 
    403 	case SLJIT_UNORDERED_F64:
    404 		return 0x8a /* jp */;
    405 
    406 	case SLJIT_ORDERED_F64:
    407 		return 0x8b /* jpo */;
    408 	}
    409 	return 0;
    410 }
    411 
    412 static sljit_u8* generate_far_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_s32 type);
    413 
    414 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
    415 static sljit_u8* generate_fixed_jump(sljit_u8 *code_ptr, sljit_sw addr, sljit_s32 type);
    416 #endif
    417 
    418 static sljit_u8* generate_near_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_u8 *code, sljit_s32 type)
    419 {
    420 	sljit_s32 short_jump;
    421 	sljit_uw label_addr;
    422 
    423 	if (jump->flags & JUMP_LABEL)
    424 		label_addr = (sljit_uw)(code + jump->u.label->size);
    425 	else
    426 		label_addr = jump->u.target;
    427 	short_jump = (sljit_sw)(label_addr - (jump->addr + 2)) >= -128 && (sljit_sw)(label_addr - (jump->addr + 2)) <= 127;
    428 
    429 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
    430 	if ((sljit_sw)(label_addr - (jump->addr + 1)) > HALFWORD_MAX || (sljit_sw)(label_addr - (jump->addr + 1)) < HALFWORD_MIN)
    431 		return generate_far_jump_code(jump, code_ptr, type);
    432 #endif
    433 
    434 	if (type == SLJIT_JUMP) {
    435 		if (short_jump)
    436 			*code_ptr++ = JMP_i8;
    437 		else
    438 			*code_ptr++ = JMP_i32;
    439 		jump->addr++;
    440 	}
    441 	else if (type >= SLJIT_FAST_CALL) {
    442 		short_jump = 0;
    443 		*code_ptr++ = CALL_i32;
    444 		jump->addr++;
    445 	}
    446 	else if (short_jump) {
    447 		*code_ptr++ = get_jump_code(type) - 0x10;
    448 		jump->addr++;
    449 	}
    450 	else {
    451 		*code_ptr++ = GROUP_0F;
    452 		*code_ptr++ = get_jump_code(type);
    453 		jump->addr += 2;
    454 	}
    455 
    456 	if (short_jump) {
    457 		jump->flags |= PATCH_MB;
    458 		code_ptr += sizeof(sljit_s8);
    459 	} else {
    460 		jump->flags |= PATCH_MW;
    461 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    462 		code_ptr += sizeof(sljit_sw);
    463 #else
    464 		code_ptr += sizeof(sljit_s32);
    465 #endif
    466 	}
    467 
    468 	return code_ptr;
    469 }
    470 
    471 SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
    472 {
    473 	struct sljit_memory_fragment *buf;
    474 	sljit_u8 *code;
    475 	sljit_u8 *code_ptr;
    476 	sljit_u8 *buf_ptr;
    477 	sljit_u8 *buf_end;
    478 	sljit_u8 len;
    479 
    480 	struct sljit_label *label;
    481 	struct sljit_jump *jump;
    482 	struct sljit_const *const_;
    483 
    484 	CHECK_ERROR_PTR();
    485 	CHECK_PTR(check_sljit_generate_code(compiler));
    486 	reverse_buf(compiler);
    487 
    488 	/* Second code generation pass. */
    489 	code = (sljit_u8*)SLJIT_MALLOC_EXEC(compiler->size);
    490 	PTR_FAIL_WITH_EXEC_IF(code);
    491 	buf = compiler->buf;
    492 
    493 	code_ptr = code;
    494 	label = compiler->labels;
    495 	jump = compiler->jumps;
    496 	const_ = compiler->consts;
    497 	do {
    498 		buf_ptr = buf->memory;
    499 		buf_end = buf_ptr + buf->used_size;
    500 		do {
    501 			len = *buf_ptr++;
    502 			if (len > 0) {
    503 				/* The code is already generated. */
    504 				SLJIT_MEMCPY(code_ptr, buf_ptr, len);
    505 				code_ptr += len;
    506 				buf_ptr += len;
    507 			}
    508 			else {
    509 				if (*buf_ptr >= 4) {
    510 					jump->addr = (sljit_uw)code_ptr;
    511 					if (!(jump->flags & SLJIT_REWRITABLE_JUMP))
    512 						code_ptr = generate_near_jump_code(jump, code_ptr, code, *buf_ptr - 4);
    513 					else
    514 						code_ptr = generate_far_jump_code(jump, code_ptr, *buf_ptr - 4);
    515 					jump = jump->next;
    516 				}
    517 				else if (*buf_ptr == 0) {
    518 					label->addr = (sljit_uw)code_ptr;
    519 					label->size = code_ptr - code;
    520 					label = label->next;
    521 				}
    522 				else if (*buf_ptr == 1) {
    523 					const_->addr = ((sljit_uw)code_ptr) - sizeof(sljit_sw);
    524 					const_ = const_->next;
    525 				}
    526 				else {
    527 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    528 					*code_ptr++ = (*buf_ptr == 2) ? CALL_i32 : JMP_i32;
    529 					buf_ptr++;
    530 					sljit_unaligned_store_sw(code_ptr, *(sljit_sw*)buf_ptr - ((sljit_sw)code_ptr + sizeof(sljit_sw)));
    531 					code_ptr += sizeof(sljit_sw);
    532 					buf_ptr += sizeof(sljit_sw) - 1;
    533 #else
    534 					code_ptr = generate_fixed_jump(code_ptr, *(sljit_sw*)(buf_ptr + 1), *buf_ptr);
    535 					buf_ptr += sizeof(sljit_sw);
    536 #endif
    537 				}
    538 				buf_ptr++;
    539 			}
    540 		} while (buf_ptr < buf_end);
    541 		SLJIT_ASSERT(buf_ptr == buf_end);
    542 		buf = buf->next;
    543 	} while (buf);
    544 
    545 	SLJIT_ASSERT(!label);
    546 	SLJIT_ASSERT(!jump);
    547 	SLJIT_ASSERT(!const_);
    548 
    549 	jump = compiler->jumps;
    550 	while (jump) {
    551 		if (jump->flags & PATCH_MB) {
    552 			SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_s8))) >= -128 && (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_s8))) <= 127);
    553 			*(sljit_u8*)jump->addr = (sljit_u8)(jump->u.label->addr - (jump->addr + sizeof(sljit_s8)));
    554 		} else if (jump->flags & PATCH_MW) {
    555 			if (jump->flags & JUMP_LABEL) {
    556 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    557 				sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_sw))));
    558 #else
    559 				SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_s32))) <= HALFWORD_MAX);
    560 				sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)(jump->u.label->addr - (jump->addr + sizeof(sljit_s32))));
    561 #endif
    562 			}
    563 			else {
    564 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    565 				sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_sw))));
    566 #else
    567 				SLJIT_ASSERT((sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_s32))) <= HALFWORD_MAX);
    568 				sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)(jump->u.target - (jump->addr + sizeof(sljit_s32))));
    569 #endif
    570 			}
    571 		}
    572 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
    573 		else if (jump->flags & PATCH_MD)
    574 			sljit_unaligned_store_sw((void*)jump->addr, jump->u.label->addr);
    575 #endif
    576 
    577 		jump = jump->next;
    578 	}
    579 
    580 	/* Maybe we waste some space because of short jumps. */
    581 	SLJIT_ASSERT(code_ptr <= code + compiler->size);
    582 	compiler->error = SLJIT_ERR_COMPILED;
    583 	compiler->executable_size = code_ptr - code;
    584 	return (void*)code;
    585 }
    586 
    587 /* --------------------------------------------------------------------- */
    588 /*  Operators                                                            */
    589 /* --------------------------------------------------------------------- */
    590 
    591 static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
    592 	sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
    593 	sljit_s32 dst, sljit_sw dstw,
    594 	sljit_s32 src1, sljit_sw src1w,
    595 	sljit_s32 src2, sljit_sw src2w);
    596 
    597 static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
    598 	sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
    599 	sljit_s32 dst, sljit_sw dstw,
    600 	sljit_s32 src1, sljit_sw src1w,
    601 	sljit_s32 src2, sljit_sw src2w);
    602 
    603 static sljit_s32 emit_mov(struct sljit_compiler *compiler,
    604 	sljit_s32 dst, sljit_sw dstw,
    605 	sljit_s32 src, sljit_sw srcw);
    606 
    607 static SLJIT_INLINE sljit_s32 emit_save_flags(struct sljit_compiler *compiler)
    608 {
    609 	sljit_u8 *inst;
    610 
    611 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    612 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 5);
    613 	FAIL_IF(!inst);
    614 	INC_SIZE(5);
    615 #else
    616 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 6);
    617 	FAIL_IF(!inst);
    618 	INC_SIZE(6);
    619 	*inst++ = REX_W;
    620 #endif
    621 	*inst++ = LEA_r_m; /* lea esp/rsp, [esp/rsp + sizeof(sljit_sw)] */
    622 	*inst++ = 0x64;
    623 	*inst++ = 0x24;
    624 	*inst++ = (sljit_u8)sizeof(sljit_sw);
    625 	*inst++ = PUSHF;
    626 	compiler->flags_saved = 1;
    627 	return SLJIT_SUCCESS;
    628 }
    629 
    630 static SLJIT_INLINE sljit_s32 emit_restore_flags(struct sljit_compiler *compiler, sljit_s32 keep_flags)
    631 {
    632 	sljit_u8 *inst;
    633 
    634 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    635 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 5);
    636 	FAIL_IF(!inst);
    637 	INC_SIZE(5);
    638 	*inst++ = POPF;
    639 #else
    640 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 6);
    641 	FAIL_IF(!inst);
    642 	INC_SIZE(6);
    643 	*inst++ = POPF;
    644 	*inst++ = REX_W;
    645 #endif
    646 	*inst++ = LEA_r_m; /* lea esp/rsp, [esp/rsp - sizeof(sljit_sw)] */
    647 	*inst++ = 0x64;
    648 	*inst++ = 0x24;
    649 	*inst++ = (sljit_u8)(-(sljit_s8)sizeof(sljit_sw));
    650 	compiler->flags_saved = keep_flags;
    651 	return SLJIT_SUCCESS;
    652 }
    653 
    654 #ifdef _WIN32
    655 #include <malloc.h>
    656 
    657 static void SLJIT_CALL sljit_grow_stack(sljit_sw local_size)
    658 {
    659 	/* Workaround for calling the internal _chkstk() function on Windows.
    660 	This function touches all 4k pages belongs to the requested stack space,
    661 	which size is passed in local_size. This is necessary on Windows where
    662 	the stack can only grow in 4k steps. However, this function just burn
    663 	CPU cycles if the stack is large enough. However, you don't know it in
    664 	advance, so it must always be called. I think this is a bad design in
    665 	general even if it has some reasons. */
    666 	*(volatile sljit_s32*)alloca(local_size) = 0;
    667 }
    668 
    669 #endif
    670 
    671 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    672 #include "sljitNativeX86_32.c"
    673 #else
    674 #include "sljitNativeX86_64.c"
    675 #endif
    676 
    677 static sljit_s32 emit_mov(struct sljit_compiler *compiler,
    678 	sljit_s32 dst, sljit_sw dstw,
    679 	sljit_s32 src, sljit_sw srcw)
    680 {
    681 	sljit_u8* inst;
    682 
    683 	if (dst == SLJIT_UNUSED) {
    684 		/* No destination, doesn't need to setup flags. */
    685 		if (src & SLJIT_MEM) {
    686 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
    687 			FAIL_IF(!inst);
    688 			*inst = MOV_r_rm;
    689 		}
    690 		return SLJIT_SUCCESS;
    691 	}
    692 	if (FAST_IS_REG(src)) {
    693 		inst = emit_x86_instruction(compiler, 1, src, 0, dst, dstw);
    694 		FAIL_IF(!inst);
    695 		*inst = MOV_rm_r;
    696 		return SLJIT_SUCCESS;
    697 	}
    698 	if (src & SLJIT_IMM) {
    699 		if (FAST_IS_REG(dst)) {
    700 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    701 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
    702 #else
    703 			if (!compiler->mode32) {
    704 				if (NOT_HALFWORD(srcw))
    705 					return emit_load_imm64(compiler, dst, srcw);
    706 			}
    707 			else
    708 				return emit_do_imm32(compiler, (reg_map[dst] >= 8) ? REX_B : 0, MOV_r_i32 + reg_lmap[dst], srcw);
    709 #endif
    710 		}
    711 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
    712 		if (!compiler->mode32 && NOT_HALFWORD(srcw)) {
    713 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, srcw));
    714 			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, dst, dstw);
    715 			FAIL_IF(!inst);
    716 			*inst = MOV_rm_r;
    717 			return SLJIT_SUCCESS;
    718 		}
    719 #endif
    720 		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, dstw);
    721 		FAIL_IF(!inst);
    722 		*inst = MOV_rm_i32;
    723 		return SLJIT_SUCCESS;
    724 	}
    725 	if (FAST_IS_REG(dst)) {
    726 		inst = emit_x86_instruction(compiler, 1, dst, 0, src, srcw);
    727 		FAIL_IF(!inst);
    728 		*inst = MOV_r_rm;
    729 		return SLJIT_SUCCESS;
    730 	}
    731 
    732 	/* Memory to memory move. Requires two instruction. */
    733 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
    734 	FAIL_IF(!inst);
    735 	*inst = MOV_r_rm;
    736 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
    737 	FAIL_IF(!inst);
    738 	*inst = MOV_rm_r;
    739 	return SLJIT_SUCCESS;
    740 }
    741 
    742 #define EMIT_MOV(compiler, dst, dstw, src, srcw) \
    743 	FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
    744 
    745 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op)
    746 {
    747 	sljit_u8 *inst;
    748 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
    749 	sljit_s32 size;
    750 #endif
    751 
    752 	CHECK_ERROR();
    753 	CHECK(check_sljit_emit_op0(compiler, op));
    754 
    755 	switch (GET_OPCODE(op)) {
    756 	case SLJIT_BREAKPOINT:
    757 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
    758 		FAIL_IF(!inst);
    759 		INC_SIZE(1);
    760 		*inst = INT3;
    761 		break;
    762 	case SLJIT_NOP:
    763 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
    764 		FAIL_IF(!inst);
    765 		INC_SIZE(1);
    766 		*inst = NOP;
    767 		break;
    768 	case SLJIT_LMUL_UW:
    769 	case SLJIT_LMUL_SW:
    770 	case SLJIT_DIVMOD_UW:
    771 	case SLJIT_DIVMOD_SW:
    772 	case SLJIT_DIV_UW:
    773 	case SLJIT_DIV_SW:
    774 		compiler->flags_saved = 0;
    775 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
    776 #ifdef _WIN64
    777 		SLJIT_COMPILE_ASSERT(
    778 			reg_map[SLJIT_R0] == 0
    779 			&& reg_map[SLJIT_R1] == 2
    780 			&& reg_map[TMP_REG1] > 7,
    781 			invalid_register_assignment_for_div_mul);
    782 #else
    783 		SLJIT_COMPILE_ASSERT(
    784 			reg_map[SLJIT_R0] == 0
    785 			&& reg_map[SLJIT_R1] < 7
    786 			&& reg_map[TMP_REG1] == 2,
    787 			invalid_register_assignment_for_div_mul);
    788 #endif
    789 		compiler->mode32 = op & SLJIT_I32_OP;
    790 #endif
    791 		SLJIT_COMPILE_ASSERT((SLJIT_DIVMOD_UW & 0x2) == 0 && SLJIT_DIV_UW - 0x2 == SLJIT_DIVMOD_UW, bad_div_opcode_assignments);
    792 
    793 		op = GET_OPCODE(op);
    794 		if ((op | 0x2) == SLJIT_DIV_UW) {
    795 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
    796 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
    797 			inst = emit_x86_instruction(compiler, 1, SLJIT_R1, 0, SLJIT_R1, 0);
    798 #else
    799 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
    800 #endif
    801 			FAIL_IF(!inst);
    802 			*inst = XOR_r_rm;
    803 		}
    804 
    805 		if ((op | 0x2) == SLJIT_DIV_SW) {
    806 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
    807 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
    808 #endif
    809 
    810 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    811 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
    812 			FAIL_IF(!inst);
    813 			INC_SIZE(1);
    814 			*inst = CDQ;
    815 #else
    816 			if (compiler->mode32) {
    817 				inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
    818 				FAIL_IF(!inst);
    819 				INC_SIZE(1);
    820 				*inst = CDQ;
    821 			} else {
    822 				inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
    823 				FAIL_IF(!inst);
    824 				INC_SIZE(2);
    825 				*inst++ = REX_W;
    826 				*inst = CDQ;
    827 			}
    828 #endif
    829 		}
    830 
    831 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    832 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
    833 		FAIL_IF(!inst);
    834 		INC_SIZE(2);
    835 		*inst++ = GROUP_F7;
    836 		*inst = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]);
    837 #else
    838 #ifdef _WIN64
    839 		size = (!compiler->mode32 || op >= SLJIT_DIVMOD_UW) ? 3 : 2;
    840 #else
    841 		size = (!compiler->mode32) ? 3 : 2;
    842 #endif
    843 		inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
    844 		FAIL_IF(!inst);
    845 		INC_SIZE(size);
    846 #ifdef _WIN64
    847 		if (!compiler->mode32)
    848 			*inst++ = REX_W | ((op >= SLJIT_DIVMOD_UW) ? REX_B : 0);
    849 		else if (op >= SLJIT_DIVMOD_UW)
    850 			*inst++ = REX_B;
    851 		*inst++ = GROUP_F7;
    852 		*inst = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]);
    853 #else
    854 		if (!compiler->mode32)
    855 			*inst++ = REX_W;
    856 		*inst++ = GROUP_F7;
    857 		*inst = MOD_REG | reg_map[SLJIT_R1];
    858 #endif
    859 #endif
    860 		switch (op) {
    861 		case SLJIT_LMUL_UW:
    862 			*inst |= MUL;
    863 			break;
    864 		case SLJIT_LMUL_SW:
    865 			*inst |= IMUL;
    866 			break;
    867 		case SLJIT_DIVMOD_UW:
    868 		case SLJIT_DIV_UW:
    869 			*inst |= DIV;
    870 			break;
    871 		case SLJIT_DIVMOD_SW:
    872 		case SLJIT_DIV_SW:
    873 			*inst |= IDIV;
    874 			break;
    875 		}
    876 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
    877 		if (op <= SLJIT_DIVMOD_SW)
    878 			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
    879 #else
    880 		if (op >= SLJIT_DIV_UW)
    881 			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
    882 #endif
    883 		break;
    884 	}
    885 
    886 	return SLJIT_SUCCESS;
    887 }
    888 
    889 #define ENCODE_PREFIX(prefix) \
    890 	do { \
    891 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1); \
    892 		FAIL_IF(!inst); \
    893 		INC_SIZE(1); \
    894 		*inst = (prefix); \
    895 	} while (0)
    896 
    897 static sljit_s32 emit_mov_byte(struct sljit_compiler *compiler, sljit_s32 sign,
    898 	sljit_s32 dst, sljit_sw dstw,
    899 	sljit_s32 src, sljit_sw srcw)
    900 {
    901 	sljit_u8* inst;
    902 	sljit_s32 dst_r;
    903 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    904 	sljit_s32 work_r;
    905 #endif
    906 
    907 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
    908 	compiler->mode32 = 0;
    909 #endif
    910 
    911 	if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
    912 		return SLJIT_SUCCESS; /* Empty instruction. */
    913 
    914 	if (src & SLJIT_IMM) {
    915 		if (FAST_IS_REG(dst)) {
    916 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    917 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
    918 #else
    919 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
    920 			FAIL_IF(!inst);
    921 			*inst = MOV_rm_i32;
    922 			return SLJIT_SUCCESS;
    923 #endif
    924 		}
    925 		inst = emit_x86_instruction(compiler, 1 | EX86_BYTE_ARG | EX86_NO_REXW, SLJIT_IMM, srcw, dst, dstw);
    926 		FAIL_IF(!inst);
    927 		*inst = MOV_rm8_i8;
    928 		return SLJIT_SUCCESS;
    929 	}
    930 
    931 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
    932 
    933 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src)) {
    934 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    935 		if (reg_map[src] >= 4) {
    936 			SLJIT_ASSERT(dst_r == TMP_REG1);
    937 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
    938 		} else
    939 			dst_r = src;
    940 #else
    941 		dst_r = src;
    942 #endif
    943 	}
    944 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    945 	else if (FAST_IS_REG(src) && reg_map[src] >= 4) {
    946 		/* src, dst are registers. */
    947 		SLJIT_ASSERT(SLOW_IS_REG(dst));
    948 		if (reg_map[dst] < 4) {
    949 			if (dst != src)
    950 				EMIT_MOV(compiler, dst, 0, src, 0);
    951 			inst = emit_x86_instruction(compiler, 2, dst, 0, dst, 0);
    952 			FAIL_IF(!inst);
    953 			*inst++ = GROUP_0F;
    954 			*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
    955 		}
    956 		else {
    957 			if (dst != src)
    958 				EMIT_MOV(compiler, dst, 0, src, 0);
    959 			if (sign) {
    960 				/* shl reg, 24 */
    961 				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
    962 				FAIL_IF(!inst);
    963 				*inst |= SHL;
    964 				/* sar reg, 24 */
    965 				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
    966 				FAIL_IF(!inst);
    967 				*inst |= SAR;
    968 			}
    969 			else {
    970 				inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 0xff, dst, 0);
    971 				FAIL_IF(!inst);
    972 				*(inst + 1) |= AND;
    973 			}
    974 		}
    975 		return SLJIT_SUCCESS;
    976 	}
    977 #endif
    978 	else {
    979 		/* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */
    980 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
    981 		FAIL_IF(!inst);
    982 		*inst++ = GROUP_0F;
    983 		*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
    984 	}
    985 
    986 	if (dst & SLJIT_MEM) {
    987 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    988 		if (dst_r == TMP_REG1) {
    989 			/* Find a non-used register, whose reg_map[src] < 4. */
    990 			if ((dst & REG_MASK) == SLJIT_R0) {
    991 				if ((dst & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_R1))
    992 					work_r = SLJIT_R2;
    993 				else
    994 					work_r = SLJIT_R1;
    995 			}
    996 			else {
    997 				if ((dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
    998 					work_r = SLJIT_R0;
    999 				else if ((dst & REG_MASK) == SLJIT_R1)
   1000 					work_r = SLJIT_R2;
   1001 				else
   1002 					work_r = SLJIT_R1;
   1003 			}
   1004 
   1005 			if (work_r == SLJIT_R0) {
   1006 				ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
   1007 			}
   1008 			else {
   1009 				inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
   1010 				FAIL_IF(!inst);
   1011 				*inst = XCHG_r_rm;
   1012 			}
   1013 
   1014 			inst = emit_x86_instruction(compiler, 1, work_r, 0, dst, dstw);
   1015 			FAIL_IF(!inst);
   1016 			*inst = MOV_rm8_r8;
   1017 
   1018 			if (work_r == SLJIT_R0) {
   1019 				ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
   1020 			}
   1021 			else {
   1022 				inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
   1023 				FAIL_IF(!inst);
   1024 				*inst = XCHG_r_rm;
   1025 			}
   1026 		}
   1027 		else {
   1028 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
   1029 			FAIL_IF(!inst);
   1030 			*inst = MOV_rm8_r8;
   1031 		}
   1032 #else
   1033 		inst = emit_x86_instruction(compiler, 1 | EX86_REX | EX86_NO_REXW, dst_r, 0, dst, dstw);
   1034 		FAIL_IF(!inst);
   1035 		*inst = MOV_rm8_r8;
   1036 #endif
   1037 	}
   1038 
   1039 	return SLJIT_SUCCESS;
   1040 }
   1041 
   1042 static sljit_s32 emit_mov_half(struct sljit_compiler *compiler, sljit_s32 sign,
   1043 	sljit_s32 dst, sljit_sw dstw,
   1044 	sljit_s32 src, sljit_sw srcw)
   1045 {
   1046 	sljit_u8* inst;
   1047 	sljit_s32 dst_r;
   1048 
   1049 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1050 	compiler->mode32 = 0;
   1051 #endif
   1052 
   1053 	if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
   1054 		return SLJIT_SUCCESS; /* Empty instruction. */
   1055 
   1056 	if (src & SLJIT_IMM) {
   1057 		if (FAST_IS_REG(dst)) {
   1058 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1059 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
   1060 #else
   1061 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
   1062 			FAIL_IF(!inst);
   1063 			*inst = MOV_rm_i32;
   1064 			return SLJIT_SUCCESS;
   1065 #endif
   1066 		}
   1067 		inst = emit_x86_instruction(compiler, 1 | EX86_HALF_ARG | EX86_NO_REXW | EX86_PREF_66, SLJIT_IMM, srcw, dst, dstw);
   1068 		FAIL_IF(!inst);
   1069 		*inst = MOV_rm_i32;
   1070 		return SLJIT_SUCCESS;
   1071 	}
   1072 
   1073 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
   1074 
   1075 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src))
   1076 		dst_r = src;
   1077 	else {
   1078 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
   1079 		FAIL_IF(!inst);
   1080 		*inst++ = GROUP_0F;
   1081 		*inst = sign ? MOVSX_r_rm16 : MOVZX_r_rm16;
   1082 	}
   1083 
   1084 	if (dst & SLJIT_MEM) {
   1085 		inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw);
   1086 		FAIL_IF(!inst);
   1087 		*inst = MOV_rm_r;
   1088 	}
   1089 
   1090 	return SLJIT_SUCCESS;
   1091 }
   1092 
   1093 static sljit_s32 emit_unary(struct sljit_compiler *compiler, sljit_u8 opcode,
   1094 	sljit_s32 dst, sljit_sw dstw,
   1095 	sljit_s32 src, sljit_sw srcw)
   1096 {
   1097 	sljit_u8* inst;
   1098 
   1099 	if (dst == SLJIT_UNUSED) {
   1100 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
   1101 		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
   1102 		FAIL_IF(!inst);
   1103 		*inst++ = GROUP_F7;
   1104 		*inst |= opcode;
   1105 		return SLJIT_SUCCESS;
   1106 	}
   1107 	if (dst == src && dstw == srcw) {
   1108 		/* Same input and output */
   1109 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
   1110 		FAIL_IF(!inst);
   1111 		*inst++ = GROUP_F7;
   1112 		*inst |= opcode;
   1113 		return SLJIT_SUCCESS;
   1114 	}
   1115 	if (FAST_IS_REG(dst)) {
   1116 		EMIT_MOV(compiler, dst, 0, src, srcw);
   1117 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
   1118 		FAIL_IF(!inst);
   1119 		*inst++ = GROUP_F7;
   1120 		*inst |= opcode;
   1121 		return SLJIT_SUCCESS;
   1122 	}
   1123 	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
   1124 	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
   1125 	FAIL_IF(!inst);
   1126 	*inst++ = GROUP_F7;
   1127 	*inst |= opcode;
   1128 	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
   1129 	return SLJIT_SUCCESS;
   1130 }
   1131 
   1132 static sljit_s32 emit_not_with_flags(struct sljit_compiler *compiler,
   1133 	sljit_s32 dst, sljit_sw dstw,
   1134 	sljit_s32 src, sljit_sw srcw)
   1135 {
   1136 	sljit_u8* inst;
   1137 
   1138 	if (dst == SLJIT_UNUSED) {
   1139 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
   1140 		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
   1141 		FAIL_IF(!inst);
   1142 		*inst++ = GROUP_F7;
   1143 		*inst |= NOT_rm;
   1144 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
   1145 		FAIL_IF(!inst);
   1146 		*inst = OR_r_rm;
   1147 		return SLJIT_SUCCESS;
   1148 	}
   1149 	if (FAST_IS_REG(dst)) {
   1150 		EMIT_MOV(compiler, dst, 0, src, srcw);
   1151 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
   1152 		FAIL_IF(!inst);
   1153 		*inst++ = GROUP_F7;
   1154 		*inst |= NOT_rm;
   1155 		inst = emit_x86_instruction(compiler, 1, dst, 0, dst, 0);
   1156 		FAIL_IF(!inst);
   1157 		*inst = OR_r_rm;
   1158 		return SLJIT_SUCCESS;
   1159 	}
   1160 	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
   1161 	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
   1162 	FAIL_IF(!inst);
   1163 	*inst++ = GROUP_F7;
   1164 	*inst |= NOT_rm;
   1165 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
   1166 	FAIL_IF(!inst);
   1167 	*inst = OR_r_rm;
   1168 	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
   1169 	return SLJIT_SUCCESS;
   1170 }
   1171 
   1172 static sljit_s32 emit_clz(struct sljit_compiler *compiler, sljit_s32 op_flags,
   1173 	sljit_s32 dst, sljit_sw dstw,
   1174 	sljit_s32 src, sljit_sw srcw)
   1175 {
   1176 	sljit_u8* inst;
   1177 	sljit_s32 dst_r;
   1178 
   1179 	SLJIT_UNUSED_ARG(op_flags);
   1180 	if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED)) {
   1181 		/* Just set the zero flag. */
   1182 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
   1183 		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
   1184 		FAIL_IF(!inst);
   1185 		*inst++ = GROUP_F7;
   1186 		*inst |= NOT_rm;
   1187 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1188 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 31, TMP_REG1, 0);
   1189 #else
   1190 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 63 : 31, TMP_REG1, 0);
   1191 #endif
   1192 		FAIL_IF(!inst);
   1193 		*inst |= SHR;
   1194 		return SLJIT_SUCCESS;
   1195 	}
   1196 
   1197 	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
   1198 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
   1199 		src = TMP_REG1;
   1200 		srcw = 0;
   1201 	}
   1202 
   1203 	inst = emit_x86_instruction(compiler, 2, TMP_REG1, 0, src, srcw);
   1204 	FAIL_IF(!inst);
   1205 	*inst++ = GROUP_0F;
   1206 	*inst = BSR_r_rm;
   1207 
   1208 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1209 	if (FAST_IS_REG(dst))
   1210 		dst_r = dst;
   1211 	else {
   1212 		/* Find an unused temporary register. */
   1213 		if ((dst & REG_MASK) != SLJIT_R0 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
   1214 			dst_r = SLJIT_R0;
   1215 		else if ((dst & REG_MASK) != SLJIT_R1 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R1))
   1216 			dst_r = SLJIT_R1;
   1217 		else
   1218 			dst_r = SLJIT_R2;
   1219 		EMIT_MOV(compiler, dst, dstw, dst_r, 0);
   1220 	}
   1221 	EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, 32 + 31);
   1222 #else
   1223 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
   1224 	compiler->mode32 = 0;
   1225 	EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 64 + 63 : 32 + 31);
   1226 	compiler->mode32 = op_flags & SLJIT_I32_OP;
   1227 #endif
   1228 
   1229 	if (cpu_has_cmov == -1)
   1230 		get_cpu_features();
   1231 
   1232 	if (cpu_has_cmov) {
   1233 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
   1234 		FAIL_IF(!inst);
   1235 		*inst++ = GROUP_0F;
   1236 		*inst = CMOVNE_r_rm;
   1237 	} else {
   1238 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1239 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
   1240 		FAIL_IF(!inst);
   1241 		INC_SIZE(4);
   1242 
   1243 		*inst++ = JE_i8;
   1244 		*inst++ = 2;
   1245 		*inst++ = MOV_r_rm;
   1246 		*inst++ = MOD_REG | (reg_map[dst_r] << 3) | reg_map[TMP_REG1];
   1247 #else
   1248 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 5);
   1249 		FAIL_IF(!inst);
   1250 		INC_SIZE(5);
   1251 
   1252 		*inst++ = JE_i8;
   1253 		*inst++ = 3;
   1254 		*inst++ = REX_W | (reg_map[dst_r] >= 8 ? REX_R : 0) | (reg_map[TMP_REG1] >= 8 ? REX_B : 0);
   1255 		*inst++ = MOV_r_rm;
   1256 		*inst++ = MOD_REG | (reg_lmap[dst_r] << 3) | reg_lmap[TMP_REG1];
   1257 #endif
   1258 	}
   1259 
   1260 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1261 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
   1262 #else
   1263 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 63 : 31, dst_r, 0);
   1264 #endif
   1265 	FAIL_IF(!inst);
   1266 	*(inst + 1) |= XOR;
   1267 
   1268 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1269 	if (dst & SLJIT_MEM) {
   1270 		inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
   1271 		FAIL_IF(!inst);
   1272 		*inst = XCHG_r_rm;
   1273 	}
   1274 #else
   1275 	if (dst & SLJIT_MEM)
   1276 		EMIT_MOV(compiler, dst, dstw, TMP_REG2, 0);
   1277 #endif
   1278 	return SLJIT_SUCCESS;
   1279 }
   1280 
   1281 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op,
   1282 	sljit_s32 dst, sljit_sw dstw,
   1283 	sljit_s32 src, sljit_sw srcw)
   1284 {
   1285 	sljit_u8* inst;
   1286 	sljit_s32 update = 0;
   1287 	sljit_s32 op_flags = GET_ALL_FLAGS(op);
   1288 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1289 	sljit_s32 dst_is_ereg = 0;
   1290 	sljit_s32 src_is_ereg = 0;
   1291 #else
   1292 #	define src_is_ereg 0
   1293 #endif
   1294 
   1295 	CHECK_ERROR();
   1296 	CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw));
   1297 	ADJUST_LOCAL_OFFSET(dst, dstw);
   1298 	ADJUST_LOCAL_OFFSET(src, srcw);
   1299 
   1300 	CHECK_EXTRA_REGS(dst, dstw, dst_is_ereg = 1);
   1301 	CHECK_EXTRA_REGS(src, srcw, src_is_ereg = 1);
   1302 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1303 	compiler->mode32 = op_flags & SLJIT_I32_OP;
   1304 #endif
   1305 
   1306 	op = GET_OPCODE(op);
   1307 	if (op >= SLJIT_MOV && op <= SLJIT_MOVU_P) {
   1308 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1309 		compiler->mode32 = 0;
   1310 #endif
   1311 
   1312 		if (op_flags & SLJIT_I32_OP) {
   1313 			if (FAST_IS_REG(src) && src == dst) {
   1314 				if (!TYPE_CAST_NEEDED(op))
   1315 					return SLJIT_SUCCESS;
   1316 			}
   1317 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1318 			if (op == SLJIT_MOV_S32 && (src & SLJIT_MEM))
   1319 				op = SLJIT_MOV_U32;
   1320 			if (op == SLJIT_MOVU_S32 && (src & SLJIT_MEM))
   1321 				op = SLJIT_MOVU_U32;
   1322 			if (op == SLJIT_MOV_U32 && (src & SLJIT_IMM))
   1323 				op = SLJIT_MOV_S32;
   1324 			if (op == SLJIT_MOVU_U32 && (src & SLJIT_IMM))
   1325 				op = SLJIT_MOVU_S32;
   1326 #endif
   1327 		}
   1328 
   1329 		SLJIT_COMPILE_ASSERT(SLJIT_MOV + 8 == SLJIT_MOVU, movu_offset);
   1330 		if (op >= SLJIT_MOVU) {
   1331 			update = 1;
   1332 			op -= 8;
   1333 		}
   1334 
   1335 		if (src & SLJIT_IMM) {
   1336 			switch (op) {
   1337 			case SLJIT_MOV_U8:
   1338 				srcw = (sljit_u8)srcw;
   1339 				break;
   1340 			case SLJIT_MOV_S8:
   1341 				srcw = (sljit_s8)srcw;
   1342 				break;
   1343 			case SLJIT_MOV_U16:
   1344 				srcw = (sljit_u16)srcw;
   1345 				break;
   1346 			case SLJIT_MOV_S16:
   1347 				srcw = (sljit_s16)srcw;
   1348 				break;
   1349 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1350 			case SLJIT_MOV_U32:
   1351 				srcw = (sljit_u32)srcw;
   1352 				break;
   1353 			case SLJIT_MOV_S32:
   1354 				srcw = (sljit_s32)srcw;
   1355 				break;
   1356 #endif
   1357 			}
   1358 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1359 			if (SLJIT_UNLIKELY(dst_is_ereg))
   1360 				return emit_mov(compiler, dst, dstw, src, srcw);
   1361 #endif
   1362 		}
   1363 
   1364 		if (SLJIT_UNLIKELY(update) && (src & SLJIT_MEM) && !src_is_ereg && (src & REG_MASK) && (srcw != 0 || (src & OFFS_REG_MASK) != 0)) {
   1365 			inst = emit_x86_instruction(compiler, 1, src & REG_MASK, 0, src, srcw);
   1366 			FAIL_IF(!inst);
   1367 			*inst = LEA_r_m;
   1368 			src &= SLJIT_MEM | 0xf;
   1369 			srcw = 0;
   1370 		}
   1371 
   1372 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1373 		if (SLJIT_UNLIKELY(dst_is_ereg) && (!(op == SLJIT_MOV || op == SLJIT_MOV_U32 || op == SLJIT_MOV_S32 || op == SLJIT_MOV_P) || (src & SLJIT_MEM))) {
   1374 			SLJIT_ASSERT(dst == SLJIT_MEM1(SLJIT_SP));
   1375 			dst = TMP_REG1;
   1376 		}
   1377 #endif
   1378 
   1379 		switch (op) {
   1380 		case SLJIT_MOV:
   1381 		case SLJIT_MOV_P:
   1382 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1383 		case SLJIT_MOV_U32:
   1384 		case SLJIT_MOV_S32:
   1385 #endif
   1386 			FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
   1387 			break;
   1388 		case SLJIT_MOV_U8:
   1389 			FAIL_IF(emit_mov_byte(compiler, 0, dst, dstw, src, srcw));
   1390 			break;
   1391 		case SLJIT_MOV_S8:
   1392 			FAIL_IF(emit_mov_byte(compiler, 1, dst, dstw, src, srcw));
   1393 			break;
   1394 		case SLJIT_MOV_U16:
   1395 			FAIL_IF(emit_mov_half(compiler, 0, dst, dstw, src, srcw));
   1396 			break;
   1397 		case SLJIT_MOV_S16:
   1398 			FAIL_IF(emit_mov_half(compiler, 1, dst, dstw, src, srcw));
   1399 			break;
   1400 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1401 		case SLJIT_MOV_U32:
   1402 			FAIL_IF(emit_mov_int(compiler, 0, dst, dstw, src, srcw));
   1403 			break;
   1404 		case SLJIT_MOV_S32:
   1405 			FAIL_IF(emit_mov_int(compiler, 1, dst, dstw, src, srcw));
   1406 			break;
   1407 #endif
   1408 		}
   1409 
   1410 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1411 		if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REG1)
   1412 			return emit_mov(compiler, SLJIT_MEM1(SLJIT_SP), dstw, TMP_REG1, 0);
   1413 #endif
   1414 
   1415 		if (SLJIT_UNLIKELY(update) && (dst & SLJIT_MEM) && (dst & REG_MASK) && (dstw != 0 || (dst & OFFS_REG_MASK) != 0)) {
   1416 			inst = emit_x86_instruction(compiler, 1, dst & REG_MASK, 0, dst, dstw);
   1417 			FAIL_IF(!inst);
   1418 			*inst = LEA_r_m;
   1419 		}
   1420 		return SLJIT_SUCCESS;
   1421 	}
   1422 
   1423 	if (SLJIT_UNLIKELY(GET_FLAGS(op_flags)))
   1424 		compiler->flags_saved = 0;
   1425 
   1426 	switch (op) {
   1427 	case SLJIT_NOT:
   1428 		if (SLJIT_UNLIKELY(op_flags & SLJIT_SET_E))
   1429 			return emit_not_with_flags(compiler, dst, dstw, src, srcw);
   1430 		return emit_unary(compiler, NOT_rm, dst, dstw, src, srcw);
   1431 
   1432 	case SLJIT_NEG:
   1433 		if (SLJIT_UNLIKELY(op_flags & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
   1434 			FAIL_IF(emit_save_flags(compiler));
   1435 		return emit_unary(compiler, NEG_rm, dst, dstw, src, srcw);
   1436 
   1437 	case SLJIT_CLZ:
   1438 		if (SLJIT_UNLIKELY(op_flags & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
   1439 			FAIL_IF(emit_save_flags(compiler));
   1440 		return emit_clz(compiler, op_flags, dst, dstw, src, srcw);
   1441 	}
   1442 
   1443 	return SLJIT_SUCCESS;
   1444 
   1445 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1446 #	undef src_is_ereg
   1447 #endif
   1448 }
   1449 
   1450 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1451 
   1452 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
   1453 	if (IS_HALFWORD(immw) || compiler->mode32) { \
   1454 		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
   1455 		FAIL_IF(!inst); \
   1456 		*(inst + 1) |= (op_imm); \
   1457 	} \
   1458 	else { \
   1459 		FAIL_IF(emit_load_imm64(compiler, TMP_REG2, immw)); \
   1460 		inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, arg, argw); \
   1461 		FAIL_IF(!inst); \
   1462 		*inst = (op_mr); \
   1463 	}
   1464 
   1465 #define BINARY_EAX_IMM(op_eax_imm, immw) \
   1466 	FAIL_IF(emit_do_imm32(compiler, (!compiler->mode32) ? REX_W : 0, (op_eax_imm), immw))
   1467 
   1468 #else
   1469 
   1470 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
   1471 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
   1472 	FAIL_IF(!inst); \
   1473 	*(inst + 1) |= (op_imm);
   1474 
   1475 #define BINARY_EAX_IMM(op_eax_imm, immw) \
   1476 	FAIL_IF(emit_do_imm(compiler, (op_eax_imm), immw))
   1477 
   1478 #endif
   1479 
   1480 static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
   1481 	sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
   1482 	sljit_s32 dst, sljit_sw dstw,
   1483 	sljit_s32 src1, sljit_sw src1w,
   1484 	sljit_s32 src2, sljit_sw src2w)
   1485 {
   1486 	sljit_u8* inst;
   1487 
   1488 	if (dst == SLJIT_UNUSED) {
   1489 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   1490 		if (src2 & SLJIT_IMM) {
   1491 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
   1492 		}
   1493 		else {
   1494 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
   1495 			FAIL_IF(!inst);
   1496 			*inst = op_rm;
   1497 		}
   1498 		return SLJIT_SUCCESS;
   1499 	}
   1500 
   1501 	if (dst == src1 && dstw == src1w) {
   1502 		if (src2 & SLJIT_IMM) {
   1503 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1504 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
   1505 #else
   1506 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
   1507 #endif
   1508 				BINARY_EAX_IMM(op_eax_imm, src2w);
   1509 			}
   1510 			else {
   1511 				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
   1512 			}
   1513 		}
   1514 		else if (FAST_IS_REG(dst)) {
   1515 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
   1516 			FAIL_IF(!inst);
   1517 			*inst = op_rm;
   1518 		}
   1519 		else if (FAST_IS_REG(src2)) {
   1520 			/* Special exception for sljit_emit_op_flags. */
   1521 			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
   1522 			FAIL_IF(!inst);
   1523 			*inst = op_mr;
   1524 		}
   1525 		else {
   1526 			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
   1527 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
   1528 			FAIL_IF(!inst);
   1529 			*inst = op_mr;
   1530 		}
   1531 		return SLJIT_SUCCESS;
   1532 	}
   1533 
   1534 	/* Only for cumulative operations. */
   1535 	if (dst == src2 && dstw == src2w) {
   1536 		if (src1 & SLJIT_IMM) {
   1537 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1538 			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
   1539 #else
   1540 			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128)) {
   1541 #endif
   1542 				BINARY_EAX_IMM(op_eax_imm, src1w);
   1543 			}
   1544 			else {
   1545 				BINARY_IMM(op_imm, op_mr, src1w, dst, dstw);
   1546 			}
   1547 		}
   1548 		else if (FAST_IS_REG(dst)) {
   1549 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src1, src1w);
   1550 			FAIL_IF(!inst);
   1551 			*inst = op_rm;
   1552 		}
   1553 		else if (FAST_IS_REG(src1)) {
   1554 			inst = emit_x86_instruction(compiler, 1, src1, src1w, dst, dstw);
   1555 			FAIL_IF(!inst);
   1556 			*inst = op_mr;
   1557 		}
   1558 		else {
   1559 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   1560 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
   1561 			FAIL_IF(!inst);
   1562 			*inst = op_mr;
   1563 		}
   1564 		return SLJIT_SUCCESS;
   1565 	}
   1566 
   1567 	/* General version. */
   1568 	if (FAST_IS_REG(dst)) {
   1569 		EMIT_MOV(compiler, dst, 0, src1, src1w);
   1570 		if (src2 & SLJIT_IMM) {
   1571 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
   1572 		}
   1573 		else {
   1574 			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
   1575 			FAIL_IF(!inst);
   1576 			*inst = op_rm;
   1577 		}
   1578 	}
   1579 	else {
   1580 		/* This version requires less memory writing. */
   1581 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   1582 		if (src2 & SLJIT_IMM) {
   1583 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
   1584 		}
   1585 		else {
   1586 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
   1587 			FAIL_IF(!inst);
   1588 			*inst = op_rm;
   1589 		}
   1590 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
   1591 	}
   1592 
   1593 	return SLJIT_SUCCESS;
   1594 }
   1595 
   1596 static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
   1597 	sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
   1598 	sljit_s32 dst, sljit_sw dstw,
   1599 	sljit_s32 src1, sljit_sw src1w,
   1600 	sljit_s32 src2, sljit_sw src2w)
   1601 {
   1602 	sljit_u8* inst;
   1603 
   1604 	if (dst == SLJIT_UNUSED) {
   1605 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   1606 		if (src2 & SLJIT_IMM) {
   1607 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
   1608 		}
   1609 		else {
   1610 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
   1611 			FAIL_IF(!inst);
   1612 			*inst = op_rm;
   1613 		}
   1614 		return SLJIT_SUCCESS;
   1615 	}
   1616 
   1617 	if (dst == src1 && dstw == src1w) {
   1618 		if (src2 & SLJIT_IMM) {
   1619 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1620 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
   1621 #else
   1622 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
   1623 #endif
   1624 				BINARY_EAX_IMM(op_eax_imm, src2w);
   1625 			}
   1626 			else {
   1627 				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
   1628 			}
   1629 		}
   1630 		else if (FAST_IS_REG(dst)) {
   1631 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
   1632 			FAIL_IF(!inst);
   1633 			*inst = op_rm;
   1634 		}
   1635 		else if (FAST_IS_REG(src2)) {
   1636 			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
   1637 			FAIL_IF(!inst);
   1638 			*inst = op_mr;
   1639 		}
   1640 		else {
   1641 			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
   1642 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
   1643 			FAIL_IF(!inst);
   1644 			*inst = op_mr;
   1645 		}
   1646 		return SLJIT_SUCCESS;
   1647 	}
   1648 
   1649 	/* General version. */
   1650 	if (FAST_IS_REG(dst) && dst != src2) {
   1651 		EMIT_MOV(compiler, dst, 0, src1, src1w);
   1652 		if (src2 & SLJIT_IMM) {
   1653 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
   1654 		}
   1655 		else {
   1656 			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
   1657 			FAIL_IF(!inst);
   1658 			*inst = op_rm;
   1659 		}
   1660 	}
   1661 	else {
   1662 		/* This version requires less memory writing. */
   1663 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   1664 		if (src2 & SLJIT_IMM) {
   1665 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
   1666 		}
   1667 		else {
   1668 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
   1669 			FAIL_IF(!inst);
   1670 			*inst = op_rm;
   1671 		}
   1672 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
   1673 	}
   1674 
   1675 	return SLJIT_SUCCESS;
   1676 }
   1677 
   1678 static sljit_s32 emit_mul(struct sljit_compiler *compiler,
   1679 	sljit_s32 dst, sljit_sw dstw,
   1680 	sljit_s32 src1, sljit_sw src1w,
   1681 	sljit_s32 src2, sljit_sw src2w)
   1682 {
   1683 	sljit_u8* inst;
   1684 	sljit_s32 dst_r;
   1685 
   1686 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
   1687 
   1688 	/* Register destination. */
   1689 	if (dst_r == src1 && !(src2 & SLJIT_IMM)) {
   1690 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
   1691 		FAIL_IF(!inst);
   1692 		*inst++ = GROUP_0F;
   1693 		*inst = IMUL_r_rm;
   1694 	}
   1695 	else if (dst_r == src2 && !(src1 & SLJIT_IMM)) {
   1696 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src1, src1w);
   1697 		FAIL_IF(!inst);
   1698 		*inst++ = GROUP_0F;
   1699 		*inst = IMUL_r_rm;
   1700 	}
   1701 	else if (src1 & SLJIT_IMM) {
   1702 		if (src2 & SLJIT_IMM) {
   1703 			EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, src2w);
   1704 			src2 = dst_r;
   1705 			src2w = 0;
   1706 		}
   1707 
   1708 		if (src1w <= 127 && src1w >= -128) {
   1709 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
   1710 			FAIL_IF(!inst);
   1711 			*inst = IMUL_r_rm_i8;
   1712 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
   1713 			FAIL_IF(!inst);
   1714 			INC_SIZE(1);
   1715 			*inst = (sljit_s8)src1w;
   1716 		}
   1717 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1718 		else {
   1719 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
   1720 			FAIL_IF(!inst);
   1721 			*inst = IMUL_r_rm_i32;
   1722 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
   1723 			FAIL_IF(!inst);
   1724 			INC_SIZE(4);
   1725 			sljit_unaligned_store_sw(inst, src1w);
   1726 		}
   1727 #else
   1728 		else if (IS_HALFWORD(src1w)) {
   1729 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
   1730 			FAIL_IF(!inst);
   1731 			*inst = IMUL_r_rm_i32;
   1732 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
   1733 			FAIL_IF(!inst);
   1734 			INC_SIZE(4);
   1735 			sljit_unaligned_store_s32(inst, (sljit_s32)src1w);
   1736 		}
   1737 		else {
   1738 			EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
   1739 			if (dst_r != src2)
   1740 				EMIT_MOV(compiler, dst_r, 0, src2, src2w);
   1741 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
   1742 			FAIL_IF(!inst);
   1743 			*inst++ = GROUP_0F;
   1744 			*inst = IMUL_r_rm;
   1745 		}
   1746 #endif
   1747 	}
   1748 	else if (src2 & SLJIT_IMM) {
   1749 		/* Note: src1 is NOT immediate. */
   1750 
   1751 		if (src2w <= 127 && src2w >= -128) {
   1752 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
   1753 			FAIL_IF(!inst);
   1754 			*inst = IMUL_r_rm_i8;
   1755 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
   1756 			FAIL_IF(!inst);
   1757 			INC_SIZE(1);
   1758 			*inst = (sljit_s8)src2w;
   1759 		}
   1760 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1761 		else {
   1762 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
   1763 			FAIL_IF(!inst);
   1764 			*inst = IMUL_r_rm_i32;
   1765 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
   1766 			FAIL_IF(!inst);
   1767 			INC_SIZE(4);
   1768 			sljit_unaligned_store_sw(inst, src2w);
   1769 		}
   1770 #else
   1771 		else if (IS_HALFWORD(src2w)) {
   1772 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
   1773 			FAIL_IF(!inst);
   1774 			*inst = IMUL_r_rm_i32;
   1775 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
   1776 			FAIL_IF(!inst);
   1777 			INC_SIZE(4);
   1778 			sljit_unaligned_store_s32(inst, (sljit_s32)src2w);
   1779 		}
   1780 		else {
   1781 			EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src2w);
   1782 			if (dst_r != src1)
   1783 				EMIT_MOV(compiler, dst_r, 0, src1, src1w);
   1784 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
   1785 			FAIL_IF(!inst);
   1786 			*inst++ = GROUP_0F;
   1787 			*inst = IMUL_r_rm;
   1788 		}
   1789 #endif
   1790 	}
   1791 	else {
   1792 		/* Neither argument is immediate. */
   1793 		if (ADDRESSING_DEPENDS_ON(src2, dst_r))
   1794 			dst_r = TMP_REG1;
   1795 		EMIT_MOV(compiler, dst_r, 0, src1, src1w);
   1796 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
   1797 		FAIL_IF(!inst);
   1798 		*inst++ = GROUP_0F;
   1799 		*inst = IMUL_r_rm;
   1800 	}
   1801 
   1802 	if (dst_r == TMP_REG1)
   1803 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
   1804 
   1805 	return SLJIT_SUCCESS;
   1806 }
   1807 
   1808 static sljit_s32 emit_lea_binary(struct sljit_compiler *compiler, sljit_s32 keep_flags,
   1809 	sljit_s32 dst, sljit_sw dstw,
   1810 	sljit_s32 src1, sljit_sw src1w,
   1811 	sljit_s32 src2, sljit_sw src2w)
   1812 {
   1813 	sljit_u8* inst;
   1814 	sljit_s32 dst_r, done = 0;
   1815 
   1816 	/* These cases better be left to handled by normal way. */
   1817 	if (!keep_flags) {
   1818 		if (dst == src1 && dstw == src1w)
   1819 			return SLJIT_ERR_UNSUPPORTED;
   1820 		if (dst == src2 && dstw == src2w)
   1821 			return SLJIT_ERR_UNSUPPORTED;
   1822 	}
   1823 
   1824 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
   1825 
   1826 	if (FAST_IS_REG(src1)) {
   1827 		if (FAST_IS_REG(src2)) {
   1828 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM2(src1, src2), 0);
   1829 			FAIL_IF(!inst);
   1830 			*inst = LEA_r_m;
   1831 			done = 1;
   1832 		}
   1833 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1834 		if ((src2 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src2w))) {
   1835 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), (sljit_s32)src2w);
   1836 #else
   1837 		if (src2 & SLJIT_IMM) {
   1838 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), src2w);
   1839 #endif
   1840 			FAIL_IF(!inst);
   1841 			*inst = LEA_r_m;
   1842 			done = 1;
   1843 		}
   1844 	}
   1845 	else if (FAST_IS_REG(src2)) {
   1846 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1847 		if ((src1 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src1w))) {
   1848 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_s32)src1w);
   1849 #else
   1850 		if (src1 & SLJIT_IMM) {
   1851 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), src1w);
   1852 #endif
   1853 			FAIL_IF(!inst);
   1854 			*inst = LEA_r_m;
   1855 			done = 1;
   1856 		}
   1857 	}
   1858 
   1859 	if (done) {
   1860 		if (dst_r == TMP_REG1)
   1861 			return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
   1862 		return SLJIT_SUCCESS;
   1863 	}
   1864 	return SLJIT_ERR_UNSUPPORTED;
   1865 }
   1866 
   1867 static sljit_s32 emit_cmp_binary(struct sljit_compiler *compiler,
   1868 	sljit_s32 src1, sljit_sw src1w,
   1869 	sljit_s32 src2, sljit_sw src2w)
   1870 {
   1871 	sljit_u8* inst;
   1872 
   1873 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1874 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
   1875 #else
   1876 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
   1877 #endif
   1878 		BINARY_EAX_IMM(CMP_EAX_i32, src2w);
   1879 		return SLJIT_SUCCESS;
   1880 	}
   1881 
   1882 	if (FAST_IS_REG(src1)) {
   1883 		if (src2 & SLJIT_IMM) {
   1884 			BINARY_IMM(CMP, CMP_rm_r, src2w, src1, 0);
   1885 		}
   1886 		else {
   1887 			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
   1888 			FAIL_IF(!inst);
   1889 			*inst = CMP_r_rm;
   1890 		}
   1891 		return SLJIT_SUCCESS;
   1892 	}
   1893 
   1894 	if (FAST_IS_REG(src2) && !(src1 & SLJIT_IMM)) {
   1895 		inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
   1896 		FAIL_IF(!inst);
   1897 		*inst = CMP_rm_r;
   1898 		return SLJIT_SUCCESS;
   1899 	}
   1900 
   1901 	if (src2 & SLJIT_IMM) {
   1902 		if (src1 & SLJIT_IMM) {
   1903 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   1904 			src1 = TMP_REG1;
   1905 			src1w = 0;
   1906 		}
   1907 		BINARY_IMM(CMP, CMP_rm_r, src2w, src1, src1w);
   1908 	}
   1909 	else {
   1910 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   1911 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
   1912 		FAIL_IF(!inst);
   1913 		*inst = CMP_r_rm;
   1914 	}
   1915 	return SLJIT_SUCCESS;
   1916 }
   1917 
   1918 static sljit_s32 emit_test_binary(struct sljit_compiler *compiler,
   1919 	sljit_s32 src1, sljit_sw src1w,
   1920 	sljit_s32 src2, sljit_sw src2w)
   1921 {
   1922 	sljit_u8* inst;
   1923 
   1924 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1925 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
   1926 #else
   1927 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
   1928 #endif
   1929 		BINARY_EAX_IMM(TEST_EAX_i32, src2w);
   1930 		return SLJIT_SUCCESS;
   1931 	}
   1932 
   1933 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1934 	if (src2 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
   1935 #else
   1936 	if (src2 == SLJIT_R0 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128)) {
   1937 #endif
   1938 		BINARY_EAX_IMM(TEST_EAX_i32, src1w);
   1939 		return SLJIT_SUCCESS;
   1940 	}
   1941 
   1942 	if (!(src1 & SLJIT_IMM)) {
   1943 		if (src2 & SLJIT_IMM) {
   1944 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1945 			if (IS_HALFWORD(src2w) || compiler->mode32) {
   1946 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
   1947 				FAIL_IF(!inst);
   1948 				*inst = GROUP_F7;
   1949 			}
   1950 			else {
   1951 				FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
   1952 				inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src1, src1w);
   1953 				FAIL_IF(!inst);
   1954 				*inst = TEST_rm_r;
   1955 			}
   1956 #else
   1957 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
   1958 			FAIL_IF(!inst);
   1959 			*inst = GROUP_F7;
   1960 #endif
   1961 			return SLJIT_SUCCESS;
   1962 		}
   1963 		else if (FAST_IS_REG(src1)) {
   1964 			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
   1965 			FAIL_IF(!inst);
   1966 			*inst = TEST_rm_r;
   1967 			return SLJIT_SUCCESS;
   1968 		}
   1969 	}
   1970 
   1971 	if (!(src2 & SLJIT_IMM)) {
   1972 		if (src1 & SLJIT_IMM) {
   1973 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1974 			if (IS_HALFWORD(src1w) || compiler->mode32) {
   1975 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, src2w);
   1976 				FAIL_IF(!inst);
   1977 				*inst = GROUP_F7;
   1978 			}
   1979 			else {
   1980 				FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
   1981 				inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src2, src2w);
   1982 				FAIL_IF(!inst);
   1983 				*inst = TEST_rm_r;
   1984 			}
   1985 #else
   1986 			inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, src2w);
   1987 			FAIL_IF(!inst);
   1988 			*inst = GROUP_F7;
   1989 #endif
   1990 			return SLJIT_SUCCESS;
   1991 		}
   1992 		else if (FAST_IS_REG(src2)) {
   1993 			inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
   1994 			FAIL_IF(!inst);
   1995 			*inst = TEST_rm_r;
   1996 			return SLJIT_SUCCESS;
   1997 		}
   1998 	}
   1999 
   2000 	EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   2001 	if (src2 & SLJIT_IMM) {
   2002 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2003 		if (IS_HALFWORD(src2w) || compiler->mode32) {
   2004 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
   2005 			FAIL_IF(!inst);
   2006 			*inst = GROUP_F7;
   2007 		}
   2008 		else {
   2009 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
   2010 			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, TMP_REG1, 0);
   2011 			FAIL_IF(!inst);
   2012 			*inst = TEST_rm_r;
   2013 		}
   2014 #else
   2015 		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
   2016 		FAIL_IF(!inst);
   2017 		*inst = GROUP_F7;
   2018 #endif
   2019 	}
   2020 	else {
   2021 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
   2022 		FAIL_IF(!inst);
   2023 		*inst = TEST_rm_r;
   2024 	}
   2025 	return SLJIT_SUCCESS;
   2026 }
   2027 
   2028 static sljit_s32 emit_shift(struct sljit_compiler *compiler,
   2029 	sljit_u8 mode,
   2030 	sljit_s32 dst, sljit_sw dstw,
   2031 	sljit_s32 src1, sljit_sw src1w,
   2032 	sljit_s32 src2, sljit_sw src2w)
   2033 {
   2034 	sljit_u8* inst;
   2035 
   2036 	if ((src2 & SLJIT_IMM) || (src2 == SLJIT_PREF_SHIFT_REG)) {
   2037 		if (dst == src1 && dstw == src1w) {
   2038 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, dstw);
   2039 			FAIL_IF(!inst);
   2040 			*inst |= mode;
   2041 			return SLJIT_SUCCESS;
   2042 		}
   2043 		if (dst == SLJIT_UNUSED) {
   2044 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   2045 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
   2046 			FAIL_IF(!inst);
   2047 			*inst |= mode;
   2048 			return SLJIT_SUCCESS;
   2049 		}
   2050 		if (dst == SLJIT_PREF_SHIFT_REG && src2 == SLJIT_PREF_SHIFT_REG) {
   2051 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   2052 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
   2053 			FAIL_IF(!inst);
   2054 			*inst |= mode;
   2055 			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
   2056 			return SLJIT_SUCCESS;
   2057 		}
   2058 		if (FAST_IS_REG(dst)) {
   2059 			EMIT_MOV(compiler, dst, 0, src1, src1w);
   2060 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0);
   2061 			FAIL_IF(!inst);
   2062 			*inst |= mode;
   2063 			return SLJIT_SUCCESS;
   2064 		}
   2065 
   2066 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   2067 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
   2068 		FAIL_IF(!inst);
   2069 		*inst |= mode;
   2070 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
   2071 		return SLJIT_SUCCESS;
   2072 	}
   2073 
   2074 	if (dst == SLJIT_PREF_SHIFT_REG) {
   2075 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   2076 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
   2077 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
   2078 		FAIL_IF(!inst);
   2079 		*inst |= mode;
   2080 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
   2081 	}
   2082 	else if (FAST_IS_REG(dst) && dst != src2 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
   2083 		if (src1 != dst)
   2084 			EMIT_MOV(compiler, dst, 0, src1, src1w);
   2085 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
   2086 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
   2087 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
   2088 		FAIL_IF(!inst);
   2089 		*inst |= mode;
   2090 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
   2091 	}
   2092 	else {
   2093 		/* This case is really difficult, since ecx itself may used for
   2094 		   addressing, and we must ensure to work even in that case. */
   2095 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   2096 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2097 		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
   2098 #else
   2099 		/* [esp+0] contains the flags. */
   2100 		EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw), SLJIT_PREF_SHIFT_REG, 0);
   2101 #endif
   2102 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
   2103 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
   2104 		FAIL_IF(!inst);
   2105 		*inst |= mode;
   2106 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2107 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
   2108 #else
   2109 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw));
   2110 #endif
   2111 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
   2112 	}
   2113 
   2114 	return SLJIT_SUCCESS;
   2115 }
   2116 
   2117 static sljit_s32 emit_shift_with_flags(struct sljit_compiler *compiler,
   2118 	sljit_u8 mode, sljit_s32 set_flags,
   2119 	sljit_s32 dst, sljit_sw dstw,
   2120 	sljit_s32 src1, sljit_sw src1w,
   2121 	sljit_s32 src2, sljit_sw src2w)
   2122 {
   2123 	/* The CPU does not set flags if the shift count is 0. */
   2124 	if (src2 & SLJIT_IMM) {
   2125 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2126 		if ((src2w & 0x3f) != 0 || (compiler->mode32 && (src2w & 0x1f) != 0))
   2127 			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
   2128 #else
   2129 		if ((src2w & 0x1f) != 0)
   2130 			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
   2131 #endif
   2132 		if (!set_flags)
   2133 			return emit_mov(compiler, dst, dstw, src1, src1w);
   2134 		/* OR dst, src, 0 */
   2135 		return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
   2136 			dst, dstw, src1, src1w, SLJIT_IMM, 0);
   2137 	}
   2138 
   2139 	if (!set_flags)
   2140 		return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
   2141 
   2142 	if (!FAST_IS_REG(dst))
   2143 		FAIL_IF(emit_cmp_binary(compiler, src1, src1w, SLJIT_IMM, 0));
   2144 
   2145 	FAIL_IF(emit_shift(compiler,mode, dst, dstw, src1, src1w, src2, src2w));
   2146 
   2147 	if (FAST_IS_REG(dst))
   2148 		return emit_cmp_binary(compiler, dst, dstw, SLJIT_IMM, 0);
   2149 	return SLJIT_SUCCESS;
   2150 }
   2151 
   2152 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compiler, sljit_s32 op,
   2153 	sljit_s32 dst, sljit_sw dstw,
   2154 	sljit_s32 src1, sljit_sw src1w,
   2155 	sljit_s32 src2, sljit_sw src2w)
   2156 {
   2157 	CHECK_ERROR();
   2158 	CHECK(check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
   2159 	ADJUST_LOCAL_OFFSET(dst, dstw);
   2160 	ADJUST_LOCAL_OFFSET(src1, src1w);
   2161 	ADJUST_LOCAL_OFFSET(src2, src2w);
   2162 
   2163 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
   2164 	CHECK_EXTRA_REGS(src1, src1w, (void)0);
   2165 	CHECK_EXTRA_REGS(src2, src2w, (void)0);
   2166 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2167 	compiler->mode32 = op & SLJIT_I32_OP;
   2168 #endif
   2169 
   2170 	if (GET_OPCODE(op) >= SLJIT_MUL) {
   2171 		if (SLJIT_UNLIKELY(GET_FLAGS(op)))
   2172 			compiler->flags_saved = 0;
   2173 		else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
   2174 			FAIL_IF(emit_save_flags(compiler));
   2175 	}
   2176 
   2177 	switch (GET_OPCODE(op)) {
   2178 	case SLJIT_ADD:
   2179 		if (!GET_FLAGS(op)) {
   2180 			if (emit_lea_binary(compiler, op & SLJIT_KEEP_FLAGS, dst, dstw, src1, src1w, src2, src2w) != SLJIT_ERR_UNSUPPORTED)
   2181 				return compiler->error;
   2182 		}
   2183 		else
   2184 			compiler->flags_saved = 0;
   2185 		if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
   2186 			FAIL_IF(emit_save_flags(compiler));
   2187 		return emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
   2188 			dst, dstw, src1, src1w, src2, src2w);
   2189 	case SLJIT_ADDC:
   2190 		if (SLJIT_UNLIKELY(compiler->flags_saved)) /* C flag must be restored. */
   2191 			FAIL_IF(emit_restore_flags(compiler, 1));
   2192 		else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS))
   2193 			FAIL_IF(emit_save_flags(compiler));
   2194 		if (SLJIT_UNLIKELY(GET_FLAGS(op)))
   2195 			compiler->flags_saved = 0;
   2196 		return emit_cum_binary(compiler, ADC_r_rm, ADC_rm_r, ADC, ADC_EAX_i32,
   2197 			dst, dstw, src1, src1w, src2, src2w);
   2198 	case SLJIT_SUB:
   2199 		if (!GET_FLAGS(op)) {
   2200 			if ((src2 & SLJIT_IMM) && emit_lea_binary(compiler, op & SLJIT_KEEP_FLAGS, dst, dstw, src1, src1w, SLJIT_IMM, -src2w) != SLJIT_ERR_UNSUPPORTED)
   2201 				return compiler->error;
   2202 		}
   2203 		else
   2204 			compiler->flags_saved = 0;
   2205 		if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
   2206 			FAIL_IF(emit_save_flags(compiler));
   2207 		if (dst == SLJIT_UNUSED)
   2208 			return emit_cmp_binary(compiler, src1, src1w, src2, src2w);
   2209 		return emit_non_cum_binary(compiler, SUB_r_rm, SUB_rm_r, SUB, SUB_EAX_i32,
   2210 			dst, dstw, src1, src1w, src2, src2w);
   2211 	case SLJIT_SUBC:
   2212 		if (SLJIT_UNLIKELY(compiler->flags_saved)) /* C flag must be restored. */
   2213 			FAIL_IF(emit_restore_flags(compiler, 1));
   2214 		else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS))
   2215 			FAIL_IF(emit_save_flags(compiler));
   2216 		if (SLJIT_UNLIKELY(GET_FLAGS(op)))
   2217 			compiler->flags_saved = 0;
   2218 		return emit_non_cum_binary(compiler, SBB_r_rm, SBB_rm_r, SBB, SBB_EAX_i32,
   2219 			dst, dstw, src1, src1w, src2, src2w);
   2220 	case SLJIT_MUL:
   2221 		return emit_mul(compiler, dst, dstw, src1, src1w, src2, src2w);
   2222 	case SLJIT_AND:
   2223 		if (dst == SLJIT_UNUSED)
   2224 			return emit_test_binary(compiler, src1, src1w, src2, src2w);
   2225 		return emit_cum_binary(compiler, AND_r_rm, AND_rm_r, AND, AND_EAX_i32,
   2226 			dst, dstw, src1, src1w, src2, src2w);
   2227 	case SLJIT_OR:
   2228 		return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
   2229 			dst, dstw, src1, src1w, src2, src2w);
   2230 	case SLJIT_XOR:
   2231 		return emit_cum_binary(compiler, XOR_r_rm, XOR_rm_r, XOR, XOR_EAX_i32,
   2232 			dst, dstw, src1, src1w, src2, src2w);
   2233 	case SLJIT_SHL:
   2234 		return emit_shift_with_flags(compiler, SHL, GET_FLAGS(op),
   2235 			dst, dstw, src1, src1w, src2, src2w);
   2236 	case SLJIT_LSHR:
   2237 		return emit_shift_with_flags(compiler, SHR, GET_FLAGS(op),
   2238 			dst, dstw, src1, src1w, src2, src2w);
   2239 	case SLJIT_ASHR:
   2240 		return emit_shift_with_flags(compiler, SAR, GET_FLAGS(op),
   2241 			dst, dstw, src1, src1w, src2, src2w);
   2242 	}
   2243 
   2244 	return SLJIT_SUCCESS;
   2245 }
   2246 
   2247 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
   2248 {
   2249 	CHECK_REG_INDEX(check_sljit_get_register_index(reg));
   2250 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   2251 	if (reg >= SLJIT_R3 && reg <= SLJIT_R6)
   2252 		return -1;
   2253 #endif
   2254 	return reg_map[reg];
   2255 }
   2256 
   2257 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg)
   2258 {
   2259 	CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
   2260 	return reg;
   2261 }
   2262 
   2263 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
   2264 	void *instruction, sljit_s32 size)
   2265 {
   2266 	sljit_u8 *inst;
   2267 
   2268 	CHECK_ERROR();
   2269 	CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
   2270 
   2271 	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
   2272 	FAIL_IF(!inst);
   2273 	INC_SIZE(size);
   2274 	SLJIT_MEMCPY(inst, instruction, size);
   2275 	return SLJIT_SUCCESS;
   2276 }
   2277 
   2278 /* --------------------------------------------------------------------- */
   2279 /*  Floating point operators                                             */
   2280 /* --------------------------------------------------------------------- */
   2281 
   2282 /* Alignment + 2 * 16 bytes. */
   2283 static sljit_s32 sse2_data[3 + (4 + 4) * 2];
   2284 static sljit_s32 *sse2_buffer;
   2285 
   2286 static void init_compiler(void)
   2287 {
   2288 	sse2_buffer = (sljit_s32*)(((sljit_uw)sse2_data + 15) & ~0xf);
   2289 	/* Single precision constants. */
   2290 	sse2_buffer[0] = 0x80000000;
   2291 	sse2_buffer[4] = 0x7fffffff;
   2292 	/* Double precision constants. */
   2293 	sse2_buffer[8] = 0;
   2294 	sse2_buffer[9] = 0x80000000;
   2295 	sse2_buffer[12] = 0xffffffff;
   2296 	sse2_buffer[13] = 0x7fffffff;
   2297 }
   2298 
   2299 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_is_fpu_available(void)
   2300 {
   2301 #ifdef SLJIT_IS_FPU_AVAILABLE
   2302 	return SLJIT_IS_FPU_AVAILABLE;
   2303 #elif (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
   2304 	if (cpu_has_sse2 == -1)
   2305 		get_cpu_features();
   2306 	return cpu_has_sse2;
   2307 #else /* SLJIT_DETECT_SSE2 */
   2308 	return 1;
   2309 #endif /* SLJIT_DETECT_SSE2 */
   2310 }
   2311 
   2312 static sljit_s32 emit_sse2(struct sljit_compiler *compiler, sljit_u8 opcode,
   2313 	sljit_s32 single, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w)
   2314 {
   2315 	sljit_u8 *inst;
   2316 
   2317 	inst = emit_x86_instruction(compiler, 2 | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
   2318 	FAIL_IF(!inst);
   2319 	*inst++ = GROUP_0F;
   2320 	*inst = opcode;
   2321 	return SLJIT_SUCCESS;
   2322 }
   2323 
   2324 static sljit_s32 emit_sse2_logic(struct sljit_compiler *compiler, sljit_u8 opcode,
   2325 	sljit_s32 pref66, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w)
   2326 {
   2327 	sljit_u8 *inst;
   2328 
   2329 	inst = emit_x86_instruction(compiler, 2 | (pref66 ? EX86_PREF_66 : 0) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
   2330 	FAIL_IF(!inst);
   2331 	*inst++ = GROUP_0F;
   2332 	*inst = opcode;
   2333 	return SLJIT_SUCCESS;
   2334 }
   2335 
   2336 static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
   2337 	sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
   2338 {
   2339 	return emit_sse2(compiler, MOVSD_x_xm, single, dst, src, srcw);
   2340 }
   2341 
   2342 static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
   2343 	sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src)
   2344 {
   2345 	return emit_sse2(compiler, MOVSD_xm_x, single, src, dst, dstw);
   2346 }
   2347 
   2348 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_compiler *compiler, sljit_s32 op,
   2349 	sljit_s32 dst, sljit_sw dstw,
   2350 	sljit_s32 src, sljit_sw srcw)
   2351 {
   2352 	sljit_s32 dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
   2353 	sljit_u8 *inst;
   2354 
   2355 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2356 	if (GET_OPCODE(op) == SLJIT_CONV_SW_FROM_F64)
   2357 		compiler->mode32 = 0;
   2358 #endif
   2359 
   2360 	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_F32_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP2, dst_r, 0, src, srcw);
   2361 	FAIL_IF(!inst);
   2362 	*inst++ = GROUP_0F;
   2363 	*inst = CVTTSD2SI_r_xm;
   2364 
   2365 	if (dst_r == TMP_REG1 && dst != SLJIT_UNUSED)
   2366 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
   2367 	return SLJIT_SUCCESS;
   2368 }
   2369 
   2370 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_compiler *compiler, sljit_s32 op,
   2371 	sljit_s32 dst, sljit_sw dstw,
   2372 	sljit_s32 src, sljit_sw srcw)
   2373 {
   2374 	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
   2375 	sljit_u8 *inst;
   2376 
   2377 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2378 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_SW)
   2379 		compiler->mode32 = 0;
   2380 #endif
   2381 
   2382 	if (src & SLJIT_IMM) {
   2383 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2384 		if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
   2385 			srcw = (sljit_s32)srcw;
   2386 #endif
   2387 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
   2388 		src = TMP_REG1;
   2389 		srcw = 0;
   2390 	}
   2391 
   2392 	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_F32_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP1, dst_r, 0, src, srcw);
   2393 	FAIL_IF(!inst);
   2394 	*inst++ = GROUP_0F;
   2395 	*inst = CVTSI2SD_x_rm;
   2396 
   2397 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2398 	compiler->mode32 = 1;
   2399 #endif
   2400 	if (dst_r == TMP_FREG)
   2401 		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
   2402 	return SLJIT_SUCCESS;
   2403 }
   2404 
   2405 static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compiler, sljit_s32 op,
   2406 	sljit_s32 src1, sljit_sw src1w,
   2407 	sljit_s32 src2, sljit_sw src2w)
   2408 {
   2409 	compiler->flags_saved = 0;
   2410 	if (!FAST_IS_REG(src1)) {
   2411 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
   2412 		src1 = TMP_FREG;
   2413 	}
   2414 	return emit_sse2_logic(compiler, UCOMISD_x_xm, !(op & SLJIT_F32_OP), src1, src2, src2w);
   2415 }
   2416 
   2417 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compiler, sljit_s32 op,
   2418 	sljit_s32 dst, sljit_sw dstw,
   2419 	sljit_s32 src, sljit_sw srcw)
   2420 {
   2421 	sljit_s32 dst_r;
   2422 
   2423 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2424 	compiler->mode32 = 1;
   2425 #endif
   2426 
   2427 	CHECK_ERROR();
   2428 	SELECT_FOP1_OPERATION_WITH_CHECKS(compiler, op, dst, dstw, src, srcw);
   2429 
   2430 	if (GET_OPCODE(op) == SLJIT_MOV_F64) {
   2431 		if (FAST_IS_REG(dst))
   2432 			return emit_sse2_load(compiler, op & SLJIT_F32_OP, dst, src, srcw);
   2433 		if (FAST_IS_REG(src))
   2434 			return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, src);
   2435 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src, srcw));
   2436 		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
   2437 	}
   2438 
   2439 	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_F32) {
   2440 		dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
   2441 		if (FAST_IS_REG(src)) {
   2442 			/* We overwrite the high bits of source. From SLJIT point of view,
   2443 			   this is not an issue.
   2444 			   Note: In SSE3, we could also use MOVDDUP and MOVSLDUP. */
   2445 			FAIL_IF(emit_sse2_logic(compiler, UNPCKLPD_x_xm, op & SLJIT_F32_OP, src, src, 0));
   2446 		}
   2447 		else {
   2448 			FAIL_IF(emit_sse2_load(compiler, !(op & SLJIT_F32_OP), TMP_FREG, src, srcw));
   2449 			src = TMP_FREG;
   2450 		}
   2451 
   2452 		FAIL_IF(emit_sse2_logic(compiler, CVTPD2PS_x_xm, op & SLJIT_F32_OP, dst_r, src, 0));
   2453 		if (dst_r == TMP_FREG)
   2454 			return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
   2455 		return SLJIT_SUCCESS;
   2456 	}
   2457 
   2458 	if (SLOW_IS_REG(dst)) {
   2459 		dst_r = dst;
   2460 		if (dst != src)
   2461 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src, srcw));
   2462 	}
   2463 	else {
   2464 		dst_r = TMP_FREG;
   2465 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src, srcw));
   2466 	}
   2467 
   2468 	switch (GET_OPCODE(op)) {
   2469 	case SLJIT_NEG_F64:
   2470 		FAIL_IF(emit_sse2_logic(compiler, XORPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_F32_OP ? sse2_buffer : sse2_buffer + 8)));
   2471 		break;
   2472 
   2473 	case SLJIT_ABS_F64:
   2474 		FAIL_IF(emit_sse2_logic(compiler, ANDPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_F32_OP ? sse2_buffer + 4 : sse2_buffer + 12)));
   2475 		break;
   2476 	}
   2477 
   2478 	if (dst_r == TMP_FREG)
   2479 		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
   2480 	return SLJIT_SUCCESS;
   2481 }
   2482 
   2483 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compiler, sljit_s32 op,
   2484 	sljit_s32 dst, sljit_sw dstw,
   2485 	sljit_s32 src1, sljit_sw src1w,
   2486 	sljit_s32 src2, sljit_sw src2w)
   2487 {
   2488 	sljit_s32 dst_r;
   2489 
   2490 	CHECK_ERROR();
   2491 	CHECK(check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
   2492 	ADJUST_LOCAL_OFFSET(dst, dstw);
   2493 	ADJUST_LOCAL_OFFSET(src1, src1w);
   2494 	ADJUST_LOCAL_OFFSET(src2, src2w);
   2495 
   2496 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2497 	compiler->mode32 = 1;
   2498 #endif
   2499 
   2500 	if (FAST_IS_REG(dst)) {
   2501 		dst_r = dst;
   2502 		if (dst == src1)
   2503 			; /* Do nothing here. */
   2504 		else if (dst == src2 && (op == SLJIT_ADD_F64 || op == SLJIT_MUL_F64)) {
   2505 			/* Swap arguments. */
   2506 			src2 = src1;
   2507 			src2w = src1w;
   2508 		}
   2509 		else if (dst != src2)
   2510 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src1, src1w));
   2511 		else {
   2512 			dst_r = TMP_FREG;
   2513 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
   2514 		}
   2515 	}
   2516 	else {
   2517 		dst_r = TMP_FREG;
   2518 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
   2519 	}
   2520 
   2521 	switch (GET_OPCODE(op)) {
   2522 	case SLJIT_ADD_F64:
   2523 		FAIL_IF(emit_sse2(compiler, ADDSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
   2524 		break;
   2525 
   2526 	case SLJIT_SUB_F64:
   2527 		FAIL_IF(emit_sse2(compiler, SUBSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
   2528 		break;
   2529 
   2530 	case SLJIT_MUL_F64:
   2531 		FAIL_IF(emit_sse2(compiler, MULSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
   2532 		break;
   2533 
   2534 	case SLJIT_DIV_F64:
   2535 		FAIL_IF(emit_sse2(compiler, DIVSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
   2536 		break;
   2537 	}
   2538 
   2539 	if (dst_r == TMP_FREG)
   2540 		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
   2541 	return SLJIT_SUCCESS;
   2542 }
   2543 
   2544 /* --------------------------------------------------------------------- */
   2545 /*  Conditional instructions                                             */
   2546 /* --------------------------------------------------------------------- */
   2547 
   2548 SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
   2549 {
   2550 	sljit_u8 *inst;
   2551 	struct sljit_label *label;
   2552 
   2553 	CHECK_ERROR_PTR();
   2554 	CHECK_PTR(check_sljit_emit_label(compiler));
   2555 
   2556 	/* We should restore the flags before the label,
   2557 	   since other taken jumps has their own flags as well. */
   2558 	if (SLJIT_UNLIKELY(compiler->flags_saved))
   2559 		PTR_FAIL_IF(emit_restore_flags(compiler, 0));
   2560 
   2561 	if (compiler->last_label && compiler->last_label->size == compiler->size)
   2562 		return compiler->last_label;
   2563 
   2564 	label = (struct sljit_label*)ensure_abuf(compiler, sizeof(struct sljit_label));
   2565 	PTR_FAIL_IF(!label);
   2566 	set_label(label, compiler);
   2567 
   2568 	inst = (sljit_u8*)ensure_buf(compiler, 2);
   2569 	PTR_FAIL_IF(!inst);
   2570 
   2571 	*inst++ = 0;
   2572 	*inst++ = 0;
   2573 
   2574 	return label;
   2575 }
   2576 
   2577 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_s32 type)
   2578 {
   2579 	sljit_u8 *inst;
   2580 	struct sljit_jump *jump;
   2581 
   2582 	CHECK_ERROR_PTR();
   2583 	CHECK_PTR(check_sljit_emit_jump(compiler, type));
   2584 
   2585 	if (SLJIT_UNLIKELY(compiler->flags_saved)) {
   2586 		if ((type & 0xff) <= SLJIT_JUMP)
   2587 			PTR_FAIL_IF(emit_restore_flags(compiler, 0));
   2588 		compiler->flags_saved = 0;
   2589 	}
   2590 
   2591 	jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
   2592 	PTR_FAIL_IF_NULL(jump);
   2593 	set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
   2594 	type &= 0xff;
   2595 
   2596 	if (type >= SLJIT_CALL1)
   2597 		PTR_FAIL_IF(call_with_args(compiler, type));
   2598 
   2599 	/* Worst case size. */
   2600 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   2601 	compiler->size += (type >= SLJIT_JUMP) ? 5 : 6;
   2602 #else
   2603 	compiler->size += (type >= SLJIT_JUMP) ? (10 + 3) : (2 + 10 + 3);
   2604 #endif
   2605 
   2606 	inst = (sljit_u8*)ensure_buf(compiler, 2);
   2607 	PTR_FAIL_IF_NULL(inst);
   2608 
   2609 	*inst++ = 0;
   2610 	*inst++ = type + 4;
   2611 	return jump;
   2612 }
   2613 
   2614 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
   2615 {
   2616 	sljit_u8 *inst;
   2617 	struct sljit_jump *jump;
   2618 
   2619 	CHECK_ERROR();
   2620 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
   2621 	ADJUST_LOCAL_OFFSET(src, srcw);
   2622 
   2623 	CHECK_EXTRA_REGS(src, srcw, (void)0);
   2624 
   2625 	if (SLJIT_UNLIKELY(compiler->flags_saved)) {
   2626 		if (type <= SLJIT_JUMP)
   2627 			FAIL_IF(emit_restore_flags(compiler, 0));
   2628 		compiler->flags_saved = 0;
   2629 	}
   2630 
   2631 	if (type >= SLJIT_CALL1) {
   2632 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   2633 #if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
   2634 		if (src == SLJIT_R2) {
   2635 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
   2636 			src = TMP_REG1;
   2637 		}
   2638 		if (src == SLJIT_MEM1(SLJIT_SP) && type >= SLJIT_CALL3)
   2639 			srcw += sizeof(sljit_sw);
   2640 #endif
   2641 #endif
   2642 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && defined(_WIN64)
   2643 		if (src == SLJIT_R2) {
   2644 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
   2645 			src = TMP_REG1;
   2646 		}
   2647 #endif
   2648 		FAIL_IF(call_with_args(compiler, type));
   2649 	}
   2650 
   2651 	if (src == SLJIT_IMM) {
   2652 		jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
   2653 		FAIL_IF_NULL(jump);
   2654 		set_jump(jump, compiler, JUMP_ADDR);
   2655 		jump->u.target = srcw;
   2656 
   2657 		/* Worst case size. */
   2658 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   2659 		compiler->size += 5;
   2660 #else
   2661 		compiler->size += 10 + 3;
   2662 #endif
   2663 
   2664 		inst = (sljit_u8*)ensure_buf(compiler, 2);
   2665 		FAIL_IF_NULL(inst);
   2666 
   2667 		*inst++ = 0;
   2668 		*inst++ = type + 4;
   2669 	}
   2670 	else {
   2671 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2672 		/* REX_W is not necessary (src is not immediate). */
   2673 		compiler->mode32 = 1;
   2674 #endif
   2675 		inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
   2676 		FAIL_IF(!inst);
   2677 		*inst++ = GROUP_FF;
   2678 		*inst |= (type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm;
   2679 	}
   2680 	return SLJIT_SUCCESS;
   2681 }
   2682 
   2683 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
   2684 	sljit_s32 dst, sljit_sw dstw,
   2685 	sljit_s32 src, sljit_sw srcw,
   2686 	sljit_s32 type)
   2687 {
   2688 	sljit_u8 *inst;
   2689 	sljit_u8 cond_set = 0;
   2690 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2691 	sljit_s32 reg;
   2692 #else
   2693 	/* CHECK_EXTRA_REGS migh overwrite these values. */
   2694 	sljit_s32 dst_save = dst;
   2695 	sljit_sw dstw_save = dstw;
   2696 #endif
   2697 
   2698 	CHECK_ERROR();
   2699 	CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type));
   2700 	SLJIT_UNUSED_ARG(srcw);
   2701 
   2702 	if (dst == SLJIT_UNUSED)
   2703 		return SLJIT_SUCCESS;
   2704 
   2705 	ADJUST_LOCAL_OFFSET(dst, dstw);
   2706 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
   2707 	if (SLJIT_UNLIKELY(compiler->flags_saved))
   2708 		FAIL_IF(emit_restore_flags(compiler, op & SLJIT_KEEP_FLAGS));
   2709 
   2710 	type &= 0xff;
   2711 	/* setcc = jcc + 0x10. */
   2712 	cond_set = get_jump_code(type) + 0x10;
   2713 
   2714 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2715 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src) {
   2716 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 3);
   2717 		FAIL_IF(!inst);
   2718 		INC_SIZE(4 + 3);
   2719 		/* Set low register to conditional flag. */
   2720 		*inst++ = (reg_map[TMP_REG1] <= 7) ? REX : REX_B;
   2721 		*inst++ = GROUP_0F;
   2722 		*inst++ = cond_set;
   2723 		*inst++ = MOD_REG | reg_lmap[TMP_REG1];
   2724 		*inst++ = REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B);
   2725 		*inst++ = OR_rm8_r8;
   2726 		*inst++ = MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst];
   2727 		return SLJIT_SUCCESS;
   2728 	}
   2729 
   2730 	reg = (op == SLJIT_MOV && FAST_IS_REG(dst)) ? dst : TMP_REG1;
   2731 
   2732 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 4);
   2733 	FAIL_IF(!inst);
   2734 	INC_SIZE(4 + 4);
   2735 	/* Set low register to conditional flag. */
   2736 	*inst++ = (reg_map[reg] <= 7) ? REX : REX_B;
   2737 	*inst++ = GROUP_0F;
   2738 	*inst++ = cond_set;
   2739 	*inst++ = MOD_REG | reg_lmap[reg];
   2740 	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R));
   2741 	*inst++ = GROUP_0F;
   2742 	*inst++ = MOVZX_r_rm8;
   2743 	*inst = MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg];
   2744 
   2745 	if (reg != TMP_REG1)
   2746 		return SLJIT_SUCCESS;
   2747 
   2748 	if (GET_OPCODE(op) < SLJIT_ADD) {
   2749 		compiler->mode32 = GET_OPCODE(op) != SLJIT_MOV;
   2750 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
   2751 	}
   2752 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
   2753 		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
   2754 	compiler->skip_checks = 1;
   2755 #endif
   2756 	return sljit_emit_op2(compiler, op, dst, dstw, dst, dstw, TMP_REG1, 0);
   2757 #else /* SLJIT_CONFIG_X86_64 */
   2758 	if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) {
   2759 		if (reg_map[dst] <= 4) {
   2760 			/* Low byte is accessible. */
   2761 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 3);
   2762 			FAIL_IF(!inst);
   2763 			INC_SIZE(3 + 3);
   2764 			/* Set low byte to conditional flag. */
   2765 			*inst++ = GROUP_0F;
   2766 			*inst++ = cond_set;
   2767 			*inst++ = MOD_REG | reg_map[dst];
   2768 
   2769 			*inst++ = GROUP_0F;
   2770 			*inst++ = MOVZX_r_rm8;
   2771 			*inst = MOD_REG | (reg_map[dst] << 3) | reg_map[dst];
   2772 			return SLJIT_SUCCESS;
   2773 		}
   2774 
   2775 		/* Low byte is not accessible. */
   2776 		if (cpu_has_cmov == -1)
   2777 			get_cpu_features();
   2778 
   2779 		if (cpu_has_cmov) {
   2780 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
   2781 			/* a xor reg, reg operation would overwrite the flags. */
   2782 			EMIT_MOV(compiler, dst, 0, SLJIT_IMM, 0);
   2783 
   2784 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 3);
   2785 			FAIL_IF(!inst);
   2786 			INC_SIZE(3);
   2787 
   2788 			*inst++ = GROUP_0F;
   2789 			/* cmovcc = setcc - 0x50. */
   2790 			*inst++ = cond_set - 0x50;
   2791 			*inst++ = MOD_REG | (reg_map[dst] << 3) | reg_map[TMP_REG1];
   2792 			return SLJIT_SUCCESS;
   2793 		}
   2794 
   2795 		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
   2796 		FAIL_IF(!inst);
   2797 		INC_SIZE(1 + 3 + 3 + 1);
   2798 		*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
   2799 		/* Set al to conditional flag. */
   2800 		*inst++ = GROUP_0F;
   2801 		*inst++ = cond_set;
   2802 		*inst++ = MOD_REG | 0 /* eax */;
   2803 
   2804 		*inst++ = GROUP_0F;
   2805 		*inst++ = MOVZX_r_rm8;
   2806 		*inst++ = MOD_REG | (reg_map[dst] << 3) | 0 /* eax */;
   2807 		*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
   2808 		return SLJIT_SUCCESS;
   2809 	}
   2810 
   2811 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src && reg_map[dst] <= 4) {
   2812 		SLJIT_COMPILE_ASSERT(reg_map[SLJIT_R0] == 0, scratch_reg1_must_be_eax);
   2813 		if (dst != SLJIT_R0) {
   2814 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 2 + 1);
   2815 			FAIL_IF(!inst);
   2816 			INC_SIZE(1 + 3 + 2 + 1);
   2817 			/* Set low register to conditional flag. */
   2818 			*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
   2819 			*inst++ = GROUP_0F;
   2820 			*inst++ = cond_set;
   2821 			*inst++ = MOD_REG | 0 /* eax */;
   2822 			*inst++ = OR_rm8_r8;
   2823 			*inst++ = MOD_REG | (0 /* eax */ << 3) | reg_map[dst];
   2824 			*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
   2825 		}
   2826 		else {
   2827 			inst = (sljit_u8*)ensure_buf(compiler, 1 + 2 + 3 + 2 + 2);
   2828 			FAIL_IF(!inst);
   2829 			INC_SIZE(2 + 3 + 2 + 2);
   2830 			/* Set low register to conditional flag. */
   2831 			*inst++ = XCHG_r_rm;
   2832 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
   2833 			*inst++ = GROUP_0F;
   2834 			*inst++ = cond_set;
   2835 			*inst++ = MOD_REG | 1 /* ecx */;
   2836 			*inst++ = OR_rm8_r8;
   2837 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | 0 /* eax */;
   2838 			*inst++ = XCHG_r_rm;
   2839 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
   2840 		}
   2841 		return SLJIT_SUCCESS;
   2842 	}
   2843 
   2844 	/* Set TMP_REG1 to the bit. */
   2845 	inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
   2846 	FAIL_IF(!inst);
   2847 	INC_SIZE(1 + 3 + 3 + 1);
   2848 	*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
   2849 	/* Set al to conditional flag. */
   2850 	*inst++ = GROUP_0F;
   2851 	*inst++ = cond_set;
   2852 	*inst++ = MOD_REG | 0 /* eax */;
   2853 
   2854 	*inst++ = GROUP_0F;
   2855 	*inst++ = MOVZX_r_rm8;
   2856 	*inst++ = MOD_REG | (0 << 3) /* eax */ | 0 /* eax */;
   2857 
   2858 	*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
   2859 
   2860 	if (GET_OPCODE(op) < SLJIT_ADD)
   2861 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
   2862 
   2863 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
   2864 		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
   2865 	compiler->skip_checks = 1;
   2866 #endif
   2867 	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
   2868 #endif /* SLJIT_CONFIG_X86_64 */
   2869 }
   2870 
   2871 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
   2872 {
   2873 	CHECK_ERROR();
   2874 	CHECK(check_sljit_get_local_base(compiler, dst, dstw, offset));
   2875 	ADJUST_LOCAL_OFFSET(dst, dstw);
   2876 
   2877 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
   2878 
   2879 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2880 	compiler->mode32 = 0;
   2881 #endif
   2882 
   2883 	ADJUST_LOCAL_OFFSET(SLJIT_MEM1(SLJIT_SP), offset);
   2884 
   2885 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2886 	if (NOT_HALFWORD(offset)) {
   2887 		FAIL_IF(emit_load_imm64(compiler, TMP_REG1, offset));
   2888 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
   2889 		SLJIT_ASSERT(emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0) != SLJIT_ERR_UNSUPPORTED);
   2890 		return compiler->error;
   2891 #else
   2892 		return emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0);
   2893 #endif
   2894 	}
   2895 #endif
   2896 
   2897 	if (offset != 0)
   2898 		return emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, SLJIT_IMM, offset);
   2899 	return emit_mov(compiler, dst, dstw, SLJIT_SP, 0);
   2900 }
   2901 
   2902 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
   2903 {
   2904 	sljit_u8 *inst;
   2905 	struct sljit_const *const_;
   2906 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2907 	sljit_s32 reg;
   2908 #endif
   2909 
   2910 	CHECK_ERROR_PTR();
   2911 	CHECK_PTR(check_sljit_emit_const(compiler, dst, dstw, init_value));
   2912 	ADJUST_LOCAL_OFFSET(dst, dstw);
   2913 
   2914 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
   2915 
   2916 	const_ = (struct sljit_const*)ensure_abuf(compiler, sizeof(struct sljit_const));
   2917 	PTR_FAIL_IF(!const_);
   2918 	set_const(const_, compiler);
   2919 
   2920 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2921 	compiler->mode32 = 0;
   2922 	reg = SLOW_IS_REG(dst) ? dst : TMP_REG1;
   2923 
   2924 	if (emit_load_imm64(compiler, reg, init_value))
   2925 		return NULL;
   2926 #else
   2927 	if (dst == SLJIT_UNUSED)
   2928 		dst = TMP_REG1;
   2929 
   2930 	if (emit_mov(compiler, dst, dstw, SLJIT_IMM, init_value))
   2931 		return NULL;
   2932 #endif
   2933 
   2934 	inst = (sljit_u8*)ensure_buf(compiler, 2);
   2935 	PTR_FAIL_IF(!inst);
   2936 
   2937 	*inst++ = 0;
   2938 	*inst++ = 1;
   2939 
   2940 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2941 	if (dst & SLJIT_MEM)
   2942 		if (emit_mov(compiler, dst, dstw, TMP_REG1, 0))
   2943 			return NULL;
   2944 #endif
   2945 
   2946 	return const_;
   2947 }
   2948 
   2949 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_addr)
   2950 {
   2951 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   2952 	sljit_unaligned_store_sw((void*)addr, new_addr - (addr + 4));
   2953 #else
   2954 	sljit_unaligned_store_sw((void*)addr, (sljit_sw) new_addr);
   2955 #endif
   2956 }
   2957 
   2958 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant)
   2959 {
   2960 	sljit_unaligned_store_sw((void*)addr, new_constant);
   2961 }
   2962 
   2963 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_x86_is_sse2_available(void)
   2964 {
   2965 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
   2966 	if (cpu_has_sse2 == -1)
   2967 		get_cpu_features();
   2968 	return cpu_has_sse2;
   2969 #else
   2970 	return 1;
   2971 #endif
   2972 }
   2973 
   2974 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_x86_is_cmov_available(void)
   2975 {
   2976 	if (cpu_has_cmov == -1)
   2977 		get_cpu_features();
   2978 	return cpu_has_cmov;
   2979 }
   2980 
   2981 SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_x86_emit_cmov(struct sljit_compiler *compiler,
   2982 	sljit_s32 type,
   2983 	sljit_s32 dst_reg,
   2984 	sljit_s32 src, sljit_sw srcw)
   2985 {
   2986 	sljit_u8* inst;
   2987 
   2988 	CHECK_ERROR();
   2989 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
   2990 	CHECK_ARGUMENT(sljit_x86_is_cmov_available());
   2991 	CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_I32_OP)));
   2992 	CHECK_ARGUMENT((type & 0xff) >= SLJIT_EQUAL && (type & 0xff) <= SLJIT_ORDERED_F64);
   2993 	CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(dst_reg & ~SLJIT_I32_OP));
   2994 	FUNCTION_CHECK_SRC(src, srcw);
   2995 #endif
   2996 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
   2997 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
   2998 		fprintf(compiler->verbose, "  x86_cmov%s %s%s, ",
   2999 			!(dst_reg & SLJIT_I32_OP) ? "" : ".i",
   3000 			jump_names[type & 0xff], JUMP_POSTFIX(type));
   3001 		sljit_verbose_reg(compiler, dst_reg & ~SLJIT_I32_OP);
   3002 		fprintf(compiler->verbose, ", ");
   3003 		sljit_verbose_param(compiler, src, srcw);
   3004 		fprintf(compiler->verbose, "\n");
   3005 	}
   3006 #endif
   3007 
   3008 	ADJUST_LOCAL_OFFSET(src, srcw);
   3009 	CHECK_EXTRA_REGS(src, srcw, (void)0);
   3010 
   3011 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   3012 	compiler->mode32 = dst_reg & SLJIT_I32_OP;
   3013 #endif
   3014 	dst_reg &= ~SLJIT_I32_OP;
   3015 
   3016 	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
   3017 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
   3018 		src = TMP_REG1;
   3019 		srcw = 0;
   3020 	}
   3021 
   3022 	inst = emit_x86_instruction(compiler, 2, dst_reg, 0, src, srcw);
   3023 	FAIL_IF(!inst);
   3024 	*inst++ = GROUP_0F;
   3025 	*inst = get_jump_code(type & 0xff) - 0x40;
   3026 	return SLJIT_SUCCESS;
   3027 }
   3028