Home | History | Annotate | Download | only in sljit
      1 /*
      2  *    Stack-less Just-In-Time compiler
      3  *
      4  *    Copyright 2009-2012 Zoltan Herczeg (hzmester (at) freemail.hu). All rights reserved.
      5  *
      6  * Redistribution and use in source and binary forms, with or without modification, are
      7  * permitted provided that the following conditions are met:
      8  *
      9  *   1. Redistributions of source code must retain the above copyright notice, this list of
     10  *      conditions and the following disclaimer.
     11  *
     12  *   2. Redistributions in binary form must reproduce the above copyright notice, this list
     13  *      of conditions and the following disclaimer in the documentation and/or other materials
     14  *      provided with the distribution.
     15  *
     16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
     17  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
     19  * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
     20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
     21  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
     22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
     23  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
     24  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25  */
     26 
     27 SLJIT_API_FUNC_ATTRIBUTE SLJIT_CONST char* sljit_get_platform_name(void)
     28 {
     29 	return "x86" SLJIT_CPUINFO;
     30 }
     31 
     32 /*
     33    32b register indexes:
     34      0 - EAX
     35      1 - ECX
     36      2 - EDX
     37      3 - EBX
     38      4 - none
     39      5 - EBP
     40      6 - ESI
     41      7 - EDI
     42 */
     43 
     44 /*
     45    64b register indexes:
     46      0 - RAX
     47      1 - RCX
     48      2 - RDX
     49      3 - RBX
     50      4 - none
     51      5 - RBP
     52      6 - RSI
     53      7 - RDI
     54      8 - R8   - From now on REX prefix is required
     55      9 - R9
     56     10 - R10
     57     11 - R11
     58     12 - R12
     59     13 - R13
     60     14 - R14
     61     15 - R15
     62 */
     63 
     64 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
     65 
     66 /* Last register + 1. */
     67 #define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
     68 
     69 static SLJIT_CONST sljit_ub reg_map[SLJIT_NUMBER_OF_REGISTERS + 3] = {
     70 	0, 0, 2, 1, 0, 0, 0, 0, 7, 6, 3, 4, 5
     71 };
     72 
     73 #define CHECK_EXTRA_REGS(p, w, do) \
     74 	if (p >= SLJIT_R3 && p <= SLJIT_R6) { \
     75 		w = SLJIT_LOCALS_OFFSET + ((p) - (SLJIT_R3 + 4)) * sizeof(sljit_sw); \
     76 		p = SLJIT_MEM1(SLJIT_SP); \
     77 		do; \
     78 	}
     79 
     80 #else /* SLJIT_CONFIG_X86_32 */
     81 
     82 /* Last register + 1. */
     83 #define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
     84 #define TMP_REG2	(SLJIT_NUMBER_OF_REGISTERS + 3)
     85 #define TMP_REG3	(SLJIT_NUMBER_OF_REGISTERS + 4)
     86 
     87 /* Note: r12 & 0x7 == 0b100, which decoded as SIB byte present
     88    Note: avoid to use r12 and r13 for memory addessing
     89    therefore r12 is better for SAVED_EREG than SAVED_REG. */
     90 #ifndef _WIN64
     91 /* 1st passed in rdi, 2nd argument passed in rsi, 3rd in rdx. */
     92 static SLJIT_CONST sljit_ub reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
     93 	0, 0, 6, 1, 8, 11, 10, 12, 5, 13, 14, 15, 3, 4, 2, 7, 9
     94 };
     95 /* low-map. reg_map & 0x7. */
     96 static SLJIT_CONST sljit_ub reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 5] = {
     97 	0, 0, 6, 1, 0, 3,  2,  4,  5,  5,  6,  7, 3, 4, 2, 7, 1
     98 };
     99 #else
    100 /* 1st passed in rcx, 2nd argument passed in rdx, 3rd in r8. */
    101 static SLJIT_CONST sljit_ub reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
    102 	0, 0, 2, 1, 11, 12, 5, 13, 14, 15, 7, 6, 3, 4, 10, 8, 9
    103 };
    104 /* low-map. reg_map & 0x7. */
    105 static SLJIT_CONST sljit_ub reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 5] = {
    106 	0, 0, 2, 1, 3,  4,  5,  5, 6,  7,  7, 6, 3, 4, 2,  0, 1
    107 };
    108 #endif
    109 
    110 #define REX_W		0x48
    111 #define REX_R		0x44
    112 #define REX_X		0x42
    113 #define REX_B		0x41
    114 #define REX		0x40
    115 
    116 #ifndef _WIN64
    117 #define HALFWORD_MAX 0x7fffffffl
    118 #define HALFWORD_MIN -0x80000000l
    119 #else
    120 #define HALFWORD_MAX 0x7fffffffll
    121 #define HALFWORD_MIN -0x80000000ll
    122 #endif
    123 
    124 #define IS_HALFWORD(x)		((x) <= HALFWORD_MAX && (x) >= HALFWORD_MIN)
    125 #define NOT_HALFWORD(x)		((x) > HALFWORD_MAX || (x) < HALFWORD_MIN)
    126 
    127 #define CHECK_EXTRA_REGS(p, w, do)
    128 
    129 #endif /* SLJIT_CONFIG_X86_32 */
    130 
    131 #define TMP_FREG	(0)
    132 
    133 /* Size flags for emit_x86_instruction: */
    134 #define EX86_BIN_INS		0x0010
    135 #define EX86_SHIFT_INS		0x0020
    136 #define EX86_REX		0x0040
    137 #define EX86_NO_REXW		0x0080
    138 #define EX86_BYTE_ARG		0x0100
    139 #define EX86_HALF_ARG		0x0200
    140 #define EX86_PREF_66		0x0400
    141 #define EX86_PREF_F2		0x0800
    142 #define EX86_PREF_F3		0x1000
    143 #define EX86_SSE2_OP1		0x2000
    144 #define EX86_SSE2_OP2		0x4000
    145 #define EX86_SSE2		(EX86_SSE2_OP1 | EX86_SSE2_OP2)
    146 
    147 /* --------------------------------------------------------------------- */
    148 /*  Instrucion forms                                                     */
    149 /* --------------------------------------------------------------------- */
    150 
    151 #define ADD		(/* BINARY */ 0 << 3)
    152 #define ADD_EAX_i32	0x05
    153 #define ADD_r_rm	0x03
    154 #define ADD_rm_r	0x01
    155 #define ADDSD_x_xm	0x58
    156 #define ADC		(/* BINARY */ 2 << 3)
    157 #define ADC_EAX_i32	0x15
    158 #define ADC_r_rm	0x13
    159 #define ADC_rm_r	0x11
    160 #define AND		(/* BINARY */ 4 << 3)
    161 #define AND_EAX_i32	0x25
    162 #define AND_r_rm	0x23
    163 #define AND_rm_r	0x21
    164 #define ANDPD_x_xm	0x54
    165 #define BSR_r_rm	(/* GROUP_0F */ 0xbd)
    166 #define CALL_i32	0xe8
    167 #define CALL_rm		(/* GROUP_FF */ 2 << 3)
    168 #define CDQ		0x99
    169 #define CMOVNE_r_rm	(/* GROUP_0F */ 0x45)
    170 #define CMP		(/* BINARY */ 7 << 3)
    171 #define CMP_EAX_i32	0x3d
    172 #define CMP_r_rm	0x3b
    173 #define CMP_rm_r	0x39
    174 #define CVTPD2PS_x_xm	0x5a
    175 #define CVTSI2SD_x_rm	0x2a
    176 #define CVTTSD2SI_r_xm	0x2c
    177 #define DIV		(/* GROUP_F7 */ 6 << 3)
    178 #define DIVSD_x_xm	0x5e
    179 #define INT3		0xcc
    180 #define IDIV		(/* GROUP_F7 */ 7 << 3)
    181 #define IMUL		(/* GROUP_F7 */ 5 << 3)
    182 #define IMUL_r_rm	(/* GROUP_0F */ 0xaf)
    183 #define IMUL_r_rm_i8	0x6b
    184 #define IMUL_r_rm_i32	0x69
    185 #define JE_i8		0x74
    186 #define JNE_i8		0x75
    187 #define JMP_i8		0xeb
    188 #define JMP_i32		0xe9
    189 #define JMP_rm		(/* GROUP_FF */ 4 << 3)
    190 #define LEA_r_m		0x8d
    191 #define MOV_r_rm	0x8b
    192 #define MOV_r_i32	0xb8
    193 #define MOV_rm_r	0x89
    194 #define MOV_rm_i32	0xc7
    195 #define MOV_rm8_i8	0xc6
    196 #define MOV_rm8_r8	0x88
    197 #define MOVSD_x_xm	0x10
    198 #define MOVSD_xm_x	0x11
    199 #define MOVSXD_r_rm	0x63
    200 #define MOVSX_r_rm8	(/* GROUP_0F */ 0xbe)
    201 #define MOVSX_r_rm16	(/* GROUP_0F */ 0xbf)
    202 #define MOVZX_r_rm8	(/* GROUP_0F */ 0xb6)
    203 #define MOVZX_r_rm16	(/* GROUP_0F */ 0xb7)
    204 #define MUL		(/* GROUP_F7 */ 4 << 3)
    205 #define MULSD_x_xm	0x59
    206 #define NEG_rm		(/* GROUP_F7 */ 3 << 3)
    207 #define NOP		0x90
    208 #define NOT_rm		(/* GROUP_F7 */ 2 << 3)
    209 #define OR		(/* BINARY */ 1 << 3)
    210 #define OR_r_rm		0x0b
    211 #define OR_EAX_i32	0x0d
    212 #define OR_rm_r		0x09
    213 #define OR_rm8_r8	0x08
    214 #define POP_r		0x58
    215 #define POP_rm		0x8f
    216 #define POPF		0x9d
    217 #define PUSH_i32	0x68
    218 #define PUSH_r		0x50
    219 #define PUSH_rm		(/* GROUP_FF */ 6 << 3)
    220 #define PUSHF		0x9c
    221 #define RET_near	0xc3
    222 #define RET_i16		0xc2
    223 #define SBB		(/* BINARY */ 3 << 3)
    224 #define SBB_EAX_i32	0x1d
    225 #define SBB_r_rm	0x1b
    226 #define SBB_rm_r	0x19
    227 #define SAR		(/* SHIFT */ 7 << 3)
    228 #define SHL		(/* SHIFT */ 4 << 3)
    229 #define SHR		(/* SHIFT */ 5 << 3)
    230 #define SUB		(/* BINARY */ 5 << 3)
    231 #define SUB_EAX_i32	0x2d
    232 #define SUB_r_rm	0x2b
    233 #define SUB_rm_r	0x29
    234 #define SUBSD_x_xm	0x5c
    235 #define TEST_EAX_i32	0xa9
    236 #define TEST_rm_r	0x85
    237 #define UCOMISD_x_xm	0x2e
    238 #define UNPCKLPD_x_xm	0x14
    239 #define XCHG_EAX_r	0x90
    240 #define XCHG_r_rm	0x87
    241 #define XOR		(/* BINARY */ 6 << 3)
    242 #define XOR_EAX_i32	0x35
    243 #define XOR_r_rm	0x33
    244 #define XOR_rm_r	0x31
    245 #define XORPD_x_xm	0x57
    246 
    247 #define GROUP_0F	0x0f
    248 #define GROUP_F7	0xf7
    249 #define GROUP_FF	0xff
    250 #define GROUP_BINARY_81	0x81
    251 #define GROUP_BINARY_83	0x83
    252 #define GROUP_SHIFT_1	0xd1
    253 #define GROUP_SHIFT_N	0xc1
    254 #define GROUP_SHIFT_CL	0xd3
    255 
    256 #define MOD_REG		0xc0
    257 #define MOD_DISP8	0x40
    258 
    259 #define INC_SIZE(s)			(*inst++ = (s), compiler->size += (s))
    260 
    261 #define PUSH_REG(r)			(*inst++ = (PUSH_r + (r)))
    262 #define POP_REG(r)			(*inst++ = (POP_r + (r)))
    263 #define RET()				(*inst++ = (RET_near))
    264 #define RET_I16(n)			(*inst++ = (RET_i16), *inst++ = n, *inst++ = 0)
    265 /* r32, r/m32 */
    266 #define MOV_RM(mod, reg, rm)		(*inst++ = (MOV_r_rm), *inst++ = (mod) << 6 | (reg) << 3 | (rm))
    267 
    268 /* Multithreading does not affect these static variables, since they store
    269    built-in CPU features. Therefore they can be overwritten by different threads
    270    if they detect the CPU features in the same time. */
    271 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
    272 static sljit_si cpu_has_sse2 = -1;
    273 #endif
    274 static sljit_si cpu_has_cmov = -1;
    275 
    276 #ifdef _WIN32_WCE
    277 #include <cmnintrin.h>
    278 #elif defined(_MSC_VER) && _MSC_VER >= 1400
    279 #include <intrin.h>
    280 #endif
    281 
    282 static void get_cpu_features(void)
    283 {
    284 	sljit_ui features;
    285 
    286 #if defined(_MSC_VER) && _MSC_VER >= 1400
    287 
    288 	int CPUInfo[4];
    289 	__cpuid(CPUInfo, 1);
    290 	features = (sljit_ui)CPUInfo[3];
    291 
    292 #elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C)
    293 
    294 	/* AT&T syntax. */
    295 	__asm__ (
    296 		"movl $0x1, %%eax\n"
    297 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    298 		/* On x86-32, there is no red zone, so this
    299 		   should work (no need for a local variable). */
    300 		"push %%ebx\n"
    301 #endif
    302 		"cpuid\n"
    303 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    304 		"pop %%ebx\n"
    305 #endif
    306 		"movl %%edx, %0\n"
    307 		: "=g" (features)
    308 		:
    309 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    310 		: "%eax", "%ecx", "%edx"
    311 #else
    312 		: "%rax", "%rbx", "%rcx", "%rdx"
    313 #endif
    314 	);
    315 
    316 #else /* _MSC_VER && _MSC_VER >= 1400 */
    317 
    318 	/* Intel syntax. */
    319 	__asm {
    320 		mov eax, 1
    321 		cpuid
    322 		mov features, edx
    323 	}
    324 
    325 #endif /* _MSC_VER && _MSC_VER >= 1400 */
    326 
    327 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
    328 	cpu_has_sse2 = (features >> 26) & 0x1;
    329 #endif
    330 	cpu_has_cmov = (features >> 15) & 0x1;
    331 }
    332 
    333 static sljit_ub get_jump_code(sljit_si type)
    334 {
    335 	switch (type) {
    336 	case SLJIT_EQUAL:
    337 	case SLJIT_D_EQUAL:
    338 		return 0x84 /* je */;
    339 
    340 	case SLJIT_NOT_EQUAL:
    341 	case SLJIT_D_NOT_EQUAL:
    342 		return 0x85 /* jne */;
    343 
    344 	case SLJIT_LESS:
    345 	case SLJIT_D_LESS:
    346 		return 0x82 /* jc */;
    347 
    348 	case SLJIT_GREATER_EQUAL:
    349 	case SLJIT_D_GREATER_EQUAL:
    350 		return 0x83 /* jae */;
    351 
    352 	case SLJIT_GREATER:
    353 	case SLJIT_D_GREATER:
    354 		return 0x87 /* jnbe */;
    355 
    356 	case SLJIT_LESS_EQUAL:
    357 	case SLJIT_D_LESS_EQUAL:
    358 		return 0x86 /* jbe */;
    359 
    360 	case SLJIT_SIG_LESS:
    361 		return 0x8c /* jl */;
    362 
    363 	case SLJIT_SIG_GREATER_EQUAL:
    364 		return 0x8d /* jnl */;
    365 
    366 	case SLJIT_SIG_GREATER:
    367 		return 0x8f /* jnle */;
    368 
    369 	case SLJIT_SIG_LESS_EQUAL:
    370 		return 0x8e /* jle */;
    371 
    372 	case SLJIT_OVERFLOW:
    373 	case SLJIT_MUL_OVERFLOW:
    374 		return 0x80 /* jo */;
    375 
    376 	case SLJIT_NOT_OVERFLOW:
    377 	case SLJIT_MUL_NOT_OVERFLOW:
    378 		return 0x81 /* jno */;
    379 
    380 	case SLJIT_D_UNORDERED:
    381 		return 0x8a /* jp */;
    382 
    383 	case SLJIT_D_ORDERED:
    384 		return 0x8b /* jpo */;
    385 	}
    386 	return 0;
    387 }
    388 
    389 static sljit_ub* generate_far_jump_code(struct sljit_jump *jump, sljit_ub *code_ptr, sljit_si type);
    390 
    391 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
    392 static sljit_ub* generate_fixed_jump(sljit_ub *code_ptr, sljit_sw addr, sljit_si type);
    393 #endif
    394 
    395 static sljit_ub* generate_near_jump_code(struct sljit_jump *jump, sljit_ub *code_ptr, sljit_ub *code, sljit_si type)
    396 {
    397 	sljit_si short_jump;
    398 	sljit_uw label_addr;
    399 
    400 	if (jump->flags & JUMP_LABEL)
    401 		label_addr = (sljit_uw)(code + jump->u.label->size);
    402 	else
    403 		label_addr = jump->u.target;
    404 	short_jump = (sljit_sw)(label_addr - (jump->addr + 2)) >= -128 && (sljit_sw)(label_addr - (jump->addr + 2)) <= 127;
    405 
    406 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
    407 	if ((sljit_sw)(label_addr - (jump->addr + 1)) > HALFWORD_MAX || (sljit_sw)(label_addr - (jump->addr + 1)) < HALFWORD_MIN)
    408 		return generate_far_jump_code(jump, code_ptr, type);
    409 #endif
    410 
    411 	if (type == SLJIT_JUMP) {
    412 		if (short_jump)
    413 			*code_ptr++ = JMP_i8;
    414 		else
    415 			*code_ptr++ = JMP_i32;
    416 		jump->addr++;
    417 	}
    418 	else if (type >= SLJIT_FAST_CALL) {
    419 		short_jump = 0;
    420 		*code_ptr++ = CALL_i32;
    421 		jump->addr++;
    422 	}
    423 	else if (short_jump) {
    424 		*code_ptr++ = get_jump_code(type) - 0x10;
    425 		jump->addr++;
    426 	}
    427 	else {
    428 		*code_ptr++ = GROUP_0F;
    429 		*code_ptr++ = get_jump_code(type);
    430 		jump->addr += 2;
    431 	}
    432 
    433 	if (short_jump) {
    434 		jump->flags |= PATCH_MB;
    435 		code_ptr += sizeof(sljit_sb);
    436 	} else {
    437 		jump->flags |= PATCH_MW;
    438 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    439 		code_ptr += sizeof(sljit_sw);
    440 #else
    441 		code_ptr += sizeof(sljit_si);
    442 #endif
    443 	}
    444 
    445 	return code_ptr;
    446 }
    447 
    448 SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
    449 {
    450 	struct sljit_memory_fragment *buf;
    451 	sljit_ub *code;
    452 	sljit_ub *code_ptr;
    453 	sljit_ub *buf_ptr;
    454 	sljit_ub *buf_end;
    455 	sljit_ub len;
    456 
    457 	struct sljit_label *label;
    458 	struct sljit_jump *jump;
    459 	struct sljit_const *const_;
    460 
    461 	CHECK_ERROR_PTR();
    462 	CHECK_PTR(check_sljit_generate_code(compiler));
    463 	reverse_buf(compiler);
    464 
    465 	/* Second code generation pass. */
    466 	code = (sljit_ub*)SLJIT_MALLOC_EXEC(compiler->size);
    467 	PTR_FAIL_WITH_EXEC_IF(code);
    468 	buf = compiler->buf;
    469 
    470 	code_ptr = code;
    471 	label = compiler->labels;
    472 	jump = compiler->jumps;
    473 	const_ = compiler->consts;
    474 	do {
    475 		buf_ptr = buf->memory;
    476 		buf_end = buf_ptr + buf->used_size;
    477 		do {
    478 			len = *buf_ptr++;
    479 			if (len > 0) {
    480 				/* The code is already generated. */
    481 				SLJIT_MEMMOVE(code_ptr, buf_ptr, len);
    482 				code_ptr += len;
    483 				buf_ptr += len;
    484 			}
    485 			else {
    486 				if (*buf_ptr >= 4) {
    487 					jump->addr = (sljit_uw)code_ptr;
    488 					if (!(jump->flags & SLJIT_REWRITABLE_JUMP))
    489 						code_ptr = generate_near_jump_code(jump, code_ptr, code, *buf_ptr - 4);
    490 					else
    491 						code_ptr = generate_far_jump_code(jump, code_ptr, *buf_ptr - 4);
    492 					jump = jump->next;
    493 				}
    494 				else if (*buf_ptr == 0) {
    495 					label->addr = (sljit_uw)code_ptr;
    496 					label->size = code_ptr - code;
    497 					label = label->next;
    498 				}
    499 				else if (*buf_ptr == 1) {
    500 					const_->addr = ((sljit_uw)code_ptr) - sizeof(sljit_sw);
    501 					const_ = const_->next;
    502 				}
    503 				else {
    504 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    505 					*code_ptr++ = (*buf_ptr == 2) ? CALL_i32 : JMP_i32;
    506 					buf_ptr++;
    507 					*(sljit_sw*)code_ptr = *(sljit_sw*)buf_ptr - ((sljit_sw)code_ptr + sizeof(sljit_sw));
    508 					code_ptr += sizeof(sljit_sw);
    509 					buf_ptr += sizeof(sljit_sw) - 1;
    510 #else
    511 					code_ptr = generate_fixed_jump(code_ptr, *(sljit_sw*)(buf_ptr + 1), *buf_ptr);
    512 					buf_ptr += sizeof(sljit_sw);
    513 #endif
    514 				}
    515 				buf_ptr++;
    516 			}
    517 		} while (buf_ptr < buf_end);
    518 		SLJIT_ASSERT(buf_ptr == buf_end);
    519 		buf = buf->next;
    520 	} while (buf);
    521 
    522 	SLJIT_ASSERT(!label);
    523 	SLJIT_ASSERT(!jump);
    524 	SLJIT_ASSERT(!const_);
    525 
    526 	jump = compiler->jumps;
    527 	while (jump) {
    528 		if (jump->flags & PATCH_MB) {
    529 			SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_sb))) >= -128 && (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_sb))) <= 127);
    530 			*(sljit_ub*)jump->addr = (sljit_ub)(jump->u.label->addr - (jump->addr + sizeof(sljit_sb)));
    531 		} else if (jump->flags & PATCH_MW) {
    532 			if (jump->flags & JUMP_LABEL) {
    533 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    534 				*(sljit_sw*)jump->addr = (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_sw)));
    535 #else
    536 				SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_si))) >= HALFWORD_MIN && (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_si))) <= HALFWORD_MAX);
    537 				*(sljit_si*)jump->addr = (sljit_si)(jump->u.label->addr - (jump->addr + sizeof(sljit_si)));
    538 #endif
    539 			}
    540 			else {
    541 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    542 				*(sljit_sw*)jump->addr = (sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_sw)));
    543 #else
    544 				SLJIT_ASSERT((sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_si))) >= HALFWORD_MIN && (sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_si))) <= HALFWORD_MAX);
    545 				*(sljit_si*)jump->addr = (sljit_si)(jump->u.target - (jump->addr + sizeof(sljit_si)));
    546 #endif
    547 			}
    548 		}
    549 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
    550 		else if (jump->flags & PATCH_MD)
    551 			*(sljit_sw*)jump->addr = jump->u.label->addr;
    552 #endif
    553 
    554 		jump = jump->next;
    555 	}
    556 
    557 	/* Maybe we waste some space because of short jumps. */
    558 	SLJIT_ASSERT(code_ptr <= code + compiler->size);
    559 	compiler->error = SLJIT_ERR_COMPILED;
    560 	compiler->executable_size = code_ptr - code;
    561 	return (void*)code;
    562 }
    563 
    564 /* --------------------------------------------------------------------- */
    565 /*  Operators                                                            */
    566 /* --------------------------------------------------------------------- */
    567 
    568 static sljit_si emit_cum_binary(struct sljit_compiler *compiler,
    569 	sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
    570 	sljit_si dst, sljit_sw dstw,
    571 	sljit_si src1, sljit_sw src1w,
    572 	sljit_si src2, sljit_sw src2w);
    573 
    574 static sljit_si emit_non_cum_binary(struct sljit_compiler *compiler,
    575 	sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
    576 	sljit_si dst, sljit_sw dstw,
    577 	sljit_si src1, sljit_sw src1w,
    578 	sljit_si src2, sljit_sw src2w);
    579 
    580 static sljit_si emit_mov(struct sljit_compiler *compiler,
    581 	sljit_si dst, sljit_sw dstw,
    582 	sljit_si src, sljit_sw srcw);
    583 
    584 static SLJIT_INLINE sljit_si emit_save_flags(struct sljit_compiler *compiler)
    585 {
    586 	sljit_ub *inst;
    587 
    588 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    589 	inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
    590 	FAIL_IF(!inst);
    591 	INC_SIZE(5);
    592 #else
    593 	inst = (sljit_ub*)ensure_buf(compiler, 1 + 6);
    594 	FAIL_IF(!inst);
    595 	INC_SIZE(6);
    596 	*inst++ = REX_W;
    597 #endif
    598 	*inst++ = LEA_r_m; /* lea esp/rsp, [esp/rsp + sizeof(sljit_sw)] */
    599 	*inst++ = 0x64;
    600 	*inst++ = 0x24;
    601 	*inst++ = (sljit_ub)sizeof(sljit_sw);
    602 	*inst++ = PUSHF;
    603 	compiler->flags_saved = 1;
    604 	return SLJIT_SUCCESS;
    605 }
    606 
    607 static SLJIT_INLINE sljit_si emit_restore_flags(struct sljit_compiler *compiler, sljit_si keep_flags)
    608 {
    609 	sljit_ub *inst;
    610 
    611 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    612 	inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
    613 	FAIL_IF(!inst);
    614 	INC_SIZE(5);
    615 	*inst++ = POPF;
    616 #else
    617 	inst = (sljit_ub*)ensure_buf(compiler, 1 + 6);
    618 	FAIL_IF(!inst);
    619 	INC_SIZE(6);
    620 	*inst++ = POPF;
    621 	*inst++ = REX_W;
    622 #endif
    623 	*inst++ = LEA_r_m; /* lea esp/rsp, [esp/rsp - sizeof(sljit_sw)] */
    624 	*inst++ = 0x64;
    625 	*inst++ = 0x24;
    626 	*inst++ = (sljit_ub)-(sljit_sb)sizeof(sljit_sw);
    627 	compiler->flags_saved = keep_flags;
    628 	return SLJIT_SUCCESS;
    629 }
    630 
    631 #ifdef _WIN32
    632 #include <malloc.h>
    633 
    634 static void SLJIT_CALL sljit_grow_stack(sljit_sw local_size)
    635 {
    636 	/* Workaround for calling the internal _chkstk() function on Windows.
    637 	This function touches all 4k pages belongs to the requested stack space,
    638 	which size is passed in local_size. This is necessary on Windows where
    639 	the stack can only grow in 4k steps. However, this function just burn
    640 	CPU cycles if the stack is large enough. However, you don't know it in
    641 	advance, so it must always be called. I think this is a bad design in
    642 	general even if it has some reasons. */
    643 	*(volatile sljit_si*)alloca(local_size) = 0;
    644 }
    645 
    646 #endif
    647 
    648 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    649 #include "sljitNativeX86_32.c"
    650 #else
    651 #include "sljitNativeX86_64.c"
    652 #endif
    653 
    654 static sljit_si emit_mov(struct sljit_compiler *compiler,
    655 	sljit_si dst, sljit_sw dstw,
    656 	sljit_si src, sljit_sw srcw)
    657 {
    658 	sljit_ub* inst;
    659 
    660 	if (dst == SLJIT_UNUSED) {
    661 		/* No destination, doesn't need to setup flags. */
    662 		if (src & SLJIT_MEM) {
    663 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
    664 			FAIL_IF(!inst);
    665 			*inst = MOV_r_rm;
    666 		}
    667 		return SLJIT_SUCCESS;
    668 	}
    669 	if (FAST_IS_REG(src)) {
    670 		inst = emit_x86_instruction(compiler, 1, src, 0, dst, dstw);
    671 		FAIL_IF(!inst);
    672 		*inst = MOV_rm_r;
    673 		return SLJIT_SUCCESS;
    674 	}
    675 	if (src & SLJIT_IMM) {
    676 		if (FAST_IS_REG(dst)) {
    677 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    678 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
    679 #else
    680 			if (!compiler->mode32) {
    681 				if (NOT_HALFWORD(srcw))
    682 					return emit_load_imm64(compiler, dst, srcw);
    683 			}
    684 			else
    685 				return emit_do_imm32(compiler, (reg_map[dst] >= 8) ? REX_B : 0, MOV_r_i32 + reg_lmap[dst], srcw);
    686 #endif
    687 		}
    688 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
    689 		if (!compiler->mode32 && NOT_HALFWORD(srcw)) {
    690 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, srcw));
    691 			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, dst, dstw);
    692 			FAIL_IF(!inst);
    693 			*inst = MOV_rm_r;
    694 			return SLJIT_SUCCESS;
    695 		}
    696 #endif
    697 		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, dstw);
    698 		FAIL_IF(!inst);
    699 		*inst = MOV_rm_i32;
    700 		return SLJIT_SUCCESS;
    701 	}
    702 	if (FAST_IS_REG(dst)) {
    703 		inst = emit_x86_instruction(compiler, 1, dst, 0, src, srcw);
    704 		FAIL_IF(!inst);
    705 		*inst = MOV_r_rm;
    706 		return SLJIT_SUCCESS;
    707 	}
    708 
    709 	/* Memory to memory move. Requires two instruction. */
    710 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
    711 	FAIL_IF(!inst);
    712 	*inst = MOV_r_rm;
    713 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
    714 	FAIL_IF(!inst);
    715 	*inst = MOV_rm_r;
    716 	return SLJIT_SUCCESS;
    717 }
    718 
    719 #define EMIT_MOV(compiler, dst, dstw, src, srcw) \
    720 	FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
    721 
    722 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler, sljit_si op)
    723 {
    724 	sljit_ub *inst;
    725 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
    726 	sljit_si size;
    727 #endif
    728 
    729 	CHECK_ERROR();
    730 	CHECK(check_sljit_emit_op0(compiler, op));
    731 
    732 	switch (GET_OPCODE(op)) {
    733 	case SLJIT_BREAKPOINT:
    734 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
    735 		FAIL_IF(!inst);
    736 		INC_SIZE(1);
    737 		*inst = INT3;
    738 		break;
    739 	case SLJIT_NOP:
    740 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
    741 		FAIL_IF(!inst);
    742 		INC_SIZE(1);
    743 		*inst = NOP;
    744 		break;
    745 	case SLJIT_LUMUL:
    746 	case SLJIT_LSMUL:
    747 	case SLJIT_UDIVMOD:
    748 	case SLJIT_SDIVMOD:
    749 	case SLJIT_UDIVI:
    750 	case SLJIT_SDIVI:
    751 		compiler->flags_saved = 0;
    752 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
    753 #ifdef _WIN64
    754 		SLJIT_COMPILE_ASSERT(
    755 			reg_map[SLJIT_R0] == 0
    756 			&& reg_map[SLJIT_R1] == 2
    757 			&& reg_map[TMP_REG1] > 7,
    758 			invalid_register_assignment_for_div_mul);
    759 #else
    760 		SLJIT_COMPILE_ASSERT(
    761 			reg_map[SLJIT_R0] == 0
    762 			&& reg_map[SLJIT_R1] < 7
    763 			&& reg_map[TMP_REG1] == 2,
    764 			invalid_register_assignment_for_div_mul);
    765 #endif
    766 		compiler->mode32 = op & SLJIT_INT_OP;
    767 #endif
    768 		SLJIT_COMPILE_ASSERT((SLJIT_UDIVMOD & 0x2) == 0 && SLJIT_UDIVI - 0x2 == SLJIT_UDIVMOD, bad_div_opcode_assignments);
    769 
    770 		op = GET_OPCODE(op);
    771 		if ((op | 0x2) == SLJIT_UDIVI) {
    772 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
    773 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
    774 			inst = emit_x86_instruction(compiler, 1, SLJIT_R1, 0, SLJIT_R1, 0);
    775 #else
    776 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
    777 #endif
    778 			FAIL_IF(!inst);
    779 			*inst = XOR_r_rm;
    780 		}
    781 
    782 		if ((op | 0x2) == SLJIT_SDIVI) {
    783 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
    784 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
    785 #endif
    786 
    787 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    788 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
    789 			FAIL_IF(!inst);
    790 			INC_SIZE(1);
    791 			*inst = CDQ;
    792 #else
    793 			if (compiler->mode32) {
    794 				inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
    795 				FAIL_IF(!inst);
    796 				INC_SIZE(1);
    797 				*inst = CDQ;
    798 			} else {
    799 				inst = (sljit_ub*)ensure_buf(compiler, 1 + 2);
    800 				FAIL_IF(!inst);
    801 				INC_SIZE(2);
    802 				*inst++ = REX_W;
    803 				*inst = CDQ;
    804 			}
    805 #endif
    806 		}
    807 
    808 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    809 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 2);
    810 		FAIL_IF(!inst);
    811 		INC_SIZE(2);
    812 		*inst++ = GROUP_F7;
    813 		*inst = MOD_REG | ((op >= SLJIT_UDIVMOD) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]);
    814 #else
    815 #ifdef _WIN64
    816 		size = (!compiler->mode32 || op >= SLJIT_UDIVMOD) ? 3 : 2;
    817 #else
    818 		size = (!compiler->mode32) ? 3 : 2;
    819 #endif
    820 		inst = (sljit_ub*)ensure_buf(compiler, 1 + size);
    821 		FAIL_IF(!inst);
    822 		INC_SIZE(size);
    823 #ifdef _WIN64
    824 		if (!compiler->mode32)
    825 			*inst++ = REX_W | ((op >= SLJIT_UDIVMOD) ? REX_B : 0);
    826 		else if (op >= SLJIT_UDIVMOD)
    827 			*inst++ = REX_B;
    828 		*inst++ = GROUP_F7;
    829 		*inst = MOD_REG | ((op >= SLJIT_UDIVMOD) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]);
    830 #else
    831 		if (!compiler->mode32)
    832 			*inst++ = REX_W;
    833 		*inst++ = GROUP_F7;
    834 		*inst = MOD_REG | reg_map[SLJIT_R1];
    835 #endif
    836 #endif
    837 		switch (op) {
    838 		case SLJIT_LUMUL:
    839 			*inst |= MUL;
    840 			break;
    841 		case SLJIT_LSMUL:
    842 			*inst |= IMUL;
    843 			break;
    844 		case SLJIT_UDIVMOD:
    845 		case SLJIT_UDIVI:
    846 			*inst |= DIV;
    847 			break;
    848 		case SLJIT_SDIVMOD:
    849 		case SLJIT_SDIVI:
    850 			*inst |= IDIV;
    851 			break;
    852 		}
    853 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
    854 		if (op <= SLJIT_SDIVMOD)
    855 			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
    856 #else
    857 		if (op >= SLJIT_UDIVI)
    858 			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
    859 #endif
    860 		break;
    861 	}
    862 
    863 	return SLJIT_SUCCESS;
    864 }
    865 
    866 #define ENCODE_PREFIX(prefix) \
    867 	do { \
    868 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 1); \
    869 		FAIL_IF(!inst); \
    870 		INC_SIZE(1); \
    871 		*inst = (prefix); \
    872 	} while (0)
    873 
    874 static sljit_si emit_mov_byte(struct sljit_compiler *compiler, sljit_si sign,
    875 	sljit_si dst, sljit_sw dstw,
    876 	sljit_si src, sljit_sw srcw)
    877 {
    878 	sljit_ub* inst;
    879 	sljit_si dst_r;
    880 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    881 	sljit_si work_r;
    882 #endif
    883 
    884 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
    885 	compiler->mode32 = 0;
    886 #endif
    887 
    888 	if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
    889 		return SLJIT_SUCCESS; /* Empty instruction. */
    890 
    891 	if (src & SLJIT_IMM) {
    892 		if (FAST_IS_REG(dst)) {
    893 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    894 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
    895 #else
    896 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
    897 			FAIL_IF(!inst);
    898 			*inst = MOV_rm_i32;
    899 			return SLJIT_SUCCESS;
    900 #endif
    901 		}
    902 		inst = emit_x86_instruction(compiler, 1 | EX86_BYTE_ARG | EX86_NO_REXW, SLJIT_IMM, srcw, dst, dstw);
    903 		FAIL_IF(!inst);
    904 		*inst = MOV_rm8_i8;
    905 		return SLJIT_SUCCESS;
    906 	}
    907 
    908 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
    909 
    910 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src)) {
    911 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    912 		if (reg_map[src] >= 4) {
    913 			SLJIT_ASSERT(dst_r == TMP_REG1);
    914 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
    915 		} else
    916 			dst_r = src;
    917 #else
    918 		dst_r = src;
    919 #endif
    920 	}
    921 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    922 	else if (FAST_IS_REG(src) && reg_map[src] >= 4) {
    923 		/* src, dst are registers. */
    924 		SLJIT_ASSERT(SLOW_IS_REG(dst));
    925 		if (reg_map[dst] < 4) {
    926 			if (dst != src)
    927 				EMIT_MOV(compiler, dst, 0, src, 0);
    928 			inst = emit_x86_instruction(compiler, 2, dst, 0, dst, 0);
    929 			FAIL_IF(!inst);
    930 			*inst++ = GROUP_0F;
    931 			*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
    932 		}
    933 		else {
    934 			if (dst != src)
    935 				EMIT_MOV(compiler, dst, 0, src, 0);
    936 			if (sign) {
    937 				/* shl reg, 24 */
    938 				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
    939 				FAIL_IF(!inst);
    940 				*inst |= SHL;
    941 				/* sar reg, 24 */
    942 				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
    943 				FAIL_IF(!inst);
    944 				*inst |= SAR;
    945 			}
    946 			else {
    947 				inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 0xff, dst, 0);
    948 				FAIL_IF(!inst);
    949 				*(inst + 1) |= AND;
    950 			}
    951 		}
    952 		return SLJIT_SUCCESS;
    953 	}
    954 #endif
    955 	else {
    956 		/* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */
    957 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
    958 		FAIL_IF(!inst);
    959 		*inst++ = GROUP_0F;
    960 		*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
    961 	}
    962 
    963 	if (dst & SLJIT_MEM) {
    964 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
    965 		if (dst_r == TMP_REG1) {
    966 			/* Find a non-used register, whose reg_map[src] < 4. */
    967 			if ((dst & REG_MASK) == SLJIT_R0) {
    968 				if ((dst & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_R1))
    969 					work_r = SLJIT_R2;
    970 				else
    971 					work_r = SLJIT_R1;
    972 			}
    973 			else {
    974 				if ((dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
    975 					work_r = SLJIT_R0;
    976 				else if ((dst & REG_MASK) == SLJIT_R1)
    977 					work_r = SLJIT_R2;
    978 				else
    979 					work_r = SLJIT_R1;
    980 			}
    981 
    982 			if (work_r == SLJIT_R0) {
    983 				ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
    984 			}
    985 			else {
    986 				inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
    987 				FAIL_IF(!inst);
    988 				*inst = XCHG_r_rm;
    989 			}
    990 
    991 			inst = emit_x86_instruction(compiler, 1, work_r, 0, dst, dstw);
    992 			FAIL_IF(!inst);
    993 			*inst = MOV_rm8_r8;
    994 
    995 			if (work_r == SLJIT_R0) {
    996 				ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
    997 			}
    998 			else {
    999 				inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
   1000 				FAIL_IF(!inst);
   1001 				*inst = XCHG_r_rm;
   1002 			}
   1003 		}
   1004 		else {
   1005 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
   1006 			FAIL_IF(!inst);
   1007 			*inst = MOV_rm8_r8;
   1008 		}
   1009 #else
   1010 		inst = emit_x86_instruction(compiler, 1 | EX86_REX | EX86_NO_REXW, dst_r, 0, dst, dstw);
   1011 		FAIL_IF(!inst);
   1012 		*inst = MOV_rm8_r8;
   1013 #endif
   1014 	}
   1015 
   1016 	return SLJIT_SUCCESS;
   1017 }
   1018 
   1019 static sljit_si emit_mov_half(struct sljit_compiler *compiler, sljit_si sign,
   1020 	sljit_si dst, sljit_sw dstw,
   1021 	sljit_si src, sljit_sw srcw)
   1022 {
   1023 	sljit_ub* inst;
   1024 	sljit_si dst_r;
   1025 
   1026 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1027 	compiler->mode32 = 0;
   1028 #endif
   1029 
   1030 	if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
   1031 		return SLJIT_SUCCESS; /* Empty instruction. */
   1032 
   1033 	if (src & SLJIT_IMM) {
   1034 		if (FAST_IS_REG(dst)) {
   1035 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1036 			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
   1037 #else
   1038 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
   1039 			FAIL_IF(!inst);
   1040 			*inst = MOV_rm_i32;
   1041 			return SLJIT_SUCCESS;
   1042 #endif
   1043 		}
   1044 		inst = emit_x86_instruction(compiler, 1 | EX86_HALF_ARG | EX86_NO_REXW | EX86_PREF_66, SLJIT_IMM, srcw, dst, dstw);
   1045 		FAIL_IF(!inst);
   1046 		*inst = MOV_rm_i32;
   1047 		return SLJIT_SUCCESS;
   1048 	}
   1049 
   1050 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
   1051 
   1052 	if ((dst & SLJIT_MEM) && FAST_IS_REG(src))
   1053 		dst_r = src;
   1054 	else {
   1055 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
   1056 		FAIL_IF(!inst);
   1057 		*inst++ = GROUP_0F;
   1058 		*inst = sign ? MOVSX_r_rm16 : MOVZX_r_rm16;
   1059 	}
   1060 
   1061 	if (dst & SLJIT_MEM) {
   1062 		inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw);
   1063 		FAIL_IF(!inst);
   1064 		*inst = MOV_rm_r;
   1065 	}
   1066 
   1067 	return SLJIT_SUCCESS;
   1068 }
   1069 
   1070 static sljit_si emit_unary(struct sljit_compiler *compiler, sljit_ub opcode,
   1071 	sljit_si dst, sljit_sw dstw,
   1072 	sljit_si src, sljit_sw srcw)
   1073 {
   1074 	sljit_ub* inst;
   1075 
   1076 	if (dst == SLJIT_UNUSED) {
   1077 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
   1078 		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
   1079 		FAIL_IF(!inst);
   1080 		*inst++ = GROUP_F7;
   1081 		*inst |= opcode;
   1082 		return SLJIT_SUCCESS;
   1083 	}
   1084 	if (dst == src && dstw == srcw) {
   1085 		/* Same input and output */
   1086 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
   1087 		FAIL_IF(!inst);
   1088 		*inst++ = GROUP_F7;
   1089 		*inst |= opcode;
   1090 		return SLJIT_SUCCESS;
   1091 	}
   1092 	if (FAST_IS_REG(dst)) {
   1093 		EMIT_MOV(compiler, dst, 0, src, srcw);
   1094 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
   1095 		FAIL_IF(!inst);
   1096 		*inst++ = GROUP_F7;
   1097 		*inst |= opcode;
   1098 		return SLJIT_SUCCESS;
   1099 	}
   1100 	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
   1101 	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
   1102 	FAIL_IF(!inst);
   1103 	*inst++ = GROUP_F7;
   1104 	*inst |= opcode;
   1105 	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
   1106 	return SLJIT_SUCCESS;
   1107 }
   1108 
   1109 static sljit_si emit_not_with_flags(struct sljit_compiler *compiler,
   1110 	sljit_si dst, sljit_sw dstw,
   1111 	sljit_si src, sljit_sw srcw)
   1112 {
   1113 	sljit_ub* inst;
   1114 
   1115 	if (dst == SLJIT_UNUSED) {
   1116 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
   1117 		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
   1118 		FAIL_IF(!inst);
   1119 		*inst++ = GROUP_F7;
   1120 		*inst |= NOT_rm;
   1121 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
   1122 		FAIL_IF(!inst);
   1123 		*inst = OR_r_rm;
   1124 		return SLJIT_SUCCESS;
   1125 	}
   1126 	if (FAST_IS_REG(dst)) {
   1127 		EMIT_MOV(compiler, dst, 0, src, srcw);
   1128 		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
   1129 		FAIL_IF(!inst);
   1130 		*inst++ = GROUP_F7;
   1131 		*inst |= NOT_rm;
   1132 		inst = emit_x86_instruction(compiler, 1, dst, 0, dst, 0);
   1133 		FAIL_IF(!inst);
   1134 		*inst = OR_r_rm;
   1135 		return SLJIT_SUCCESS;
   1136 	}
   1137 	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
   1138 	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
   1139 	FAIL_IF(!inst);
   1140 	*inst++ = GROUP_F7;
   1141 	*inst |= NOT_rm;
   1142 	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
   1143 	FAIL_IF(!inst);
   1144 	*inst = OR_r_rm;
   1145 	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
   1146 	return SLJIT_SUCCESS;
   1147 }
   1148 
   1149 static sljit_si emit_clz(struct sljit_compiler *compiler, sljit_si op_flags,
   1150 	sljit_si dst, sljit_sw dstw,
   1151 	sljit_si src, sljit_sw srcw)
   1152 {
   1153 	sljit_ub* inst;
   1154 	sljit_si dst_r;
   1155 
   1156 	SLJIT_UNUSED_ARG(op_flags);
   1157 	if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED)) {
   1158 		/* Just set the zero flag. */
   1159 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
   1160 		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
   1161 		FAIL_IF(!inst);
   1162 		*inst++ = GROUP_F7;
   1163 		*inst |= NOT_rm;
   1164 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1165 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 31, TMP_REG1, 0);
   1166 #else
   1167 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 63 : 31, TMP_REG1, 0);
   1168 #endif
   1169 		FAIL_IF(!inst);
   1170 		*inst |= SHR;
   1171 		return SLJIT_SUCCESS;
   1172 	}
   1173 
   1174 	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
   1175 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
   1176 		src = TMP_REG1;
   1177 		srcw = 0;
   1178 	}
   1179 
   1180 	inst = emit_x86_instruction(compiler, 2, TMP_REG1, 0, src, srcw);
   1181 	FAIL_IF(!inst);
   1182 	*inst++ = GROUP_0F;
   1183 	*inst = BSR_r_rm;
   1184 
   1185 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1186 	if (FAST_IS_REG(dst))
   1187 		dst_r = dst;
   1188 	else {
   1189 		/* Find an unused temporary register. */
   1190 		if ((dst & REG_MASK) != SLJIT_R0 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
   1191 			dst_r = SLJIT_R0;
   1192 		else if ((dst & REG_MASK) != SLJIT_R1 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R1))
   1193 			dst_r = SLJIT_R1;
   1194 		else
   1195 			dst_r = SLJIT_R2;
   1196 		EMIT_MOV(compiler, dst, dstw, dst_r, 0);
   1197 	}
   1198 	EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, 32 + 31);
   1199 #else
   1200 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
   1201 	compiler->mode32 = 0;
   1202 	EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 64 + 63 : 32 + 31);
   1203 	compiler->mode32 = op_flags & SLJIT_INT_OP;
   1204 #endif
   1205 
   1206 	if (cpu_has_cmov == -1)
   1207 		get_cpu_features();
   1208 
   1209 	if (cpu_has_cmov) {
   1210 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
   1211 		FAIL_IF(!inst);
   1212 		*inst++ = GROUP_0F;
   1213 		*inst = CMOVNE_r_rm;
   1214 	} else {
   1215 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1216 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
   1217 		FAIL_IF(!inst);
   1218 		INC_SIZE(4);
   1219 
   1220 		*inst++ = JE_i8;
   1221 		*inst++ = 2;
   1222 		*inst++ = MOV_r_rm;
   1223 		*inst++ = MOD_REG | (reg_map[dst_r] << 3) | reg_map[TMP_REG1];
   1224 #else
   1225 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
   1226 		FAIL_IF(!inst);
   1227 		INC_SIZE(5);
   1228 
   1229 		*inst++ = JE_i8;
   1230 		*inst++ = 3;
   1231 		*inst++ = REX_W | (reg_map[dst_r] >= 8 ? REX_R : 0) | (reg_map[TMP_REG1] >= 8 ? REX_B : 0);
   1232 		*inst++ = MOV_r_rm;
   1233 		*inst++ = MOD_REG | (reg_lmap[dst_r] << 3) | reg_lmap[TMP_REG1];
   1234 #endif
   1235 	}
   1236 
   1237 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1238 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
   1239 #else
   1240 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 63 : 31, dst_r, 0);
   1241 #endif
   1242 	FAIL_IF(!inst);
   1243 	*(inst + 1) |= XOR;
   1244 
   1245 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1246 	if (dst & SLJIT_MEM) {
   1247 		inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
   1248 		FAIL_IF(!inst);
   1249 		*inst = XCHG_r_rm;
   1250 	}
   1251 #else
   1252 	if (dst & SLJIT_MEM)
   1253 		EMIT_MOV(compiler, dst, dstw, TMP_REG2, 0);
   1254 #endif
   1255 	return SLJIT_SUCCESS;
   1256 }
   1257 
   1258 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op1(struct sljit_compiler *compiler, sljit_si op,
   1259 	sljit_si dst, sljit_sw dstw,
   1260 	sljit_si src, sljit_sw srcw)
   1261 {
   1262 	sljit_ub* inst;
   1263 	sljit_si update = 0;
   1264 	sljit_si op_flags = GET_ALL_FLAGS(op);
   1265 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1266 	sljit_si dst_is_ereg = 0;
   1267 	sljit_si src_is_ereg = 0;
   1268 #else
   1269 #	define src_is_ereg 0
   1270 #endif
   1271 
   1272 	CHECK_ERROR();
   1273 	CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw));
   1274 	ADJUST_LOCAL_OFFSET(dst, dstw);
   1275 	ADJUST_LOCAL_OFFSET(src, srcw);
   1276 
   1277 	CHECK_EXTRA_REGS(dst, dstw, dst_is_ereg = 1);
   1278 	CHECK_EXTRA_REGS(src, srcw, src_is_ereg = 1);
   1279 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1280 	compiler->mode32 = op_flags & SLJIT_INT_OP;
   1281 #endif
   1282 
   1283 	op = GET_OPCODE(op);
   1284 	if (op >= SLJIT_MOV && op <= SLJIT_MOVU_P) {
   1285 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1286 		compiler->mode32 = 0;
   1287 #endif
   1288 
   1289 		if (op_flags & SLJIT_INT_OP) {
   1290 			if (FAST_IS_REG(src) && src == dst) {
   1291 				if (!TYPE_CAST_NEEDED(op))
   1292 					return SLJIT_SUCCESS;
   1293 			}
   1294 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1295 			if (op == SLJIT_MOV_SI && (src & SLJIT_MEM))
   1296 				op = SLJIT_MOV_UI;
   1297 			if (op == SLJIT_MOVU_SI && (src & SLJIT_MEM))
   1298 				op = SLJIT_MOVU_UI;
   1299 			if (op == SLJIT_MOV_UI && (src & SLJIT_IMM))
   1300 				op = SLJIT_MOV_SI;
   1301 			if (op == SLJIT_MOVU_UI && (src & SLJIT_IMM))
   1302 				op = SLJIT_MOVU_SI;
   1303 #endif
   1304 		}
   1305 
   1306 		SLJIT_COMPILE_ASSERT(SLJIT_MOV + 8 == SLJIT_MOVU, movu_offset);
   1307 		if (op >= SLJIT_MOVU) {
   1308 			update = 1;
   1309 			op -= 8;
   1310 		}
   1311 
   1312 		if (src & SLJIT_IMM) {
   1313 			switch (op) {
   1314 			case SLJIT_MOV_UB:
   1315 				srcw = (sljit_ub)srcw;
   1316 				break;
   1317 			case SLJIT_MOV_SB:
   1318 				srcw = (sljit_sb)srcw;
   1319 				break;
   1320 			case SLJIT_MOV_UH:
   1321 				srcw = (sljit_uh)srcw;
   1322 				break;
   1323 			case SLJIT_MOV_SH:
   1324 				srcw = (sljit_sh)srcw;
   1325 				break;
   1326 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1327 			case SLJIT_MOV_UI:
   1328 				srcw = (sljit_ui)srcw;
   1329 				break;
   1330 			case SLJIT_MOV_SI:
   1331 				srcw = (sljit_si)srcw;
   1332 				break;
   1333 #endif
   1334 			}
   1335 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1336 			if (SLJIT_UNLIKELY(dst_is_ereg))
   1337 				return emit_mov(compiler, dst, dstw, src, srcw);
   1338 #endif
   1339 		}
   1340 
   1341 		if (SLJIT_UNLIKELY(update) && (src & SLJIT_MEM) && !src_is_ereg && (src & REG_MASK) && (srcw != 0 || (src & OFFS_REG_MASK) != 0)) {
   1342 			inst = emit_x86_instruction(compiler, 1, src & REG_MASK, 0, src, srcw);
   1343 			FAIL_IF(!inst);
   1344 			*inst = LEA_r_m;
   1345 			src &= SLJIT_MEM | 0xf;
   1346 			srcw = 0;
   1347 		}
   1348 
   1349 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1350 		if (SLJIT_UNLIKELY(dst_is_ereg) && (!(op == SLJIT_MOV || op == SLJIT_MOV_UI || op == SLJIT_MOV_SI || op == SLJIT_MOV_P) || (src & SLJIT_MEM))) {
   1351 			SLJIT_ASSERT(dst == SLJIT_MEM1(SLJIT_SP));
   1352 			dst = TMP_REG1;
   1353 		}
   1354 #endif
   1355 
   1356 		switch (op) {
   1357 		case SLJIT_MOV:
   1358 		case SLJIT_MOV_P:
   1359 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1360 		case SLJIT_MOV_UI:
   1361 		case SLJIT_MOV_SI:
   1362 #endif
   1363 			FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
   1364 			break;
   1365 		case SLJIT_MOV_UB:
   1366 			FAIL_IF(emit_mov_byte(compiler, 0, dst, dstw, src, srcw));
   1367 			break;
   1368 		case SLJIT_MOV_SB:
   1369 			FAIL_IF(emit_mov_byte(compiler, 1, dst, dstw, src, srcw));
   1370 			break;
   1371 		case SLJIT_MOV_UH:
   1372 			FAIL_IF(emit_mov_half(compiler, 0, dst, dstw, src, srcw));
   1373 			break;
   1374 		case SLJIT_MOV_SH:
   1375 			FAIL_IF(emit_mov_half(compiler, 1, dst, dstw, src, srcw));
   1376 			break;
   1377 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1378 		case SLJIT_MOV_UI:
   1379 			FAIL_IF(emit_mov_int(compiler, 0, dst, dstw, src, srcw));
   1380 			break;
   1381 		case SLJIT_MOV_SI:
   1382 			FAIL_IF(emit_mov_int(compiler, 1, dst, dstw, src, srcw));
   1383 			break;
   1384 #endif
   1385 		}
   1386 
   1387 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1388 		if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REG1)
   1389 			return emit_mov(compiler, SLJIT_MEM1(SLJIT_SP), dstw, TMP_REG1, 0);
   1390 #endif
   1391 
   1392 		if (SLJIT_UNLIKELY(update) && (dst & SLJIT_MEM) && (dst & REG_MASK) && (dstw != 0 || (dst & OFFS_REG_MASK) != 0)) {
   1393 			inst = emit_x86_instruction(compiler, 1, dst & REG_MASK, 0, dst, dstw);
   1394 			FAIL_IF(!inst);
   1395 			*inst = LEA_r_m;
   1396 		}
   1397 		return SLJIT_SUCCESS;
   1398 	}
   1399 
   1400 	if (SLJIT_UNLIKELY(GET_FLAGS(op_flags)))
   1401 		compiler->flags_saved = 0;
   1402 
   1403 	switch (op) {
   1404 	case SLJIT_NOT:
   1405 		if (SLJIT_UNLIKELY(op_flags & SLJIT_SET_E))
   1406 			return emit_not_with_flags(compiler, dst, dstw, src, srcw);
   1407 		return emit_unary(compiler, NOT_rm, dst, dstw, src, srcw);
   1408 
   1409 	case SLJIT_NEG:
   1410 		if (SLJIT_UNLIKELY(op_flags & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
   1411 			FAIL_IF(emit_save_flags(compiler));
   1412 		return emit_unary(compiler, NEG_rm, dst, dstw, src, srcw);
   1413 
   1414 	case SLJIT_CLZ:
   1415 		if (SLJIT_UNLIKELY(op_flags & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
   1416 			FAIL_IF(emit_save_flags(compiler));
   1417 		return emit_clz(compiler, op_flags, dst, dstw, src, srcw);
   1418 	}
   1419 
   1420 	return SLJIT_SUCCESS;
   1421 
   1422 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1423 #	undef src_is_ereg
   1424 #endif
   1425 }
   1426 
   1427 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1428 
   1429 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
   1430 	if (IS_HALFWORD(immw) || compiler->mode32) { \
   1431 		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
   1432 		FAIL_IF(!inst); \
   1433 		*(inst + 1) |= (op_imm); \
   1434 	} \
   1435 	else { \
   1436 		FAIL_IF(emit_load_imm64(compiler, TMP_REG2, immw)); \
   1437 		inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, arg, argw); \
   1438 		FAIL_IF(!inst); \
   1439 		*inst = (op_mr); \
   1440 	}
   1441 
   1442 #define BINARY_EAX_IMM(op_eax_imm, immw) \
   1443 	FAIL_IF(emit_do_imm32(compiler, (!compiler->mode32) ? REX_W : 0, (op_eax_imm), immw))
   1444 
   1445 #else
   1446 
   1447 #define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
   1448 	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
   1449 	FAIL_IF(!inst); \
   1450 	*(inst + 1) |= (op_imm);
   1451 
   1452 #define BINARY_EAX_IMM(op_eax_imm, immw) \
   1453 	FAIL_IF(emit_do_imm(compiler, (op_eax_imm), immw))
   1454 
   1455 #endif
   1456 
   1457 static sljit_si emit_cum_binary(struct sljit_compiler *compiler,
   1458 	sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
   1459 	sljit_si dst, sljit_sw dstw,
   1460 	sljit_si src1, sljit_sw src1w,
   1461 	sljit_si src2, sljit_sw src2w)
   1462 {
   1463 	sljit_ub* inst;
   1464 
   1465 	if (dst == SLJIT_UNUSED) {
   1466 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   1467 		if (src2 & SLJIT_IMM) {
   1468 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
   1469 		}
   1470 		else {
   1471 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
   1472 			FAIL_IF(!inst);
   1473 			*inst = op_rm;
   1474 		}
   1475 		return SLJIT_SUCCESS;
   1476 	}
   1477 
   1478 	if (dst == src1 && dstw == src1w) {
   1479 		if (src2 & SLJIT_IMM) {
   1480 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1481 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
   1482 #else
   1483 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
   1484 #endif
   1485 				BINARY_EAX_IMM(op_eax_imm, src2w);
   1486 			}
   1487 			else {
   1488 				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
   1489 			}
   1490 		}
   1491 		else if (FAST_IS_REG(dst)) {
   1492 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
   1493 			FAIL_IF(!inst);
   1494 			*inst = op_rm;
   1495 		}
   1496 		else if (FAST_IS_REG(src2)) {
   1497 			/* Special exception for sljit_emit_op_flags. */
   1498 			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
   1499 			FAIL_IF(!inst);
   1500 			*inst = op_mr;
   1501 		}
   1502 		else {
   1503 			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
   1504 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
   1505 			FAIL_IF(!inst);
   1506 			*inst = op_mr;
   1507 		}
   1508 		return SLJIT_SUCCESS;
   1509 	}
   1510 
   1511 	/* Only for cumulative operations. */
   1512 	if (dst == src2 && dstw == src2w) {
   1513 		if (src1 & SLJIT_IMM) {
   1514 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1515 			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
   1516 #else
   1517 			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128)) {
   1518 #endif
   1519 				BINARY_EAX_IMM(op_eax_imm, src1w);
   1520 			}
   1521 			else {
   1522 				BINARY_IMM(op_imm, op_mr, src1w, dst, dstw);
   1523 			}
   1524 		}
   1525 		else if (FAST_IS_REG(dst)) {
   1526 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src1, src1w);
   1527 			FAIL_IF(!inst);
   1528 			*inst = op_rm;
   1529 		}
   1530 		else if (FAST_IS_REG(src1)) {
   1531 			inst = emit_x86_instruction(compiler, 1, src1, src1w, dst, dstw);
   1532 			FAIL_IF(!inst);
   1533 			*inst = op_mr;
   1534 		}
   1535 		else {
   1536 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   1537 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
   1538 			FAIL_IF(!inst);
   1539 			*inst = op_mr;
   1540 		}
   1541 		return SLJIT_SUCCESS;
   1542 	}
   1543 
   1544 	/* General version. */
   1545 	if (FAST_IS_REG(dst)) {
   1546 		EMIT_MOV(compiler, dst, 0, src1, src1w);
   1547 		if (src2 & SLJIT_IMM) {
   1548 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
   1549 		}
   1550 		else {
   1551 			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
   1552 			FAIL_IF(!inst);
   1553 			*inst = op_rm;
   1554 		}
   1555 	}
   1556 	else {
   1557 		/* This version requires less memory writing. */
   1558 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   1559 		if (src2 & SLJIT_IMM) {
   1560 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
   1561 		}
   1562 		else {
   1563 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
   1564 			FAIL_IF(!inst);
   1565 			*inst = op_rm;
   1566 		}
   1567 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
   1568 	}
   1569 
   1570 	return SLJIT_SUCCESS;
   1571 }
   1572 
   1573 static sljit_si emit_non_cum_binary(struct sljit_compiler *compiler,
   1574 	sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
   1575 	sljit_si dst, sljit_sw dstw,
   1576 	sljit_si src1, sljit_sw src1w,
   1577 	sljit_si src2, sljit_sw src2w)
   1578 {
   1579 	sljit_ub* inst;
   1580 
   1581 	if (dst == SLJIT_UNUSED) {
   1582 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   1583 		if (src2 & SLJIT_IMM) {
   1584 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
   1585 		}
   1586 		else {
   1587 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
   1588 			FAIL_IF(!inst);
   1589 			*inst = op_rm;
   1590 		}
   1591 		return SLJIT_SUCCESS;
   1592 	}
   1593 
   1594 	if (dst == src1 && dstw == src1w) {
   1595 		if (src2 & SLJIT_IMM) {
   1596 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1597 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
   1598 #else
   1599 			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
   1600 #endif
   1601 				BINARY_EAX_IMM(op_eax_imm, src2w);
   1602 			}
   1603 			else {
   1604 				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
   1605 			}
   1606 		}
   1607 		else if (FAST_IS_REG(dst)) {
   1608 			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
   1609 			FAIL_IF(!inst);
   1610 			*inst = op_rm;
   1611 		}
   1612 		else if (FAST_IS_REG(src2)) {
   1613 			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
   1614 			FAIL_IF(!inst);
   1615 			*inst = op_mr;
   1616 		}
   1617 		else {
   1618 			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
   1619 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
   1620 			FAIL_IF(!inst);
   1621 			*inst = op_mr;
   1622 		}
   1623 		return SLJIT_SUCCESS;
   1624 	}
   1625 
   1626 	/* General version. */
   1627 	if (FAST_IS_REG(dst) && dst != src2) {
   1628 		EMIT_MOV(compiler, dst, 0, src1, src1w);
   1629 		if (src2 & SLJIT_IMM) {
   1630 			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
   1631 		}
   1632 		else {
   1633 			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
   1634 			FAIL_IF(!inst);
   1635 			*inst = op_rm;
   1636 		}
   1637 	}
   1638 	else {
   1639 		/* This version requires less memory writing. */
   1640 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   1641 		if (src2 & SLJIT_IMM) {
   1642 			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
   1643 		}
   1644 		else {
   1645 			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
   1646 			FAIL_IF(!inst);
   1647 			*inst = op_rm;
   1648 		}
   1649 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
   1650 	}
   1651 
   1652 	return SLJIT_SUCCESS;
   1653 }
   1654 
   1655 static sljit_si emit_mul(struct sljit_compiler *compiler,
   1656 	sljit_si dst, sljit_sw dstw,
   1657 	sljit_si src1, sljit_sw src1w,
   1658 	sljit_si src2, sljit_sw src2w)
   1659 {
   1660 	sljit_ub* inst;
   1661 	sljit_si dst_r;
   1662 
   1663 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
   1664 
   1665 	/* Register destination. */
   1666 	if (dst_r == src1 && !(src2 & SLJIT_IMM)) {
   1667 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
   1668 		FAIL_IF(!inst);
   1669 		*inst++ = GROUP_0F;
   1670 		*inst = IMUL_r_rm;
   1671 	}
   1672 	else if (dst_r == src2 && !(src1 & SLJIT_IMM)) {
   1673 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src1, src1w);
   1674 		FAIL_IF(!inst);
   1675 		*inst++ = GROUP_0F;
   1676 		*inst = IMUL_r_rm;
   1677 	}
   1678 	else if (src1 & SLJIT_IMM) {
   1679 		if (src2 & SLJIT_IMM) {
   1680 			EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, src2w);
   1681 			src2 = dst_r;
   1682 			src2w = 0;
   1683 		}
   1684 
   1685 		if (src1w <= 127 && src1w >= -128) {
   1686 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
   1687 			FAIL_IF(!inst);
   1688 			*inst = IMUL_r_rm_i8;
   1689 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
   1690 			FAIL_IF(!inst);
   1691 			INC_SIZE(1);
   1692 			*inst = (sljit_sb)src1w;
   1693 		}
   1694 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1695 		else {
   1696 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
   1697 			FAIL_IF(!inst);
   1698 			*inst = IMUL_r_rm_i32;
   1699 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
   1700 			FAIL_IF(!inst);
   1701 			INC_SIZE(4);
   1702 			*(sljit_sw*)inst = src1w;
   1703 		}
   1704 #else
   1705 		else if (IS_HALFWORD(src1w)) {
   1706 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
   1707 			FAIL_IF(!inst);
   1708 			*inst = IMUL_r_rm_i32;
   1709 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
   1710 			FAIL_IF(!inst);
   1711 			INC_SIZE(4);
   1712 			*(sljit_si*)inst = (sljit_si)src1w;
   1713 		}
   1714 		else {
   1715 			EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
   1716 			if (dst_r != src2)
   1717 				EMIT_MOV(compiler, dst_r, 0, src2, src2w);
   1718 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
   1719 			FAIL_IF(!inst);
   1720 			*inst++ = GROUP_0F;
   1721 			*inst = IMUL_r_rm;
   1722 		}
   1723 #endif
   1724 	}
   1725 	else if (src2 & SLJIT_IMM) {
   1726 		/* Note: src1 is NOT immediate. */
   1727 
   1728 		if (src2w <= 127 && src2w >= -128) {
   1729 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
   1730 			FAIL_IF(!inst);
   1731 			*inst = IMUL_r_rm_i8;
   1732 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
   1733 			FAIL_IF(!inst);
   1734 			INC_SIZE(1);
   1735 			*inst = (sljit_sb)src2w;
   1736 		}
   1737 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   1738 		else {
   1739 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
   1740 			FAIL_IF(!inst);
   1741 			*inst = IMUL_r_rm_i32;
   1742 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
   1743 			FAIL_IF(!inst);
   1744 			INC_SIZE(4);
   1745 			*(sljit_sw*)inst = src2w;
   1746 		}
   1747 #else
   1748 		else if (IS_HALFWORD(src2w)) {
   1749 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
   1750 			FAIL_IF(!inst);
   1751 			*inst = IMUL_r_rm_i32;
   1752 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
   1753 			FAIL_IF(!inst);
   1754 			INC_SIZE(4);
   1755 			*(sljit_si*)inst = (sljit_si)src2w;
   1756 		}
   1757 		else {
   1758 			EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src2w);
   1759 			if (dst_r != src1)
   1760 				EMIT_MOV(compiler, dst_r, 0, src1, src1w);
   1761 			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
   1762 			FAIL_IF(!inst);
   1763 			*inst++ = GROUP_0F;
   1764 			*inst = IMUL_r_rm;
   1765 		}
   1766 #endif
   1767 	}
   1768 	else {
   1769 		/* Neither argument is immediate. */
   1770 		if (ADDRESSING_DEPENDS_ON(src2, dst_r))
   1771 			dst_r = TMP_REG1;
   1772 		EMIT_MOV(compiler, dst_r, 0, src1, src1w);
   1773 		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
   1774 		FAIL_IF(!inst);
   1775 		*inst++ = GROUP_0F;
   1776 		*inst = IMUL_r_rm;
   1777 	}
   1778 
   1779 	if (dst_r == TMP_REG1)
   1780 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
   1781 
   1782 	return SLJIT_SUCCESS;
   1783 }
   1784 
   1785 static sljit_si emit_lea_binary(struct sljit_compiler *compiler, sljit_si keep_flags,
   1786 	sljit_si dst, sljit_sw dstw,
   1787 	sljit_si src1, sljit_sw src1w,
   1788 	sljit_si src2, sljit_sw src2w)
   1789 {
   1790 	sljit_ub* inst;
   1791 	sljit_si dst_r, done = 0;
   1792 
   1793 	/* These cases better be left to handled by normal way. */
   1794 	if (!keep_flags) {
   1795 		if (dst == src1 && dstw == src1w)
   1796 			return SLJIT_ERR_UNSUPPORTED;
   1797 		if (dst == src2 && dstw == src2w)
   1798 			return SLJIT_ERR_UNSUPPORTED;
   1799 	}
   1800 
   1801 	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
   1802 
   1803 	if (FAST_IS_REG(src1)) {
   1804 		if (FAST_IS_REG(src2)) {
   1805 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM2(src1, src2), 0);
   1806 			FAIL_IF(!inst);
   1807 			*inst = LEA_r_m;
   1808 			done = 1;
   1809 		}
   1810 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1811 		if ((src2 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src2w))) {
   1812 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), (sljit_si)src2w);
   1813 #else
   1814 		if (src2 & SLJIT_IMM) {
   1815 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), src2w);
   1816 #endif
   1817 			FAIL_IF(!inst);
   1818 			*inst = LEA_r_m;
   1819 			done = 1;
   1820 		}
   1821 	}
   1822 	else if (FAST_IS_REG(src2)) {
   1823 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1824 		if ((src1 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src1w))) {
   1825 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_si)src1w);
   1826 #else
   1827 		if (src1 & SLJIT_IMM) {
   1828 			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), src1w);
   1829 #endif
   1830 			FAIL_IF(!inst);
   1831 			*inst = LEA_r_m;
   1832 			done = 1;
   1833 		}
   1834 	}
   1835 
   1836 	if (done) {
   1837 		if (dst_r == TMP_REG1)
   1838 			return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
   1839 		return SLJIT_SUCCESS;
   1840 	}
   1841 	return SLJIT_ERR_UNSUPPORTED;
   1842 }
   1843 
   1844 static sljit_si emit_cmp_binary(struct sljit_compiler *compiler,
   1845 	sljit_si src1, sljit_sw src1w,
   1846 	sljit_si src2, sljit_sw src2w)
   1847 {
   1848 	sljit_ub* inst;
   1849 
   1850 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1851 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
   1852 #else
   1853 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
   1854 #endif
   1855 		BINARY_EAX_IMM(CMP_EAX_i32, src2w);
   1856 		return SLJIT_SUCCESS;
   1857 	}
   1858 
   1859 	if (FAST_IS_REG(src1)) {
   1860 		if (src2 & SLJIT_IMM) {
   1861 			BINARY_IMM(CMP, CMP_rm_r, src2w, src1, 0);
   1862 		}
   1863 		else {
   1864 			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
   1865 			FAIL_IF(!inst);
   1866 			*inst = CMP_r_rm;
   1867 		}
   1868 		return SLJIT_SUCCESS;
   1869 	}
   1870 
   1871 	if (FAST_IS_REG(src2) && !(src1 & SLJIT_IMM)) {
   1872 		inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
   1873 		FAIL_IF(!inst);
   1874 		*inst = CMP_rm_r;
   1875 		return SLJIT_SUCCESS;
   1876 	}
   1877 
   1878 	if (src2 & SLJIT_IMM) {
   1879 		if (src1 & SLJIT_IMM) {
   1880 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   1881 			src1 = TMP_REG1;
   1882 			src1w = 0;
   1883 		}
   1884 		BINARY_IMM(CMP, CMP_rm_r, src2w, src1, src1w);
   1885 	}
   1886 	else {
   1887 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   1888 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
   1889 		FAIL_IF(!inst);
   1890 		*inst = CMP_r_rm;
   1891 	}
   1892 	return SLJIT_SUCCESS;
   1893 }
   1894 
   1895 static sljit_si emit_test_binary(struct sljit_compiler *compiler,
   1896 	sljit_si src1, sljit_sw src1w,
   1897 	sljit_si src2, sljit_sw src2w)
   1898 {
   1899 	sljit_ub* inst;
   1900 
   1901 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1902 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
   1903 #else
   1904 	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
   1905 #endif
   1906 		BINARY_EAX_IMM(TEST_EAX_i32, src2w);
   1907 		return SLJIT_SUCCESS;
   1908 	}
   1909 
   1910 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1911 	if (src2 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
   1912 #else
   1913 	if (src2 == SLJIT_R0 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128)) {
   1914 #endif
   1915 		BINARY_EAX_IMM(TEST_EAX_i32, src1w);
   1916 		return SLJIT_SUCCESS;
   1917 	}
   1918 
   1919 	if (!(src1 & SLJIT_IMM)) {
   1920 		if (src2 & SLJIT_IMM) {
   1921 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1922 			if (IS_HALFWORD(src2w) || compiler->mode32) {
   1923 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
   1924 				FAIL_IF(!inst);
   1925 				*inst = GROUP_F7;
   1926 			}
   1927 			else {
   1928 				FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
   1929 				inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src1, src1w);
   1930 				FAIL_IF(!inst);
   1931 				*inst = TEST_rm_r;
   1932 			}
   1933 #else
   1934 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
   1935 			FAIL_IF(!inst);
   1936 			*inst = GROUP_F7;
   1937 #endif
   1938 			return SLJIT_SUCCESS;
   1939 		}
   1940 		else if (FAST_IS_REG(src1)) {
   1941 			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
   1942 			FAIL_IF(!inst);
   1943 			*inst = TEST_rm_r;
   1944 			return SLJIT_SUCCESS;
   1945 		}
   1946 	}
   1947 
   1948 	if (!(src2 & SLJIT_IMM)) {
   1949 		if (src1 & SLJIT_IMM) {
   1950 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1951 			if (IS_HALFWORD(src1w) || compiler->mode32) {
   1952 				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, src2w);
   1953 				FAIL_IF(!inst);
   1954 				*inst = GROUP_F7;
   1955 			}
   1956 			else {
   1957 				FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
   1958 				inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src2, src2w);
   1959 				FAIL_IF(!inst);
   1960 				*inst = TEST_rm_r;
   1961 			}
   1962 #else
   1963 			inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, src2w);
   1964 			FAIL_IF(!inst);
   1965 			*inst = GROUP_F7;
   1966 #endif
   1967 			return SLJIT_SUCCESS;
   1968 		}
   1969 		else if (FAST_IS_REG(src2)) {
   1970 			inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
   1971 			FAIL_IF(!inst);
   1972 			*inst = TEST_rm_r;
   1973 			return SLJIT_SUCCESS;
   1974 		}
   1975 	}
   1976 
   1977 	EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   1978 	if (src2 & SLJIT_IMM) {
   1979 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   1980 		if (IS_HALFWORD(src2w) || compiler->mode32) {
   1981 			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
   1982 			FAIL_IF(!inst);
   1983 			*inst = GROUP_F7;
   1984 		}
   1985 		else {
   1986 			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
   1987 			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, TMP_REG1, 0);
   1988 			FAIL_IF(!inst);
   1989 			*inst = TEST_rm_r;
   1990 		}
   1991 #else
   1992 		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
   1993 		FAIL_IF(!inst);
   1994 		*inst = GROUP_F7;
   1995 #endif
   1996 	}
   1997 	else {
   1998 		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
   1999 		FAIL_IF(!inst);
   2000 		*inst = TEST_rm_r;
   2001 	}
   2002 	return SLJIT_SUCCESS;
   2003 }
   2004 
   2005 static sljit_si emit_shift(struct sljit_compiler *compiler,
   2006 	sljit_ub mode,
   2007 	sljit_si dst, sljit_sw dstw,
   2008 	sljit_si src1, sljit_sw src1w,
   2009 	sljit_si src2, sljit_sw src2w)
   2010 {
   2011 	sljit_ub* inst;
   2012 
   2013 	if ((src2 & SLJIT_IMM) || (src2 == SLJIT_PREF_SHIFT_REG)) {
   2014 		if (dst == src1 && dstw == src1w) {
   2015 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, dstw);
   2016 			FAIL_IF(!inst);
   2017 			*inst |= mode;
   2018 			return SLJIT_SUCCESS;
   2019 		}
   2020 		if (dst == SLJIT_UNUSED) {
   2021 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   2022 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
   2023 			FAIL_IF(!inst);
   2024 			*inst |= mode;
   2025 			return SLJIT_SUCCESS;
   2026 		}
   2027 		if (dst == SLJIT_PREF_SHIFT_REG && src2 == SLJIT_PREF_SHIFT_REG) {
   2028 			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   2029 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
   2030 			FAIL_IF(!inst);
   2031 			*inst |= mode;
   2032 			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
   2033 			return SLJIT_SUCCESS;
   2034 		}
   2035 		if (FAST_IS_REG(dst)) {
   2036 			EMIT_MOV(compiler, dst, 0, src1, src1w);
   2037 			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0);
   2038 			FAIL_IF(!inst);
   2039 			*inst |= mode;
   2040 			return SLJIT_SUCCESS;
   2041 		}
   2042 
   2043 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   2044 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
   2045 		FAIL_IF(!inst);
   2046 		*inst |= mode;
   2047 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
   2048 		return SLJIT_SUCCESS;
   2049 	}
   2050 
   2051 	if (dst == SLJIT_PREF_SHIFT_REG) {
   2052 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   2053 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
   2054 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
   2055 		FAIL_IF(!inst);
   2056 		*inst |= mode;
   2057 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
   2058 	}
   2059 	else if (FAST_IS_REG(dst) && dst != src2 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
   2060 		if (src1 != dst)
   2061 			EMIT_MOV(compiler, dst, 0, src1, src1w);
   2062 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
   2063 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
   2064 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
   2065 		FAIL_IF(!inst);
   2066 		*inst |= mode;
   2067 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
   2068 	}
   2069 	else {
   2070 		/* This case is really difficult, since ecx itself may used for
   2071 		   addressing, and we must ensure to work even in that case. */
   2072 		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
   2073 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2074 		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
   2075 #else
   2076 		/* [esp+0] contains the flags. */
   2077 		EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw), SLJIT_PREF_SHIFT_REG, 0);
   2078 #endif
   2079 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
   2080 		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
   2081 		FAIL_IF(!inst);
   2082 		*inst |= mode;
   2083 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2084 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
   2085 #else
   2086 		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw));
   2087 #endif
   2088 		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
   2089 	}
   2090 
   2091 	return SLJIT_SUCCESS;
   2092 }
   2093 
   2094 static sljit_si emit_shift_with_flags(struct sljit_compiler *compiler,
   2095 	sljit_ub mode, sljit_si set_flags,
   2096 	sljit_si dst, sljit_sw dstw,
   2097 	sljit_si src1, sljit_sw src1w,
   2098 	sljit_si src2, sljit_sw src2w)
   2099 {
   2100 	/* The CPU does not set flags if the shift count is 0. */
   2101 	if (src2 & SLJIT_IMM) {
   2102 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2103 		if ((src2w & 0x3f) != 0 || (compiler->mode32 && (src2w & 0x1f) != 0))
   2104 			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
   2105 #else
   2106 		if ((src2w & 0x1f) != 0)
   2107 			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
   2108 #endif
   2109 		if (!set_flags)
   2110 			return emit_mov(compiler, dst, dstw, src1, src1w);
   2111 		/* OR dst, src, 0 */
   2112 		return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
   2113 			dst, dstw, src1, src1w, SLJIT_IMM, 0);
   2114 	}
   2115 
   2116 	if (!set_flags)
   2117 		return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
   2118 
   2119 	if (!FAST_IS_REG(dst))
   2120 		FAIL_IF(emit_cmp_binary(compiler, src1, src1w, SLJIT_IMM, 0));
   2121 
   2122 	FAIL_IF(emit_shift(compiler,mode, dst, dstw, src1, src1w, src2, src2w));
   2123 
   2124 	if (FAST_IS_REG(dst))
   2125 		return emit_cmp_binary(compiler, dst, dstw, SLJIT_IMM, 0);
   2126 	return SLJIT_SUCCESS;
   2127 }
   2128 
   2129 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op2(struct sljit_compiler *compiler, sljit_si op,
   2130 	sljit_si dst, sljit_sw dstw,
   2131 	sljit_si src1, sljit_sw src1w,
   2132 	sljit_si src2, sljit_sw src2w)
   2133 {
   2134 	CHECK_ERROR();
   2135 	CHECK(check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
   2136 	ADJUST_LOCAL_OFFSET(dst, dstw);
   2137 	ADJUST_LOCAL_OFFSET(src1, src1w);
   2138 	ADJUST_LOCAL_OFFSET(src2, src2w);
   2139 
   2140 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
   2141 	CHECK_EXTRA_REGS(src1, src1w, (void)0);
   2142 	CHECK_EXTRA_REGS(src2, src2w, (void)0);
   2143 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2144 	compiler->mode32 = op & SLJIT_INT_OP;
   2145 #endif
   2146 
   2147 	if (GET_OPCODE(op) >= SLJIT_MUL) {
   2148 		if (SLJIT_UNLIKELY(GET_FLAGS(op)))
   2149 			compiler->flags_saved = 0;
   2150 		else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
   2151 			FAIL_IF(emit_save_flags(compiler));
   2152 	}
   2153 
   2154 	switch (GET_OPCODE(op)) {
   2155 	case SLJIT_ADD:
   2156 		if (!GET_FLAGS(op)) {
   2157 			if (emit_lea_binary(compiler, op & SLJIT_KEEP_FLAGS, dst, dstw, src1, src1w, src2, src2w) != SLJIT_ERR_UNSUPPORTED)
   2158 				return compiler->error;
   2159 		}
   2160 		else
   2161 			compiler->flags_saved = 0;
   2162 		if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
   2163 			FAIL_IF(emit_save_flags(compiler));
   2164 		return emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
   2165 			dst, dstw, src1, src1w, src2, src2w);
   2166 	case SLJIT_ADDC:
   2167 		if (SLJIT_UNLIKELY(compiler->flags_saved)) /* C flag must be restored. */
   2168 			FAIL_IF(emit_restore_flags(compiler, 1));
   2169 		else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS))
   2170 			FAIL_IF(emit_save_flags(compiler));
   2171 		if (SLJIT_UNLIKELY(GET_FLAGS(op)))
   2172 			compiler->flags_saved = 0;
   2173 		return emit_cum_binary(compiler, ADC_r_rm, ADC_rm_r, ADC, ADC_EAX_i32,
   2174 			dst, dstw, src1, src1w, src2, src2w);
   2175 	case SLJIT_SUB:
   2176 		if (!GET_FLAGS(op)) {
   2177 			if ((src2 & SLJIT_IMM) && emit_lea_binary(compiler, op & SLJIT_KEEP_FLAGS, dst, dstw, src1, src1w, SLJIT_IMM, -src2w) != SLJIT_ERR_UNSUPPORTED)
   2178 				return compiler->error;
   2179 		}
   2180 		else
   2181 			compiler->flags_saved = 0;
   2182 		if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
   2183 			FAIL_IF(emit_save_flags(compiler));
   2184 		if (dst == SLJIT_UNUSED)
   2185 			return emit_cmp_binary(compiler, src1, src1w, src2, src2w);
   2186 		return emit_non_cum_binary(compiler, SUB_r_rm, SUB_rm_r, SUB, SUB_EAX_i32,
   2187 			dst, dstw, src1, src1w, src2, src2w);
   2188 	case SLJIT_SUBC:
   2189 		if (SLJIT_UNLIKELY(compiler->flags_saved)) /* C flag must be restored. */
   2190 			FAIL_IF(emit_restore_flags(compiler, 1));
   2191 		else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS))
   2192 			FAIL_IF(emit_save_flags(compiler));
   2193 		if (SLJIT_UNLIKELY(GET_FLAGS(op)))
   2194 			compiler->flags_saved = 0;
   2195 		return emit_non_cum_binary(compiler, SBB_r_rm, SBB_rm_r, SBB, SBB_EAX_i32,
   2196 			dst, dstw, src1, src1w, src2, src2w);
   2197 	case SLJIT_MUL:
   2198 		return emit_mul(compiler, dst, dstw, src1, src1w, src2, src2w);
   2199 	case SLJIT_AND:
   2200 		if (dst == SLJIT_UNUSED)
   2201 			return emit_test_binary(compiler, src1, src1w, src2, src2w);
   2202 		return emit_cum_binary(compiler, AND_r_rm, AND_rm_r, AND, AND_EAX_i32,
   2203 			dst, dstw, src1, src1w, src2, src2w);
   2204 	case SLJIT_OR:
   2205 		return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
   2206 			dst, dstw, src1, src1w, src2, src2w);
   2207 	case SLJIT_XOR:
   2208 		return emit_cum_binary(compiler, XOR_r_rm, XOR_rm_r, XOR, XOR_EAX_i32,
   2209 			dst, dstw, src1, src1w, src2, src2w);
   2210 	case SLJIT_SHL:
   2211 		return emit_shift_with_flags(compiler, SHL, GET_FLAGS(op),
   2212 			dst, dstw, src1, src1w, src2, src2w);
   2213 	case SLJIT_LSHR:
   2214 		return emit_shift_with_flags(compiler, SHR, GET_FLAGS(op),
   2215 			dst, dstw, src1, src1w, src2, src2w);
   2216 	case SLJIT_ASHR:
   2217 		return emit_shift_with_flags(compiler, SAR, GET_FLAGS(op),
   2218 			dst, dstw, src1, src1w, src2, src2w);
   2219 	}
   2220 
   2221 	return SLJIT_SUCCESS;
   2222 }
   2223 
   2224 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_register_index(sljit_si reg)
   2225 {
   2226 	CHECK_REG_INDEX(check_sljit_get_register_index(reg));
   2227 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   2228 	if (reg >= SLJIT_R3 && reg <= SLJIT_R6)
   2229 		return -1;
   2230 #endif
   2231 	return reg_map[reg];
   2232 }
   2233 
   2234 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_float_register_index(sljit_si reg)
   2235 {
   2236 	CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
   2237 	return reg;
   2238 }
   2239 
   2240 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_custom(struct sljit_compiler *compiler,
   2241 	void *instruction, sljit_si size)
   2242 {
   2243 	sljit_ub *inst;
   2244 
   2245 	CHECK_ERROR();
   2246 	CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
   2247 
   2248 	inst = (sljit_ub*)ensure_buf(compiler, 1 + size);
   2249 	FAIL_IF(!inst);
   2250 	INC_SIZE(size);
   2251 	SLJIT_MEMMOVE(inst, instruction, size);
   2252 	return SLJIT_SUCCESS;
   2253 }
   2254 
   2255 /* --------------------------------------------------------------------- */
   2256 /*  Floating point operators                                             */
   2257 /* --------------------------------------------------------------------- */
   2258 
   2259 /* Alignment + 2 * 16 bytes. */
   2260 static sljit_si sse2_data[3 + (4 + 4) * 2];
   2261 static sljit_si *sse2_buffer;
   2262 
   2263 static void init_compiler(void)
   2264 {
   2265 	sse2_buffer = (sljit_si*)(((sljit_uw)sse2_data + 15) & ~0xf);
   2266 	/* Single precision constants. */
   2267 	sse2_buffer[0] = 0x80000000;
   2268 	sse2_buffer[4] = 0x7fffffff;
   2269 	/* Double precision constants. */
   2270 	sse2_buffer[8] = 0;
   2271 	sse2_buffer[9] = 0x80000000;
   2272 	sse2_buffer[12] = 0xffffffff;
   2273 	sse2_buffer[13] = 0x7fffffff;
   2274 }
   2275 
   2276 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_is_fpu_available(void)
   2277 {
   2278 #ifdef SLJIT_IS_FPU_AVAILABLE
   2279 	return SLJIT_IS_FPU_AVAILABLE;
   2280 #elif (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
   2281 	if (cpu_has_sse2 == -1)
   2282 		get_cpu_features();
   2283 	return cpu_has_sse2;
   2284 #else /* SLJIT_DETECT_SSE2 */
   2285 	return 1;
   2286 #endif /* SLJIT_DETECT_SSE2 */
   2287 }
   2288 
   2289 static sljit_si emit_sse2(struct sljit_compiler *compiler, sljit_ub opcode,
   2290 	sljit_si single, sljit_si xmm1, sljit_si xmm2, sljit_sw xmm2w)
   2291 {
   2292 	sljit_ub *inst;
   2293 
   2294 	inst = emit_x86_instruction(compiler, 2 | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
   2295 	FAIL_IF(!inst);
   2296 	*inst++ = GROUP_0F;
   2297 	*inst = opcode;
   2298 	return SLJIT_SUCCESS;
   2299 }
   2300 
   2301 static sljit_si emit_sse2_logic(struct sljit_compiler *compiler, sljit_ub opcode,
   2302 	sljit_si pref66, sljit_si xmm1, sljit_si xmm2, sljit_sw xmm2w)
   2303 {
   2304 	sljit_ub *inst;
   2305 
   2306 	inst = emit_x86_instruction(compiler, 2 | (pref66 ? EX86_PREF_66 : 0) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
   2307 	FAIL_IF(!inst);
   2308 	*inst++ = GROUP_0F;
   2309 	*inst = opcode;
   2310 	return SLJIT_SUCCESS;
   2311 }
   2312 
   2313 static SLJIT_INLINE sljit_si emit_sse2_load(struct sljit_compiler *compiler,
   2314 	sljit_si single, sljit_si dst, sljit_si src, sljit_sw srcw)
   2315 {
   2316 	return emit_sse2(compiler, MOVSD_x_xm, single, dst, src, srcw);
   2317 }
   2318 
   2319 static SLJIT_INLINE sljit_si emit_sse2_store(struct sljit_compiler *compiler,
   2320 	sljit_si single, sljit_si dst, sljit_sw dstw, sljit_si src)
   2321 {
   2322 	return emit_sse2(compiler, MOVSD_xm_x, single, src, dst, dstw);
   2323 }
   2324 
   2325 static SLJIT_INLINE sljit_si sljit_emit_fop1_convw_fromd(struct sljit_compiler *compiler, sljit_si op,
   2326 	sljit_si dst, sljit_sw dstw,
   2327 	sljit_si src, sljit_sw srcw)
   2328 {
   2329 	sljit_si dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
   2330 	sljit_ub *inst;
   2331 
   2332 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2333 	if (GET_OPCODE(op) == SLJIT_CONVW_FROMD)
   2334 		compiler->mode32 = 0;
   2335 #endif
   2336 
   2337 	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_SINGLE_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP2, dst_r, 0, src, srcw);
   2338 	FAIL_IF(!inst);
   2339 	*inst++ = GROUP_0F;
   2340 	*inst = CVTTSD2SI_r_xm;
   2341 
   2342 	if (dst_r == TMP_REG1 && dst != SLJIT_UNUSED)
   2343 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
   2344 	return SLJIT_SUCCESS;
   2345 }
   2346 
   2347 static SLJIT_INLINE sljit_si sljit_emit_fop1_convd_fromw(struct sljit_compiler *compiler, sljit_si op,
   2348 	sljit_si dst, sljit_sw dstw,
   2349 	sljit_si src, sljit_sw srcw)
   2350 {
   2351 	sljit_si dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
   2352 	sljit_ub *inst;
   2353 
   2354 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2355 	if (GET_OPCODE(op) == SLJIT_CONVD_FROMW)
   2356 		compiler->mode32 = 0;
   2357 #endif
   2358 
   2359 	if (src & SLJIT_IMM) {
   2360 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2361 		if (GET_OPCODE(op) == SLJIT_CONVD_FROMI)
   2362 			srcw = (sljit_si)srcw;
   2363 #endif
   2364 		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
   2365 		src = TMP_REG1;
   2366 		srcw = 0;
   2367 	}
   2368 
   2369 	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_SINGLE_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP1, dst_r, 0, src, srcw);
   2370 	FAIL_IF(!inst);
   2371 	*inst++ = GROUP_0F;
   2372 	*inst = CVTSI2SD_x_rm;
   2373 
   2374 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2375 	compiler->mode32 = 1;
   2376 #endif
   2377 	if (dst_r == TMP_FREG)
   2378 		return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
   2379 	return SLJIT_SUCCESS;
   2380 }
   2381 
   2382 static SLJIT_INLINE sljit_si sljit_emit_fop1_cmp(struct sljit_compiler *compiler, sljit_si op,
   2383 	sljit_si src1, sljit_sw src1w,
   2384 	sljit_si src2, sljit_sw src2w)
   2385 {
   2386 	compiler->flags_saved = 0;
   2387 	if (!FAST_IS_REG(src1)) {
   2388 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src1, src1w));
   2389 		src1 = TMP_FREG;
   2390 	}
   2391 	return emit_sse2_logic(compiler, UCOMISD_x_xm, !(op & SLJIT_SINGLE_OP), src1, src2, src2w);
   2392 }
   2393 
   2394 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fop1(struct sljit_compiler *compiler, sljit_si op,
   2395 	sljit_si dst, sljit_sw dstw,
   2396 	sljit_si src, sljit_sw srcw)
   2397 {
   2398 	sljit_si dst_r;
   2399 
   2400 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2401 	compiler->mode32 = 1;
   2402 #endif
   2403 
   2404 	CHECK_ERROR();
   2405 	SELECT_FOP1_OPERATION_WITH_CHECKS(compiler, op, dst, dstw, src, srcw);
   2406 
   2407 	if (GET_OPCODE(op) == SLJIT_DMOV) {
   2408 		if (FAST_IS_REG(dst))
   2409 			return emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst, src, srcw);
   2410 		if (FAST_IS_REG(src))
   2411 			return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, src);
   2412 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src, srcw));
   2413 		return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
   2414 	}
   2415 
   2416 	if (GET_OPCODE(op) == SLJIT_CONVD_FROMS) {
   2417 		dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
   2418 		if (FAST_IS_REG(src)) {
   2419 			/* We overwrite the high bits of source. From SLJIT point of view,
   2420 			   this is not an issue.
   2421 			   Note: In SSE3, we could also use MOVDDUP and MOVSLDUP. */
   2422 			FAIL_IF(emit_sse2_logic(compiler, UNPCKLPD_x_xm, op & SLJIT_SINGLE_OP, src, src, 0));
   2423 		}
   2424 		else {
   2425 			FAIL_IF(emit_sse2_load(compiler, !(op & SLJIT_SINGLE_OP), TMP_FREG, src, srcw));
   2426 			src = TMP_FREG;
   2427 		}
   2428 
   2429 		FAIL_IF(emit_sse2_logic(compiler, CVTPD2PS_x_xm, op & SLJIT_SINGLE_OP, dst_r, src, 0));
   2430 		if (dst_r == TMP_FREG)
   2431 			return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
   2432 		return SLJIT_SUCCESS;
   2433 	}
   2434 
   2435 	if (SLOW_IS_REG(dst)) {
   2436 		dst_r = dst;
   2437 		if (dst != src)
   2438 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst_r, src, srcw));
   2439 	}
   2440 	else {
   2441 		dst_r = TMP_FREG;
   2442 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst_r, src, srcw));
   2443 	}
   2444 
   2445 	switch (GET_OPCODE(op)) {
   2446 	case SLJIT_DNEG:
   2447 		FAIL_IF(emit_sse2_logic(compiler, XORPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_SINGLE_OP ? sse2_buffer : sse2_buffer + 8)));
   2448 		break;
   2449 
   2450 	case SLJIT_DABS:
   2451 		FAIL_IF(emit_sse2_logic(compiler, ANDPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_SINGLE_OP ? sse2_buffer + 4 : sse2_buffer + 12)));
   2452 		break;
   2453 	}
   2454 
   2455 	if (dst_r == TMP_FREG)
   2456 		return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
   2457 	return SLJIT_SUCCESS;
   2458 }
   2459 
   2460 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fop2(struct sljit_compiler *compiler, sljit_si op,
   2461 	sljit_si dst, sljit_sw dstw,
   2462 	sljit_si src1, sljit_sw src1w,
   2463 	sljit_si src2, sljit_sw src2w)
   2464 {
   2465 	sljit_si dst_r;
   2466 
   2467 	CHECK_ERROR();
   2468 	CHECK(check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
   2469 	ADJUST_LOCAL_OFFSET(dst, dstw);
   2470 	ADJUST_LOCAL_OFFSET(src1, src1w);
   2471 	ADJUST_LOCAL_OFFSET(src2, src2w);
   2472 
   2473 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2474 	compiler->mode32 = 1;
   2475 #endif
   2476 
   2477 	if (FAST_IS_REG(dst)) {
   2478 		dst_r = dst;
   2479 		if (dst == src1)
   2480 			; /* Do nothing here. */
   2481 		else if (dst == src2 && (op == SLJIT_DADD || op == SLJIT_DMUL)) {
   2482 			/* Swap arguments. */
   2483 			src2 = src1;
   2484 			src2w = src1w;
   2485 		}
   2486 		else if (dst != src2)
   2487 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst_r, src1, src1w));
   2488 		else {
   2489 			dst_r = TMP_FREG;
   2490 			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src1, src1w));
   2491 		}
   2492 	}
   2493 	else {
   2494 		dst_r = TMP_FREG;
   2495 		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src1, src1w));
   2496 	}
   2497 
   2498 	switch (GET_OPCODE(op)) {
   2499 	case SLJIT_DADD:
   2500 		FAIL_IF(emit_sse2(compiler, ADDSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
   2501 		break;
   2502 
   2503 	case SLJIT_DSUB:
   2504 		FAIL_IF(emit_sse2(compiler, SUBSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
   2505 		break;
   2506 
   2507 	case SLJIT_DMUL:
   2508 		FAIL_IF(emit_sse2(compiler, MULSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
   2509 		break;
   2510 
   2511 	case SLJIT_DDIV:
   2512 		FAIL_IF(emit_sse2(compiler, DIVSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
   2513 		break;
   2514 	}
   2515 
   2516 	if (dst_r == TMP_FREG)
   2517 		return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
   2518 	return SLJIT_SUCCESS;
   2519 }
   2520 
   2521 /* --------------------------------------------------------------------- */
   2522 /*  Conditional instructions                                             */
   2523 /* --------------------------------------------------------------------- */
   2524 
   2525 SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
   2526 {
   2527 	sljit_ub *inst;
   2528 	struct sljit_label *label;
   2529 
   2530 	CHECK_ERROR_PTR();
   2531 	CHECK_PTR(check_sljit_emit_label(compiler));
   2532 
   2533 	/* We should restore the flags before the label,
   2534 	   since other taken jumps has their own flags as well. */
   2535 	if (SLJIT_UNLIKELY(compiler->flags_saved))
   2536 		PTR_FAIL_IF(emit_restore_flags(compiler, 0));
   2537 
   2538 	if (compiler->last_label && compiler->last_label->size == compiler->size)
   2539 		return compiler->last_label;
   2540 
   2541 	label = (struct sljit_label*)ensure_abuf(compiler, sizeof(struct sljit_label));
   2542 	PTR_FAIL_IF(!label);
   2543 	set_label(label, compiler);
   2544 
   2545 	inst = (sljit_ub*)ensure_buf(compiler, 2);
   2546 	PTR_FAIL_IF(!inst);
   2547 
   2548 	*inst++ = 0;
   2549 	*inst++ = 0;
   2550 
   2551 	return label;
   2552 }
   2553 
   2554 SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_si type)
   2555 {
   2556 	sljit_ub *inst;
   2557 	struct sljit_jump *jump;
   2558 
   2559 	CHECK_ERROR_PTR();
   2560 	CHECK_PTR(check_sljit_emit_jump(compiler, type));
   2561 
   2562 	if (SLJIT_UNLIKELY(compiler->flags_saved)) {
   2563 		if ((type & 0xff) <= SLJIT_JUMP)
   2564 			PTR_FAIL_IF(emit_restore_flags(compiler, 0));
   2565 		compiler->flags_saved = 0;
   2566 	}
   2567 
   2568 	jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
   2569 	PTR_FAIL_IF_NULL(jump);
   2570 	set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
   2571 	type &= 0xff;
   2572 
   2573 	if (type >= SLJIT_CALL1)
   2574 		PTR_FAIL_IF(call_with_args(compiler, type));
   2575 
   2576 	/* Worst case size. */
   2577 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   2578 	compiler->size += (type >= SLJIT_JUMP) ? 5 : 6;
   2579 #else
   2580 	compiler->size += (type >= SLJIT_JUMP) ? (10 + 3) : (2 + 10 + 3);
   2581 #endif
   2582 
   2583 	inst = (sljit_ub*)ensure_buf(compiler, 2);
   2584 	PTR_FAIL_IF_NULL(inst);
   2585 
   2586 	*inst++ = 0;
   2587 	*inst++ = type + 4;
   2588 	return jump;
   2589 }
   2590 
   2591 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_ijump(struct sljit_compiler *compiler, sljit_si type, sljit_si src, sljit_sw srcw)
   2592 {
   2593 	sljit_ub *inst;
   2594 	struct sljit_jump *jump;
   2595 
   2596 	CHECK_ERROR();
   2597 	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
   2598 	ADJUST_LOCAL_OFFSET(src, srcw);
   2599 
   2600 	CHECK_EXTRA_REGS(src, srcw, (void)0);
   2601 
   2602 	if (SLJIT_UNLIKELY(compiler->flags_saved)) {
   2603 		if (type <= SLJIT_JUMP)
   2604 			FAIL_IF(emit_restore_flags(compiler, 0));
   2605 		compiler->flags_saved = 0;
   2606 	}
   2607 
   2608 	if (type >= SLJIT_CALL1) {
   2609 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   2610 #if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
   2611 		if (src == SLJIT_R2) {
   2612 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
   2613 			src = TMP_REG1;
   2614 		}
   2615 		if (src == SLJIT_MEM1(SLJIT_SP) && type >= SLJIT_CALL3)
   2616 			srcw += sizeof(sljit_sw);
   2617 #endif
   2618 #endif
   2619 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && defined(_WIN64)
   2620 		if (src == SLJIT_R2) {
   2621 			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
   2622 			src = TMP_REG1;
   2623 		}
   2624 #endif
   2625 		FAIL_IF(call_with_args(compiler, type));
   2626 	}
   2627 
   2628 	if (src == SLJIT_IMM) {
   2629 		jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
   2630 		FAIL_IF_NULL(jump);
   2631 		set_jump(jump, compiler, JUMP_ADDR);
   2632 		jump->u.target = srcw;
   2633 
   2634 		/* Worst case size. */
   2635 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   2636 		compiler->size += 5;
   2637 #else
   2638 		compiler->size += 10 + 3;
   2639 #endif
   2640 
   2641 		inst = (sljit_ub*)ensure_buf(compiler, 2);
   2642 		FAIL_IF_NULL(inst);
   2643 
   2644 		*inst++ = 0;
   2645 		*inst++ = type + 4;
   2646 	}
   2647 	else {
   2648 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2649 		/* REX_W is not necessary (src is not immediate). */
   2650 		compiler->mode32 = 1;
   2651 #endif
   2652 		inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
   2653 		FAIL_IF(!inst);
   2654 		*inst++ = GROUP_FF;
   2655 		*inst |= (type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm;
   2656 	}
   2657 	return SLJIT_SUCCESS;
   2658 }
   2659 
   2660 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_si op,
   2661 	sljit_si dst, sljit_sw dstw,
   2662 	sljit_si src, sljit_sw srcw,
   2663 	sljit_si type)
   2664 {
   2665 	sljit_ub *inst;
   2666 	sljit_ub cond_set = 0;
   2667 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2668 	sljit_si reg;
   2669 #else
   2670 	/* CHECK_EXTRA_REGS migh overwrite these values. */
   2671 	sljit_si dst_save = dst;
   2672 	sljit_sw dstw_save = dstw;
   2673 #endif
   2674 
   2675 	CHECK_ERROR();
   2676 	CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type));
   2677 	SLJIT_UNUSED_ARG(srcw);
   2678 
   2679 	if (dst == SLJIT_UNUSED)
   2680 		return SLJIT_SUCCESS;
   2681 
   2682 	ADJUST_LOCAL_OFFSET(dst, dstw);
   2683 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
   2684 	if (SLJIT_UNLIKELY(compiler->flags_saved))
   2685 		FAIL_IF(emit_restore_flags(compiler, op & SLJIT_KEEP_FLAGS));
   2686 
   2687 	type &= 0xff;
   2688 	/* setcc = jcc + 0x10. */
   2689 	cond_set = get_jump_code(type) + 0x10;
   2690 
   2691 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2692 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src) {
   2693 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 4 + 3);
   2694 		FAIL_IF(!inst);
   2695 		INC_SIZE(4 + 3);
   2696 		/* Set low register to conditional flag. */
   2697 		*inst++ = (reg_map[TMP_REG1] <= 7) ? REX : REX_B;
   2698 		*inst++ = GROUP_0F;
   2699 		*inst++ = cond_set;
   2700 		*inst++ = MOD_REG | reg_lmap[TMP_REG1];
   2701 		*inst++ = REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B);
   2702 		*inst++ = OR_rm8_r8;
   2703 		*inst++ = MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst];
   2704 		return SLJIT_SUCCESS;
   2705 	}
   2706 
   2707 	reg = (op == SLJIT_MOV && FAST_IS_REG(dst)) ? dst : TMP_REG1;
   2708 
   2709 	inst = (sljit_ub*)ensure_buf(compiler, 1 + 4 + 4);
   2710 	FAIL_IF(!inst);
   2711 	INC_SIZE(4 + 4);
   2712 	/* Set low register to conditional flag. */
   2713 	*inst++ = (reg_map[reg] <= 7) ? REX : REX_B;
   2714 	*inst++ = GROUP_0F;
   2715 	*inst++ = cond_set;
   2716 	*inst++ = MOD_REG | reg_lmap[reg];
   2717 	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R));
   2718 	*inst++ = GROUP_0F;
   2719 	*inst++ = MOVZX_r_rm8;
   2720 	*inst = MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg];
   2721 
   2722 	if (reg != TMP_REG1)
   2723 		return SLJIT_SUCCESS;
   2724 
   2725 	if (GET_OPCODE(op) < SLJIT_ADD) {
   2726 		compiler->mode32 = GET_OPCODE(op) != SLJIT_MOV;
   2727 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
   2728 	}
   2729 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
   2730 		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
   2731 	compiler->skip_checks = 1;
   2732 #endif
   2733 	return sljit_emit_op2(compiler, op, dst, dstw, dst, dstw, TMP_REG1, 0);
   2734 #else /* SLJIT_CONFIG_X86_64 */
   2735 	if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) {
   2736 		if (reg_map[dst] <= 4) {
   2737 			/* Low byte is accessible. */
   2738 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 3 + 3);
   2739 			FAIL_IF(!inst);
   2740 			INC_SIZE(3 + 3);
   2741 			/* Set low byte to conditional flag. */
   2742 			*inst++ = GROUP_0F;
   2743 			*inst++ = cond_set;
   2744 			*inst++ = MOD_REG | reg_map[dst];
   2745 
   2746 			*inst++ = GROUP_0F;
   2747 			*inst++ = MOVZX_r_rm8;
   2748 			*inst = MOD_REG | (reg_map[dst] << 3) | reg_map[dst];
   2749 			return SLJIT_SUCCESS;
   2750 		}
   2751 
   2752 		/* Low byte is not accessible. */
   2753 		if (cpu_has_cmov == -1)
   2754 			get_cpu_features();
   2755 
   2756 		if (cpu_has_cmov) {
   2757 			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
   2758 			/* a xor reg, reg operation would overwrite the flags. */
   2759 			EMIT_MOV(compiler, dst, 0, SLJIT_IMM, 0);
   2760 
   2761 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 3);
   2762 			FAIL_IF(!inst);
   2763 			INC_SIZE(3);
   2764 
   2765 			*inst++ = GROUP_0F;
   2766 			/* cmovcc = setcc - 0x50. */
   2767 			*inst++ = cond_set - 0x50;
   2768 			*inst++ = MOD_REG | (reg_map[dst] << 3) | reg_map[TMP_REG1];
   2769 			return SLJIT_SUCCESS;
   2770 		}
   2771 
   2772 		inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
   2773 		FAIL_IF(!inst);
   2774 		INC_SIZE(1 + 3 + 3 + 1);
   2775 		*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
   2776 		/* Set al to conditional flag. */
   2777 		*inst++ = GROUP_0F;
   2778 		*inst++ = cond_set;
   2779 		*inst++ = MOD_REG | 0 /* eax */;
   2780 
   2781 		*inst++ = GROUP_0F;
   2782 		*inst++ = MOVZX_r_rm8;
   2783 		*inst++ = MOD_REG | (reg_map[dst] << 3) | 0 /* eax */;
   2784 		*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
   2785 		return SLJIT_SUCCESS;
   2786 	}
   2787 
   2788 	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src && reg_map[dst] <= 4) {
   2789 		SLJIT_COMPILE_ASSERT(reg_map[SLJIT_R0] == 0, scratch_reg1_must_be_eax);
   2790 		if (dst != SLJIT_R0) {
   2791 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 3 + 2 + 1);
   2792 			FAIL_IF(!inst);
   2793 			INC_SIZE(1 + 3 + 2 + 1);
   2794 			/* Set low register to conditional flag. */
   2795 			*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
   2796 			*inst++ = GROUP_0F;
   2797 			*inst++ = cond_set;
   2798 			*inst++ = MOD_REG | 0 /* eax */;
   2799 			*inst++ = OR_rm8_r8;
   2800 			*inst++ = MOD_REG | (0 /* eax */ << 3) | reg_map[dst];
   2801 			*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
   2802 		}
   2803 		else {
   2804 			inst = (sljit_ub*)ensure_buf(compiler, 1 + 2 + 3 + 2 + 2);
   2805 			FAIL_IF(!inst);
   2806 			INC_SIZE(2 + 3 + 2 + 2);
   2807 			/* Set low register to conditional flag. */
   2808 			*inst++ = XCHG_r_rm;
   2809 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
   2810 			*inst++ = GROUP_0F;
   2811 			*inst++ = cond_set;
   2812 			*inst++ = MOD_REG | 1 /* ecx */;
   2813 			*inst++ = OR_rm8_r8;
   2814 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | 0 /* eax */;
   2815 			*inst++ = XCHG_r_rm;
   2816 			*inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
   2817 		}
   2818 		return SLJIT_SUCCESS;
   2819 	}
   2820 
   2821 	/* Set TMP_REG1 to the bit. */
   2822 	inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
   2823 	FAIL_IF(!inst);
   2824 	INC_SIZE(1 + 3 + 3 + 1);
   2825 	*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
   2826 	/* Set al to conditional flag. */
   2827 	*inst++ = GROUP_0F;
   2828 	*inst++ = cond_set;
   2829 	*inst++ = MOD_REG | 0 /* eax */;
   2830 
   2831 	*inst++ = GROUP_0F;
   2832 	*inst++ = MOVZX_r_rm8;
   2833 	*inst++ = MOD_REG | (0 << 3) /* eax */ | 0 /* eax */;
   2834 
   2835 	*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
   2836 
   2837 	if (GET_OPCODE(op) < SLJIT_ADD)
   2838 		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
   2839 
   2840 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
   2841 		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
   2842 	compiler->skip_checks = 1;
   2843 #endif
   2844 	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
   2845 #endif /* SLJIT_CONFIG_X86_64 */
   2846 }
   2847 
   2848 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_local_base(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw, sljit_sw offset)
   2849 {
   2850 	CHECK_ERROR();
   2851 	CHECK(check_sljit_get_local_base(compiler, dst, dstw, offset));
   2852 	ADJUST_LOCAL_OFFSET(dst, dstw);
   2853 
   2854 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
   2855 
   2856 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2857 	compiler->mode32 = 0;
   2858 #endif
   2859 
   2860 	ADJUST_LOCAL_OFFSET(SLJIT_MEM1(SLJIT_SP), offset);
   2861 
   2862 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2863 	if (NOT_HALFWORD(offset)) {
   2864 		FAIL_IF(emit_load_imm64(compiler, TMP_REG1, offset));
   2865 #if (defined SLJIT_DEBUG && SLJIT_DEBUG)
   2866 		SLJIT_ASSERT(emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0) != SLJIT_ERR_UNSUPPORTED);
   2867 		return compiler->error;
   2868 #else
   2869 		return emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0);
   2870 #endif
   2871 	}
   2872 #endif
   2873 
   2874 	if (offset != 0)
   2875 		return emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, SLJIT_IMM, offset);
   2876 	return emit_mov(compiler, dst, dstw, SLJIT_SP, 0);
   2877 }
   2878 
   2879 SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw, sljit_sw init_value)
   2880 {
   2881 	sljit_ub *inst;
   2882 	struct sljit_const *const_;
   2883 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2884 	sljit_si reg;
   2885 #endif
   2886 
   2887 	CHECK_ERROR_PTR();
   2888 	CHECK_PTR(check_sljit_emit_const(compiler, dst, dstw, init_value));
   2889 	ADJUST_LOCAL_OFFSET(dst, dstw);
   2890 
   2891 	CHECK_EXTRA_REGS(dst, dstw, (void)0);
   2892 
   2893 	const_ = (struct sljit_const*)ensure_abuf(compiler, sizeof(struct sljit_const));
   2894 	PTR_FAIL_IF(!const_);
   2895 	set_const(const_, compiler);
   2896 
   2897 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2898 	compiler->mode32 = 0;
   2899 	reg = SLOW_IS_REG(dst) ? dst : TMP_REG1;
   2900 
   2901 	if (emit_load_imm64(compiler, reg, init_value))
   2902 		return NULL;
   2903 #else
   2904 	if (dst == SLJIT_UNUSED)
   2905 		dst = TMP_REG1;
   2906 
   2907 	if (emit_mov(compiler, dst, dstw, SLJIT_IMM, init_value))
   2908 		return NULL;
   2909 #endif
   2910 
   2911 	inst = (sljit_ub*)ensure_buf(compiler, 2);
   2912 	PTR_FAIL_IF(!inst);
   2913 
   2914 	*inst++ = 0;
   2915 	*inst++ = 1;
   2916 
   2917 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2918 	if (dst & SLJIT_MEM)
   2919 		if (emit_mov(compiler, dst, dstw, TMP_REG1, 0))
   2920 			return NULL;
   2921 #endif
   2922 
   2923 	return const_;
   2924 }
   2925 
   2926 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_addr)
   2927 {
   2928 #if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
   2929 	*(sljit_sw*)addr = new_addr - (addr + 4);
   2930 #else
   2931 	*(sljit_uw*)addr = new_addr;
   2932 #endif
   2933 }
   2934 
   2935 SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant)
   2936 {
   2937 	*(sljit_sw*)addr = new_constant;
   2938 }
   2939 
   2940 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_is_sse2_available(void)
   2941 {
   2942 #if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
   2943 	if (cpu_has_sse2 == -1)
   2944 		get_cpu_features();
   2945 	return cpu_has_sse2;
   2946 #else
   2947 	return 1;
   2948 #endif
   2949 }
   2950 
   2951 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_is_cmov_available(void)
   2952 {
   2953 	if (cpu_has_cmov == -1)
   2954 		get_cpu_features();
   2955 	return cpu_has_cmov;
   2956 }
   2957 
   2958 SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_x86_emit_cmov(struct sljit_compiler *compiler,
   2959 	sljit_si type,
   2960 	sljit_si dst_reg,
   2961 	sljit_si src, sljit_sw srcw)
   2962 {
   2963 	sljit_ub* inst;
   2964 
   2965 	CHECK_ERROR();
   2966 #if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
   2967 	CHECK_ARGUMENT(sljit_x86_is_cmov_available());
   2968 	CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_INT_OP)));
   2969 	CHECK_ARGUMENT((type & 0xff) >= SLJIT_EQUAL && (type & 0xff) <= SLJIT_D_ORDERED);
   2970 	CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(dst_reg & ~SLJIT_INT_OP));
   2971 	FUNCTION_CHECK_SRC(src, srcw);
   2972 #endif
   2973 #if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
   2974 	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
   2975 		fprintf(compiler->verbose, "  x86_cmov%s %s%s, ",
   2976 			!(dst_reg & SLJIT_INT_OP) ? "" : ".i",
   2977 			JUMP_PREFIX(type), jump_names[type & 0xff]);
   2978 		sljit_verbose_reg(compiler, dst_reg & ~SLJIT_INT_OP);
   2979 		fprintf(compiler->verbose, ", ");
   2980 		sljit_verbose_param(compiler, src, srcw);
   2981 		fprintf(compiler->verbose, "\n");
   2982 	}
   2983 #endif
   2984 
   2985 	ADJUST_LOCAL_OFFSET(src, srcw);
   2986 	CHECK_EXTRA_REGS(src, srcw, (void)0);
   2987 
   2988 #if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
   2989 	compiler->mode32 = dst_reg & SLJIT_INT_OP;
   2990 #endif
   2991 	dst_reg &= ~SLJIT_INT_OP;
   2992 
   2993 	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
   2994 		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
   2995 		src = TMP_REG1;
   2996 		srcw = 0;
   2997 	}
   2998 
   2999 	inst = emit_x86_instruction(compiler, 2, dst_reg, 0, src, srcw);
   3000 	FAIL_IF(!inst);
   3001 	*inst++ = GROUP_0F;
   3002 	*inst = get_jump_code(type & 0xff) - 0x40;
   3003 	return SLJIT_SUCCESS;
   3004 }
   3005