/*
 * IR - Lightweight JIT Compilation Framework
 * (x86/x86_64 native code generator based on DynAsm)
 * Copyright (C) 2022 Zend by Perforce.
 * Authors: Dmitry Stogov <dmitry@php.net>
 */

|.if X64
|.arch x64
|.else
|.arch x86
|.endif

|.actionlist dasm_actions
|.globals ir_lb
|.section code, cold_code, rodata, jmp_table

|.define IR_LOOP_ALIGNMENT, 16

#ifdef IR_DEBUG
typedef struct _ir_mem {uint64_t v;} ir_mem;

# define IR_MEM_VAL(loc)            ((loc).v)
#else
typedef uint64_t ir_mem;

# define IR_MEM_VAL(loc)            (loc)
#endif

#define IR_MEM_OFFSET(loc)          ((int32_t)(IR_MEM_VAL(loc) & 0xffffffff))
#define IR_MEM_BASE(loc)            ((ir_reg)((IR_MEM_VAL(loc) >> 32) & 0xff))
#define IR_MEM_INDEX(loc)           ((ir_reg)((IR_MEM_VAL(loc) >> 40) & 0xff))
#define IR_MEM_SCALE(loc)           ((int32_t)((IR_MEM_VAL(loc) >> 48) & 0xff))

#define IR_MEM_O(addr)            IR_MEM(IR_REG_NONE, addr, IR_REG_NONE, 1)
#define IR_MEM_B(base)            IR_MEM(base, 0, IR_REG_NONE, 1)
#define IR_MEM_BO(base, offset)   IR_MEM(base, offset, IR_REG_NONE, 1)

IR_ALWAYS_INLINE ir_mem IR_MEM(ir_reg base, int32_t offset, ir_reg index, int32_t scale)
{
	ir_mem mem;
	IR_ASSERT(base == IR_REG_NONE || (base >= IR_REG_GP_FIRST && base <= IR_REG_GP_LAST));
	IR_ASSERT(index == IR_REG_NONE || (index >= IR_REG_GP_FIRST && index <= IR_REG_GP_LAST));
	IR_ASSERT(scale == 1 || scale == 2 || scale == 4 || scale == 8);
#ifdef IR_DEBUG
	mem.v =
#else
	mem =
#endif
		((uint64_t)(uint32_t)offset |
		((uint64_t)(uint8_t)base << 32) |
		((uint64_t)(uint8_t)index << 40) |
		((uint64_t)(uint8_t)scale << 48));
	return mem;
}

#define IR_IS_SIGNED_32BIT(val)     ((((intptr_t)(val)) <= 0x7fffffff) && (((intptr_t)(val)) >= (-2147483647 - 1)))
#define IR_IS_SIGNED_NEG_32BIT(val) ((((intptr_t)(val)) <= 0x7fffffff) && (((intptr_t)(val)) >= -2147483647))
#define IR_IS_UNSIGNED_32BIT(val)   (((uintptr_t)(val)) <= 0xffffffff)
#define IR_IS_32BIT(type, val)      (IR_IS_TYPE_SIGNED(type) ? IR_IS_SIGNED_32BIT((val).i64) : IR_IS_UNSIGNED_32BIT((val).u64))
#define IR_IS_FP_ZERO(insn)         ((insn.type == IR_DOUBLE) ? (insn.val.u64 == 0) : (insn.val.u32 == 0))
#define IR_MAY_USE_32BIT_ADDR(code_buffer, addr) \
	((code_buffer) && \
	IR_IS_SIGNED_32BIT((char*)(addr) - (char*)(code_buffer)->start) && \
	IR_IS_SIGNED_32BIT((char*)(addr) - ((char*)(code_buffer)->end)))

#define IR_SPILL_POS_TO_OFFSET(offset) \
	((ctx->flags & IR_USE_FRAME_POINTER) ? \
		((offset) - (ctx->stack_frame_size - ctx->stack_frame_alignment)) : \
		((offset) + ctx->call_stack_size))

|.macro ASM_EXPAND_OP_MEM, MACRO, op, type, op1
||	do {
||		int32_t offset = IR_MEM_OFFSET(op1);
||		int32_t base = IR_MEM_BASE(op1);
||		int32_t index = IR_MEM_INDEX(op1);
||		int32_t scale = IR_MEM_SCALE(op1);
||  	if (index == IR_REG_NONE) {
||			if (base == IR_REG_NONE) {
|				MACRO op, type, [offset]
||			} else {
|				MACRO op, type, [Ra(base)+offset]
||			}
||		} else if (scale == 8) {
||			if (base == IR_REG_NONE) {
|				MACRO op, type, [Ra(index)*8+offset]
||			} else {
|				MACRO op, type, [Ra(base)+Ra(index)*8+offset]
||			}
||		} else if (scale == 4) {
||			if (base == IR_REG_NONE) {
|				MACRO op, type, [Ra(index)*4+offset]
||			} else {
|				MACRO op, type, [Ra(base)+Ra(index)*4+offset]
||			}
||		} else if (scale == 2) {
||			if (base == IR_REG_NONE) {
|				MACRO op, type, [Ra(index)*2+offset]
||			} else {
|				MACRO op, type, [Ra(base)+Ra(index)*2+offset]
||			}
||		} else {
||			IR_ASSERT(scale == 1);
||			if (base == IR_REG_NONE) {
|				MACRO op, type, [Ra(index)+offset]
||			} else {
|				MACRO op, type, [Ra(base)+Ra(index)+offset]
||			}
||		}
||	} while (0);
|.endmacro

|.macro ASM_EXPAND_OP1_MEM, MACRO, op, type, op1, op2
||	do {
||		int32_t offset = IR_MEM_OFFSET(op1);
||		int32_t base = IR_MEM_BASE(op1);
||		int32_t index = IR_MEM_INDEX(op1);
||		int32_t scale = IR_MEM_SCALE(op1);
||  	if (index == IR_REG_NONE) {
||			if (base == IR_REG_NONE) {
|				MACRO op, type, [offset], op2
||			} else {
|				MACRO op, type, [Ra(base)+offset], op2
||			}
||		} else if (scale == 8) {
||			if (base == IR_REG_NONE) {
|				MACRO op, type, [Ra(index)*8+offset], op2
||			} else {
|				MACRO op, type, [Ra(base)+Ra(index)*8+offset], op2
||			}
||		} else if (scale == 4) {
||			if (base == IR_REG_NONE) {
|				MACRO op, type, [Ra(index)*4+offset], op2
||			} else {
|				MACRO op, type, [Ra(base)+Ra(index)*4+offset], op2
||			}
||		} else if (scale == 2) {
||			if (base == IR_REG_NONE) {
|				MACRO op, type, [Ra(index)*2+offset], op2
||			} else {
|				MACRO op, type, [Ra(base)+Ra(index)*2+offset], op2
||			}
||		} else {
||			IR_ASSERT(scale == 1);
||			if (base == IR_REG_NONE) {
|				MACRO op, type, [Ra(index)+offset], op2
||			} else {
|				MACRO op, type, [Ra(base)+Ra(index)+offset], op2
||			}
||		}
||	} while (0);
|.endmacro

|.macro ASM_EXPAND_OP2_MEM, MACRO, op, type, op1, op2
||	do {
||		int32_t offset = IR_MEM_OFFSET(op2);
||		int32_t base = IR_MEM_BASE(op2);
||		int32_t index = IR_MEM_INDEX(op2);
||		int32_t scale = IR_MEM_SCALE(op2);
||  	if (index == IR_REG_NONE) {
||			if (base == IR_REG_NONE) {
|				MACRO op, type, op1, [offset]
||			} else {
|				MACRO op, type, op1, [Ra(base)+offset]
||			}
||		} else if (scale == 8) {
||			if (base == IR_REG_NONE) {
|				MACRO op, type, op1, [Ra(index)*8+offset]
||			} else {
|				MACRO op, type, op1, [Ra(base)+Ra(index)*8+offset]
||			}
||		} else if (scale == 4) {
||			if (base == IR_REG_NONE) {
|				MACRO op, type, op1, [Ra(index)*4+offset]
||			} else {
|				MACRO op, type, op1, [Ra(base)+Ra(index)*4+offset]
||			}
||		} else if (scale == 2) {
||			if (base == IR_REG_NONE) {
|				MACRO op, type, op1, [Ra(index)*2+offset]
||			} else {
|				MACRO op, type, op1, [Ra(base)+Ra(index)*2+offset]
||			}
||		} else {
||			IR_ASSERT(scale == 1);
||			if (base == IR_REG_NONE) {
|				MACRO op, type, op1, [Ra(index)+offset]
||			} else {
|				MACRO op, type, op1, [Ra(base)+Ra(index)+offset]
||			}
||		}
||	} while (0);
|.endmacro

|.macro ASM_EXPAND_OP2_MEM_3, MACRO, op, type, op1, op2, op3
||	do {
||		int32_t offset = IR_MEM_OFFSET(op2);
||		int32_t base = IR_MEM_BASE(op2);
||		int32_t index = IR_MEM_INDEX(op2);
||		int32_t scale = IR_MEM_SCALE(op2);
||  	if (index == IR_REG_NONE) {
||			if (base == IR_REG_NONE) {
|				MACRO op, type, op1, [offset], op3
||			} else {
|				MACRO op, type, op1, [Ra(base)+offset], op3
||			}
||		} else if (scale == 8) {
||			if (base == IR_REG_NONE) {
|				MACRO op, type, op1, [Ra(index)*8+offset], op3
||			} else {
|				MACRO op, type, op1, [Ra(base)+Ra(index)*8+offset], op3
||			}
||		} else if (scale == 4) {
||			if (base == IR_REG_NONE) {
|				MACRO op, type, op1, [Ra(index)*4+offset], op3
||			} else {
|				MACRO op, type, op1, [Ra(base)+Ra(index)*4+offset], op3
||			}
||		} else if (scale == 2) {
||			if (base == IR_REG_NONE) {
|				MACRO op, type, op1, [Ra(index)*2+offset], op3
||			} else {
|				MACRO op, type, op1, [Ra(base)+Ra(index)*2+offset], op3
||			}
||		} else {
||			IR_ASSERT(scale == 1);
||			if (base == IR_REG_NONE) {
|				MACRO op, type, op1, [Ra(index)+offset], op3
||			} else {
|				MACRO op, type, op1, [Ra(base)+Ra(index)+offset], op3
||			}
||		}
||	} while (0);
|.endmacro

|.macro ASM_EXPAND_OP3_MEM, MACRO, op, type, op1, op2, op3
||	do {
||		int32_t offset = IR_MEM_OFFSET(op3);
||		int32_t base = IR_MEM_BASE(op3);
||		int32_t index = IR_MEM_INDEX(op3);
||		int32_t scale = IR_MEM_SCALE(op3);
||  	if (index == IR_REG_NONE) {
||			if (base == IR_REG_NONE) {
|				MACRO op, type, op1, op2, [offset]
||			} else {
|				MACRO op, type, op1, op2, [Ra(base)+offset]
||			}
||		} else if (scale == 8) {
||			if (base == IR_REG_NONE) {
|				MACRO op, type, op1, op2, [Ra(index)*8+offset]
||			} else {
|				MACRO op, type, op1, op2, [Ra(base)+Ra(index)*8+offset]
||			}
||		} else if (scale == 4) {
||			if (base == IR_REG_NONE) {
|				MACRO op, type, op1, op2, [Ra(index)*4+offset]
||			} else {
|				MACRO op, type, op1, op2, [Ra(base)+Ra(index)*4+offset]
||			}
||		} else if (scale == 2) {
||			if (base == IR_REG_NONE) {
|				MACRO op, type, op1, op2, [Ra(index)*2+offset]
||			} else {
|				MACRO op, type, op1, op2, [Ra(base)+Ra(index)*2+offset]
||			}
||		} else {
||			IR_ASSERT(scale == 1);
||			if (base == IR_REG_NONE) {
|				MACRO op, type, op1, op2, [Ra(index)+offset]
||			} else {
|				MACRO op, type, op1, op2, [Ra(base)+Ra(index)+offset]
||			}
||		}
||	} while (0);
|.endmacro

|.macro ASM_EXPAND_TYPE_MEM, op, type, op1
||	switch (ir_type_size[type]) {
|| 		default:
||			IR_ASSERT(0);
||		case 1:
|			op byte op1
||			break;
||		case 2:
|			op word op1
|| 			break;
||		case 4:
|			op dword op1
|| 			break;
|.if X64
||		case 8:
|			op qword op1
|| 			break;
|.endif
||	}
|.endmacro

|.macro ASM_EXPAND_TYPE_MEM_REG, op, type, op1, op2
||	switch (ir_type_size[type]) {
|| 		default:
||			IR_ASSERT(0);
||		case 1:
|			op byte op1, Rb(op2)
||			break;
||		case 2:
|			op word op1, Rw(op2)
|| 			break;
||		case 4:
|			op dword op1, Rd(op2)
|| 			break;
|.if X64
||		case 8:
|			op qword op1, Rq(op2)
|| 			break;
|.endif
||	}
|.endmacro

|.macro ASM_EXPAND_TYPE_MEM_TXT, op, type, op1, op2
||	switch (ir_type_size[type]) {
|| 		default:
||			IR_ASSERT(0);
||		case 1:
|			op byte op1, op2
||			break;
||		case 2:
|			op word op1, op2
|| 			break;
||		case 4:
|			op dword op1, op2
|| 			break;
|.if X64
||		case 8:
|			op qword op1, op2
|| 			break;
|.endif
||	}
|.endmacro

|.macro ASM_EXPAND_TYPE_MEM_IMM, op, type, op1, op2
||	switch (ir_type_size[type]) {
|| 		default:
||			IR_ASSERT(0);
||		case 1:
|			op byte op1, (op2 & 0xff)
||			break;
||		case 2:
|			op word op1, (op2 & 0xffff)
|| 			break;
||		case 4:
|			op dword op1, op2
|| 			break;
|.if X64
||		case 8:
|			op qword op1, op2
|| 			break;
|.endif
||	}
|.endmacro

|.macro ASM_EXPAND_TYPE_REG_MEM, op, type, op1, op2
||	switch (ir_type_size[type]) {
|| 		default:
||			IR_ASSERT(0);
||		case 1:
|			op Rb(op1), byte op2
||			break;
||		case 2:
|			op Rw(op1), word op2
|| 			break;
||		case 4:
|			op Rd(op1), dword op2
|| 			break;
|.if X64
||		case 8:
|			op Rq(op1), qword op2
|| 			break;
|.endif
||	}
|.endmacro

|.macro ASM_TMEM_OP, op, type, op1
||	do {
||		int32_t offset = IR_MEM_OFFSET(op1);
||		int32_t base = IR_MEM_BASE(op1);
||		int32_t index = IR_MEM_INDEX(op1);
||		int32_t scale = IR_MEM_SCALE(op1);
||  	if (index == IR_REG_NONE) {
||			if (base == IR_REG_NONE) {
|				op type [offset]
||			} else {
|				op type [Ra(base)+offset]
||			}
||		} else if (scale == 8) {
||			if (base == IR_REG_NONE) {
|				op type [Ra(index)*8+offset]
||			} else {
|				op type [Ra(base)+Ra(index)*8+offset]
||			}
||		} else if (scale == 4) {
||			if (base == IR_REG_NONE) {
|				op type [Ra(index)*4+offset]
||			} else {
|				op type [Ra(base)+Ra(index)*4+offset]
||			}
||		} else if (scale == 2) {
||			if (base == IR_REG_NONE) {
|				op type [Ra(index)*2+offset]
||			} else {
|				op type [Ra(base)+Ra(index)*2+offset]
||			}
||		} else {
||			IR_ASSERT(scale == 1);
||			if (base == IR_REG_NONE) {
|				op type [Ra(index)+offset]
||			} else {
|				op type [Ra(base)+Ra(index)+offset]
||			}
||		}
||	} while (0);
|.endmacro

|.macro ASM_TXT_TMEM_OP, op, op1, type, op2
||	do {
||		int32_t offset = IR_MEM_OFFSET(op2);
||		int32_t base = IR_MEM_BASE(op2);
||		int32_t index = IR_MEM_INDEX(op2);
||		int32_t scale = IR_MEM_SCALE(op2);
||  	if (index == IR_REG_NONE) {
||			if (base == IR_REG_NONE) {
|				op op1, type [offset]
||			} else {
|				op op1, type [Ra(base)+offset]
||			}
||		} else if (scale == 8) {
||			if (base == IR_REG_NONE) {
|				op op1, type [Ra(index)*8+offset]
||			} else {
|				op op1, type [Ra(base)+Ra(index)*8+offset]
||			}
||		} else if (scale == 4) {
||			if (base == IR_REG_NONE) {
|				op op1, type [Ra(index)*4+offset]
||			} else {
|				op op1, type [Ra(base)+Ra(index)*4+offset]
||			}
||		} else if (scale == 2) {
||			if (base == IR_REG_NONE) {
|				op op1, type [Ra(index)*2+offset]
||			} else {
|				op op1, type [Ra(base)+Ra(index)*2+offset]
||			}
||		} else {
||			IR_ASSERT(scale == 1);
||			if (base == IR_REG_NONE) {
|				op op1, type [Ra(index)+offset]
||			} else {
|				op op1, type [Ra(base)+Ra(index)+offset]
||			}
||		}
||	} while (0);
|.endmacro

|.macro ASM_TMEM_TXT_OP, op, type, op1, op2
||	do {
||		int32_t offset = IR_MEM_OFFSET(op1);
||		int32_t base = IR_MEM_BASE(op1);
||		int32_t index = IR_MEM_INDEX(op1);
||		int32_t scale = IR_MEM_SCALE(op1);
||  	if (index == IR_REG_NONE) {
||			if (base == IR_REG_NONE) {
|				op type [offset], op2
||			} else {
|				op type [Ra(base)+offset], op2
||			}
||		} else if (scale == 8) {
||			if (base == IR_REG_NONE) {
|				op type [Ra(index)*8+offset], op2
||			} else {
|				op type [Ra(base)+Ra(index)*8+offset], op2
||			}
||		} else if (scale == 4) {
||			if (base == IR_REG_NONE) {
|				op type [Ra(index)*4+offset], op2
||			} else {
|				op type [Ra(base)+Ra(index)*4+offset], op2
||			}
||		} else if (scale == 2) {
||			if (base == IR_REG_NONE) {
|				op type [Ra(index)*2+offset], op2
||			} else {
|				op type [Ra(base)+Ra(index)*2+offset], op2
||			}
||		} else {
||			IR_ASSERT(scale == 1);
||			if (base == IR_REG_NONE) {
|				op type [Ra(index)+offset], op2
||			} else {
|				op type [Ra(base)+Ra(index)+offset], op2
||			}
||		}
||	} while (0);
|.endmacro

|.macro ASM_TXT_TXT_TMEM_OP, op, op1, op2, type, op3
||	do {
||		int32_t offset = IR_MEM_OFFSET(op3);
||		int32_t base = IR_MEM_BASE(op3);
||		int32_t index = IR_MEM_INDEX(op3);
||		int32_t scale = IR_MEM_SCALE(op3);
||  	if (index == IR_REG_NONE) {
||			if (base == IR_REG_NONE) {
|				op op1, op2, type [offset]
||			} else {
|				op op1, op2, type [Ra(base)+offset]
||			}
||		} else if (scale == 8) {
||			if (base == IR_REG_NONE) {
|				op op1, op2, type [Ra(index)*8+offset]
||			} else {
|				op op1, op2, type [Ra(base)+Ra(index)*8+offset]
||			}
||		} else if (scale == 4) {
||			if (base == IR_REG_NONE) {
|				op op1, op2, type [Ra(index)*4+offset]
||			} else {
|				op op1, op2, type [Ra(base)+Ra(index)*4+offset]
||			}
||		} else if (scale == 2) {
||			if (base == IR_REG_NONE) {
|				op op1, op2, type [Ra(index)*2+offset]
||			} else {
|				op op1, op2, type [Ra(base)+Ra(index)*2+offset]
||			}
||		} else {
||			IR_ASSERT(scale == 1);
||			if (base == IR_REG_NONE) {
|				op op1, op2, type [Ra(index)+offset]
||			} else {
|				op op1, op2, type [Ra(base)+Ra(index)+offset]
||			}
||		}
||	} while (0);
|.endmacro

|.macro ASM_REG_OP, op, type, op1
||	switch (ir_type_size[type]) {
|| 		default:
||			IR_ASSERT(0);
||		case 1:
|			op Rb(op1)
||			break;
||		case 2:
|			op Rw(op1)
|| 			break;
||		case 4:
|			op Rd(op1)
|| 			break;
|.if X64
||		case 8:
|			op Rq(op1)
|| 			break;
|.endif
||	}
|.endmacro

|.macro ASM_MEM_OP, op, type, op1
|	ASM_EXPAND_OP_MEM ASM_EXPAND_TYPE_MEM, op, type, op1
|.endmacro

|.macro ASM_REG_REG_OP, op, type, op1, op2
||	switch (ir_type_size[type]) {
|| 		default:
||			IR_ASSERT(0);
||		case 1:
|			op Rb(op1), Rb(op2)
||			break;
||		case 2:
|			op Rw(op1), Rw(op2)
|| 			break;
||		case 4:
|			op Rd(op1), Rd(op2)
|| 			break;
|.if X64
||		case 8:
|			op Rq(op1), Rq(op2)
|| 			break;
|.endif
||	}
|.endmacro

|.macro ASM_REG_REG_OP2, op, type, op1, op2
||	switch (ir_type_size[type]) {
|| 		default:
||			IR_ASSERT(0);
||		case 1:
||		case 2:
|			op Rw(op1), Rw(op2)
|| 			break;
||		case 4:
|			op Rd(op1), Rd(op2)
|| 			break;
|.if X64
||		case 8:
|			op Rq(op1), Rq(op2)
|| 			break;
|.endif
||	}
|.endmacro

|.macro ASM_REG_TXT_OP, op, type, op1, op2
||	switch (ir_type_size[type]) {
|| 		default:
||			IR_ASSERT(0);
||		case 1:
|			op Rb(op1), op2
||			break;
||		case 2:
|			op Rw(op1), op2
|| 			break;
||		case 4:
|			op Rd(op1), op2
|| 			break;
|.if X64
||		case 8:
|			op Rq(op1), op2
|| 			break;
|.endif
||	}
|.endmacro

|.macro ASM_REG_IMM_OP, op, type, op1, op2
||	switch (ir_type_size[type]) {
|| 		default:
||			IR_ASSERT(0);
||		case 1:
|			op Rb(op1), (op2 & 0xff)
||			break;
||		case 2:
|			op Rw(op1), (op2 & 0xffff)
|| 			break;
||		case 4:
|			op Rd(op1), op2
|| 			break;
|.if X64
||		case 8:
|			op Rq(op1), op2
|| 			break;
|.endif
||	}
|.endmacro

/* Like ASM_REG_IMM_OP, but op1 accepts r16,r32,r64 (not r8) */
|.macro ASM_REG16_IMM_OP, op, type, op1, op2
||	switch (ir_type_size[type]) {
|| 		default:
||			IR_ASSERT(0);
||		case 1:
||		case 2:
|			op Rw(op1), (op2 & 0xffff)
|| 			break;
||		case 4:
|			op Rd(op1), op2
|| 			break;
|.if X64
||		case 8:
|			op Rq(op1), op2
|| 			break;
|.endif
||	}
|.endmacro

|.macro ASM_MEM_REG_OP, op, type, op1, op2
|	ASM_EXPAND_OP1_MEM ASM_EXPAND_TYPE_MEM_REG, op, type, op1, op2
|.endmacro

|.macro ASM_MEM_TXT_OP, op, type, op1, op2
|	ASM_EXPAND_OP1_MEM ASM_EXPAND_TYPE_MEM_TXT, op, type, op1, op2
|.endmacro

|.macro ASM_MEM_IMM_OP, op, type, op1, op2
|	ASM_EXPAND_OP1_MEM ASM_EXPAND_TYPE_MEM_IMM, op, type, op1, op2
|.endmacro

|.macro ASM_REG_MEM_OP, op, type, op1, op2
|	ASM_EXPAND_OP2_MEM ASM_REG_TXT_OP, op, type, op1, op2
|.endmacro

|.macro ASM_REG_REG_MUL, op, type, op1, op2
||	switch (ir_type_size[type]) {
|| 		default:
||			IR_ASSERT(0);
||		case 2:
|			op Rw(op1), Rw(op2)
|| 			break;
||		case 4:
|			op Rd(op1), Rd(op2)
|| 			break;
|.if X64
||		case 8:
|			op Rq(op1), Rq(op2)
|| 			break;
|.endif
||	}
|.endmacro

|.macro ASM_REG_IMM_MUL, op, type, op1, op2
||	switch (ir_type_size[type]) {
|| 		default:
||			IR_ASSERT(0);
||		case 2:
|			op Rw(op1), op2
|| 			break;
||		case 4:
|			op Rd(op1), op2
|| 			break;
|.if X64
||		case 8:
|			op Rq(op1), op2
|| 			break;
|.endif
||	}
|.endmacro

|.macro ASM_REG_TXT_MUL, op, type, op1, op2
||	switch (ir_type_size[type]) {
|| 		default:
||			IR_ASSERT(0);
||		case 2:
|			op Rw(op1), op2
|| 			break;
||		case 4:
|			op Rd(op1), op2
|| 			break;
|.if X64
||		case 8:
|			op Rq(op1), op2
|| 			break;
|.endif
||	}
|.endmacro

|.macro ASM_REG_MEM_MUL, op, type, op1, op2
|	ASM_EXPAND_OP2_MEM ASM_REG_TXT_MUL, op, type, op1, op2
|.endmacro

|.macro ASM_REG_TXT_TXT_MUL, op, type, op1, op2, op3
||	switch (ir_type_size[type]) {
|| 		default:
||			IR_ASSERT(0);
||		case 2:
|			op Rw(op1), op2, op3
|| 			break;
||		case 4:
|			op Rd(op1), op2, op3
|| 			break;
|.if X64
||		case 8:
|			op Rq(op1), op2, op3
|| 			break;
|.endif
||	}
|.endmacro

|.macro ASM_REG_MEM_TXT_MUL, op, type, op1, op2, op3
|	ASM_EXPAND_OP2_MEM_3 ASM_REG_TXT_TXT_MUL, imul, type, op1, op2, op3
|.endmacro

|.macro ASM_SSE2_REG_REG_OP, op, type, op1, op2
||	if (type == IR_DOUBLE) {
|		op..d xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST)
||	} else {
||		IR_ASSERT(type == IR_FLOAT);
|		op..s xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST)
||	}
|.endmacro

|.macro ASM_SSE2_REG_TXT_OP, op, type, op1, op2
||	if (type == IR_DOUBLE) {
|		op..d xmm(op1-IR_REG_FP_FIRST), qword op2
||	} else {
||		IR_ASSERT(type == IR_FLOAT);
|		op..s xmm(op1-IR_REG_FP_FIRST), dword op2
||	}
|.endmacro

|.macro ASM_SSE2_REG_MEM_OP, op, type, op1, op2
|	ASM_EXPAND_OP2_MEM ASM_SSE2_REG_TXT_OP, op, type, op1, op2
|.endmacro

|.macro ASM_AVX_REG_REG_REG_OP, op, type, op1, op2, op3
||	if (type == IR_DOUBLE) {
|		op..d xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), xmm(op3-IR_REG_FP_FIRST)
||	} else {
||		IR_ASSERT(type == IR_FLOAT);
|		op..s xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), xmm(op3-IR_REG_FP_FIRST)
||	}
|.endmacro

|.macro ASM_AVX_REG_REG_TXT_OP, op, type, op1, op2, op3
||	if (type == IR_DOUBLE) {
|		op..d xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), qword op3
||	} else {
||		IR_ASSERT(type == IR_FLOAT);
|		op..s xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), dword op3
||	}
|.endmacro

|.macro ASM_AVX_REG_REG_MEM_OP, op, type, op1, op2, op3
|	ASM_EXPAND_OP3_MEM ASM_AVX_REG_REG_TXT_OP, op, type, op1, op2, op3
|.endmacro

|.macro ASM_FP_REG_REG_OP, op, type, op1, op2
||	if (ctx->mflags & IR_X86_AVX) {
|		ASM_SSE2_REG_REG_OP v..op, type, op1, op2
||	} else {
|		ASM_SSE2_REG_REG_OP op, type, op1, op2
||	}
|.endmacro

|.macro ASM_FP_TXT_REG_OP, op, type, dst, src
||	if (type == IR_DOUBLE) {
||		if (ctx->mflags & IR_X86_AVX) {
|			v..op..d qword dst, xmm(src-IR_REG_FP_FIRST)
||		} else {
|			op..d qword dst, xmm(src-IR_REG_FP_FIRST)
||		}
||	} else {
||		IR_ASSERT(type == IR_FLOAT);
||		if (ctx->mflags & IR_X86_AVX) {
|			v..op..s dword dst, xmm(src-IR_REG_FP_FIRST)
||		} else {
|			op..s dword dst, xmm(src-IR_REG_FP_FIRST)
||		}
||	}
|.endmacro

|.macro ASM_FP_MEM_REG_OP, op, type, op1, op2
|	ASM_EXPAND_OP1_MEM ASM_FP_TXT_REG_OP, op, type, op1, op2
|.endmacro

|.macro ASM_FP_REG_TXT_OP, op, type, op1, op2
||	if (ctx->mflags & IR_X86_AVX) {
|		ASM_SSE2_REG_TXT_OP v..op, type, op1, op2
||	} else {
|		ASM_SSE2_REG_TXT_OP op, type, op1, op2
||	}
|.endmacro

|.macro ASM_FP_REG_MEM_OP, op, type, op1, op2
||	if (ctx->mflags & IR_X86_AVX) {
|		ASM_SSE2_REG_MEM_OP v..op, type, op1, op2
||	} else {
|		ASM_SSE2_REG_MEM_OP op, type, op1, op2
||	}
|.endmacro

|.macro ASM_SSE2_REG_REG_TXT_OP, op, type, op1, op2, op3
||	if (type == IR_DOUBLE) {
|		op..d xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), op3
||	} else {
||		IR_ASSERT(type == IR_FLOAT);
|		op..s xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), op3
||	}
|.endmacro

|.macro ASM_SSE2_REG_REG_REG_TXT_OP, op, type, op1, op2, op3, op4
||	if (type == IR_DOUBLE) {
|		op..d xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), xmm(op3-IR_REG_FP_FIRST), op4
||	} else {
||		IR_ASSERT(type == IR_FLOAT);
|		op..s xmm(op1-IR_REG_FP_FIRST), xmm(op2-IR_REG_FP_FIRST), xmm(op3-IR_REG_FP_FIRST), op4
||	}
|.endmacro

|.macro ASM_FP_REG_REG_TXT_OP, op, type, op1, op2, op3
||	if (ctx->mflags & IR_X86_AVX) {
|		ASM_SSE2_REG_REG_REG_TXT_OP v..op, type, op1, op2, op3
||	} else {
|		ASM_SSE2_REG_REG_TXT_OP op, type, op1, op2, op3
||	}
|.endmacro

typedef struct _ir_backend_data {
    ir_reg_alloc_data  ra_data;
	uint32_t           dessa_from_block;
	dasm_State        *dasm_state;
	ir_bitset          emit_constants;
	int                rodata_label, jmp_table_label;
	bool               double_neg_const;
	bool               float_neg_const;
	bool               double_abs_const;
	bool               float_abs_const;
	bool               double_zero_const;
} ir_backend_data;

#define IR_GP_REG_NAME(code, name64, name32, name16, name8, name8h) \
	#name64,
#define IR_GP_REG_NAME32(code, name64, name32, name16, name8, name8h) \
	#name32,
#define IR_GP_REG_NAME16(code, name64, name32, name16, name8, name8h) \
	#name16,
#define IR_GP_REG_NAME8(code, name64, name32, name16, name8, name8h) \
	#name8,
#define IR_FP_REG_NAME(code, name) \
	#name,

static const char *_ir_reg_name[IR_REG_NUM] = {
	IR_GP_REGS(IR_GP_REG_NAME)
	IR_FP_REGS(IR_FP_REG_NAME)
};

static const char *_ir_reg_name32[IR_REG_NUM] = {
	IR_GP_REGS(IR_GP_REG_NAME32)
};

static const char *_ir_reg_name16[IR_REG_NUM] = {
	IR_GP_REGS(IR_GP_REG_NAME16)
};

static const char *_ir_reg_name8[IR_REG_NUM] = {
	IR_GP_REGS(IR_GP_REG_NAME8)
};

/* Calling Convention */
#ifdef _WIN64

static const int8_t _ir_int_reg_params[IR_REG_INT_ARGS] = {
	IR_REG_INT_ARG1,
	IR_REG_INT_ARG2,
	IR_REG_INT_ARG3,
	IR_REG_INT_ARG4,
};

static const int8_t _ir_fp_reg_params[IR_REG_FP_ARGS] = {
	IR_REG_FP_ARG1,
	IR_REG_FP_ARG2,
	IR_REG_FP_ARG3,
	IR_REG_FP_ARG4,
};

#elif defined(IR_TARGET_X64)

static const int8_t _ir_int_reg_params[IR_REG_INT_ARGS] = {
	IR_REG_INT_ARG1,
	IR_REG_INT_ARG2,
	IR_REG_INT_ARG3,
	IR_REG_INT_ARG4,
	IR_REG_INT_ARG5,
	IR_REG_INT_ARG6,
};

static const int8_t _ir_fp_reg_params[IR_REG_FP_ARGS] = {
	IR_REG_FP_ARG1,
	IR_REG_FP_ARG2,
	IR_REG_FP_ARG3,
	IR_REG_FP_ARG4,
	IR_REG_FP_ARG5,
	IR_REG_FP_ARG6,
	IR_REG_FP_ARG7,
	IR_REG_FP_ARG8,
};

#else

static const int8_t *_ir_int_reg_params = NULL;
static const int8_t *_ir_fp_reg_params = NULL;
static const int8_t _ir_int_fc_reg_params[IR_REG_INT_FCARGS] = {
	IR_REG_INT_FCARG1,
	IR_REG_INT_FCARG2,
};
static const int8_t *_ir_fp_fc_reg_params = NULL;

#endif

const char *ir_reg_name(int8_t reg, ir_type type)
{
	if (reg >= IR_REG_NUM) {
		if (reg == IR_REG_SCRATCH) {
			return "SCRATCH";
		} else {
			IR_ASSERT(reg == IR_REG_ALL);
			return "ALL";
		}
	}
	IR_ASSERT(reg >= 0 && reg < IR_REG_NUM);
	if (type == IR_VOID) {
		type = (reg < IR_REG_FP_FIRST) ? IR_ADDR : IR_DOUBLE;
	}
	if (IR_IS_TYPE_FP(type) || ir_type_size[type] == 8) {
		return _ir_reg_name[reg];
	} else if (ir_type_size[type] == 4) {
		return _ir_reg_name32[reg];
	} else if (ir_type_size[type] == 2) {
		return _ir_reg_name16[reg];
	} else {
		IR_ASSERT(ir_type_size[type] == 1);
		return _ir_reg_name8[reg];
	}
}

#define IR_RULES(_)        \
	_(CMP_INT)             \
	_(CMP_FP)              \
	_(MUL_INT)             \
	_(DIV_INT)             \
	_(MOD_INT)             \
	_(TEST_INT)            \
	_(SETCC_INT)           \
	_(TESTCC_INT)          \
	_(LEA_OB)              \
	_(LEA_SI)              \
	_(LEA_SIB)             \
	_(LEA_IB)              \
	_(LEA_SI_O)            \
	_(LEA_SIB_O)           \
	_(LEA_IB_O)            \
	_(LEA_I_OB)            \
	_(LEA_OB_I)            \
	_(LEA_OB_SI)           \
	_(LEA_SI_OB)           \
	_(LEA_B_SI)            \
	_(LEA_SI_B)            \
	_(LEA_B_SI_O)          \
	_(LEA_SI_B_O)          \
	_(LEA_SYM_O)           \
	_(LEA_O_SYM)           \
	_(INC)                 \
	_(DEC)                 \
	_(MUL_PWR2)            \
	_(DIV_PWR2)            \
	_(MOD_PWR2)            \
	_(SDIV_PWR2)           \
	_(SMOD_PWR2)           \
	_(BOOL_NOT)            \
	_(BOOL_NOT_INT)        \
	_(ABS_INT)             \
	_(OP_INT)              \
	_(OP_FP)               \
	_(IMUL3)               \
	_(BINOP_INT)           \
	_(BINOP_SSE2)          \
	_(BINOP_AVX)           \
	_(SHIFT)               \
	_(SHIFT_CONST)         \
	_(COPY_INT)            \
	_(COPY_FP)             \
	_(CMP_AND_STORE_INT)   \
	_(CMP_AND_BRANCH_INT)  \
	_(CMP_AND_BRANCH_FP)   \
	_(TEST_AND_BRANCH_INT) \
	_(JCC_INT)             \
	_(COND_CMP_INT)        \
	_(COND_CMP_FP)         \
	_(GUARD_CMP_INT)       \
	_(GUARD_CMP_FP)        \
	_(GUARD_TEST_INT)      \
	_(GUARD_JCC_INT)       \
	_(GUARD_OVERFLOW)      \
	_(OVERFLOW_AND_BRANCH) \
	_(MIN_MAX_INT)         \
	_(MEM_OP_INT)          \
	_(MEM_INC)             \
	_(MEM_DEC)             \
	_(MEM_MUL_PWR2)        \
	_(MEM_DIV_PWR2)        \
	_(MEM_MOD_PWR2)        \
	_(MEM_BINOP_INT)       \
	_(MEM_SHIFT)           \
	_(MEM_SHIFT_CONST)     \
	_(REG_BINOP_INT)       \
	_(VSTORE_INT)          \
	_(VSTORE_FP)           \
	_(LOAD_INT)            \
	_(LOAD_FP)             \
	_(STORE_INT)           \
	_(STORE_FP)            \
	_(IF_INT)              \
	_(RETURN_VOID)         \
	_(RETURN_INT)          \
	_(RETURN_FP)           \
	_(BIT_COUNT)           \
	_(SSE_SQRT)            \
	_(SSE_RINT)            \
	_(SSE_FLOOR)           \
	_(SSE_CEIL)            \
	_(SSE_TRUNC)           \
	_(SSE_NEARBYINT)       \
	_(BIT_OP)              \

#define IR_LEA_FIRST IR_LEA_OB
#define IR_LEA_LAST  IR_LEA_O_SYM

#define IR_RULE_ENUM(name) IR_ ## name,

#define IR_STATIC_ALLOCA   (IR_SKIPPED | IR_FUSED | IR_SIMPLE | IR_ALLOCA)

enum _ir_rule {
	IR_FIRST_RULE = IR_LAST_OP,
	IR_RULES(IR_RULE_ENUM)
	IR_LAST_RULE
};

#define IR_RULE_NAME(name)  #name,
const char *ir_rule_name[IR_LAST_OP] = {
	NULL,
	IR_RULES(IR_RULE_NAME)
	NULL
};

static bool ir_may_fuse_addr(ir_ctx *ctx, const ir_insn *addr_insn)
{
	if (sizeof(void*) == 4) {
		return 1;
	} else if (IR_IS_SYM_CONST(addr_insn->op)) {
		void *addr = ir_sym_addr(ctx, addr_insn);

		if (!addr) {
			return 0;
		}
		return IR_IS_SIGNED_32BIT((int64_t)(intptr_t)addr);
	} else {
		return IR_IS_SIGNED_32BIT(addr_insn->val.i64);
	}
}

static bool ir_may_fuse_imm(ir_ctx *ctx, const ir_insn *val_insn)
{
	if (val_insn->type == IR_ADDR) {
		if (sizeof(void*) == 4) {
			return 1;
		} else if (IR_IS_SYM_CONST(val_insn->op)) {
			void *addr = ir_sym_addr(ctx, val_insn);

			if (!addr) {
				return 0;
			}
			return IR_IS_SIGNED_32BIT((intptr_t)addr);
		} else {
			return IR_IS_SIGNED_32BIT(val_insn->val.i64);
		}
	} else {
		return (ir_type_size[val_insn->type] <= 4 || IR_IS_SIGNED_32BIT(val_insn->val.i64));
	}
}

/* register allocation */
static int ir_add_const_tmp_reg(ir_ctx *ctx, ir_ref ref, uint32_t num, int n, ir_target_constraints *constraints)
{
	IR_ASSERT(IR_IS_CONST_REF(ref));
	const ir_insn *val_insn = &ctx->ir_base[ref];

	if (!ir_may_fuse_imm(ctx, val_insn)) {
		constraints->tmp_regs[n] = IR_TMP_REG(num, val_insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
		n++;
	}
	return n;
}

int ir_get_target_constraints(ir_ctx *ctx, ir_ref ref, ir_target_constraints *constraints)
{
	uint32_t rule = ir_rule(ctx, ref);
	const ir_insn *insn;
	int n = 0;
	int flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG | IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;

	constraints->def_reg = IR_REG_NONE;
	constraints->hints_count = 0;
	switch (rule & IR_RULE_MASK) {
		case IR_BINOP_INT:
			insn = &ctx->ir_base[ref];
			if (rule & IR_FUSED) {
				if (ctx->ir_base[insn->op1].op == IR_RLOAD) {
					flags = IR_OP1_MUST_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
				} else {
					flags = IR_OP2_MUST_BE_IN_REG;
				}
			} else {
				flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
			}
			if (IR_IS_CONST_REF(insn->op2)) {
				if (insn->op1 != insn->op2) {
					n = ir_add_const_tmp_reg(ctx, insn->op2, 2, n, constraints);
				}
			} else if (ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
				constraints->tmp_regs[n] = IR_TMP_REG(2, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
				n++;
			}
			break;
		case IR_IMUL3:
			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
			break;
		case IR_SHIFT:
			if (rule & IR_FUSED) {
				flags = IR_OP2_MUST_BE_IN_REG;
			} else {
				flags = IR_DEF_REUSES_OP1_REG | IR_DEF_CONFLICTS_WITH_INPUT_REGS | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
			}
			constraints->hints[1] = IR_REG_NONE;
			constraints->hints[2] = IR_REG_RCX;
			constraints->hints_count = 3;
			constraints->tmp_regs[0] = IR_SCRATCH_REG(IR_REG_RCX, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
			n = 1;
			break;
		case IR_MUL_INT:
			/* %rax - used as input and result */
			constraints->def_reg = IR_REG_RAX;
			constraints->hints[1] = IR_REG_RAX;
			constraints->hints_count = 2;
			flags = IR_USE_SHOULD_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
			constraints->tmp_regs[0] = IR_SCRATCH_REG(IR_REG_RDX, IR_USE_SUB_REF, IR_DEF_SUB_REF);
			constraints->tmp_regs[1] = IR_SCRATCH_REG(IR_REG_RAX, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
			n = 2;
			break;
		case IR_DIV_INT:
			/* %rax - used as input and result */
			constraints->def_reg = IR_REG_RAX;
			constraints->hints[1] = IR_REG_RAX;
			constraints->hints_count = 2;
			flags = IR_USE_SHOULD_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
			constraints->tmp_regs[0] = IR_SCRATCH_REG(IR_REG_RDX, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
			constraints->tmp_regs[1] = IR_SCRATCH_REG(IR_REG_RAX, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
			n = 2;
			goto op2_const;
		case IR_MOD_INT:
			constraints->def_reg = IR_REG_RDX;
			constraints->hints[1] = IR_REG_RAX;
			constraints->hints_count = 2;
			flags = IR_USE_SHOULD_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
			constraints->tmp_regs[0] = IR_SCRATCH_REG(IR_REG_RAX, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
			constraints->tmp_regs[1] = IR_SCRATCH_REG(IR_REG_RDX, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
			n = 2;
			goto op2_const;
		case IR_MIN_MAX_INT:
			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_MUST_BE_IN_REG;
op2_const:
			insn = &ctx->ir_base[ref];
			if (IR_IS_CONST_REF(insn->op2) && insn->op1 != insn->op2) {
				constraints->tmp_regs[n] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
				n++;
			} else if (ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
				constraints->tmp_regs[n] = IR_TMP_REG(2, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
				n = 1;
			}
			break;
		case IR_CMP_INT:
		case IR_TEST_INT:
			insn = &ctx->ir_base[ref];
			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
			if (IR_IS_CONST_REF(insn->op1)) {
				const ir_insn *val_insn = &ctx->ir_base[insn->op1];
				constraints->tmp_regs[0] = IR_TMP_REG(1, val_insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
				n = 1;
			} else if (ir_rule(ctx, insn->op1) == IR_STATIC_ALLOCA) {
				constraints->tmp_regs[0] = IR_TMP_REG(1, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
				n = 1;
			} else if (ir_rule(ctx, insn->op1) & IR_FUSED) {
				flags = IR_USE_MUST_BE_IN_REG | IR_OP2_MUST_BE_IN_REG;
			}
			if (IR_IS_CONST_REF(insn->op2)) {
				if (insn->op1 != insn->op2) {
					flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
					n = ir_add_const_tmp_reg(ctx, insn->op2, 2, n, constraints);
				}
			} else if (ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
				constraints->tmp_regs[n] = IR_TMP_REG(2, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
				n++;
			}
			break;
		case IR_CMP_FP:
			insn = &ctx->ir_base[ref];
			if (!(rule & IR_FUSED)) {
				constraints->tmp_regs[0] = IR_TMP_REG(3, IR_BOOL, IR_DEF_SUB_REF, IR_SAVE_SUB_REF);
				n = 1;
			}
			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
			if (IR_IS_CONST_REF(insn->op1)) {
				const ir_insn *val_insn = &ctx->ir_base[insn->op1];
				constraints->tmp_regs[n] = IR_TMP_REG(1, val_insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
				n++;
			}
			break;
		case IR_BINOP_AVX:
			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
			insn = &ctx->ir_base[ref];
			if (IR_IS_CONST_REF(insn->op1)) {
				constraints->tmp_regs[0] = IR_TMP_REG(1, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
				n = 1;
			}
			break;
		case IR_COND:
			insn = &ctx->ir_base[ref];
			if (!IR_IS_TYPE_INT(ctx->ir_base[insn->op1].type)) {
				flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG | IR_OP3_SHOULD_BE_IN_REG;
				break;
			}
			IR_FALLTHROUGH;
		case IR_COND_CMP_INT:
			insn = &ctx->ir_base[ref];
			if (IR_IS_TYPE_INT(insn->type)) {
				if (IR_IS_CONST_REF(insn->op3) || ir_rule(ctx, insn->op3) == IR_STATIC_ALLOCA) {
					flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
					constraints->tmp_regs[0] = IR_TMP_REG(3, insn->type, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
					n = 1;
				} else if (IR_IS_CONST_REF(insn->op2) || ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
					flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
					constraints->tmp_regs[0] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
					n = 1;
				} else {
					flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
				}
			} else {
				flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG | IR_OP3_SHOULD_BE_IN_REG;
			}
			break;
		case IR_COND_CMP_FP:
			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG | IR_OP3_SHOULD_BE_IN_REG;
			break;
		case IR_VSTORE_INT:
			flags = IR_OP3_MUST_BE_IN_REG;
			insn = &ctx->ir_base[ref];
			if (IR_IS_CONST_REF(insn->op3)) {
				n = ir_add_const_tmp_reg(ctx, insn->op3, 3, n, constraints);
			} else if (ir_rule(ctx, insn->op3) == IR_STATIC_ALLOCA) {
				constraints->tmp_regs[n] = IR_TMP_REG(3, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
				n++;
			}
			break;
		case IR_STORE_INT:
			flags = IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
			insn = &ctx->ir_base[ref];
			if (IR_IS_CONST_REF(insn->op2)) {
				n = ir_add_const_tmp_reg(ctx, insn->op2, 2, n, constraints);
			}
			if (IR_IS_CONST_REF(insn->op3)) {
				n = ir_add_const_tmp_reg(ctx, insn->op3, 3, n, constraints);
			} else if (ir_rule(ctx, insn->op3) == IR_STATIC_ALLOCA) {
				constraints->tmp_regs[n] = IR_TMP_REG(3, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
				n++;
			}
			break;
		case IR_VSTORE_FP:
			flags = IR_OP3_MUST_BE_IN_REG;
			insn = &ctx->ir_base[ref];
			if (IR_IS_CONST_REF(insn->op3)) {
				insn = &ctx->ir_base[insn->op3];
				constraints->tmp_regs[0] = IR_TMP_REG(3, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
				n = 1;
			}
			break;
		case IR_LOAD_FP:
		case IR_LOAD_INT:
		case IR_MEM_OP_INT:
		case IR_MEM_INC:
		case IR_MEM_DEC:
		case IR_MEM_MUL_PWR2:
		case IR_MEM_DIV_PWR2:
		case IR_MEM_MOD_PWR2:
		case IR_MEM_BINOP_INT:
		case IR_MEM_SHIFT:
		case IR_MEM_SHIFT_CONST:
		case IR_CMP_AND_STORE_INT:
			flags = IR_USE_MUST_BE_IN_REG | IR_OP2_MUST_BE_IN_REG;
			insn = &ctx->ir_base[ref];
			if (IR_IS_CONST_REF(insn->op2)) {
				n = ir_add_const_tmp_reg(ctx, insn->op2, 2, n, constraints);
			}
			break;
		case IR_STORE_FP:
			flags = IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
			insn = &ctx->ir_base[ref];
			if (IR_IS_CONST_REF(insn->op2)) {
				n = ir_add_const_tmp_reg(ctx, insn->op2, 2, n, constraints);
			}
			if (IR_IS_CONST_REF(insn->op3)) {
				insn = &ctx->ir_base[insn->op3];
				constraints->tmp_regs[n] = IR_TMP_REG(3, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
				n++;
			}
			break;
		case IR_SWITCH:
			flags = IR_OP2_MUST_BE_IN_REG;
			insn = &ctx->ir_base[ref];
			if (IR_IS_CONST_REF(insn->op2)) {
				insn = &ctx->ir_base[insn->op2];
				constraints->tmp_regs[0] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
				n = 1;
			}
			/* we need a temporary regeset in case MIN CASE value is not zero or some CASE VAL can't fit into 32-bit */
			constraints->tmp_regs[n] = IR_TMP_REG(3, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
			n++;
			break;
		case IR_ARGVAL:
			constraints->tmp_regs[0] = IR_SCRATCH_REG(IR_REG_RSI, IR_DEF_SUB_REF - IR_SUB_REFS_COUNT, IR_USE_SUB_REF);
			constraints->tmp_regs[1] = IR_SCRATCH_REG(IR_REG_RDI, IR_DEF_SUB_REF - IR_SUB_REFS_COUNT, IR_USE_SUB_REF);
			constraints->tmp_regs[2] = IR_SCRATCH_REG(IR_REG_RCX, IR_DEF_SUB_REF - IR_SUB_REFS_COUNT, IR_USE_SUB_REF);
			n = 3;
			break;
		case IR_CALL:
			insn = &ctx->ir_base[ref];
			if (IR_IS_TYPE_INT(insn->type)) {
				constraints->def_reg = IR_REG_INT_RET1;
#ifdef IR_REG_FP_RET1
			} else {
				constraints->def_reg = IR_REG_FP_RET1;
#endif
			}
			constraints->tmp_regs[0] = IR_SCRATCH_REG(IR_REG_SCRATCH, IR_USE_SUB_REF, IR_DEF_SUB_REF);
			n = 1;
			IR_FALLTHROUGH;
		case IR_TAILCALL:
			insn = &ctx->ir_base[ref];
			if (insn->inputs_count > 2) {
				constraints->hints[2] = IR_REG_NONE;
				constraints->hints_count = ir_get_args_regs(ctx, insn, constraints->hints);
				if (!IR_IS_CONST_REF(insn->op2)) {
					constraints->tmp_regs[n] = IR_TMP_REG(1, IR_ADDR, IR_LOAD_SUB_REF, IR_USE_SUB_REF);
					n++;
				}
			}
			flags = IR_USE_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG | IR_OP3_SHOULD_BE_IN_REG;
			break;
		case IR_BINOP_SSE2:
			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG | IR_OP2_SHOULD_BE_IN_REG;
			break;
		case IR_SHIFT_CONST:
		case IR_INC:
		case IR_DEC:
		case IR_MUL_PWR2:
		case IR_DIV_PWR2:
		case IR_OP_INT:
		case IR_OP_FP:
		case IR_BIT_OP:
			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
			break;
		case IR_MOD_PWR2:
			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
			insn = &ctx->ir_base[ref];
			if (ir_type_size[insn->type] == 8) {
				int64_t offset = ctx->ir_base[insn->op2].val.u64 - 1;
				if (!IR_IS_SIGNED_32BIT(offset)) {
					constraints->tmp_regs[n] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
					n++;
				}
			}
			break;
		case IR_SMOD_PWR2:
			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
			insn = &ctx->ir_base[ref];
			if (ir_type_size[insn->type] == 8) {
				int64_t offset = ctx->ir_base[insn->op2].val.u64 - 1;
				if (!IR_IS_SIGNED_32BIT(offset)) {
					constraints->tmp_regs[n] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
					n++;
				}
			}
			constraints->tmp_regs[n] = IR_TMP_REG(3, insn->type, IR_USE_SUB_REF, IR_SAVE_SUB_REF);
			n++;
			break;
		case IR_SDIV_PWR2:
			flags = IR_DEF_CONFLICTS_WITH_INPUT_REGS | IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG;
			insn = &ctx->ir_base[ref];
			if (ir_type_size[insn->type] == 8) {
				int64_t offset = ctx->ir_base[insn->op2].val.u64 - 1;
				if (!IR_IS_SIGNED_32BIT(offset)) {
					constraints->tmp_regs[n] = IR_TMP_REG(2, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
					n++;
				}
			}
			break;
		case IR_BIT_COUNT:
			insn = &ctx->ir_base[ref];
			if (ir_type_size[insn->type] == 1) {
				flags = IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG;
			} else {
				flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
			}
			if (IR_IS_CONST_REF(insn->op1)) {
				constraints->tmp_regs[0] = IR_TMP_REG(1, insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
				n = 1;
			}
			break;
		case IR_CTPOP:
			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
			insn = &ctx->ir_base[ref];
			constraints->tmp_regs[0] = IR_TMP_REG(2, insn->type, IR_USE_SUB_REF, IR_SAVE_SUB_REF);
			n = 1;
			if (ir_type_size[insn->type] == 8) {
				constraints->tmp_regs[1] = IR_TMP_REG(3, insn->type, IR_USE_SUB_REF, IR_SAVE_SUB_REF);
				n = 2;
			}
			break;
		case IR_COPY_INT:
		case IR_COPY_FP:
		case IR_SEXT:
		case IR_ZEXT:
		case IR_TRUNC:
		case IR_PROTO:
		case IR_FP2FP:
			flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
			break;
		case IR_BITCAST:
			insn = &ctx->ir_base[ref];
			if (IR_IS_TYPE_INT(insn->type) && IR_IS_TYPE_INT(ctx->ir_base[insn->op1].type)) {
				flags = IR_DEF_REUSES_OP1_REG | IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
			} else {
				flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
			}
			break;
		case IR_FP2INT:
			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
			break;
		case IR_INT2FP:
			flags = IR_USE_MUST_BE_IN_REG | IR_OP1_SHOULD_BE_IN_REG;
			insn = &ctx->ir_base[ref];
			if (IR_IS_CONST_REF(insn->op1)) {
				constraints->tmp_regs[0] = IR_TMP_REG(1, ctx->ir_base[insn->op1].type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
				n = 1;
			}
			break;
		case IR_ABS_INT:
			flags = IR_DEF_CONFLICTS_WITH_INPUT_REGS | IR_USE_MUST_BE_IN_REG | IR_OP1_MUST_BE_IN_REG;
			break;
		case IR_PARAM:
			constraints->def_reg = ir_get_param_reg(ctx, ref);
			flags = 0;
			break;
		case IR_PI:
		case IR_PHI:
			flags = IR_USE_SHOULD_BE_IN_REG;
			break;
		case IR_RLOAD:
			constraints->def_reg = ctx->ir_base[ref].op2;
			flags = IR_USE_SHOULD_BE_IN_REG;
			break;
		case IR_EXITCALL:
			flags = IR_USE_MUST_BE_IN_REG;
			constraints->def_reg = IR_REG_INT_RET1;
			break;
		case IR_IF_INT:
		case IR_GUARD:
		case IR_GUARD_NOT:
			flags = IR_OP2_SHOULD_BE_IN_REG;
			break;
		case IR_IJMP:
			flags = IR_OP2_SHOULD_BE_IN_REG;
			break;
		case IR_RSTORE:
			flags = IR_OP3_SHOULD_BE_IN_REG;
			break;
		case IR_RETURN_INT:
			flags = IR_OP2_SHOULD_BE_IN_REG;
			constraints->hints[2] = IR_REG_INT_RET1;
			constraints->hints_count = 3;
			break;
		case IR_RETURN_FP:
#ifdef IR_REG_FP_RET1
			flags = IR_OP2_SHOULD_BE_IN_REG;
			constraints->hints[2] = IR_REG_FP_RET1;
			constraints->hints_count = 3;
#endif
			break;
		case IR_SNAPSHOT:
			flags = 0;
			break;
		case IR_VA_START:
			flags = IR_OP2_MUST_BE_IN_REG;
			constraints->tmp_regs[0] = IR_TMP_REG(3, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
			n = 1;
			insn = &ctx->ir_base[ref];
			if (IR_IS_CONST_REF(insn->op2)) {
				constraints->tmp_regs[1] = IR_TMP_REG(2, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
				n = 2;
			}
			break;
		case IR_VA_ARG:
			flags = IR_USE_MUST_BE_IN_REG | IR_OP2_MUST_BE_IN_REG;
			constraints->tmp_regs[0] = IR_TMP_REG(3, IR_ADDR, IR_LOAD_SUB_REF, IR_SAVE_SUB_REF);
			n = 1;
			insn = &ctx->ir_base[ref];
			if (IR_IS_CONST_REF(insn->op2)) {
				constraints->tmp_regs[1] = IR_TMP_REG(2, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
				n = 2;
			}
			break;
		case IR_VA_COPY:
			flags = IR_OP2_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
			constraints->tmp_regs[0] = IR_TMP_REG(1, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
			n = 1;
			insn = &ctx->ir_base[ref];
			if (IR_IS_CONST_REF(insn->op2)) {
				constraints->tmp_regs[n] = IR_TMP_REG(2, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
				n++;
			}
			if (IR_IS_CONST_REF(insn->op3)) {
				constraints->tmp_regs[n] = IR_TMP_REG(3, IR_ADDR, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
				n++;
			}
			break;
		case IR_SSE_SQRT:
		case IR_SSE_RINT:
		case IR_SSE_FLOOR:
		case IR_SSE_CEIL:
		case IR_SSE_TRUNC:
		case IR_SSE_NEARBYINT:
			insn = &ctx->ir_base[ref];
			flags = IR_USE_MUST_BE_IN_REG | IR_OP3_MUST_BE_IN_REG;
			if (IR_IS_CONST_REF(insn->op3)) {
				const ir_insn *val_insn = &ctx->ir_base[insn->op3];
				constraints->tmp_regs[n] = IR_TMP_REG(3, val_insn->type, IR_LOAD_SUB_REF, IR_DEF_SUB_REF);
				n = 1;
			}
			break;
	}
	constraints->tmps_count = n;

	return flags;
}

/* instruction selection */
static uint32_t ir_match_insn(ir_ctx *ctx, ir_ref ref);
static bool ir_match_try_fuse_load(ir_ctx *ctx, ir_ref ref, ir_ref root);

static void ir_swap_ops(ir_insn *insn)
{
	SWAP_REFS(insn->op1, insn->op2);
}

static bool ir_match_try_revert_lea_to_add(ir_ctx *ctx, ir_ref ref)
{
	ir_insn *insn = &ctx->ir_base[ref];

	/* TODO: This optimization makes sense only if the other operand is killed */
	if (insn->op1 == insn->op2) {
		/* pass */
	} else if (ir_match_try_fuse_load(ctx, insn->op2, ref)) {
		ctx->rules[ref] = IR_BINOP_INT | IR_MAY_SWAP;
		return 1;
	} else if (ir_match_try_fuse_load(ctx, insn->op1, ref)) {
		/* swap for better load fusion */
		ir_swap_ops(insn);
		ctx->rules[ref] = IR_BINOP_INT | IR_MAY_SWAP;
		return 1;
	}
	return 0;
}

static void ir_match_fuse_addr(ir_ctx *ctx, ir_ref addr_ref)
{
	if (!IR_IS_CONST_REF(addr_ref)) {
		uint32_t rule = ctx->rules[addr_ref];

		if (!rule) {
			ctx->rules[addr_ref] = rule = ir_match_insn(ctx, addr_ref);
		}
		if (rule >= IR_LEA_FIRST && rule <= IR_LEA_LAST) {
			ir_use_list *use_list;
			ir_ref j;

			if (rule == IR_LEA_IB && ir_match_try_revert_lea_to_add(ctx, addr_ref)) {
				return;
			}

			use_list = &ctx->use_lists[addr_ref];
			j = use_list->count;
			if (j > 1) {
				/* check if address is used only in LOAD and STORE */
				ir_ref *p = &ctx->use_edges[use_list->refs];

				do {
					ir_insn *insn = &ctx->ir_base[*p];
					if (insn->op != IR_LOAD && (insn->op != IR_STORE || insn->op3 == addr_ref)) {
						return;
					}
					p++;
				} while (--j);
			}
			ctx->rules[addr_ref] = IR_FUSED | IR_SIMPLE | rule;
		}
	}
}

static bool ir_match_may_fuse_SI(ir_ctx *ctx, ir_ref ref, ir_ref use)
{
	ir_insn *op2_insn, *insn = &ctx->ir_base[use];

	if (insn->op == IR_ADD) {
		if (insn->op1 == ref) {
			if (IR_IS_CONST_REF(insn->op2)) {
				op2_insn = &ctx->ir_base[insn->op2];
				if (IR_IS_SYM_CONST(op2_insn->op)) {
					if (ir_may_fuse_addr(ctx, op2_insn)) {
						return 1; // LEA_SI_O
					}
				} else if (IR_IS_SIGNED_32BIT(op2_insn->val.i64)) {
					return 1; // LEA_SI_O
				}
			} else if (insn->op2 != ref) {
				return 1; // LEA_SI_B or LEA_SI_OB
			}
		} else if (insn->op2 == ref && insn->op1 != insn->op2) {
			return 1; // LEA_B_SI or LEA_OB_SI
		}
	}
	return 0;
}

static bool ir_match_fuse_addr_all_useges(ir_ctx *ctx, ir_ref ref)
{
	uint32_t rule = ctx->rules[ref];
	ir_use_list *use_list;
	ir_ref n, *p, use;

	if (rule == (IR_FUSED | IR_SIMPLE | IR_LEA_SI)) {
		return 1;
	} else if (!rule) {
		ir_insn *insn = &ctx->ir_base[ref];

		IR_ASSERT(IR_IS_TYPE_INT(insn->type) && ir_type_size[insn->type] >= 4);
		if (insn->op == IR_MUL
		 && IR_IS_CONST_REF(insn->op2)) {
			insn = &ctx->ir_base[insn->op2];
			if (!IR_IS_SYM_CONST(insn->op)
			 &&	(insn->val.u64 == 2 || insn->val.u64 == 4 || insn->val.u64 == 8)) {
				ctx->rules[ref] = IR_LEA_SI;

				use_list = &ctx->use_lists[ref];
				n = use_list->count;
				IR_ASSERT(n > 1);
				p = &ctx->use_edges[use_list->refs];
				for (; n > 0; p++, n--) {
					use = *p;
					if (!ir_match_may_fuse_SI(ctx, ref, use)) {
						return 0;
					}
				}

				return 1;
			}
		}
	}

	return 0;
}

/* A naive check if there is a STORE or CALL between this LOAD and the fusion root */
static bool ir_match_has_mem_deps(ir_ctx *ctx, ir_ref ref, ir_ref root)
{
	if (ref + 1 != root) {
		ir_ref pos = ctx->prev_ref[root];

		do {
			ir_insn *insn = &ctx->ir_base[pos];

			if (insn->op == IR_STORE) {
				// TODO: check if LOAD and STORE addresses may alias
				return 1;
			} else if (insn->op == IR_CALL) {
				return 1;
			}
			pos = ctx->prev_ref[pos];
		} while (ref != pos);
	}
	return 0;
}

static void ir_match_fuse_load(ir_ctx *ctx, ir_ref ref, ir_ref root)
{
	if (ir_in_same_block(ctx, ref)
	 && ctx->ir_base[ref].op == IR_LOAD) {
		if (ctx->use_lists[ref].count == 2
		 && !ir_match_has_mem_deps(ctx, ref, root)) {
			ir_ref addr_ref = ctx->ir_base[ref].op2;
			ir_insn *addr_insn = &ctx->ir_base[addr_ref];

			if (IR_IS_CONST_REF(addr_ref)) {
				if (ir_may_fuse_addr(ctx, addr_insn)) {
					ctx->rules[ref] = IR_FUSED | IR_SIMPLE | IR_LOAD;
					return;
				}
			} else {
				ctx->rules[ref] = IR_FUSED | IR_SIMPLE | IR_LOAD;
				ir_match_fuse_addr(ctx, addr_ref);
				return;
			}
		}
	}
}

static bool ir_match_try_fuse_load(ir_ctx *ctx, ir_ref ref, ir_ref root)
{
	ir_insn *insn = &ctx->ir_base[ref];

	if (ir_in_same_block(ctx, ref)
	 && insn->op == IR_LOAD) {
		if (ctx->use_lists[ref].count == 2
		 && !ir_match_has_mem_deps(ctx, ref, root)) {
			ir_ref addr_ref = ctx->ir_base[ref].op2;
			ir_insn *addr_insn = &ctx->ir_base[addr_ref];

			if (IR_IS_CONST_REF(addr_ref)) {
				if (ir_may_fuse_addr(ctx, addr_insn)) {
					ctx->rules[ref] = IR_FUSED | IR_SIMPLE | IR_LOAD;
					return 1;
				}
			} else {
				ctx->rules[ref] = IR_FUSED | IR_SIMPLE | IR_LOAD;
				ir_match_fuse_addr(ctx, addr_ref);
				return 1;
			}
		}
	} else if (insn->op == IR_PARAM) {
		if (ctx->use_lists[ref].count == 1
		 && ir_get_param_reg(ctx, ref) == IR_REG_NONE) {
			return 1;
		}
	} else if (ctx->ir_base[ref].op == IR_VLOAD) {
		return 1;
	}
	return 0;
}

static void ir_match_fuse_load_commutative_int(ir_ctx *ctx, ir_insn *insn, ir_ref root)
{
	if (IR_IS_CONST_REF(insn->op2)
	 && ir_may_fuse_imm(ctx, &ctx->ir_base[insn->op2])) {
		return;
	} else if (ir_match_try_fuse_load(ctx, insn->op2, root)) {
		return;
	} else if (ir_match_try_fuse_load(ctx, insn->op1, root)) {
		ir_swap_ops(insn);
	}
}

static void ir_match_fuse_load_commutative_fp(ir_ctx *ctx, ir_insn *insn, ir_ref root)
{
	if (!IR_IS_CONST_REF(insn->op2)
	 && !ir_match_try_fuse_load(ctx, insn->op2, root)
	 && (IR_IS_CONST_REF(insn->op1) || ir_match_try_fuse_load(ctx, insn->op1, root))) {
		ir_swap_ops(insn);
	}
}

static void ir_match_fuse_load_cmp_int(ir_ctx *ctx, ir_insn *insn, ir_ref root)
{
	if (IR_IS_CONST_REF(insn->op2)
	 && ir_may_fuse_imm(ctx, &ctx->ir_base[insn->op2])) {
		ir_match_fuse_load(ctx, insn->op1, root);
	} else if (!ir_match_try_fuse_load(ctx, insn->op2, root)
	 && ir_match_try_fuse_load(ctx, insn->op1, root)) {
		ir_swap_ops(insn);
		if (insn->op != IR_EQ && insn->op != IR_NE) {
			insn->op ^= 3;
		}
	}
}

static void ir_match_fuse_load_test_int(ir_ctx *ctx, ir_insn *insn, ir_ref root)
{
	if (IR_IS_CONST_REF(insn->op2)
	 && ir_may_fuse_imm(ctx, &ctx->ir_base[insn->op2])) {
		ir_match_fuse_load(ctx, insn->op1, root);
	} else if (!ir_match_try_fuse_load(ctx, insn->op2, root)
	 && ir_match_try_fuse_load(ctx, insn->op1, root)) {
		ir_swap_ops(insn);
	}
}

static void ir_match_fuse_load_cmp_fp(ir_ctx *ctx, ir_insn *insn, ir_ref root)
{
	if (insn->op != IR_EQ && insn->op != IR_NE) {
		if (insn->op == IR_LT || insn->op == IR_LE) {
			/* swap operands to avoid P flag check */
			ir_swap_ops(insn);
			insn->op ^= 3;
		}
		ir_match_fuse_load(ctx, insn->op2, root);
	} else if (IR_IS_CONST_REF(insn->op2) && !IR_IS_FP_ZERO(ctx->ir_base[insn->op2])) {
		/* pass */
	} else if (ir_match_try_fuse_load(ctx, insn->op2, root)) {
		/* pass */
	} else if ((IR_IS_CONST_REF(insn->op1) && !IR_IS_FP_ZERO(ctx->ir_base[insn->op1])) || ir_match_try_fuse_load(ctx, insn->op1, root)) {
		ir_swap_ops(insn);
		if (insn->op != IR_EQ && insn->op != IR_NE
		 && insn->op != IR_ORDERED && insn->op != IR_UNORDERED) {
			insn->op ^= 3;
		}
	}
}

static void ir_match_fuse_load_cmp_fp_br(ir_ctx *ctx, ir_insn *insn, ir_ref root, bool direct)
{
	if (direct) {
		if (insn->op == IR_LT || insn->op == IR_LE) {
			/* swap operands to avoid P flag check */
			ir_swap_ops(insn);
			insn->op ^= 3;
		}
	} else {
		if (insn->op == IR_GT || insn->op == IR_GE) {
			/* swap operands to avoid P flag check */
			ir_swap_ops(insn);
			insn->op ^= 3;
		}
	}
	if (IR_IS_CONST_REF(insn->op2) && !IR_IS_FP_ZERO(ctx->ir_base[insn->op2])) {
		/* pass */
	} else if (ir_match_try_fuse_load(ctx, insn->op2, root)) {
		/* pass */
	} else if ((IR_IS_CONST_REF(insn->op1) && !IR_IS_FP_ZERO(ctx->ir_base[insn->op1])) || ir_match_try_fuse_load(ctx, insn->op1, root)) {
		ir_swap_ops(insn);
		if (insn->op != IR_EQ && insn->op != IR_NE
		 && insn->op != IR_ORDERED && insn->op != IR_UNORDERED) {
			insn->op ^= 3;
		}
	}
}

#define STR_EQUAL(name, name_len, str) (name_len == strlen(str) && memcmp(name, str, strlen(str)) == 0)

#define IR_IS_FP_FUNC_1(proto, _type)  (proto->params_count == 1 && \
                                        proto->param_types[0] == _type && \
                                        proto->ret_type == _type)

static uint32_t ir_match_builtin_call(ir_ctx *ctx, const ir_insn *func)
{
	const ir_proto_t *proto = (const ir_proto_t *)ir_get_str(ctx, func->proto);

	if (proto->flags & IR_BUILTIN_FUNC) {
		size_t name_len;
		const char *name = ir_get_strl(ctx, func->val.name, &name_len);

		if (STR_EQUAL(name, name_len, "sqrt")) {
			if (IR_IS_FP_FUNC_1(proto, IR_DOUBLE)) {
				return IR_SSE_SQRT;
			}
		} else if (STR_EQUAL(name, name_len, "sqrtf")) {
			if (IR_IS_FP_FUNC_1(proto, IR_FLOAT)) {
				return IR_SSE_SQRT;
			}
		} else if (STR_EQUAL(name, name_len, "rint")) {
			if (IR_IS_FP_FUNC_1(proto, IR_DOUBLE)) {
				return IR_SSE_RINT;
			}
		} else if (STR_EQUAL(name, name_len, "rintf")) {
			if (IR_IS_FP_FUNC_1(proto, IR_FLOAT)) {
				return IR_SSE_RINT;
			}
		} else if (STR_EQUAL(name, name_len, "floor")) {
			if (IR_IS_FP_FUNC_1(proto, IR_DOUBLE)) {
				return IR_SSE_FLOOR;
			}
		} else if (STR_EQUAL(name, name_len, "floorf")) {
			if (IR_IS_FP_FUNC_1(proto, IR_FLOAT)) {
				return IR_SSE_FLOOR;
			}
		} else if (STR_EQUAL(name, name_len, "ceil")) {
			if (IR_IS_FP_FUNC_1(proto, IR_DOUBLE)) {
				return IR_SSE_CEIL;
			}
		} else if (STR_EQUAL(name, name_len, "ceilf")) {
			if (IR_IS_FP_FUNC_1(proto, IR_FLOAT)) {
				return IR_SSE_CEIL;
			}
		} else if (STR_EQUAL(name, name_len, "trunc")) {
			if (IR_IS_FP_FUNC_1(proto, IR_DOUBLE)) {
				return IR_SSE_TRUNC;
			}
		} else if (STR_EQUAL(name, name_len, "truncf")) {
			if (IR_IS_FP_FUNC_1(proto, IR_FLOAT)) {
				return IR_SSE_TRUNC;
			}
		} else if (STR_EQUAL(name, name_len, "nearbyint")) {
			if (IR_IS_FP_FUNC_1(proto, IR_DOUBLE)) {
				return IR_SSE_NEARBYINT;
			}
		} else if (STR_EQUAL(name, name_len, "nearbyintf")) {
			if (IR_IS_FP_FUNC_1(proto, IR_FLOAT)) {
				return IR_SSE_NEARBYINT;
			}
		}
	}

	return 0;
}

static uint32_t ir_match_insn(ir_ctx *ctx, ir_ref ref)
{
	ir_insn *op2_insn;
	ir_insn *insn = &ctx->ir_base[ref];
	uint32_t store_rule;
	ir_op load_op;

	switch (insn->op) {
		case IR_EQ:
		case IR_NE:
		case IR_LT:
		case IR_GE:
		case IR_LE:
		case IR_GT:
		case IR_ULT:
		case IR_UGE:
		case IR_ULE:
		case IR_UGT:
			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op1].type)) {
				if (IR_IS_CONST_REF(insn->op2)
				 && !IR_IS_SYM_CONST(ctx->ir_base[insn->op2].op)
				 && ctx->ir_base[insn->op2].val.i64 == 0
				 && insn->op1 == ref - 1) { /* previous instruction */
					ir_insn *op1_insn = &ctx->ir_base[insn->op1];

					if (op1_insn->op == IR_AND && ctx->use_lists[insn->op1].count == 1) {
						/* v = AND(_, _); CMP(v, 0) => SKIP_TEST; TEST */
						ir_match_fuse_load_test_int(ctx, op1_insn, ref);
						ctx->rules[insn->op1] = IR_FUSED | IR_TEST_INT;
						return IR_TESTCC_INT;
					} else if ((op1_insn->op == IR_OR || op1_insn->op == IR_AND || op1_insn->op == IR_XOR) ||
							/* GT(ADD(_, _), 0) can't be optimized because ADD may overflow */
							((op1_insn->op == IR_ADD || op1_insn->op == IR_SUB) &&
								(insn->op == IR_EQ || insn->op == IR_NE ||
									insn->op == IR_LT || insn->op == IR_GE))) {
						/* v = BINOP(_, _); CMP(v, 0) => BINOP; SETCC */
						if (ir_op_flags[op1_insn->op] & IR_OP_FLAG_COMMUTATIVE) {
							ir_match_fuse_load_commutative_int(ctx, op1_insn, ref);
							ctx->rules[insn->op1] = IR_BINOP_INT | IR_MAY_SWAP;
						} else {
							ir_match_fuse_load(ctx, op1_insn->op2, ref);
							ctx->rules[insn->op1] = IR_BINOP_INT;
						}
						return IR_SETCC_INT;
					}
				}
				ir_match_fuse_load_cmp_int(ctx, insn, ref);
				return IR_CMP_INT;
			} else {
				ir_match_fuse_load_cmp_fp(ctx, insn, ref);
				return IR_CMP_FP;
			}
			break;
		case IR_ORDERED:
		case IR_UNORDERED:
			ir_match_fuse_load_cmp_fp(ctx, insn, ref);
			return IR_CMP_FP;
		case IR_ADD:
		case IR_SUB:
			if (IR_IS_TYPE_INT(insn->type)) {
				if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
					op2_insn = &ctx->ir_base[insn->op2];
					if (IR_IS_CONST_REF(insn->op1)) {
						ir_insn *op1_insn = &ctx->ir_base[insn->op1];

						if (insn->op == IR_ADD
						 && IR_IS_SYM_CONST(op1_insn->op)
						 && !IR_IS_SYM_CONST(op2_insn->op)
						 && IR_IS_SIGNED_32BIT((intptr_t)ir_sym_val(ctx, op1_insn) + (intptr_t)op2_insn->val.i64)) {
							return IR_LEA_SYM_O;
						} else if (insn->op == IR_ADD
						 && IR_IS_SYM_CONST(op2_insn->op)
						 && !IR_IS_SYM_CONST(op1_insn->op)
						 && IR_IS_SIGNED_32BIT((intptr_t)ir_sym_val(ctx, op2_insn) + (intptr_t)op1_insn->val.i64)) {
							return IR_LEA_O_SYM;
						}
						// const
						// TODO: add support for sym+offset ???
					} else if (IR_IS_SYM_CONST(op2_insn->op)) {
						if (insn->op == IR_ADD && ir_may_fuse_addr(ctx, op2_insn)) {
							goto lea;
						}
						/* pass */
					} else if (op2_insn->val.i64 == 0) {
						// return IR_COPY_INT;
					} else if ((ir_type_size[insn->type] >= 4 && insn->op == IR_ADD && IR_IS_SIGNED_32BIT(op2_insn->val.i64)) ||
							(ir_type_size[insn->type] >= 4 && insn->op == IR_SUB && IR_IS_SIGNED_NEG_32BIT(op2_insn->val.i64))) {
lea:
						if (ctx->use_lists[insn->op1].count == 1 || ir_match_fuse_addr_all_useges(ctx, insn->op1)) {
							uint32_t rule = ctx->rules[insn->op1];

							if (!rule) {
								ctx->rules[insn->op1] = rule = ir_match_insn(ctx, insn->op1);
							}
							if (rule == IR_LEA_SI || rule == (IR_FUSED | IR_SIMPLE | IR_LEA_SI)) {
								/* z = MUL(Y, 2|4|8) ... ADD(z, imm32) => SKIP ... LEA [Y*2|4|8+im32] */
								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_SI;
								return IR_LEA_SI_O;
							} else if (rule == IR_LEA_SIB || rule == (IR_FUSED | IR_SIMPLE | IR_LEA_SIB)) {
								/* z = ADD(X, MUL(Y, 2|4|8)) ... ADD(z, imm32) => SKIP ... LEA [X+Y*2|4|8+im32] */
								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_SIB;
								return IR_LEA_SIB_O;
							} else if (rule == IR_LEA_IB || rule == (IR_FUSED | IR_SIMPLE | IR_LEA_IB)) {
								/* z = ADD(X, Y) ... ADD(z, imm32) => SKIP ... LEA [X+Y+im32] */
								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_IB;
								return IR_LEA_IB_O;
							} else if (rule == IR_LEA_B_SI || rule == (IR_FUSED | IR_SIMPLE | IR_LEA_B_SI)) {
								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_B_SI;
								return IR_LEA_B_SI_O;
							} else if (rule == IR_LEA_SI_B || rule == (IR_FUSED | IR_SIMPLE | IR_LEA_SI_B)) {
								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_SI_B;
								return IR_LEA_SI_B_O;
							}
						}
						/* ADD(X, imm32) => LEA [X+imm32] */
						return IR_LEA_OB;
					} else if (op2_insn->val.i64 == 1 || op2_insn->val.i64 == -1) {
						if (insn->op == IR_ADD) {
							if (op2_insn->val.i64 == 1) {
								/* ADD(_, 1) => INC */
								return IR_INC;
						    } else {
								/* ADD(_, -1) => DEC */
								return IR_DEC;
						    }
						} else {
							if (op2_insn->val.i64 == 1) {
								/* SUB(_, 1) => DEC */
								return IR_DEC;
						    } else {
								/* SUB(_, -1) => INC */
								return IR_INC;
						    }
						}
					}
				} else if ((ctx->flags & IR_OPT_CODEGEN) && insn->op == IR_ADD && ir_type_size[insn->type] >= 4) {
					if (insn->op1 != insn->op2) {
						if (ctx->use_lists[insn->op1].count == 1 || ir_match_fuse_addr_all_useges(ctx, insn->op1)) {
							uint32_t rule =ctx->rules[insn->op1];
							if (!rule) {
								ctx->rules[insn->op1] = rule = ir_match_insn(ctx, insn->op1);
							}
							if (rule == IR_LEA_OB) {
								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_OB;
								if (ctx->use_lists[insn->op2].count == 1 || ir_match_fuse_addr_all_useges(ctx, insn->op2)) {
									rule = ctx->rules[insn->op2];
									if (!rule) {
										ctx->rules[insn->op2] = rule = ir_match_insn(ctx, insn->op2);
									}
									if (rule == IR_LEA_SI || rule == (IR_FUSED | IR_SIMPLE | IR_LEA_SI)) {
										/* x = ADD(X, imm32) ... y = MUL(Y, 2|4|8) ... ADD(x, y) => SKIP ... SKIP ... LEA */
										ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_LEA_SI;
										return IR_LEA_OB_SI;
									}
								}
								/* x = ADD(X, imm32) ... ADD(x, Y) => SKIP ... LEA */
								return IR_LEA_OB_I;
							} else if (rule == IR_LEA_SI || rule == (IR_FUSED | IR_SIMPLE | IR_LEA_SI)) {
								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_SI;
								if (ctx->use_lists[insn->op2].count == 1) {
									rule = ctx->rules[insn->op2];
									if (!rule) {
										ctx->rules[insn->op2] = rule = ir_match_insn(ctx, insn->op2);
									}
									if (rule == IR_LEA_OB || rule == (IR_FUSED | IR_SIMPLE | IR_LEA_OB)) {
										/* x = ADD(X, imm32) ... y = MUL(Y, 2|4|8) ... ADD(y, x) => SKIP ... SKIP ... LEA */
										ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_LEA_OB;
										return IR_LEA_SI_OB;
									}
								}
								/* x = MUL(X, 2|4|8) ... ADD(x, Y) => SKIP ... LEA */
								return IR_LEA_SI_B;
							}
						}
						if (ctx->use_lists[insn->op2].count == 1 || ir_match_fuse_addr_all_useges(ctx, insn->op2)) {
							uint32_t rule = ctx->rules[insn->op2];
							if (!rule) {
								ctx->rules[insn->op2] = rule = ir_match_insn(ctx, insn->op2);
							}
							if (rule == IR_LEA_OB || rule == (IR_FUSED | IR_SIMPLE | IR_LEA_OB)) {
								ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_LEA_OB;
								/* x = ADD(X, imm32) ... ADD(Y, x) => SKIP ... LEA */
								return IR_LEA_I_OB;
							} else if (rule == IR_LEA_SI || rule == (IR_FUSED | IR_SIMPLE | IR_LEA_SI)) {
								ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_LEA_SI;
								/* x = MUL(X, 2|4|8) ... ADD(Y, x) => SKIP ... LEA */
								return IR_LEA_B_SI;
							}
						}
					}
					/* ADD(X, Y) => LEA [X + Y] */
					return IR_LEA_IB;
				}
binop_int:
				if (ir_op_flags[insn->op] & IR_OP_FLAG_COMMUTATIVE) {
					ir_match_fuse_load_commutative_int(ctx, insn, ref);
					return IR_BINOP_INT | IR_MAY_SWAP;
				} else {
					ir_match_fuse_load(ctx, insn->op2, ref);
					return IR_BINOP_INT;
				}
			} else {
binop_fp:
				if (ir_op_flags[insn->op] & IR_OP_FLAG_COMMUTATIVE) {
					ir_match_fuse_load_commutative_fp(ctx, insn, ref);
					if (ctx->mflags & IR_X86_AVX) {
						return IR_BINOP_AVX;
					} else {
						return IR_BINOP_SSE2 | IR_MAY_SWAP;
					}
				} else {
					ir_match_fuse_load(ctx, insn->op2, ref);
					if (ctx->mflags & IR_X86_AVX) {
						return IR_BINOP_AVX;
					} else {
						return IR_BINOP_SSE2;
					}
				}
			}
			break;
		case IR_MUL:
			if (IR_IS_TYPE_INT(insn->type)) {
				if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
					op2_insn = &ctx->ir_base[insn->op2];
					if (IR_IS_SYM_CONST(op2_insn->op)) {
						/* pass */
					} else if (IR_IS_CONST_REF(insn->op1)) {
						// const
					} else if (op2_insn->val.u64 == 0) {
						// 0
					} else if (op2_insn->val.u64 == 1) {
						// return IR_COPY_INT;
					} else if (ir_type_size[insn->type] >= 4 &&
							(op2_insn->val.u64 == 2 || op2_insn->val.u64 == 4 || op2_insn->val.u64 == 8)) {
						/* MUL(X, 2|4|8) => LEA [X*2|4|8] */
						return IR_LEA_SI;
					} else if (ir_type_size[insn->type] >= 4 &&
							(op2_insn->val.u64 == 3 || op2_insn->val.u64 == 5 || op2_insn->val.u64 == 9)) {
						/* MUL(X, 3|5|9) => LEA [X+X*2|4|8] */
						return IR_LEA_SIB;
					} else if (IR_IS_POWER_OF_TWO(op2_insn->val.u64)) {
						/* MUL(X, PWR2) => SHL */
						return IR_MUL_PWR2;
					} else if (IR_IS_TYPE_SIGNED(insn->type)
					 && ir_type_size[insn->type] != 1
					 && IR_IS_SIGNED_32BIT(op2_insn->val.i64)
					 && !IR_IS_CONST_REF(insn->op1)) {
						/* MUL(_, imm32) => IMUL */
						ir_match_fuse_load(ctx, insn->op1, ref);
						return IR_IMUL3;
					}
				}
				/* Prefer IMUL over MUL because it's more flexible and uses less registers ??? */
//				if (IR_IS_TYPE_SIGNED(insn->type) && ir_type_size[insn->type] != 1) {
				if (ir_type_size[insn->type] != 1) {
					goto binop_int;
				}
				ir_match_fuse_load(ctx, insn->op2, ref);
				return IR_MUL_INT;
			} else {
				goto binop_fp;
			}
			break;
		case IR_ADD_OV:
		case IR_SUB_OV:
			IR_ASSERT(IR_IS_TYPE_INT(insn->type));
			goto binop_int;
		case IR_MUL_OV:
			IR_ASSERT(IR_IS_TYPE_INT(insn->type));
			if (IR_IS_TYPE_SIGNED(insn->type) && ir_type_size[insn->type] != 1) {
				if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
					op2_insn = &ctx->ir_base[insn->op2];
					if (!IR_IS_SYM_CONST(op2_insn->op)
					 && IR_IS_SIGNED_32BIT(op2_insn->val.i64)
					 && !IR_IS_CONST_REF(insn->op1)) {
						/* MUL(_, imm32) => IMUL */
						ir_match_fuse_load(ctx, insn->op1, ref);
						return IR_IMUL3;
					}
				}
				goto binop_int;
			}
			ir_match_fuse_load(ctx, insn->op2, ref);
			return IR_MUL_INT;
		case IR_DIV:
			if (IR_IS_TYPE_INT(insn->type)) {
				if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
					op2_insn = &ctx->ir_base[insn->op2];
					if (IR_IS_SYM_CONST(op2_insn->op)) {
						/* pass */
					} else if (IR_IS_CONST_REF(insn->op1)) {
						// const
					} else if (op2_insn->val.u64 == 1) {
						// return IR_COPY_INT;
					} else if (IR_IS_POWER_OF_TWO(op2_insn->val.u64)) {
						/* DIV(X, PWR2) => SHR */
						if (IR_IS_TYPE_UNSIGNED(insn->type)) {
							return IR_DIV_PWR2;
						} else {
							return IR_SDIV_PWR2;
						}
					}
				}
				ir_match_fuse_load(ctx, insn->op2, ref);
				return IR_DIV_INT;
			} else {
				goto binop_fp;
			}
			break;
		case IR_MOD:
			if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
				op2_insn = &ctx->ir_base[insn->op2];
				if (IR_IS_SYM_CONST(op2_insn->op)) {
					/* pass */
				} else if (IR_IS_CONST_REF(insn->op1)) {
					// const
				} else if (IR_IS_POWER_OF_TWO(op2_insn->val.u64)) {
					/* MOD(X, PWR2) => AND */
					if (IR_IS_TYPE_UNSIGNED(insn->type)) {
						return IR_MOD_PWR2;
					} else {
						return IR_SMOD_PWR2;
					}
				}
			}
			ir_match_fuse_load(ctx, insn->op2, ref);
			return IR_MOD_INT;
		case IR_BSWAP:
			IR_ASSERT(IR_IS_TYPE_INT(insn->type));
			return IR_OP_INT;
		case IR_NOT:
			if (insn->type == IR_BOOL) {
				if (ctx->ir_base[insn->op1].type == IR_BOOL) {
					return IR_BOOL_NOT;
				} else {
					IR_ASSERT(IR_IS_TYPE_INT(ctx->ir_base[insn->op1].type)); // TODO: IR_BOOL_NOT_FP
					return IR_BOOL_NOT_INT;
				}
			} else {
				IR_ASSERT(IR_IS_TYPE_INT(insn->type));
				return IR_OP_INT;
			}
			break;
		case IR_NEG:
			if (IR_IS_TYPE_INT(insn->type)) {
				return IR_OP_INT;
			} else {
				return IR_OP_FP;
			}
		case IR_ABS:
			if (IR_IS_TYPE_INT(insn->type)) {
				return IR_ABS_INT; // movl %edi, %eax; negl %eax; cmovs %edi, %eax
			} else {
				return IR_OP_FP;
			}
		case IR_OR:
			if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
				op2_insn = &ctx->ir_base[insn->op2];
				if (IR_IS_SYM_CONST(op2_insn->op)) {
					/* pass */
				} else if (IR_IS_CONST_REF(insn->op1)) {
					// const
				} else if (op2_insn->val.i64 == 0) {
					// return IR_COPY_INT;
				} else if (op2_insn->val.i64 == -1) {
					// -1
				} else if (IR_IS_POWER_OF_TWO(op2_insn->val.u64) && !IR_IS_SIGNED_32BIT(op2_insn->val.i64)) {
					/* OR(X, PWR2) => BTS */
					return IR_BIT_OP;
				}
			}
			goto binop_int;
		case IR_AND:
			if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
				op2_insn = &ctx->ir_base[insn->op2];
				if (IR_IS_SYM_CONST(op2_insn->op)) {
					/* pass */
				} else if (IR_IS_CONST_REF(insn->op1)) {
					// const
				} else if (op2_insn->val.i64 == 0) {
					// 0
				} else if (op2_insn->val.i64 == -1) {
					// return IR_COPY_INT;
				} else if (IR_IS_POWER_OF_TWO(~op2_insn->val.u64) && !IR_IS_SIGNED_32BIT(op2_insn->val.i64)) {
					/* AND(X, ~PWR2) => BTR */
					return IR_BIT_OP;
				}
			}
			goto binop_int;
		case IR_XOR:
			if ((ctx->flags & IR_OPT_CODEGEN) && IR_IS_CONST_REF(insn->op2)) {
				op2_insn = &ctx->ir_base[insn->op2];
				if (IR_IS_SYM_CONST(op2_insn->op)) {
					/* pass */
				} else if (IR_IS_CONST_REF(insn->op1)) {
					// const
				}
			}
			goto binop_int;
		case IR_SHL:
			if (IR_IS_CONST_REF(insn->op2)) {
				if (ctx->flags & IR_OPT_CODEGEN) {
					op2_insn = &ctx->ir_base[insn->op2];
					if (IR_IS_SYM_CONST(op2_insn->op)) {
						/* pass */
					} else if (IR_IS_CONST_REF(insn->op1)) {
						// const
					} else if (op2_insn->val.u64 == 0) {
						// return IR_COPY_INT;
					} else if (ir_type_size[insn->type] >= 4) {
						if (op2_insn->val.u64 == 1) {
							// lea [op1*2]
						} else if (op2_insn->val.u64 == 2) {
							// lea [op1*4]
						} else if (op2_insn->val.u64 == 3) {
							// lea [op1*8]
						}
					}
				}
				return IR_SHIFT_CONST;
			}
			return IR_SHIFT;
		case IR_SHR:
		case IR_SAR:
		case IR_ROL:
		case IR_ROR:
			if (IR_IS_CONST_REF(insn->op2)) {
				if (ctx->flags & IR_OPT_CODEGEN) {
					op2_insn = &ctx->ir_base[insn->op2];
					if (IR_IS_SYM_CONST(op2_insn->op)) {
						/* pass */
					} else if (IR_IS_CONST_REF(insn->op1)) {
						// const
					} else if (op2_insn->val.u64 == 0) {
						// return IR_COPY_INT;
					}
				}
				return IR_SHIFT_CONST;
			}
			return IR_SHIFT;
		case IR_MIN:
		case IR_MAX:
			if (IR_IS_TYPE_INT(insn->type)) {
				return IR_MIN_MAX_INT | IR_MAY_SWAP;
			} else {
				goto binop_fp;
			}
			break;
		case IR_COPY:
			if (IR_IS_TYPE_INT(insn->type)) {
				return IR_COPY_INT | IR_MAY_REUSE;
			} else {
				return IR_COPY_FP | IR_MAY_REUSE;
			}
			break;
		case IR_CALL:
			if (IR_IS_CONST_REF(insn->op2)) {
				const ir_insn *func = &ctx->ir_base[insn->op2];

				if (func->op == IR_FUNC && func->proto) {
					uint32_t rule = ir_match_builtin_call(ctx, func);

					if (rule) {
						return rule;
					}
				}
			}
			ctx->flags2 |= IR_HAS_CALLS | IR_16B_FRAME_ALIGNMENT;
#ifndef IR_REG_FP_RET1
			if (IR_IS_TYPE_FP(insn->type)) {
				ctx->flags2 |= IR_HAS_FP_RET_SLOT;
			}
#endif
			IR_FALLTHROUGH;
		case IR_TAILCALL:
		case IR_IJMP:
			ir_match_fuse_load(ctx, insn->op2, ref);
			return insn->op;
		case IR_VAR:
			return IR_SKIPPED | IR_VAR;
		case IR_PARAM:
#ifndef _WIN64
			if (ctx->value_params && ctx->value_params[insn->op3 - 1].align) {
				return IR_STATIC_ALLOCA;
			}
#endif
			return ctx->use_lists[ref].count > 0 ? IR_PARAM : IR_SKIPPED | IR_PARAM;
		case IR_ALLOCA:
			/* alloca() may be used only in functions */
			if (ctx->flags & IR_FUNCTION) {
				if (IR_IS_CONST_REF(insn->op2) && ctx->cfg_map[ref] == 1) {
					ir_insn *val = &ctx->ir_base[insn->op2];

					if (!IR_IS_SYM_CONST(val->op)) {
						return IR_STATIC_ALLOCA;
					}
				}
				ctx->flags |= IR_USE_FRAME_POINTER;
				ctx->flags2 |= IR_HAS_ALLOCA | IR_16B_FRAME_ALIGNMENT;
			}
			return IR_ALLOCA;
		case IR_VSTORE:
			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op3].type)) {
				store_rule = IR_VSTORE_INT;
				load_op = IR_VLOAD;
store_int:
				if ((ctx->flags & IR_OPT_CODEGEN)
				 && ir_in_same_block(ctx, insn->op3)
				 && (ctx->use_lists[insn->op3].count == 1 ||
				     (ctx->use_lists[insn->op3].count == 2
				   && (ctx->ir_base[insn->op3].op == IR_ADD_OV ||
				       ctx->ir_base[insn->op3].op == IR_SUB_OV)))) {
					ir_insn *op_insn = &ctx->ir_base[insn->op3];
					uint32_t rule = ctx->rules[insn->op3];

					if (!rule) {
						ctx->rules[insn->op3] = rule = ir_match_insn(ctx, insn->op3);
					}
					if (((rule & IR_RULE_MASK) == IR_BINOP_INT && op_insn->op != IR_MUL) || rule == IR_LEA_OB || rule == IR_LEA_IB) {
						if (insn->op1 == op_insn->op1
						 && ctx->ir_base[op_insn->op1].op == load_op
						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
						 && ctx->use_lists[op_insn->op1].count == 2) {
							/* l = LOAD(_, a) ... v = BINOP(l, _) ... STORE(l, a, v) => SKIP ... SKIP_MEM_BINOP ... MEM_BINOP */
							ctx->rules[insn->op3] = IR_FUSED | IR_BINOP_INT;
							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
							if (!IR_IS_CONST_REF(op_insn->op2)
							 && ctx->rules[op_insn->op2] == (IR_FUSED|IR_SIMPLE|IR_LOAD)) {
								ctx->rules[op_insn->op2] = IR_LOAD_INT;
							}
							return IR_MEM_BINOP_INT;
						} else if ((ir_op_flags[op_insn->op] & IR_OP_FLAG_COMMUTATIVE)
						 && insn->op1 == op_insn->op2
						 && ctx->ir_base[op_insn->op2].op == load_op
						 && ctx->ir_base[op_insn->op2].op2 == insn->op2
						 && ctx->use_lists[op_insn->op2].count == 2) {
							/* l = LOAD(_, a) ... v = BINOP(_, l) ... STORE(l, a, v) => SKIP ... SKIP_MEM_BINOP ... MEM_BINOP */
							ir_swap_ops(op_insn);
							ctx->rules[insn->op3] = IR_FUSED | IR_BINOP_INT;
							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
							return IR_MEM_BINOP_INT;
						}
					} else if (rule == IR_INC) {
						if (insn->op1 == op_insn->op1
						 && ctx->ir_base[op_insn->op1].op == load_op
						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
						 && ctx->use_lists[op_insn->op1].count == 2) {
							/* l = LOAD(_, a) ... v = INC(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_INC */
							ctx->rules[insn->op3] = IR_SKIPPED | IR_INC;
							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
							return IR_MEM_INC;
						}
					} else if (rule == IR_DEC) {
						if (insn->op1 == op_insn->op1
						 && ctx->ir_base[op_insn->op1].op == load_op
						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
						 && ctx->use_lists[op_insn->op1].count == 2){
							/* l = LOAD(_, a) ... v = DEC(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_DEC */
							ctx->rules[insn->op3] = IR_SKIPPED | IR_DEC;
							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
							return IR_MEM_DEC;
						}
					} else if (rule == IR_MUL_PWR2) {
						if (insn->op1 == op_insn->op1
						 && ctx->ir_base[op_insn->op1].op == load_op
						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
						 && ctx->use_lists[op_insn->op1].count == 2) {
							/* l = LOAD(_, a) ... v = MUL_PWR2(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_MUL_PWR2 */
							ctx->rules[insn->op3] = IR_SKIPPED | IR_MUL_PWR2;
							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
							return IR_MEM_MUL_PWR2;
						}
					} else if (rule == IR_DIV_PWR2) {
						if (insn->op1 == op_insn->op1
						 && ctx->ir_base[op_insn->op1].op == load_op
						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
						 && ctx->use_lists[op_insn->op1].count == 2) {
							/* l = LOAD(_, a) ... v = DIV_PWR2(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_DIV_PWR2 */
							ctx->rules[insn->op3] = IR_SKIPPED | IR_DIV_PWR2;
							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
							return IR_MEM_DIV_PWR2;
						}
					} else if (rule == IR_MOD_PWR2) {
						if (insn->op1 == op_insn->op1
						 && ctx->ir_base[op_insn->op1].op == load_op
						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
						 && ctx->use_lists[op_insn->op1].count == 2) {
							/* l = LOAD(_, a) ... v = MOD_PWR2(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_MOD_PWR2 */
							ctx->rules[insn->op3] = IR_SKIPPED | IR_MOD_PWR2;
							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
							return IR_MEM_MOD_PWR2;
						}
					} else if (rule == IR_SHIFT) {
						if (insn->op1 == op_insn->op1
						 && ctx->ir_base[op_insn->op1].op == load_op
						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
						 && ctx->use_lists[op_insn->op1].count == 2) {
							/* l = LOAD(_, a) ... v = SHIFT(l, _) ... STORE(l, a, v) => SKIP ... SKIP_SHIFT ... MEM_SHIFT */
							ctx->rules[insn->op3] = IR_FUSED | IR_SHIFT;
							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
							return IR_MEM_SHIFT;
						}
					} else if (rule == IR_SHIFT_CONST) {
						if (insn->op1 == op_insn->op1
						 && ctx->ir_base[op_insn->op1].op == load_op
						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
						 && ctx->use_lists[op_insn->op1].count == 2) {
							/* l = LOAD(_, a) ... v = SHIFT(l, CONST) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_SHIFT_CONST */
							ctx->rules[insn->op3] = IR_SKIPPED | IR_SHIFT_CONST;
							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
							return IR_MEM_SHIFT_CONST;
						}
					} else if (rule == IR_OP_INT && op_insn->op != IR_BSWAP) {
						if (insn->op1 == op_insn->op1
						 && ctx->ir_base[op_insn->op1].op == load_op
						 && ctx->ir_base[op_insn->op1].op2 == insn->op2
						 && ctx->use_lists[op_insn->op1].count == 2) {
							/* l = LOAD(_, a) ... v = OP(l) ... STORE(l, a, v) => SKIP ... SKIP ... MEM_OP */
							ctx->rules[insn->op3] = IR_SKIPPED | IR_OP_INT;
							ctx->rules[op_insn->op1] = IR_SKIPPED | load_op;
							return IR_MEM_OP_INT;
						}
					} else if (rule == IR_CMP_INT && load_op == IR_LOAD) {
						/* c = CMP(_, _) ... STORE(c) => SKIP_CMP ... CMP_AND_STORE_INT */
						ctx->rules[insn->op3] = IR_FUSED | IR_CMP_INT;
						return IR_CMP_AND_STORE_INT;
					}
				}
				return store_rule;
			} else {
				return IR_VSTORE_FP;
			}
			break;
		case IR_LOAD:
			ir_match_fuse_addr(ctx, insn->op2);
			if (IR_IS_TYPE_INT(insn->type)) {
				return IR_LOAD_INT;
			} else {
				return IR_LOAD_FP;
			}
			break;
		case IR_STORE:
			ir_match_fuse_addr(ctx, insn->op2);
			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op3].type)) {
				store_rule = IR_STORE_INT;
				load_op = IR_LOAD;
				goto store_int;
			} else {
				return IR_STORE_FP;
			}
			break;
		case IR_RLOAD:
			if (IR_REGSET_IN(IR_REGSET_UNION((ir_regset)ctx->fixed_regset, IR_REGSET_FIXED), insn->op2)) {
				return IR_SKIPPED | IR_RLOAD;
			}
			return IR_RLOAD;
		case IR_RSTORE:
			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op2].type)) {
				if ((ctx->flags & IR_OPT_CODEGEN)
				 && ir_in_same_block(ctx, insn->op2)
				 && ctx->use_lists[insn->op2].count == 1
				 && IR_IS_TYPE_INT(ctx->ir_base[insn->op2].type)) {
					ir_insn *op_insn = &ctx->ir_base[insn->op2];

					if (op_insn->op == IR_ADD ||
				        op_insn->op == IR_SUB ||
//				        op_insn->op == IR_MUL ||
				        op_insn->op == IR_OR  ||
				        op_insn->op == IR_AND ||
				        op_insn->op == IR_XOR) {
						if (insn->op1 == op_insn->op1
						 && ctx->ir_base[op_insn->op1].op == IR_RLOAD
						 && ctx->ir_base[op_insn->op1].op2 == insn->op3
						 && ctx->use_lists[op_insn->op1].count == 2) {
							/* l = RLOAD(r) ... v = BINOP(l, _) ... RSTORE(l, r, v) => SKIP ... SKIP_REG_BINOP ... REG_BINOP */
							ctx->rules[insn->op2] = IR_FUSED | IR_BINOP_INT;
							ctx->rules[op_insn->op1] = IR_SKIPPED | IR_RLOAD;
							return IR_REG_BINOP_INT;
						} else if ((ir_op_flags[op_insn->op] & IR_OP_FLAG_COMMUTATIVE)
						 && insn->op1 == op_insn->op2
						 && ctx->ir_base[op_insn->op2].op == IR_RLOAD
						 && ctx->ir_base[op_insn->op2].op2 == insn->op3
						 && ctx->use_lists[op_insn->op2].count == 2) {
							/* l = RLOAD(r) ... v = BINOP(x, l) ... RSTORE(l, r, v) => SKIP ... SKIP_REG_BINOP ... REG_BINOP */
							ir_swap_ops(op_insn);
							ctx->rules[insn->op2] = IR_FUSED | IR_BINOP_INT;
							ctx->rules[op_insn->op1] = IR_SKIPPED | IR_RLOAD;
							return IR_REG_BINOP_INT;
						}
					}
				}
			}
			ir_match_fuse_load(ctx, insn->op2, ref);
			return IR_RSTORE;
		case IR_START:
		case IR_BEGIN:
		case IR_IF_TRUE:
		case IR_IF_FALSE:
		case IR_CASE_VAL:
		case IR_CASE_RANGE:
		case IR_CASE_DEFAULT:
		case IR_MERGE:
		case IR_LOOP_BEGIN:
		case IR_UNREACHABLE:
			return IR_SKIPPED | insn->op;
		case IR_RETURN:
			if (!insn->op2) {
				return IR_RETURN_VOID;
			} else if (IR_IS_TYPE_INT(ctx->ir_base[insn->op2].type)) {
				return IR_RETURN_INT;
			} else {
				return IR_RETURN_FP;
			}
		case IR_IF:
			if (!IR_IS_CONST_REF(insn->op2) && ctx->use_lists[insn->op2].count == 1) {
				op2_insn = &ctx->ir_base[insn->op2];
				if (op2_insn->op >= IR_EQ && op2_insn->op <= IR_UNORDERED) {
					if (IR_IS_TYPE_INT(ctx->ir_base[op2_insn->op1].type)) {
						if (IR_IS_CONST_REF(op2_insn->op2)
						 && !IR_IS_SYM_CONST(ctx->ir_base[op2_insn->op2].op)
						 && ctx->ir_base[op2_insn->op2].val.i64 == 0
						 && op2_insn->op1 == insn->op2 - 1) { /* previous instruction */
							ir_insn *op1_insn = &ctx->ir_base[op2_insn->op1];

							if (op1_insn->op == IR_AND && ctx->use_lists[op2_insn->op1].count == 1) {
								/* v = AND(_, _); c = CMP(v, 0) ... IF(c) => SKIP_TEST; SKIP ... TEST_AND_BRANCH */
								ir_match_fuse_load_test_int(ctx, op1_insn, ref);
								ctx->rules[op2_insn->op1] = IR_FUSED | IR_TEST_INT;
								ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_NOP;
								return IR_TEST_AND_BRANCH_INT;
							} else if (insn->op2 == ref - 1 && /* previous instruction */
									((op1_insn->op == IR_OR || op1_insn->op == IR_AND || op1_insn->op == IR_XOR) ||
										/* GT(ADD(_, _), 0) can't be optimized because ADD may overflow */
										((op1_insn->op == IR_ADD || op1_insn->op == IR_SUB) &&
											(op2_insn->op == IR_EQ || op2_insn->op == IR_NE ||
												op2_insn->op == IR_LT || op2_insn->op == IR_GE)))) {
								/* v = BINOP(_, _); c = CMP(v, 0) ... IF(c) => BINOP; SKIP_CMP ... JCC */
								if (ir_op_flags[op1_insn->op] & IR_OP_FLAG_COMMUTATIVE) {
									ir_match_fuse_load_commutative_int(ctx, op1_insn, ref);
									ctx->rules[op2_insn->op1] = IR_BINOP_INT | IR_MAY_SWAP;
								} else {
									ir_match_fuse_load(ctx, op1_insn->op2, ref);
									ctx->rules[op2_insn->op1] = IR_BINOP_INT;
								}
								ctx->rules[insn->op2] = IR_FUSED | IR_CMP_INT;
								return IR_JCC_INT;
							}
						}
						/* c = CMP(_, _) ... IF(c) => SKIP_CMP ... CMP_AND_BRANCH */
						ir_match_fuse_load_cmp_int(ctx, op2_insn, ref);
						ctx->rules[insn->op2] = IR_FUSED | IR_CMP_INT;
						return IR_CMP_AND_BRANCH_INT;
					} else {
						/* c = CMP(_, _) ... IF(c) => SKIP_CMP ... CMP_AND_BRANCH */
						ir_match_fuse_load_cmp_fp_br(ctx, op2_insn, ref, 1);
						ctx->rules[insn->op2] = IR_FUSED | IR_CMP_FP;
						return IR_CMP_AND_BRANCH_FP;
					}
				} else if (op2_insn->op == IR_AND) {
					/* c = AND(_, _) ... IF(c) => SKIP_TEST ... TEST_AND_BRANCH */
					ir_match_fuse_load_test_int(ctx, op2_insn, ref);
					ctx->rules[insn->op2] = IR_FUSED | IR_TEST_INT;
					return IR_TEST_AND_BRANCH_INT;
				} else if (op2_insn->op == IR_OVERFLOW && ir_in_same_block(ctx, insn->op2)) {
					/* c = OVERFLOW(_) ... IF(c) => SKIP_OVERFLOW ... OVERFLOW_AND_BRANCH */
					ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_OVERFLOW;
					return IR_OVERFLOW_AND_BRANCH;
				}
			}
			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op2].type)) {
				if (insn->op2 == ref - 1) { /* previous instruction */
					op2_insn = &ctx->ir_base[insn->op2];
					if (op2_insn->op == IR_ADD ||
					    op2_insn->op == IR_SUB ||
//					    op2_insn->op == IR_MUL ||
					    op2_insn->op == IR_OR  ||
					    op2_insn->op == IR_AND ||
					    op2_insn->op == IR_XOR) {

							/* v = BINOP(_, _); IF(v) => BINOP; JCC */
						if (ir_op_flags[op2_insn->op] & IR_OP_FLAG_COMMUTATIVE) {
							ir_match_fuse_load_commutative_int(ctx, op2_insn, ref);
							ctx->rules[insn->op2] = IR_BINOP_INT | IR_MAY_SWAP;
						} else {
							ir_match_fuse_load(ctx, op2_insn->op2, ref);
							ctx->rules[insn->op2] = IR_BINOP_INT;
						}
						return IR_JCC_INT;
					}
				} else if ((ctx->flags & IR_OPT_CODEGEN)
				 && insn->op1 == ref - 1 /* previous instruction */
				 && insn->op2 == ref - 2 /* previous instruction */
				 && ctx->use_lists[insn->op2].count == 2
				 && IR_IS_TYPE_INT(ctx->ir_base[insn->op2].type)) {
					ir_insn *store_insn = &ctx->ir_base[insn->op1];

					if (store_insn->op == IR_STORE && store_insn->op3 == insn->op2) {
						ir_insn *op_insn = &ctx->ir_base[insn->op2];

						if (op_insn->op == IR_ADD ||
						    op_insn->op == IR_SUB ||
//						    op_insn->op == IR_MUL ||
						    op_insn->op == IR_OR  ||
						    op_insn->op == IR_AND ||
						    op_insn->op == IR_XOR) {
							if (ctx->ir_base[op_insn->op1].op == IR_LOAD
							 && ctx->ir_base[op_insn->op1].op2 == store_insn->op2) {
								if (ir_in_same_block(ctx, op_insn->op1)
								 && ctx->use_lists[op_insn->op1].count == 2
								 && store_insn->op1 == op_insn->op1) {
									/* v = MEM_BINOP(_, _); IF(v) => MEM_BINOP; JCC */
									ctx->rules[insn->op2] = IR_FUSED | IR_BINOP_INT;
									ctx->rules[op_insn->op1] = IR_SKIPPED | IR_LOAD;
									ir_match_fuse_addr(ctx, store_insn->op2);
									ctx->rules[insn->op1] = IR_MEM_BINOP_INT;
									return IR_JCC_INT;
								}
							} else if ((ir_op_flags[op_insn->op] & IR_OP_FLAG_COMMUTATIVE)
							 && ctx->ir_base[op_insn->op2].op == IR_LOAD
							 && ctx->ir_base[op_insn->op2].op2 == store_insn->op2) {
								if (ir_in_same_block(ctx, op_insn->op2)
								 && ctx->use_lists[op_insn->op2].count == 2
								 && store_insn->op1 == op_insn->op2) {
									/* v = MEM_BINOP(_, _); IF(v) => MEM_BINOP; JCC */
									ir_swap_ops(op_insn);
									ctx->rules[insn->op2] = IR_FUSED | IR_BINOP_INT;
									ctx->rules[op_insn->op1] = IR_SKIPPED | IR_LOAD;
									ir_match_fuse_addr(ctx, store_insn->op2);
									ctx->rules[insn->op1] = IR_MEM_BINOP_INT;
									return IR_JCC_INT;
								}
							}
						}
					}
				}
				ir_match_fuse_load(ctx, insn->op2, ref);
				return IR_IF_INT;
			} else {
				IR_ASSERT(0 && "NIY IR_IF_FP");
				break;
			}
		case IR_COND:
			if (!IR_IS_CONST_REF(insn->op1) && ctx->use_lists[insn->op1].count == 1) {
				ir_insn *op1_insn = &ctx->ir_base[insn->op1];

				if (op1_insn->op >= IR_EQ && op1_insn->op <= IR_UNORDERED) {
					if (IR_IS_TYPE_INT(ctx->ir_base[op1_insn->op1].type)) {
						ir_match_fuse_load_cmp_int(ctx, op1_insn, ref);
						ctx->rules[insn->op1] = IR_FUSED | IR_CMP_INT;
						return IR_COND_CMP_INT;
					} else {
						ir_match_fuse_load_cmp_fp_br(ctx, op1_insn, ref, 1);
						ctx->rules[insn->op1] = IR_FUSED | IR_CMP_FP;
						return IR_COND_CMP_FP;
					}
				}
			}
			return IR_COND;
		case IR_GUARD:
		case IR_GUARD_NOT:
			if (!IR_IS_CONST_REF(insn->op2) && ctx->use_lists[insn->op2].count == 1) {
				op2_insn = &ctx->ir_base[insn->op2];
				if (op2_insn->op >= IR_EQ && op2_insn->op <= IR_UNORDERED
					// TODO: register allocator may clobber operands of CMP before they are used in the GUARD_CMP
				 && (insn->op2 == ref - 1 ||
				     (insn->op2 == ctx->prev_ref[ref] - 1
				   && ctx->ir_base[ctx->prev_ref[ref]].op == IR_SNAPSHOT))) {
					if (IR_IS_TYPE_INT(ctx->ir_base[op2_insn->op1].type)) {
						if (IR_IS_CONST_REF(op2_insn->op2)
						 && !IR_IS_SYM_CONST(ctx->ir_base[op2_insn->op2].op)
						 && ctx->ir_base[op2_insn->op2].val.i64 == 0) {
							if (op2_insn->op1 == insn->op2 - 1) { /* previous instruction */
								ir_insn *op1_insn = &ctx->ir_base[op2_insn->op1];

								if ((op1_insn->op == IR_OR || op1_insn->op == IR_AND || op1_insn->op == IR_XOR) ||
										/* GT(ADD(_, _), 0) can't be optimized because ADD may overflow */
										((op1_insn->op == IR_ADD || op1_insn->op == IR_SUB) &&
											(op2_insn->op == IR_EQ || op2_insn->op == IR_NE ||
												op2_insn->op == IR_LT || op2_insn->op == IR_GE))) {
									if (ir_op_flags[op1_insn->op] & IR_OP_FLAG_COMMUTATIVE) {
										ir_match_fuse_load_commutative_int(ctx, op1_insn, ref);
										ctx->rules[op2_insn->op1] = IR_BINOP_INT | IR_MAY_SWAP;
									} else {
										ir_match_fuse_load(ctx, op1_insn->op2, ref);
										ctx->rules[op2_insn->op1] = IR_BINOP_INT;
									}
									/* v = BINOP(_, _); c = CMP(v, 0) ... IF(c) => BINOP; SKIP_CMP ... GUARD_JCC */
									ctx->rules[insn->op2] = IR_FUSED | IR_CMP_INT;
									return IR_GUARD_JCC_INT;
								}
							} else if ((ctx->flags & IR_OPT_CODEGEN)
							 && op2_insn->op1 == insn->op2 - 2 /* before previous instruction */
							 && ir_in_same_block(ctx, op2_insn->op1)
							 && ctx->use_lists[op2_insn->op1].count == 2) {
								ir_insn *store_insn = &ctx->ir_base[insn->op2 - 1];

								if (store_insn->op == IR_STORE && store_insn->op3 == op2_insn->op1) {
									ir_insn *op_insn = &ctx->ir_base[op2_insn->op1];

									if ((op_insn->op == IR_OR || op_insn->op == IR_AND || op_insn->op == IR_XOR) ||
											/* GT(ADD(_, _), 0) can't be optimized because ADD may overflow */
											((op_insn->op == IR_ADD || op_insn->op == IR_SUB) &&
												(op2_insn->op == IR_EQ || op2_insn->op == IR_NE ||
													op2_insn->op == IR_LT || op2_insn->op == IR_GE))) {
										if (ctx->ir_base[op_insn->op1].op == IR_LOAD
										 && ctx->ir_base[op_insn->op1].op2 == store_insn->op2) {
											if (ir_in_same_block(ctx, op_insn->op1)
											 && ctx->use_lists[op_insn->op1].count == 2
											 && store_insn->op1 == op_insn->op1) {
												/* v = MEM_BINOP(_, _); IF(v) => MEM_BINOP; GUARD_JCC */
												ctx->rules[op2_insn->op1] = IR_FUSED | IR_BINOP_INT;
												ctx->rules[op_insn->op1] = IR_SKIPPED | IR_LOAD;
												ir_match_fuse_addr(ctx, store_insn->op2);
												ctx->rules[insn->op2 - 1] = IR_MEM_BINOP_INT;
												ctx->rules[insn->op2] = IR_SKIPPED | IR_NOP;
												return IR_GUARD_JCC_INT;
											}
										} else if ((ir_op_flags[op_insn->op] & IR_OP_FLAG_COMMUTATIVE)
										 && ctx->ir_base[op_insn->op2].op == IR_LOAD
										 && ctx->ir_base[op_insn->op2].op2 == store_insn->op2) {
											if (ir_in_same_block(ctx, op_insn->op2)
											 && ctx->use_lists[op_insn->op2].count == 2
											 && store_insn->op1 == op_insn->op2) {
												/* v = MEM_BINOP(_, _); IF(v) => MEM_BINOP; JCC */
												ir_swap_ops(op_insn);
												ctx->rules[op2_insn->op1] = IR_FUSED | IR_BINOP_INT;
												ctx->rules[op_insn->op1] = IR_SKIPPED | IR_LOAD;
												ir_match_fuse_addr(ctx, store_insn->op2);
												ctx->rules[insn->op2 - 1] = IR_MEM_BINOP_INT;
												ctx->rules[insn->op2] = IR_SKIPPED | IR_NOP;
												return IR_GUARD_JCC_INT;
											}
										}
									}
								}
							}
						}
						/* c = CMP(_, _) ... GUARD(c) => SKIP_CMP ... GUARD_CMP */
						ir_match_fuse_load_cmp_int(ctx, op2_insn, ref);
						ctx->rules[insn->op2] = IR_FUSED | IR_CMP_INT;
						return IR_GUARD_CMP_INT;
					} else {
						/* c = CMP(_, _) ... GUARD(c) => SKIP_CMP ... GUARD_CMP */
						ir_match_fuse_load_cmp_fp_br(ctx, op2_insn, ref, insn->op == IR_GUARD_NOT);
						ctx->rules[insn->op2] = IR_FUSED | IR_CMP_FP;
						return IR_GUARD_CMP_FP;
					}
				} else if (op2_insn->op == IR_AND) { // TODO: OR, XOR. etc
					/* c = AND(_, _) ... GUARD(c) => SKIP_TEST ... GUARD_TEST */
					ir_match_fuse_load_test_int(ctx, op2_insn, ref);
					ctx->rules[insn->op2] = IR_FUSED | IR_TEST_INT;
					return IR_GUARD_TEST_INT;
				} else if (op2_insn->op == IR_OVERFLOW && ir_in_same_block(ctx, insn->op2)) {
					/* c = OVERFLOW(_) ... GUARD(c) => SKIP_OVERFLOW ... GUARD_OVERFLOW */
					ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_OVERFLOW;
					return IR_GUARD_OVERFLOW;
				}
			}
			ir_match_fuse_load(ctx, insn->op2, ref);
			return insn->op;
		case IR_INT2FP:
			if (ir_type_size[ctx->ir_base[insn->op1].type] > (IR_IS_TYPE_SIGNED(ctx->ir_base[insn->op1].type) ? 2 : 4)) {
				ir_match_fuse_load(ctx, insn->op1, ref);
			}
			return insn->op;
		case IR_SEXT:
		case IR_ZEXT:
		case IR_FP2INT:
		case IR_FP2FP:
			ir_match_fuse_load(ctx, insn->op1, ref);
			return insn->op;
		case IR_TRUNC:
		case IR_PROTO:
			ir_match_fuse_load(ctx, insn->op1, ref);
			return insn->op | IR_MAY_REUSE;
		case IR_BITCAST:
			ir_match_fuse_load(ctx, insn->op1, ref);
			if (IR_IS_TYPE_INT(insn->type) && IR_IS_TYPE_INT(ctx->ir_base[insn->op1].type)) {
				return insn->op | IR_MAY_REUSE;
			} else {
				return insn->op;
			}
		case IR_CTLZ:
		case IR_CTTZ:
			ir_match_fuse_load(ctx, insn->op1, ref);
			return IR_BIT_COUNT;
		case IR_CTPOP:
			ir_match_fuse_load(ctx, insn->op1, ref);
			return (ctx->mflags & IR_X86_BMI1) ? IR_BIT_COUNT : IR_CTPOP;
		case IR_VA_START:
			ctx->flags2 |= IR_HAS_VA_START;
			if ((ctx->ir_base[insn->op2].op == IR_ALLOCA) || (ctx->ir_base[insn->op2].op == IR_VADDR)) {
				ir_use_list *use_list = &ctx->use_lists[insn->op2];
				ir_ref *p, n = use_list->count;
				for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
					ir_insn *use_insn = &ctx->ir_base[*p];
					if (use_insn->op == IR_VA_START || use_insn->op == IR_VA_END) {
					} else if (use_insn->op == IR_VA_COPY) {
						if (use_insn->op3 == insn->op2) {
							ctx->flags2 |= IR_HAS_VA_COPY;
						}
					} else if (use_insn->op == IR_VA_ARG) {
						if (use_insn->op2 == insn->op2) {
							if (IR_IS_TYPE_INT(use_insn->type)) {
								ctx->flags2 |= IR_HAS_VA_ARG_GP;
							} else {
								IR_ASSERT(IR_IS_TYPE_FP(use_insn->type));
								ctx->flags2 |= IR_HAS_VA_ARG_FP;
							}
						}
					} else if (*p > ref) {
						/* diriect va_list access */
						ctx->flags2 |= IR_HAS_VA_ARG_GP|IR_HAS_VA_ARG_FP;
					}
				}
			} else {
				/* va_list may escape */
				ctx->flags2 |= IR_HAS_VA_ARG_GP|IR_HAS_VA_ARG_FP;
			}
			return IR_VA_START;
		case IR_VA_END:
			return IR_SKIPPED | IR_NOP;
		case IR_VADDR:
			if (ctx->use_lists[ref].count > 0) {
				ir_use_list *use_list = &ctx->use_lists[ref];
				ir_ref *p, n = use_list->count;

				for (p = &ctx->use_edges[use_list->refs]; n > 0; p++, n--) {
					if (ctx->ir_base[*p].op != IR_VA_END) {
						return IR_STATIC_ALLOCA;
					}
				}
			}
			return IR_SKIPPED | IR_NOP;
		case IR_ARGVAL:
			return IR_FUSED | IR_ARGVAL;
		case IR_NOP:
			return IR_SKIPPED | IR_NOP;
		default:
			break;
	}

	return insn->op;
}

static void ir_match_insn2(ir_ctx *ctx, ir_ref ref, uint32_t rule)
{
	if (rule == IR_LEA_IB) {
		ir_match_try_revert_lea_to_add(ctx, ref);
	}
}

/* code generation */
static int32_t ir_ref_spill_slot_offset(ir_ctx *ctx, ir_ref ref, ir_reg *reg)
{
	int32_t offset;

	IR_ASSERT(ref >= 0 && ctx->vregs[ref] && ctx->live_intervals[ctx->vregs[ref]]);
	offset = ctx->live_intervals[ctx->vregs[ref]]->stack_spill_pos;
	IR_ASSERT(offset != -1);
	if (ctx->live_intervals[ctx->vregs[ref]]->flags & IR_LIVE_INTERVAL_SPILL_SPECIAL) {
		IR_ASSERT(ctx->spill_base != IR_REG_NONE);
		*reg = ctx->spill_base;
		return offset;
	}
	*reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
	return IR_SPILL_POS_TO_OFFSET(offset);
}

static ir_mem ir_vreg_spill_slot(ir_ctx *ctx, ir_ref v)
{
	int32_t offset;
	ir_reg base;

	IR_ASSERT(v > 0 && v <= ctx->vregs_count && ctx->live_intervals[v]);
	offset = ctx->live_intervals[v]->stack_spill_pos;
	IR_ASSERT(offset != -1);
	if (ctx->live_intervals[v]->flags & IR_LIVE_INTERVAL_SPILL_SPECIAL) {
		IR_ASSERT(ctx->spill_base != IR_REG_NONE);
		return IR_MEM_BO(ctx->spill_base, offset);
	}
	base = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
	offset = IR_SPILL_POS_TO_OFFSET(offset);
	return IR_MEM_BO(base, offset);
}

static ir_mem ir_ref_spill_slot(ir_ctx *ctx, ir_ref ref)
{
	IR_ASSERT(!IR_IS_CONST_REF(ref));
	return ir_vreg_spill_slot(ctx, ctx->vregs[ref]);
}

static bool ir_is_same_spill_slot(ir_ctx *ctx, ir_ref ref, ir_mem mem)
{
	ir_mem m = ir_ref_spill_slot(ctx, ref);
	return IR_MEM_VAL(m) == IR_MEM_VAL(mem);
}

static ir_mem ir_var_spill_slot(ir_ctx *ctx, ir_ref ref)
{
	ir_insn *var_insn = &ctx->ir_base[ref];
	ir_reg reg;

	IR_ASSERT(var_insn->op == IR_VAR);
	reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
	return IR_MEM_BO(reg, IR_SPILL_POS_TO_OFFSET(var_insn->op3));
}

static bool ir_may_avoid_spill_load(ir_ctx *ctx, ir_ref ref, ir_ref use)
{
	ir_live_interval *ival;

	IR_ASSERT(ctx->vregs[ref] && ctx->live_intervals[ctx->vregs[ref]]);
	ival = ctx->live_intervals[ctx->vregs[ref]];
	while (ival) {
		ir_use_pos *use_pos = ival->use_pos;
		while (use_pos) {
			if (IR_LIVE_POS_TO_REF(use_pos->pos) == use) {
				return !use_pos->next || use_pos->next->op_num == 0;
			}
			use_pos = use_pos->next;
		}
		ival = ival->next;
	}
	return 0;
}

static void ir_emit_mov_imm_int(ir_ctx *ctx, ir_type type, ir_reg reg, int64_t val)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;

	if (ir_type_size[type] == 8) {
		IR_ASSERT(sizeof(void*) == 8);
|.if X64
		if (IR_IS_UNSIGNED_32BIT(val)) {
			|	mov Rd(reg), (uint32_t)val // zero extended load
		} else if (IR_IS_SIGNED_32BIT(val)) {
			|	mov Rq(reg), (int32_t)val // sign extended load
		} else if (type == IR_ADDR && IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, (intptr_t)val)) {
			|	lea Ra(reg), [&val]
		} else {
			|	mov64 Ra(reg), val
		}
|.endif
	} else {
		|	ASM_REG_IMM_OP mov, type, reg, (int32_t)val // sign extended load
	}
}

static void ir_emit_load_imm_int(ir_ctx *ctx, ir_type type, ir_reg reg, int64_t val)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;

	IR_ASSERT(IR_IS_TYPE_INT(type));
	if (val == 0) {
		|	ASM_REG_REG_OP xor, type, reg, reg
	} else {
		ir_emit_mov_imm_int(ctx, type, reg, val);
	}
}

static void ir_emit_load_mem_int(ir_ctx *ctx, ir_type type, ir_reg reg, ir_mem mem)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;

	|	ASM_REG_MEM_OP mov, type, reg, mem
}

static void ir_emit_load_imm_fp(ir_ctx *ctx, ir_type type, ir_reg reg, ir_ref src)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_insn *insn = &ctx->ir_base[src];
	int label;

	if (type == IR_FLOAT && insn->val.u32 == 0) {
		if (ctx->mflags & IR_X86_AVX) {
			|	vxorps xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST)
		} else {
			|	xorps xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST)
		}
	} else if (type == IR_DOUBLE && insn->val.u64 == 0) {
		if (ctx->mflags & IR_X86_AVX) {
			|	vxorpd xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST)
		} else {
			|	xorpd xmm(reg-IR_REG_FP_FIRST), xmm(reg-IR_REG_FP_FIRST)
		}
	} else {
		label = ir_const_label(ctx, src);
		|	ASM_FP_REG_TXT_OP movs, type, reg, [=>label]
	}
}

static void ir_emit_load_mem_fp(ir_ctx *ctx, ir_type type, ir_reg reg, ir_mem mem)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;

	|	ASM_FP_REG_MEM_OP movs, type, reg, mem
}

static void ir_emit_load_mem(ir_ctx *ctx, ir_type type, ir_reg reg, ir_mem mem)
{
	if (IR_IS_TYPE_INT(type)) {
		ir_emit_load_mem_int(ctx, type, reg, mem);
	} else {
		ir_emit_load_mem_fp(ctx, type, reg, mem);
	}
}

static int32_t ir_local_offset(ir_ctx *ctx, ir_insn *insn)
{
	if (insn->op != IR_PARAM) {
		IR_ASSERT(insn->op == IR_VAR || insn->op == IR_ALLOCA || insn->op == IR_VADDR);
		return IR_SPILL_POS_TO_OFFSET(insn->op3);
	} else {
		IR_ASSERT(ctx->value_params && ctx->value_params[insn->op3 - 1].align);
		return IR_SPILL_POS_TO_OFFSET(ctx->value_params[insn->op3 - 1].offset);
	}
}

static void ir_load_local_addr(ir_ctx *ctx, ir_reg reg, ir_ref src)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg base = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
	ir_insn *var_insn;
	int32_t offset;

	IR_ASSERT(ir_rule(ctx, src) == IR_STATIC_ALLOCA);
	var_insn = &ctx->ir_base[src];
	if (var_insn->op == IR_VADDR) {
		var_insn = &ctx->ir_base[var_insn->op1];
	}
	offset = ir_local_offset(ctx, var_insn);
	if (offset == 0) {
		| mov Ra(reg), Ra(base)
	} else {
		| lea Ra(reg), [Ra(base)+offset]
	}
}

static void ir_emit_load(ir_ctx *ctx, ir_type type, ir_reg reg, ir_ref src)
{
	if (IR_IS_CONST_REF(src)) {
		if (IR_IS_TYPE_INT(type)) {
			ir_insn *insn = &ctx->ir_base[src];

			if (insn->op == IR_SYM || insn->op == IR_FUNC) {
				void *addr = ir_sym_val(ctx, insn);
				ir_emit_load_imm_int(ctx, type, reg, (intptr_t)addr);
			} else if (insn->op == IR_STR) {
				ir_backend_data *data = ctx->data;
				dasm_State **Dst = &data->dasm_state;
				int label = ir_const_label(ctx, src);

				|	lea Ra(reg), aword [=>label]
			} else {
				ir_emit_load_imm_int(ctx, type, reg, insn->val.i64);
			}
		} else {
			ir_emit_load_imm_fp(ctx, type, reg, src);
		}
	} else if (ctx->vregs[src]) {
		ir_emit_load_mem(ctx, type, reg, ir_ref_spill_slot(ctx, src));
	} else {
		ir_load_local_addr(ctx, reg, src);
	}
}

static void ir_emit_store_mem_int(ir_ctx *ctx, ir_type type, ir_mem mem, ir_reg reg)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;

	|	ASM_MEM_REG_OP mov, type, mem, reg
}

static void ir_emit_store_mem_fp(ir_ctx *ctx, ir_type type, ir_mem mem, ir_reg reg)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;

	|	ASM_FP_MEM_REG_OP movs, type, mem, reg
}

static void ir_emit_store_mem_imm(ir_ctx *ctx, ir_type type, ir_mem mem, int32_t imm)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;

	|	ASM_MEM_IMM_OP mov, type, mem, imm
}

static void ir_emit_store_mem_int_const(ir_ctx *ctx, ir_type type, ir_mem mem, ir_ref src, ir_reg tmp_reg, bool is_arg)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_insn *val_insn = &ctx->ir_base[src];

	IR_ASSERT(IR_IS_CONST_REF(src));
	if (val_insn->op == IR_STR) {
		int label = ir_const_label(ctx, src);

		IR_ASSERT(tmp_reg != IR_REG_NONE);
|.if X64
		|	lea Ra(tmp_reg), aword [=>label]
||		ir_emit_store_mem_int(ctx, type, mem, tmp_reg);
|.else
		|	ASM_TMEM_TXT_OP mov, aword, mem, =>label
|.endif
	} else {
		int64_t val = val_insn->val.i64;

		if (val_insn->op == IR_FUNC || val_insn->op == IR_SYM) {
			val = (int64_t)(intptr_t)ir_sym_val(ctx, val_insn);
		}

		if (ir_type_size[val_insn->type] <= 4 || IR_IS_SIGNED_32BIT(val)) {
			if (is_arg && ir_type_size[type] < 4) {
				type = IR_U32;
			}
			ir_emit_store_mem_imm(ctx, type, mem, val);
		} else {
			IR_ASSERT(tmp_reg != IR_REG_NONE);
			tmp_reg = IR_REG_NUM(tmp_reg);
			ir_emit_load_imm_int(ctx, type, tmp_reg, val);
			ir_emit_store_mem_int(ctx, type, mem, tmp_reg);
		}
	}
}

static void ir_emit_store_mem_fp_const(ir_ctx *ctx, ir_type type, ir_mem mem, ir_ref src, ir_reg tmp_reg, ir_reg tmp_fp_reg)
{
	ir_val *val = &ctx->ir_base[src].val;

	if (type == IR_FLOAT) {
		ir_emit_store_mem_imm(ctx, IR_U32, mem, val->i32);
	} else if (sizeof(void*) == 8 && val->i64 == 0) {
		ir_emit_store_mem_imm(ctx, IR_U64, mem, 0);
	} else if (sizeof(void*) == 8 && tmp_reg != IR_REG_NONE) {
		ir_emit_load_imm_int(ctx, IR_U64, tmp_reg, val->i64);
		ir_emit_store_mem_int(ctx, IR_U64, mem, tmp_reg);
	} else {
		tmp_fp_reg = IR_REG_NUM(tmp_fp_reg);
		ir_emit_load(ctx, type, tmp_fp_reg, src);
		ir_emit_store_mem_fp(ctx, IR_DOUBLE, mem, tmp_fp_reg);
	}
}

static void ir_emit_store_mem(ir_ctx *ctx, ir_type type, ir_mem mem, ir_reg reg)
{
	if (IR_IS_TYPE_INT(type)) {
		ir_emit_store_mem_int(ctx, type, mem, reg);
	} else {
		ir_emit_store_mem_fp(ctx, type, mem, reg);
	}
}

static void ir_emit_store(ir_ctx *ctx, ir_type type, ir_ref dst, ir_reg reg)
{
	IR_ASSERT(dst >= 0);
	ir_emit_store_mem(ctx, type, ir_ref_spill_slot(ctx, dst), reg);
}

static void ir_emit_mov(ir_ctx *ctx, ir_type type, ir_reg dst, ir_reg src)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;

	|	ASM_REG_REG_OP mov, type, dst, src
}

#define IR_HAVE_SWAP_INT

static void ir_emit_swap(ir_ctx *ctx, ir_type type, ir_reg dst, ir_reg src)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;

	|	ASM_REG_REG_OP xchg, type, dst, src
}

static void ir_emit_mov_ext(ir_ctx *ctx, ir_type type, ir_reg dst, ir_reg src)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;

	if (ir_type_size[type] > 2) {
		|	ASM_REG_REG_OP mov, type, dst, src
	} else if (ir_type_size[type] == 2) {
		if (IR_IS_TYPE_SIGNED(type)) {
			if (dst == IR_REG_RAX && src == IR_REG_RAX) {
				|	cwde
			} else {
				|	movsx Rd(dst), Rw(src)
			}
		} else {
			|	movzx Rd(dst), Rw(src)
		}
	} else /* if (ir_type_size[type] == 1) */ {
		if (IR_IS_TYPE_SIGNED(type)) {
			|	movsx Rd(dst), Rb(src)
		} else {
			|	movzx Rd(dst), Rb(src)
		}
	}
}

static void ir_emit_fp_mov(ir_ctx *ctx, ir_type type, ir_reg dst, ir_reg src)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;

	|	ASM_FP_REG_REG_OP movap, type, dst, src
}

static ir_mem ir_fuse_addr_const(ir_ctx *ctx, ir_ref ref)
{
	ir_mem mem;
	ir_insn *addr_insn = &ctx->ir_base[ref];

	IR_ASSERT(IR_IS_CONST_REF(ref));
	if (IR_IS_SYM_CONST(addr_insn->op)) {
		void *addr = ir_sym_val(ctx, addr_insn);
		IR_ASSERT(sizeof(void*) == 4 || IR_IS_SIGNED_32BIT((intptr_t)addr));
		mem = IR_MEM_O((int32_t)(intptr_t)addr);
	} else {
		IR_ASSERT(sizeof(void*) == 4 || IR_IS_SIGNED_32BIT(addr_insn->val.i64));
		mem = IR_MEM_O(addr_insn->val.i32);
	}
	return mem;
}

static ir_mem ir_fuse_addr(ir_ctx *ctx, ir_ref root, ir_ref ref)
{
	uint32_t rule = ctx->rules[ref];
	ir_insn *insn = &ctx->ir_base[ref];
	ir_insn *op1_insn, *op2_insn, *offset_insn;
	ir_ref base_reg_ref, index_reg_ref;
	ir_reg base_reg = IR_REG_NONE, index_reg;
	int32_t offset = 0, scale;

	IR_ASSERT(((rule & IR_RULE_MASK) >= IR_LEA_FIRST &&
			(rule & IR_RULE_MASK) <= IR_LEA_LAST) ||
		rule == IR_STATIC_ALLOCA);
	switch (rule & IR_RULE_MASK) {
		default:
			IR_ASSERT(0);
		case IR_LEA_OB:
			offset_insn = insn;
			if (ir_rule(ctx, insn->op1) == IR_STATIC_ALLOCA) {
				offset = ir_local_offset(ctx, &ctx->ir_base[insn->op1]);
				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
				base_reg_ref = IR_UNUSED;
			} else {
				base_reg_ref = ref * sizeof(ir_ref) + 1;
			}
			index_reg_ref = IR_UNUSED;
			scale = 1;
			break;
		case IR_LEA_SI:
			scale = ctx->ir_base[insn->op2].val.i32;
			index_reg_ref = ref * sizeof(ir_ref) + 1;
			base_reg_ref = IR_UNUSED;
			offset_insn = NULL;
			break;
		case IR_LEA_SIB:
			base_reg_ref = index_reg_ref = ref * sizeof(ir_ref) + 1;
			scale = ctx->ir_base[insn->op2].val.i32 - 1;
			offset_insn = NULL;
			break;
		case IR_LEA_IB:
			if (ir_rule(ctx, insn->op1) == IR_STATIC_ALLOCA) {
				offset = ir_local_offset(ctx, &ctx->ir_base[insn->op1]);
				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
				base_reg_ref = IR_UNUSED;
				index_reg_ref = ref * sizeof(ir_ref) + 2;
			} else if (ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
				offset = ir_local_offset(ctx, &ctx->ir_base[insn->op2]);
				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
				base_reg_ref = IR_UNUSED;
				index_reg_ref = ref * sizeof(ir_ref) + 1;
			} else {
				base_reg_ref = ref * sizeof(ir_ref) + 1;
				index_reg_ref = ref * sizeof(ir_ref) + 2;
			}
			offset_insn = NULL;
			scale = 1;
			break;
		case IR_LEA_OB_I:
			op1_insn = &ctx->ir_base[insn->op1];
			offset_insn = op1_insn;
			scale = 1;
			if (ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
				offset = ir_local_offset(ctx, &ctx->ir_base[insn->op2]);
				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
				base_reg_ref = IR_UNUSED;
				index_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
			} else if (ir_rule(ctx, op1_insn->op1) == IR_STATIC_ALLOCA) {
				offset = ir_local_offset(ctx, &ctx->ir_base[op1_insn->op1]);
				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
				base_reg_ref = IR_UNUSED;
				index_reg_ref = ref * sizeof(ir_ref) + 2;
			} else {
				base_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
				index_reg_ref = ref * sizeof(ir_ref) + 2;
			}
			break;
		case IR_LEA_I_OB:
			op2_insn = &ctx->ir_base[insn->op2];
			offset_insn = op2_insn;
			scale = 1;
			if (ir_rule(ctx, insn->op1) == IR_STATIC_ALLOCA) {
				offset = ir_local_offset(ctx, &ctx->ir_base[insn->op1]);
				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
				base_reg_ref = IR_UNUSED;
				index_reg_ref = insn->op2 * sizeof(ir_ref) + 1;
			} else if (ir_rule(ctx, op2_insn->op1) == IR_STATIC_ALLOCA) {
				offset = ir_local_offset(ctx, &ctx->ir_base[op2_insn->op1]);
				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
				base_reg_ref = IR_UNUSED;
				index_reg_ref = ref * sizeof(ir_ref) + 1;
			} else {
				base_reg_ref = ref * sizeof(ir_ref) + 1;
				index_reg_ref = insn->op2 * sizeof(ir_ref) + 1;
			}
			break;
		case IR_LEA_SI_O:
			index_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
			op1_insn = &ctx->ir_base[insn->op1];
			scale = ctx->ir_base[op1_insn->op2].val.i32;
			offset_insn = insn;
			base_reg_ref = IR_UNUSED;
			break;
		case IR_LEA_SIB_O:
			base_reg_ref = index_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
			op1_insn = &ctx->ir_base[insn->op1];
			scale = ctx->ir_base[op1_insn->op2].val.i32 - 1;
			offset_insn = insn;
			break;
		case IR_LEA_IB_O:
			op1_insn = &ctx->ir_base[insn->op1];
			offset_insn = insn;
			scale = 1;
			if (ir_rule(ctx, op1_insn->op2) == IR_STATIC_ALLOCA) {
				offset = ir_local_offset(ctx, &ctx->ir_base[op1_insn->op2]);
				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
				base_reg_ref = IR_UNUSED;
				index_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
			} else if (ir_rule(ctx, op1_insn->op1) == IR_STATIC_ALLOCA) {
				offset = ir_local_offset(ctx, &ctx->ir_base[op1_insn->op1]);
				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
				base_reg_ref = IR_UNUSED;
				index_reg_ref = insn->op1 * sizeof(ir_ref) + 2;
			} else {
				base_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
				index_reg_ref = insn->op1 * sizeof(ir_ref) + 2;
			}
			break;
		case IR_LEA_OB_SI:
			index_reg_ref = insn->op2 * sizeof(ir_ref) + 1;
			op1_insn = &ctx->ir_base[insn->op1];
			offset_insn = op1_insn;
			op2_insn = &ctx->ir_base[insn->op2];
			scale = ctx->ir_base[op2_insn->op2].val.i32;
			if (ir_rule(ctx, op1_insn->op1) == IR_STATIC_ALLOCA) {
				offset = ir_local_offset(ctx, &ctx->ir_base[op1_insn->op1]);
				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
				base_reg_ref = IR_UNUSED;
			} else {
				base_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
			}
			break;
		case IR_LEA_SI_OB:
			index_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
			op1_insn = &ctx->ir_base[insn->op1];
			scale = ctx->ir_base[op1_insn->op2].val.i32;
			op2_insn = &ctx->ir_base[insn->op2];
			offset_insn = op2_insn;
			if (ir_rule(ctx, op2_insn->op1) == IR_STATIC_ALLOCA) {
				offset = ir_local_offset(ctx, &ctx->ir_base[op2_insn->op1]);
				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
				base_reg_ref = IR_UNUSED;
			} else {
				base_reg_ref = insn->op2 * sizeof(ir_ref) + 1;
			}
			break;
		case IR_LEA_B_SI:
			if (ir_rule(ctx, insn->op1) == IR_STATIC_ALLOCA) {
				offset = ir_local_offset(ctx, &ctx->ir_base[insn->op1]);
				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
				base_reg_ref = IR_UNUSED;
			} else {
				base_reg_ref = ref * sizeof(ir_ref) + 1;
			}
			index_reg_ref = insn->op2 * sizeof(ir_ref) + 1;
			op2_insn = &ctx->ir_base[insn->op2];
			scale = ctx->ir_base[op2_insn->op2].val.i32;
			offset_insn = NULL;
			break;
		case IR_LEA_SI_B:
			index_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
			if (ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
				offset = ir_local_offset(ctx, &ctx->ir_base[insn->op2]);
				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
				base_reg_ref = IR_UNUSED;
			} else {
				base_reg_ref = ref * sizeof(ir_ref) + 2;
			}
			op1_insn = &ctx->ir_base[insn->op1];
			scale = ctx->ir_base[op1_insn->op2].val.i32;
			offset_insn = NULL;
			break;
		case IR_LEA_B_SI_O:
			offset_insn = insn;
			op1_insn = &ctx->ir_base[insn->op1];
			if (ir_rule(ctx, op1_insn->op1) == IR_STATIC_ALLOCA) {
				offset = ir_local_offset(ctx, &ctx->ir_base[op1_insn->op1]);
				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
				base_reg_ref = IR_UNUSED;
			} else {
				base_reg_ref = insn->op1 * sizeof(ir_ref) + 1;
			}
			index_reg_ref = op1_insn->op2 * sizeof(ir_ref) + 1;
			op2_insn = &ctx->ir_base[op1_insn->op2];
			scale = ctx->ir_base[op2_insn->op2].val.i32;
			break;
		case IR_LEA_SI_B_O:
			offset_insn = insn;
			op1_insn = &ctx->ir_base[insn->op1];
			index_reg_ref = op1_insn->op1 * sizeof(ir_ref) + 1;
			if (ir_rule(ctx, op1_insn->op2) == IR_STATIC_ALLOCA) {
				offset = ir_local_offset(ctx, &ctx->ir_base[op1_insn->op2]);
				base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
				base_reg_ref = IR_UNUSED;
			} else {
				base_reg_ref = insn->op1 * sizeof(ir_ref) + 2;
			}
			op1_insn = &ctx->ir_base[op1_insn->op1];
			scale = ctx->ir_base[op1_insn->op2].val.i32;
			break;
		case IR_LEA_SYM_O:
			op1_insn = &ctx->ir_base[insn->op1];
			op2_insn = &ctx->ir_base[insn->op2];
			offset = (intptr_t)ir_sym_val(ctx, op1_insn) + (intptr_t)op2_insn->val.i64;
			base_reg_ref = index_reg_ref = IR_UNUSED;
			scale = 1;
			offset_insn = NULL;
			break;
		case IR_LEA_O_SYM:
			op1_insn = &ctx->ir_base[insn->op1];
			op2_insn = &ctx->ir_base[insn->op2];
			offset = (intptr_t)ir_sym_val(ctx, op2_insn) + (intptr_t)op1_insn->val.i64;
			base_reg_ref = index_reg_ref = IR_UNUSED;
			scale = 1;
			offset_insn = NULL;
			break;
		case IR_ALLOCA:
			offset = ir_local_offset(ctx, insn);
			base_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
			base_reg_ref = index_reg_ref = IR_UNUSED;
			scale = 1;
			offset_insn = NULL;
			break;
	}

	if (offset_insn) {
		ir_insn *addr_insn = &ctx->ir_base[offset_insn->op2];

		if (IR_IS_SYM_CONST(addr_insn->op)) {
			void *addr = ir_sym_val(ctx, addr_insn);
			IR_ASSERT(sizeof(void*) != 8 || IR_IS_SIGNED_32BIT((intptr_t)addr));
			offset += (int64_t)(intptr_t)(addr);
		} else {
			if (offset_insn->op == IR_SUB) {
				offset -= addr_insn->val.i32;
			} else {
				offset += addr_insn->val.i32;
			}
		}
	}

	if (base_reg_ref) {
		if (UNEXPECTED(ctx->rules[base_reg_ref / sizeof(ir_ref)] & IR_FUSED_REG)) {
			base_reg = ir_get_fused_reg(ctx, root, base_reg_ref);
		} else {
			base_reg = ((int8_t*)ctx->regs)[base_reg_ref];
		}
		IR_ASSERT(base_reg != IR_REG_NONE);
		if (IR_REG_SPILLED(base_reg)) {
			base_reg = IR_REG_NUM(base_reg);
			ir_emit_load(ctx, insn->type, base_reg, ((ir_ref*)ctx->ir_base)[base_reg_ref]);
		}
	}

	index_reg = IR_REG_NONE;
	if (index_reg_ref) {
		if (base_reg_ref
			&& ((ir_ref*)ctx->ir_base)[index_reg_ref]
				== ((ir_ref*)ctx->ir_base)[base_reg_ref]) {
			index_reg = base_reg;
		} else {
			if (UNEXPECTED(ctx->rules[index_reg_ref / sizeof(ir_ref)] & IR_FUSED_REG)) {
				index_reg = ir_get_fused_reg(ctx, root, index_reg_ref);
			} else {
				index_reg = ((int8_t*)ctx->regs)[index_reg_ref];
			}
			IR_ASSERT(index_reg != IR_REG_NONE);
			if (IR_REG_SPILLED(index_reg)) {
				index_reg = IR_REG_NUM(index_reg);
				ir_emit_load(ctx, insn->type, index_reg, ((ir_ref*)ctx->ir_base)[index_reg_ref]);
			}
		}
	}

	return IR_MEM(base_reg, offset, index_reg, scale);
}

static ir_mem ir_fuse_mem(ir_ctx *ctx, ir_ref root, ir_ref ref, ir_insn *mem_insn, ir_reg reg)
{
	if (reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(reg)) {
			reg = IR_REG_NUM(reg);
			ir_emit_load(ctx, IR_ADDR, reg, mem_insn->op2);
		}
		return IR_MEM_B(reg);
	} else if (IR_IS_CONST_REF(mem_insn->op2)) {
		return ir_fuse_addr_const(ctx, mem_insn->op2);
	} else {
		return ir_fuse_addr(ctx, root, mem_insn->op2);
	}
}

static ir_mem ir_fuse_load(ir_ctx *ctx, ir_ref root, ir_ref ref)
{
	ir_insn *load_insn = &ctx->ir_base[ref];
	ir_reg reg;

	IR_ASSERT(load_insn->op == IR_LOAD);
	if (UNEXPECTED(ctx->rules[ref] & IR_FUSED_REG)) {
		reg = ir_get_fused_reg(ctx, root, ref * sizeof(ir_ref) + 2);
	} else {
		reg = ctx->regs[ref][2];
	}
	return ir_fuse_mem(ctx, root, ref, load_insn, reg);
}

static int32_t ir_fuse_imm(ir_ctx *ctx, ir_ref ref)
{
	ir_insn *val_insn = &ctx->ir_base[ref];

	IR_ASSERT(IR_IS_CONST_REF(ref));
	if (IR_IS_SYM_CONST(val_insn->op)) {
		void *addr = ir_sym_val(ctx, val_insn);
		IR_ASSERT(IR_IS_SIGNED_32BIT((intptr_t)addr));
		return (int32_t)(intptr_t)addr;
	} else {
		IR_ASSERT(ir_type_size[val_insn->type] == 4 || IR_IS_SIGNED_32BIT(val_insn->val.i64));
		return val_insn->val.i32;
	}
}

static void ir_emit_load_ex(ir_ctx *ctx, ir_type type, ir_reg reg, ir_ref src, ir_ref root)
{
	if (IR_IS_CONST_REF(src)) {
		if (IR_IS_TYPE_INT(type)) {
			ir_insn *insn = &ctx->ir_base[src];

			if (insn->op == IR_SYM || insn->op == IR_FUNC) {
				void *addr = ir_sym_val(ctx, insn);
				ir_emit_load_imm_int(ctx, type, reg, (intptr_t)addr);
			} else if (insn->op == IR_STR) {
				ir_backend_data *data = ctx->data;
				dasm_State **Dst = &data->dasm_state;
				int label = ir_const_label(ctx, src);

				|	lea Ra(reg), aword [=>label]
			} else {
				ir_emit_load_imm_int(ctx, type, reg, insn->val.i64);
			}
		} else {
			ir_emit_load_imm_fp(ctx, type, reg, src);
		}
	} else if (ir_rule(ctx, src) == IR_STATIC_ALLOCA) {
		ir_load_local_addr(ctx, reg, src);
	} else {
		ir_mem mem;

		if (ir_rule(ctx, src) & IR_FUSED) {
			mem = ir_fuse_load(ctx, root, src);
		} else {
			mem = ir_ref_spill_slot(ctx, src);
		}
		ir_emit_load_mem(ctx, type, reg, mem);
	}
}

static void ir_emit_prologue(ir_ctx *ctx)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	int offset = ctx->stack_frame_size + ctx->call_stack_size;

	if (ctx->flags & IR_USE_FRAME_POINTER) {
		|	push Ra(IR_REG_RBP)
		|	mov Ra(IR_REG_RBP), Ra(IR_REG_RSP)
	}
	if (IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_GP)) {
		int i;
		ir_regset used_preserved_regs = IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_GP);

		for (i = IR_REG_GP_FIRST; i <= IR_REG_GP_LAST; i++) {
			if (IR_REGSET_IN(used_preserved_regs, i)) {
				offset -= sizeof(void*);
				|	push Ra(i)
			}
		}
	}
	if (ctx->stack_frame_size + ctx->call_stack_size) {
		if (ctx->fixed_stack_red_zone) {
			IR_ASSERT(ctx->stack_frame_size + ctx->call_stack_size <= ctx->fixed_stack_red_zone);
		} else if (offset) {
			|	sub Ra(IR_REG_RSP), offset
		}
	}
	if (IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_FP)) {
		ir_reg fp;
		int i;
		ir_regset used_preserved_regs = IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_FP);

		if (ctx->flags & IR_USE_FRAME_POINTER) {
			fp = IR_REG_FRAME_POINTER;
			offset -= ctx->stack_frame_size + ctx->call_stack_size;
		} else {
			fp = IR_REG_STACK_POINTER;
		}
		for (i = IR_REG_FP_FIRST; i <= IR_REG_FP_LAST; i++) {
			if (IR_REGSET_IN(used_preserved_regs, i)) {
				offset -= sizeof(void*);
				if (ctx->mflags & IR_X86_AVX) {
					|	vmovsd qword [Ra(fp)+offset], xmm(i-IR_REG_FP_FIRST)
				} else {
					|	movsd qword [Ra(fp)+offset], xmm(i-IR_REG_FP_FIRST)
				}
			}
		}
	}
	if ((ctx->flags & IR_VARARG_FUNC) && (ctx->flags2 & IR_HAS_VA_START)) {
#if defined(_WIN64)
		ir_reg fp;
		int offset;

		if (ctx->flags & IR_USE_FRAME_POINTER) {
			fp = IR_REG_FRAME_POINTER;
			offset = sizeof(void*) * 2;
		} else {
			fp = IR_REG_STACK_POINTER;
			offset = ctx->stack_frame_size + ctx->call_stack_size + sizeof(void*);
		}
		|	mov [Ra(fp)+offset], Ra(IR_REG_INT_ARG1)
		|	mov [Ra(fp)+offset+8], Ra(IR_REG_INT_ARG2)
		|	mov [Ra(fp)+offset+16], Ra(IR_REG_INT_ARG3)
		|	mov [Ra(fp)+offset+24], Ra(IR_REG_INT_ARG4)
#elif defined(IR_TARGET_X64)
|.if X64
		const int8_t *int_reg_params = _ir_int_reg_params;
		const int8_t *fp_reg_params = _ir_fp_reg_params;
		uint32_t i;
		ir_reg fp;
		int offset;

		if (ctx->flags & IR_USE_FRAME_POINTER) {
			fp = IR_REG_FRAME_POINTER;

			offset = -(ctx->stack_frame_size - ctx->stack_frame_alignment - ctx->locals_area_size);
		} else {
			fp = IR_REG_STACK_POINTER;
			offset = ctx->locals_area_size + ctx->call_stack_size;
		}

		if ((ctx->flags2 & (IR_HAS_VA_ARG_GP|IR_HAS_VA_COPY)) && ctx->gp_reg_params < IR_REG_INT_ARGS) {
			/* skip named args */
			offset += sizeof(void*) * ctx->gp_reg_params;
			for (i = ctx->gp_reg_params; i < IR_REG_INT_ARGS; i++) {
				|	mov qword [Ra(fp)+offset], Rq(int_reg_params[i])
				offset += sizeof(void*);
			}
		}
		if ((ctx->flags2 & (IR_HAS_VA_ARG_FP|IR_HAS_VA_COPY)) && ctx->fp_reg_params < IR_REG_FP_ARGS) {
			|	test al, al
			|	je	>1
			/* skip named args */
			offset += 16 * ctx->fp_reg_params;
			for (i = ctx->fp_reg_params; i < IR_REG_FP_ARGS; i++) {
				|	movaps [Ra(fp)+offset], xmm(fp_reg_params[i]-IR_REG_FP_FIRST)
				offset += 16;
			}
			|1:
		}
|.endif
#endif
	}
}

static void ir_emit_epilogue(ir_ctx *ctx)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;

	if (IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_FP)) {
		int i;
		int offset;
		ir_reg fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
		ir_regset used_preserved_regs = (ir_regset)ctx->used_preserved_regs;

		if (ctx->flags & IR_USE_FRAME_POINTER) {
			fp = IR_REG_FRAME_POINTER;
			offset = 0;
		} else {
			fp = IR_REG_STACK_POINTER;
			offset = ctx->stack_frame_size + ctx->call_stack_size;
		}
		for (i = 0; i < IR_REG_NUM; i++) {
			if (IR_REGSET_IN(used_preserved_regs, i)) {
				if (i < IR_REG_FP_FIRST) {
					offset -= sizeof(void*);
				} else {
					offset -= sizeof(void*);
					if (ctx->mflags & IR_X86_AVX) {
						|	vmovsd xmm(i-IR_REG_FP_FIRST), qword [Ra(fp)+offset]
					} else {
						|	movsd xmm(i-IR_REG_FP_FIRST), qword [Ra(fp)+offset]
					}
				}
			}
		}
	}

	if (IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_GP)) {
		int i;
		ir_regset used_preserved_regs = IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_GP);
		int offset;

		if (ctx->flags & IR_USE_FRAME_POINTER) {
			offset = 0;
		} else {
			offset = ctx->stack_frame_size + ctx->call_stack_size;
		}
		if (IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_GP)) {
			int i;
			ir_regset used_preserved_regs = IR_REGSET_INTERSECTION((ir_regset)ctx->used_preserved_regs, IR_REGSET_GP);

			for (i = IR_REG_GP_LAST; i >= IR_REG_GP_FIRST; i--) {
				if (IR_REGSET_IN(used_preserved_regs, i)) {
					offset -= sizeof(void*);
				}
			}
		}
		if (ctx->flags & IR_USE_FRAME_POINTER) {
			|	lea Ra(IR_REG_RSP), [Ra(IR_REG_RBP)+offset]
		} else if (offset) {
			|	add Ra(IR_REG_RSP), offset
		}
		for (i = IR_REG_GP_LAST; i >= IR_REG_GP_FIRST; i--) {
			if (IR_REGSET_IN(used_preserved_regs, i)) {
				|	pop Ra(i)
			}
		}
		if (ctx->flags & IR_USE_FRAME_POINTER) {
			|	pop Ra(IR_REG_RBP)
		}
	} else if (ctx->flags & IR_USE_FRAME_POINTER) {
		|	mov Ra(IR_REG_RSP), Ra(IR_REG_RBP)
		|	pop Ra(IR_REG_RBP)
	} else if (ctx->stack_frame_size + ctx->call_stack_size) {
		if (ctx->fixed_stack_red_zone) {
			IR_ASSERT(ctx->stack_frame_size + ctx->call_stack_size <= ctx->fixed_stack_red_zone);
		} else {
			|	add Ra(IR_REG_RSP), (ctx->stack_frame_size + ctx->call_stack_size)
		}
	}
}

static void ir_emit_binop_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = insn->type;
	ir_ref op1 = insn->op1;
	ir_ref op2 = insn->op2;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];
	ir_reg op2_reg = ctx->regs[def][2];

	IR_ASSERT(def_reg != IR_REG_NONE);

	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, op1);
	}
	if (def_reg != op1_reg) {
		if (op1_reg != IR_REG_NONE) {
			ir_emit_mov(ctx, type, def_reg, op1_reg);
		} else {
			ir_emit_load(ctx, type, def_reg, op1);
		}
		if (op1 == op2) {
			op2_reg = def_reg;
		}
	}

	if (op2_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			if (op1 != op2) {
				ir_emit_load(ctx, type, op2_reg, op2);
			}
		}
		switch (insn->op) {
			default:
				IR_ASSERT(0 && "NIY binary op");
			case IR_ADD:
			case IR_ADD_OV:
				|	ASM_REG_REG_OP add, type, def_reg, op2_reg
				break;
			case IR_SUB:
			case IR_SUB_OV:
				|	ASM_REG_REG_OP sub, type, def_reg, op2_reg
				break;
			case IR_MUL:
			case IR_MUL_OV:
				|	ASM_REG_REG_MUL imul, type, def_reg, op2_reg
				break;
			case IR_OR:
				|	ASM_REG_REG_OP or, type, def_reg, op2_reg
				break;
			case IR_AND:
				|	ASM_REG_REG_OP and, type, def_reg, op2_reg
				break;
			case IR_XOR:
				|	ASM_REG_REG_OP xor, type, def_reg, op2_reg
				break;
		}
	} else if (IR_IS_CONST_REF(op2)) {
		int32_t val = ir_fuse_imm(ctx, op2);

		switch (insn->op) {
			default:
				IR_ASSERT(0 && "NIY binary op");
			case IR_ADD:
			case IR_ADD_OV:
				|	ASM_REG_IMM_OP add, type, def_reg, val
				break;
			case IR_SUB:
			case IR_SUB_OV:
				|	ASM_REG_IMM_OP sub, type, def_reg, val
				break;
			case IR_MUL:
			case IR_MUL_OV:
				|	ASM_REG_IMM_MUL imul, type, def_reg, val
				break;
			case IR_OR:
				|	ASM_REG_IMM_OP or, type, def_reg, val
				break;
			case IR_AND:
				|	ASM_REG_IMM_OP and, type, def_reg, val
				break;
			case IR_XOR:
				|	ASM_REG_IMM_OP xor, type, def_reg, val
				break;
		}
	} else {
		ir_mem mem;

		if (ir_rule(ctx, op2) & IR_FUSED) {
			mem = ir_fuse_load(ctx, def, op2);
		} else {
			mem = ir_ref_spill_slot(ctx, op2);
		}
		switch (insn->op) {
			default:
				IR_ASSERT(0 && "NIY binary op");
			case IR_ADD:
			case IR_ADD_OV:
				|	ASM_REG_MEM_OP add, type, def_reg, mem
				break;
			case IR_SUB:
			case IR_SUB_OV:
				|	ASM_REG_MEM_OP sub, type, def_reg, mem
				break;
			case IR_MUL:
			case IR_MUL_OV:
				|	ASM_REG_MEM_MUL imul, type, def_reg, mem
				break;
			case IR_OR:
				|	ASM_REG_MEM_OP or, type, def_reg, mem
				break;
			case IR_AND:
				|	ASM_REG_MEM_OP and, type, def_reg, mem
				break;
			case IR_XOR:
				|	ASM_REG_MEM_OP xor, type, def_reg, mem
				break;
		}
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_imul3(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = insn->type;
	ir_ref op1 = insn->op1;
	ir_ref op2 = insn->op2;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];
	int32_t val = ir_fuse_imm(ctx, op2);

	IR_ASSERT(def_reg != IR_REG_NONE);
	IR_ASSERT(!IR_IS_CONST_REF(op1));

	if (op1_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op1_reg)) {
			op1_reg = IR_REG_NUM(op1_reg);
			ir_emit_load(ctx, type, op1_reg, op1);
		}
		switch (ir_type_size[type]) {
			default:
				IR_ASSERT(0);
			case 2:
				|	imul Rw(def_reg), Rw(op1_reg), val
				break;
			case 4:
				|	imul Rd(def_reg), Rd(op1_reg), val
				break;
|.if X64
||			case 8:
|				imul Rq(def_reg), Rq(op1_reg), val
||				break;
|.endif
		}
	} else {
		ir_mem mem;

		if (ir_rule(ctx, op1) & IR_FUSED) {
			mem = ir_fuse_load(ctx, def, op1);
		} else {
			mem = ir_ref_spill_slot(ctx, op1);
		}
		|	ASM_REG_MEM_TXT_MUL imul, type, def_reg, mem, val
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_min_max_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = insn->type;
	ir_ref op1 = insn->op1;
	ir_ref op2 = insn->op2;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];
	ir_reg op2_reg = ctx->regs[def][2];

	IR_ASSERT(def_reg != IR_REG_NONE && op2_reg != IR_REG_NONE);

	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, op1);
	}
	if (def_reg != op1_reg) {
		if (op1_reg != IR_REG_NONE) {
			ir_emit_mov(ctx, type, def_reg, op1_reg);
		} else {
			ir_emit_load(ctx, type, def_reg, op1);
		}
	}

	if (IR_REG_SPILLED(op2_reg)) {
		op2_reg = IR_REG_NUM(op2_reg);
		if (op1 != op2) {
			ir_emit_load(ctx, type, op2_reg, op2);
		}
	}

	if (op1 == op2) {
		return;
	}

	|	ASM_REG_REG_OP cmp, type, def_reg, op2_reg
	if (insn->op == IR_MIN) {
		if (IR_IS_TYPE_SIGNED(type)) {
			|	ASM_REG_REG_OP2 cmovg, type, def_reg, op2_reg
		} else {
			|	ASM_REG_REG_OP2 cmova, type, def_reg, op2_reg
		}
	} else {
		IR_ASSERT(insn->op == IR_MAX);
		if (IR_IS_TYPE_SIGNED(type)) {
			|	ASM_REG_REG_OP2 cmovl, type, def_reg, op2_reg
		} else {
			|	ASM_REG_REG_OP2 cmovb, type, def_reg, op2_reg
		}
	}

	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_overflow(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_type type = ctx->ir_base[insn->op1].type;

	IR_ASSERT(def_reg != IR_REG_NONE);
	IR_ASSERT(IR_IS_TYPE_INT(type));
	if (IR_IS_TYPE_SIGNED(type)) {
		|	seto Rb(def_reg)
	} else {
		|	setc Rb(def_reg)
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, insn->type, def, def_reg);
	}
}

static void ir_emit_overflow_and_branch(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_insn *overflow_insn = &ctx->ir_base[insn->op2];
	ir_type type = ctx->ir_base[overflow_insn->op1].type;
	uint32_t true_block, false_block;
	bool reverse = 0;

	ir_get_true_false_blocks(ctx, b, &true_block, &false_block);
	if (true_block == next_block) {
		reverse = 1;
		true_block = false_block;
		false_block = 0;
	} else if (false_block == next_block) {
		false_block = 0;
	}

	if (IR_IS_TYPE_SIGNED(type)) {
		if (reverse) {
			|	jno =>true_block
		} else {
			|	jo =>true_block
		}
	} else {
		if (reverse) {
			|	jnc =>true_block
		} else {
			|	jc =>true_block
		}
	}
	if (false_block) {
		|	jmp =>false_block
	}
}

static void ir_emit_mem_binop_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_insn *op_insn = &ctx->ir_base[insn->op3];
	ir_type type = op_insn->type;
	ir_ref op2 = op_insn->op2;
	ir_reg op2_reg = ctx->regs[insn->op3][2];
	ir_mem mem;

	if (insn->op == IR_STORE) {
		mem = ir_fuse_mem(ctx, def, def, insn, ctx->regs[def][2]);
	} else {
		IR_ASSERT(insn->op == IR_VSTORE);
		mem = ir_var_spill_slot(ctx, insn->op2);
	}

	if (op2_reg == IR_REG_NONE) {
		int32_t val = ir_fuse_imm(ctx, op2);

		switch (op_insn->op) {
			default:
				IR_ASSERT(0 && "NIY binary op");
			case IR_ADD:
			case IR_ADD_OV:
				|	ASM_MEM_IMM_OP add, type, mem, val
				break;
			case IR_SUB:
			case IR_SUB_OV:
				|	ASM_MEM_IMM_OP sub, type, mem, val
				break;
			case IR_OR:
				|	ASM_MEM_IMM_OP or, type, mem, val
				break;
			case IR_AND:
				|	ASM_MEM_IMM_OP and, type, mem, val
				break;
			case IR_XOR:
				|	ASM_MEM_IMM_OP xor, type, mem, val
				break;
		}
	} else {
		if (IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			ir_emit_load(ctx, type, op2_reg, op2);
		}
		switch (op_insn->op) {
			default:
				IR_ASSERT(0 && "NIY binary op");
			case IR_ADD:
			case IR_ADD_OV:
				|	ASM_MEM_REG_OP add, type, mem, op2_reg
				break;
			case IR_SUB:
			case IR_SUB_OV:
				|	ASM_MEM_REG_OP sub, type, mem, op2_reg
				break;
			case IR_OR:
				|	ASM_MEM_REG_OP or, type, mem, op2_reg
				break;
			case IR_AND:
				|	ASM_MEM_REG_OP and, type, mem, op2_reg
				break;
			case IR_XOR:
				|	ASM_MEM_REG_OP xor, type, mem, op2_reg
				break;
		}
	}
}

static void ir_emit_reg_binop_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_insn *op_insn = &ctx->ir_base[insn->op2];
	ir_type type = op_insn->type;
	ir_ref op2 = op_insn->op2;
	ir_reg op2_reg = ctx->regs[insn->op2][2];
	ir_reg reg;

	IR_ASSERT(insn->op == IR_RSTORE);
	reg = insn->op3;

	if (op2_reg == IR_REG_NONE) {
		int32_t val = ir_fuse_imm(ctx, op2);

		switch (op_insn->op) {
			default:
				IR_ASSERT(0 && "NIY binary op");
			case IR_ADD:
				|	ASM_REG_IMM_OP add, type, reg, val
				break;
			case IR_SUB:
				|	ASM_REG_IMM_OP sub, type, reg, val
				break;
			case IR_OR:
				|	ASM_REG_IMM_OP or, type, reg, val
				break;
			case IR_AND:
				|	ASM_REG_IMM_OP and, type, reg, val
				break;
			case IR_XOR:
				|	ASM_REG_IMM_OP xor, type, reg, val
				break;
		}
	} else {
		if (IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			ir_emit_load(ctx, type, op2_reg, op2);
		}
		switch (op_insn->op) {
			default:
				IR_ASSERT(0 && "NIY binary op");
			case IR_ADD:
				|	ASM_REG_REG_OP add, type, reg, op2_reg
				break;
			case IR_SUB:
				|	ASM_REG_REG_OP sub, type, reg, op2_reg
				break;
			case IR_OR:
				|	ASM_REG_REG_OP or, type, reg, op2_reg
				break;
			case IR_AND:
				|	ASM_REG_REG_OP and, type, reg, op2_reg
				break;
			case IR_XOR:
				|	ASM_REG_REG_OP xor, type, reg, op2_reg
				break;
		}
	}
}

static void ir_emit_mul_div_mod_pwr2(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = insn->type;
	ir_ref op1 = insn->op1;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];

	IR_ASSERT(IR_IS_CONST_REF(insn->op2));
	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[insn->op2].op));
	IR_ASSERT(def_reg != IR_REG_NONE);

	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, op1);
	}
	if (def_reg != op1_reg) {
		if (op1_reg != IR_REG_NONE) {
			ir_emit_mov(ctx, type, def_reg, op1_reg);
		} else {
			ir_emit_load(ctx, type, def_reg, op1);
		}
	}
	if (insn->op == IR_MUL) {
		uint32_t shift = IR_LOG2(ctx->ir_base[insn->op2].val.u64);

		if (shift == 1) {
			|	ASM_REG_REG_OP add, type, def_reg, def_reg
		} else {
			|	ASM_REG_IMM_OP shl, type, def_reg, shift
		}
	} else if (insn->op == IR_DIV) {
		uint32_t shift = IR_LOG2(ctx->ir_base[insn->op2].val.u64);

		|	ASM_REG_IMM_OP shr, type, def_reg, shift
	} else {
		IR_ASSERT(insn->op == IR_MOD);
		uint64_t mask = ctx->ir_base[insn->op2].val.u64 - 1;

|.if X64
||		if (ir_type_size[type] == 8 && ctx->regs[def][2] != IR_REG_NONE) {
||			ir_reg op2_reg = ctx->regs[def][2];
||
||			op2_reg = IR_REG_NUM(op2_reg);
||			ir_emit_load_imm_int(ctx, type, op2_reg, mask);
			|	ASM_REG_REG_OP and, type, def_reg, op2_reg
||		} else {
|.endif
			|	ASM_REG_IMM_OP and, type, def_reg, mask
|.if X64
||		}
|.endif
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_bit_op(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = insn->type;
	ir_ref op1 = insn->op1;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];

	IR_ASSERT(IR_IS_CONST_REF(insn->op2));
	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[insn->op2].op));
	IR_ASSERT(def_reg != IR_REG_NONE);

	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, op1);
	}
	if (def_reg != op1_reg) {
		if (op1_reg != IR_REG_NONE) {
			ir_emit_mov(ctx, type, def_reg, op1_reg);
		} else {
			ir_emit_load(ctx, type, def_reg, op1);
		}
	}
	if (insn->op == IR_OR) {
		uint32_t bit = IR_LOG2(ctx->ir_base[insn->op2].val.u64);

		|	ASM_REG16_IMM_OP, bts, type, def_reg, bit
	} else {
		IR_ASSERT(insn->op == IR_AND);
		uint32_t bit = IR_LOG2(~ctx->ir_base[insn->op2].val.u64);

		|	ASM_REG16_IMM_OP, btr, type, def_reg, bit
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_sdiv_pwr2(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = insn->type;
	ir_ref op1 = insn->op1;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];
	uint32_t shift = IR_LOG2(ctx->ir_base[insn->op2].val.u64);
	int64_t offset = ctx->ir_base[insn->op2].val.u64 - 1;

	IR_ASSERT(shift != 0);
	IR_ASSERT(IR_IS_CONST_REF(insn->op2));
	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[insn->op2].op));
	IR_ASSERT(op1_reg != IR_REG_NONE && def_reg != IR_REG_NONE && op1_reg != def_reg);

	if (IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, op1);
	}

	if (shift == 1) {
|.if X64
||		if (ir_type_size[type] == 8) {
			|	mov Rq(def_reg), Rq(op1_reg)
			|	ASM_REG_IMM_OP shr, type, def_reg, 63
			|	add Rq(def_reg), Rq(op1_reg)
||		} else {
|.endif
			|	mov Rd(def_reg), Rd(op1_reg)
			|	ASM_REG_IMM_OP shr, type, def_reg, (ir_type_size[type]*8-1)
			|	add Rd(def_reg), Rd(op1_reg)
|.if X64
||		}
|.endif
	} else {
|.if X64
||		if (ir_type_size[type] == 8) {
||			ir_reg op2_reg = ctx->regs[def][2];
||
||			if (op2_reg != IR_REG_NONE) {
||				op2_reg =  IR_REG_NUM(op2_reg);
||				ir_emit_load_imm_int(ctx, type, op2_reg, offset);
				|	lea Rq(def_reg), [Rq(op1_reg)+Rq(op2_reg)]
||			} else {
				|	lea Rq(def_reg), [Rq(op1_reg)+(int32_t)offset]
||			}
||		} else {
|.endif
			|	lea Rd(def_reg), [Rd(op1_reg)+(int32_t)offset]
|.if X64
||		}
|.endif
		|	ASM_REG_REG_OP test, type, op1_reg, op1_reg
		|	ASM_REG_REG_OP2 cmovns, type, def_reg, op1_reg
	}
	|	ASM_REG_IMM_OP sar, type, def_reg, shift

	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_smod_pwr2(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = insn->type;
	ir_ref op1 = insn->op1;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];
	ir_reg tmp_reg = ctx->regs[def][3];
	uint32_t shift = IR_LOG2(ctx->ir_base[insn->op2].val.u64);
	uint64_t mask = ctx->ir_base[insn->op2].val.u64 - 1;

	IR_ASSERT(shift != 0);
	IR_ASSERT(IR_IS_CONST_REF(insn->op2));
	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[insn->op2].op));
	IR_ASSERT(def_reg != IR_REG_NONE && tmp_reg != IR_REG_NONE && def_reg != tmp_reg);

	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, op1);
	}
	if (def_reg != op1_reg) {
		if (op1_reg != IR_REG_NONE) {
			ir_emit_mov(ctx, type, def_reg, op1_reg);
		} else {
			ir_emit_load(ctx, type, def_reg, op1);
		}
	}
	if (tmp_reg != op1_reg) {
		ir_emit_mov(ctx, type, tmp_reg, def_reg);
	}


	if (shift == 1) {
		|	ASM_REG_IMM_OP shr, type, tmp_reg, (ir_type_size[type]*8-1)
	} else {
		|	ASM_REG_IMM_OP sar, type, tmp_reg, (ir_type_size[type]*8-1)
		|	ASM_REG_IMM_OP shr, type, tmp_reg, (ir_type_size[type]*8-shift)
	}
	|	ASM_REG_REG_OP add, type, def_reg, tmp_reg

|.if X64
||	if (ir_type_size[type] == 8 && ctx->regs[def][2] != IR_REG_NONE) {
||		ir_reg op2_reg = ctx->regs[def][2];
||
||		op2_reg = IR_REG_NUM(op2_reg);
||		ir_emit_load_imm_int(ctx, type, op2_reg, mask);
		|	ASM_REG_REG_OP and, type, def_reg, op2_reg
||	} else {
|.endif
		|	ASM_REG_IMM_OP and, type, def_reg, mask
|.if X64
||	}
|.endif

	|	ASM_REG_REG_OP sub, type, def_reg, tmp_reg

	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_mem_mul_div_mod_pwr2(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_insn *op_insn = &ctx->ir_base[insn->op3];
	ir_type type = op_insn->type;
	ir_mem mem;

	IR_ASSERT(IR_IS_CONST_REF(op_insn->op2));
	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[op_insn->op2].op));

	if (insn->op == IR_STORE) {
		mem = ir_fuse_mem(ctx, def, def, insn, ctx->regs[def][2]);
	} else {
		IR_ASSERT(insn->op == IR_VSTORE);
		mem = ir_var_spill_slot(ctx, insn->op2);
	}

	if (op_insn->op == IR_MUL) {
		uint32_t shift = IR_LOG2(ctx->ir_base[op_insn->op2].val.u64);
		|	ASM_MEM_IMM_OP shl, type, mem, shift
	} else if (op_insn->op == IR_DIV) {
		uint32_t shift = IR_LOG2(ctx->ir_base[op_insn->op2].val.u64);
		|	ASM_MEM_IMM_OP shr, type, mem, shift
	} else {
		IR_ASSERT(op_insn->op == IR_MOD);
		uint64_t mask = ctx->ir_base[op_insn->op2].val.u64 - 1;
		IR_ASSERT(IR_IS_UNSIGNED_32BIT(mask));
		|	ASM_MEM_IMM_OP and, type, mem, mask
	}
}

static void ir_emit_shift(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = insn->type;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];
	ir_reg op2_reg = ctx->regs[def][2];

	IR_ASSERT(def_reg != IR_REG_NONE && def_reg != IR_REG_RCX);
	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, insn->op1);
	}
	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
		op2_reg = IR_REG_NUM(op2_reg);
		ir_emit_load(ctx, type, op2_reg, insn->op2);
	}
	if (op2_reg != IR_REG_RCX) {
		if (op1_reg == IR_REG_RCX) {
			ir_emit_mov(ctx, type, def_reg, op1_reg);
			op1_reg = def_reg;
		}
		if (op2_reg != IR_REG_NONE) {
			ir_emit_mov(ctx, type, IR_REG_RCX, op2_reg);
		} else {
			ir_emit_load(ctx, type, IR_REG_RCX, insn->op2);
		}
	}
	if (def_reg != op1_reg) {
		if (op1_reg != IR_REG_NONE) {
			ir_emit_mov(ctx, type, def_reg, op1_reg);
		} else {
			ir_emit_load(ctx, type, def_reg, insn->op1);
		}
	}
	switch (insn->op) {
		default:
			IR_ASSERT(0);
		case IR_SHL:
			|	ASM_REG_TXT_OP shl, insn->type, def_reg, cl
			break;
		case IR_SHR:
			|	ASM_REG_TXT_OP shr, insn->type, def_reg, cl
			break;
		case IR_SAR:
			|	ASM_REG_TXT_OP sar, insn->type, def_reg, cl
			break;
		case IR_ROL:
			|	ASM_REG_TXT_OP rol, insn->type, def_reg, cl
			break;
		case IR_ROR:
			|	ASM_REG_TXT_OP ror, insn->type, def_reg, cl
			break;
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_mem_shift(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_insn *op_insn = &ctx->ir_base[insn->op3];
	ir_type type = op_insn->type;
	ir_ref op2 = op_insn->op2;
	ir_reg op2_reg = ctx->regs[insn->op3][2];
	ir_mem mem;

	if (insn->op == IR_STORE) {
		mem = ir_fuse_mem(ctx, def, def, insn, ctx->regs[def][2]);
	} else {
		IR_ASSERT(insn->op == IR_VSTORE);
		mem = ir_var_spill_slot(ctx, insn->op2);
	}

	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
		op2_reg = IR_REG_NUM(op2_reg);
		ir_emit_load(ctx, type, op2_reg, op2);
	}
	if (op2_reg != IR_REG_RCX) {
		if (op2_reg != IR_REG_NONE) {
			ir_emit_mov(ctx, type, IR_REG_RCX, op2_reg);
		} else {
			ir_emit_load(ctx, type, IR_REG_RCX, op2);
		}
	}
	switch (op_insn->op) {
		default:
			IR_ASSERT(0);
		case IR_SHL:
			|	ASM_MEM_TXT_OP shl, type, mem, cl
			break;
		case IR_SHR:
			|	ASM_MEM_TXT_OP shr, type, mem, cl
			break;
		case IR_SAR:
			|	ASM_MEM_TXT_OP sar, type, mem, cl
			break;
		case IR_ROL:
			|	ASM_MEM_TXT_OP rol, type, mem, cl
			break;
		case IR_ROR:
			|	ASM_MEM_TXT_OP ror, type, mem, cl
			break;
	}
}

static void ir_emit_shift_const(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	int32_t shift;
	ir_type type = insn->type;
	ir_ref op1 = insn->op1;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];

	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[insn->op2].op));
	IR_ASSERT(IR_IS_SIGNED_32BIT(ctx->ir_base[insn->op2].val.i64));
	shift = ctx->ir_base[insn->op2].val.i32;
	IR_ASSERT(def_reg != IR_REG_NONE);

	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, op1);
	}
	if (def_reg != op1_reg) {
		if (op1_reg != IR_REG_NONE) {
			ir_emit_mov(ctx, type, def_reg, op1_reg);
		} else {
			ir_emit_load(ctx, type, def_reg, op1);
		}
	}
	switch (insn->op) {
		default:
			IR_ASSERT(0);
		case IR_SHL:
			|	ASM_REG_IMM_OP shl, insn->type, def_reg, shift
			break;
		case IR_SHR:
			|	ASM_REG_IMM_OP shr, insn->type, def_reg, shift
			break;
		case IR_SAR:
			|	ASM_REG_IMM_OP sar, insn->type, def_reg, shift
			break;
		case IR_ROL:
			|	ASM_REG_IMM_OP rol, insn->type, def_reg, shift
			break;
		case IR_ROR:
			|	ASM_REG_IMM_OP ror, insn->type, def_reg, shift
			break;
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_mem_shift_const(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_insn *op_insn = &ctx->ir_base[insn->op3];
	ir_type type = op_insn->type;
	int32_t shift;
	ir_mem mem;

	IR_ASSERT(IR_IS_CONST_REF(op_insn->op2));
	IR_ASSERT(!IR_IS_SYM_CONST(ctx->ir_base[op_insn->op2].op));
	IR_ASSERT(IR_IS_SIGNED_32BIT(ctx->ir_base[op_insn->op2].val.i64));
	shift = ctx->ir_base[op_insn->op2].val.i32;
	if (insn->op == IR_STORE) {
		mem = ir_fuse_mem(ctx, def, def, insn, ctx->regs[def][2]);
	} else {
		IR_ASSERT(insn->op == IR_VSTORE);
		mem = ir_var_spill_slot(ctx, insn->op2);
	}

	switch (op_insn->op) {
		default:
			IR_ASSERT(0);
		case IR_SHL:
			|	ASM_MEM_IMM_OP shl, type, mem, shift
			break;
		case IR_SHR:
			|	ASM_MEM_IMM_OP shr, type, mem, shift
			break;
		case IR_SAR:
			|	ASM_MEM_IMM_OP sar, type, mem, shift
			break;
		case IR_ROL:
			|	ASM_MEM_IMM_OP rol, type, mem, shift
			break;
		case IR_ROR:
			|	ASM_MEM_IMM_OP ror, type, mem, shift
			break;
	}
}

static void ir_emit_op_int(ir_ctx *ctx, ir_ref def, ir_insn *insn, uint32_t rule)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = insn->type;
	ir_ref op1 = insn->op1;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];

	IR_ASSERT(def_reg != IR_REG_NONE);

	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, op1);
	}
	if (def_reg != op1_reg) {
		if (op1_reg != IR_REG_NONE) {
			ir_emit_mov(ctx, type, def_reg, op1_reg);
		} else {
			ir_emit_load(ctx, type, def_reg, op1);
		}
	}
	if (rule == IR_INC) {
		|	ASM_REG_OP inc, insn->type, def_reg
	} else if (rule == IR_DEC) {
		|	ASM_REG_OP dec, insn->type, def_reg
	} else if (insn->op == IR_NOT) {
		|	ASM_REG_OP not, insn->type, def_reg
	} else if (insn->op == IR_NEG) {
		|	ASM_REG_OP neg, insn->type, def_reg
	} else {
		IR_ASSERT(insn->op == IR_BSWAP);
		switch (ir_type_size[insn->type]) {
			default:
				IR_ASSERT(0);
			case 4:
				|	bswap Rd(def_reg)
				break;
			case 8:
				IR_ASSERT(sizeof(void*) == 8);
|.if X64
				|	bswap Rq(def_reg)
|.endif
				break;
		}
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_bit_count(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = insn->type;
	ir_ref op1 = insn->op1;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];

	IR_ASSERT(def_reg != IR_REG_NONE);

	if (op1_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op1_reg)) {
			op1_reg = IR_REG_NUM(op1_reg);
			ir_emit_load(ctx, type, op1_reg, op1);
		}
		switch (ir_type_size[insn->type]) {
			default:
				IR_ASSERT(0);
			case 2:
				if (insn->op == IR_CTLZ) {
					if (ctx->mflags & IR_X86_BMI1) {
						|	lzcnt Rw(def_reg), Rw(op1_reg)
					} else {
						|	bsr Rw(def_reg), Rw(op1_reg)
						|	xor Rw(def_reg), 0xf
					}
				} else if (insn->op == IR_CTTZ) {
					if (ctx->mflags & IR_X86_BMI1) {
						|	tzcnt Rw(def_reg), Rw(op1_reg)
					} else {
						|	bsf Rw(def_reg), Rw(op1_reg)
					}
				} else {
					IR_ASSERT(insn->op == IR_CTPOP);
					|	popcnt Rw(def_reg), Rw(op1_reg)
				}
				break;
			case 1:
				|   movzx Rd(op1_reg), Rb(op1_reg)
				if (insn->op == IR_CTLZ) {
					if (ctx->mflags & IR_X86_BMI1) {
						|	lzcnt Rd(def_reg), Rd(op1_reg)
						|	sub Rd(def_reg), 24
					} else {
						|	bsr Rd(def_reg), Rd(op1_reg)
						|	xor Rw(def_reg), 0x7
					}
					break;
				}
				IR_FALLTHROUGH;
			case 4:
				if (insn->op == IR_CTLZ) {
					if (ctx->mflags & IR_X86_BMI1) {
						|	lzcnt Rd(def_reg), Rd(op1_reg)
					} else {
						|	bsr Rd(def_reg), Rd(op1_reg)
						|	xor Rw(def_reg), 0x1f
					}
				} else if (insn->op == IR_CTTZ) {
					if (ctx->mflags & IR_X86_BMI1) {
						|	tzcnt Rd(def_reg), Rd(op1_reg)
					} else {
						|	bsf Rd(def_reg), Rd(op1_reg)
					}
				} else {
					IR_ASSERT(insn->op == IR_CTPOP);
					|	popcnt Rd(def_reg), Rd(op1_reg)
				}
				break;
|.if X64
			case 8:
				if (insn->op == IR_CTLZ) {
					if (ctx->mflags & IR_X86_BMI1) {
						|	lzcnt Rq(def_reg), Rq(op1_reg)
					} else {
						|	bsr Rq(def_reg), Rq(op1_reg)
						|	xor Rw(def_reg), 0x3f
					}
				} else if (insn->op == IR_CTTZ) {
					if (ctx->mflags & IR_X86_BMI1) {
						|	tzcnt Rq(def_reg), Rq(op1_reg)
					} else {
						|	bsf Rq(def_reg), Rq(op1_reg)
					}
				} else {
					IR_ASSERT(insn->op == IR_CTPOP);
					|	popcnt Rq(def_reg), Rq(op1_reg)
				}
				break;
|.endif
		}
	} else {
		ir_mem mem;

		if (ir_rule(ctx, op1) & IR_FUSED) {
			mem = ir_fuse_load(ctx, def, op1);
		} else {
			mem = ir_ref_spill_slot(ctx, op1);
		}
		switch (ir_type_size[insn->type]) {
			default:
				IR_ASSERT(0);
			case 2:
				if (insn->op == IR_CTLZ) {
					if (ctx->mflags & IR_X86_BMI1) {
						|	ASM_TXT_TMEM_OP lzcnt, Rw(def_reg), word, mem
					} else {
						|	ASM_TXT_TMEM_OP bsr, Rw(def_reg), word, mem
						|	xor Rw(def_reg), 0xf
					}
				} else if (insn->op == IR_CTTZ) {
					if (ctx->mflags & IR_X86_BMI1) {
						|	ASM_TXT_TMEM_OP tzcnt, Rw(def_reg), word, mem
					} else {
						|	ASM_TXT_TMEM_OP bsf, Rw(def_reg), word, mem
					}
				} else {
					|	ASM_TXT_TMEM_OP popcnt, Rw(def_reg), word, mem
				}
				break;
			case 4:
				if (insn->op == IR_CTLZ) {
					if (ctx->mflags & IR_X86_BMI1) {
						|	ASM_TXT_TMEM_OP lzcnt, Rd(def_reg), dword, mem
					} else {
						|	ASM_TXT_TMEM_OP bsr, Rd(def_reg), dword, mem
						|	xor Rw(def_reg), 0x1f
					}
				} else if (insn->op == IR_CTTZ) {
					if (ctx->mflags & IR_X86_BMI1) {
						|	ASM_TXT_TMEM_OP tzcnt, Rd(def_reg), dword, mem
					} else {
						|	ASM_TXT_TMEM_OP bsf, Rd(def_reg), dword, mem
					}
				} else {
					|	ASM_TXT_TMEM_OP popcnt, Rd(def_reg), dword, mem
				}
				break;
|.if X64
			case 8:
				if (insn->op == IR_CTLZ) {
					if (ctx->mflags & IR_X86_BMI1) {
						|	ASM_TXT_TMEM_OP lzcnt, Rq(def_reg), qword, mem
					} else {
						|	ASM_TXT_TMEM_OP bsr, Rq(def_reg), qword, mem
						|	xor Rw(def_reg), 0x3f
					}
				} else if (insn->op == IR_CTTZ) {
					if (ctx->mflags & IR_X86_BMI1) {
						|	ASM_TXT_TMEM_OP tzcnt, Rq(def_reg), qword, mem
					} else {
						|	ASM_TXT_TMEM_OP bsf, Rq(def_reg), qword, mem
					}
				} else {
					|	ASM_TXT_TMEM_OP popcnt, Rq(def_reg), qword, mem
				}
				break;
|.endif
		}
	}

	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_ctpop(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = insn->type;
	ir_ref op1 = insn->op1;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];
	ir_reg tmp_reg = ctx->regs[def][2];
|.if X64
||	ir_reg const_reg = ctx->regs[def][3];
|.endif

	IR_ASSERT(def_reg != IR_REG_NONE && tmp_reg != IR_REG_NONE);
	if (op1_reg == IR_REG_NONE) {
		ir_emit_load(ctx, type, def_reg, op1);
		if (ir_type_size[insn->type] == 1) {
			|	movzx Rd(def_reg), Rb(def_reg)
		} else if (ir_type_size[insn->type] == 2) {
			|	movzx Rd(def_reg), Rw(def_reg)
		}
	} else {
		if (IR_REG_SPILLED(op1_reg)) {
			op1_reg = IR_REG_NUM(op1_reg);
			ir_emit_load(ctx, type, op1_reg, op1);
		}
		switch (ir_type_size[insn->type]) {
			default:
				IR_ASSERT(0);
			case 1:
				|	movzx Rd(def_reg), Rb(op1_reg)
				break;
			case 2:
				|	movzx Rd(def_reg), Rw(op1_reg)
				break;
			case 4:
				|	mov Rd(def_reg), Rd(op1_reg)
				break;
|.if X64
||			case 8:
				|	mov Rq(def_reg), Rq(op1_reg)
||				break;
|.endif
		}
	}
	switch (ir_type_size[insn->type]) {
		default:
			IR_ASSERT(0);
		case 1:
			|	mov Rd(tmp_reg), Rd(def_reg)
			|	shr Rd(def_reg), 1
			|	and Rd(def_reg), 0x55
			|	sub Rd(tmp_reg), Rd(def_reg)
			|	mov Rd(def_reg), Rd(tmp_reg)
			|	and Rd(def_reg), 0x33
			|	shr Rd(tmp_reg), 2
			|	and Rd(tmp_reg), 0x33
			|	add Rd(tmp_reg), Rd(def_reg)
			|	mov Rd(def_reg), Rd(tmp_reg)
			|	shr Rd(def_reg), 4
			|	add Rd(def_reg), Rd(tmp_reg)
			|	and Rd(def_reg), 0x0f
			break;
		case 2:
			|	mov Rd(tmp_reg), Rd(def_reg)
			|	shr Rd(def_reg), 1
			|	and Rd(def_reg), 0x5555
			|	sub Rd(tmp_reg), Rd(def_reg)
			|	mov Rd(def_reg), Rd(tmp_reg)
			|	and Rd(def_reg), 0x3333
			|	shr Rd(tmp_reg), 2
			|	and Rd(tmp_reg), 0x3333
			|	add Rd(tmp_reg), Rd(def_reg)
			|	mov Rd(def_reg), Rd(tmp_reg)
			|	shr Rd(def_reg), 4
			|	add Rd(def_reg), Rd(tmp_reg)
			|	and Rd(def_reg), 0x0f0f
			|	mov	Rd(tmp_reg), Rd(def_reg)
			|	shr Rd(tmp_reg), 8
			|	and Rd(def_reg), 0x0f
			|	add Rd(def_reg), Rd(tmp_reg)
			break;
		case 4:
			|	mov Rd(tmp_reg), Rd(def_reg)
			|	shr Rd(def_reg), 1
			|	and Rd(def_reg), 0x55555555
			|	sub Rd(tmp_reg), Rd(def_reg)
			|	mov Rd(def_reg), Rd(tmp_reg)
			|	and Rd(def_reg), 0x33333333
			|	shr Rd(tmp_reg), 2
			|	and Rd(tmp_reg), 0x33333333
			|	add Rd(tmp_reg), Rd(def_reg)
			|	mov Rd(def_reg), Rd(tmp_reg)
			|	shr Rd(def_reg), 4
			|	add Rd(def_reg), Rd(tmp_reg)
			|	and Rd(def_reg), 0x0f0f0f0f
			|	imul Rd(def_reg), 0x01010101
			|	shr Rd(def_reg), 24
			break;
|.if X64
||		case 8:
||			IR_ASSERT(const_reg != IR_REG_NONE);
			|	mov Rq(tmp_reg), Rq(def_reg)
			|	shr Rq(def_reg), 1
			|	mov64 Rq(const_reg), 0x5555555555555555
			|	and Rq(def_reg), Rq(const_reg)
			|	sub Rq(tmp_reg), Rq(def_reg)
			|	mov Rq(def_reg), Rq(tmp_reg)
			|	mov64 Rq(const_reg), 0x3333333333333333
			|	and Rq(def_reg), Rq(const_reg)
			|	shr Rq(tmp_reg), 2
			|	and Rq(tmp_reg), Rq(const_reg)
			|	add Rq(tmp_reg), Rq(def_reg)
			|	mov Rq(def_reg), Rq(tmp_reg)
			|	shr Rq(def_reg), 4
			|	add Rq(def_reg), Rq(tmp_reg)
			|	mov64 Rq(const_reg), 0x0f0f0f0f0f0f0f0f
			|	and Rq(def_reg), Rq(const_reg)
			|	mov64 Rq(const_reg), 0x0101010101010101
			|	imul Rq(def_reg), Rq(const_reg)
			|	shr Rq(def_reg), 56
||			break;
|.endif
	}

	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_mem_op_int(ir_ctx *ctx, ir_ref def, ir_insn *insn, uint32_t rule)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_insn *op_insn = &ctx->ir_base[insn->op3];
	ir_type type = op_insn->type;
	ir_mem mem;

	if (insn->op == IR_STORE) {
		mem = ir_fuse_mem(ctx, def, def, insn, ctx->regs[def][2]);
	} else {
		IR_ASSERT(insn->op == IR_VSTORE);
		mem = ir_var_spill_slot(ctx, insn->op2);
	}

	if (rule == IR_MEM_INC) {
		|	ASM_MEM_OP inc, type, mem
	} else if (rule == IR_MEM_DEC) {
		|	ASM_MEM_OP dec, type, mem
	} else if (op_insn->op == IR_NOT) {
		|	ASM_MEM_OP not, type, mem
	} else {
		IR_ASSERT(op_insn->op == IR_NEG);
		|	ASM_MEM_OP neg, type, mem
	}
}

static void ir_emit_abs_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = insn->type;
	ir_ref op1 = insn->op1;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];

	IR_ASSERT(def_reg != IR_REG_NONE && op1_reg != IR_REG_NONE);

	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, op1);
	}

	IR_ASSERT(def_reg != op1_reg);

	ir_emit_mov(ctx, insn->type, def_reg, op1_reg);
	|	ASM_REG_OP neg, insn->type, def_reg
	|	ASM_REG_REG_OP2, cmovs, type, def_reg, op1_reg
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_bool_not(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = ctx->ir_base[insn->op1].type;
	ir_ref op1 = insn->op1;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];

	IR_ASSERT(def_reg != IR_REG_NONE);

	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, op1);
	}

	if (def_reg != op1_reg) {
		|	mov Rb(def_reg), Rb(op1_reg)
	}

	|	xor Rb(def_reg), 1

	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_bool_not_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = ctx->ir_base[insn->op1].type;
	ir_ref op1 = insn->op1;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];

	IR_ASSERT(def_reg != IR_REG_NONE);

	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, op1);
	}

	if (op1_reg != IR_REG_NONE) {
		|	ASM_REG_REG_OP test, type, op1_reg, op1_reg
	} else {
		ir_mem mem = ir_ref_spill_slot(ctx, op1);

		|	ASM_MEM_IMM_OP cmp, type, mem, 0
	}
	|	sete Rb(def_reg)

	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_mul_div_mod(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = insn->type;
	ir_ref op1 = insn->op1;
	ir_ref op2 = insn->op2;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];
	ir_reg op2_reg = ctx->regs[def][2];
	ir_mem mem;

	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, op1);
	}
	if (op1_reg != IR_REG_RAX) {
		if (op1_reg != IR_REG_NONE) {
			ir_emit_mov(ctx, type, IR_REG_RAX, op1_reg);
		} else {
			ir_emit_load(ctx, type, IR_REG_RAX, op1);
		}
	}
	if (op2_reg == IR_REG_NONE && op1 == op2) {
		op2_reg = IR_REG_RAX;
	} else if (op2_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			ir_emit_load(ctx, type, op2_reg, op2);
		}
	} else if (IR_IS_CONST_REF(op2)
	 && (insn->op == IR_MUL || insn->op == IR_MUL_OV)) {
		op2_reg = IR_REG_RDX;
		ir_emit_load(ctx, type, op2_reg, op2);
	}
	if (insn->op == IR_MUL || insn->op == IR_MUL_OV) {
		if (IR_IS_TYPE_SIGNED(insn->type)) {
			if (op2_reg != IR_REG_NONE) {
				|	ASM_REG_OP imul, type, op2_reg
			} else {
				if (ir_rule(ctx, op2) & IR_FUSED) {
					mem = ir_fuse_load(ctx, def, op2);
				} else {
					mem = ir_ref_spill_slot(ctx, op2);
				}
				|	ASM_MEM_OP imul, type, mem
			}
		} else {
			if (op2_reg != IR_REG_NONE) {
				|	ASM_REG_OP mul, type, op2_reg
			} else {
				if (ir_rule(ctx, op2) & IR_FUSED) {
					mem = ir_fuse_load(ctx, def, op2);
				} else {
					mem = ir_ref_spill_slot(ctx, op2);
				}
				|	ASM_MEM_OP mul, type, mem
			}
		}
	} else {
		if (IR_IS_TYPE_SIGNED(type)) {
			if (ir_type_size[type] == 8) {
				|	cqo
			} else if (ir_type_size[type] == 4) {
				|	cdq
			} else if (ir_type_size[type] == 2) {
				|	cwd
			} else {
				|	cbw
			}
			if (op2_reg != IR_REG_NONE) {
				|	ASM_REG_OP idiv, type, op2_reg
			} else {
				if (ir_rule(ctx, op2) & IR_FUSED) {
					mem = ir_fuse_load(ctx, def, op2);
				} else {
					mem = ir_ref_spill_slot(ctx, op2);
				}
				|	ASM_MEM_OP idiv, type, mem
			}
		} else {
			if (ir_type_size[type] == 1) {
				|	movzx ax, al
			} else {
				|	ASM_REG_REG_OP xor, type, IR_REG_RDX, IR_REG_RDX
			}
			if (op2_reg != IR_REG_NONE) {
				|	ASM_REG_OP div, type, op2_reg
			} else {
				if (ir_rule(ctx, op2) & IR_FUSED) {
					mem = ir_fuse_load(ctx, def, op2);
				} else {
					mem = ir_ref_spill_slot(ctx, op2);
				}
				|	ASM_MEM_OP div, type, mem
			}
		}
	}

	if (insn->op == IR_MUL || insn->op == IR_MUL_OV || insn->op == IR_DIV) {
		if (def_reg != IR_REG_NONE) {
			if (def_reg != IR_REG_RAX) {
				ir_emit_mov(ctx, type, def_reg, IR_REG_RAX);
			}
			if (IR_REG_SPILLED(ctx->regs[def][0])) {
				ir_emit_store(ctx, type, def, def_reg);
			}
		} else {
			ir_emit_store(ctx, type, def, IR_REG_RAX);
		}
	} else {
		IR_ASSERT(insn->op == IR_MOD);
		if (ir_type_size[type] == 1) {
			if (def_reg != IR_REG_NONE) {
				|	mov al, ah
				if (def_reg != IR_REG_RAX) {
					|	mov Rb(def_reg), al
				}
				if (IR_REG_SPILLED(ctx->regs[def][0])) {
					ir_emit_store(ctx, type, def, def_reg);
				}
			} else {
				ir_reg fp;
				int32_t offset = ir_ref_spill_slot_offset(ctx, def, &fp);

//?????
				|	mov byte [Ra(fp)+offset], ah
			}
		} else {
			if (def_reg != IR_REG_NONE) {
				if (def_reg != IR_REG_RDX) {
					ir_emit_mov(ctx, type, def_reg, IR_REG_RDX);
				}
				if (IR_REG_SPILLED(ctx->regs[def][0])) {
					ir_emit_store(ctx, type, def, def_reg);
				}
			} else {
				ir_emit_store(ctx, type, def, IR_REG_RDX);
			}
		}
	}
}

static void ir_rodata(ir_ctx *ctx)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;

	|.rodata
	if (!data->rodata_label) {
		int label = data->rodata_label = ctx->cfg_blocks_count + ctx->consts_count + 2;
		|=>label:
	}
}

static void ir_emit_op_fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = insn->type;
	ir_ref op1 = insn->op1;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];

	IR_ASSERT(def_reg != IR_REG_NONE);

	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, op1);
	}
	if (def_reg != op1_reg) {
		if (op1_reg != IR_REG_NONE) {
			ir_emit_fp_mov(ctx, type, def_reg, op1_reg);
		} else {
			ir_emit_load(ctx, type, def_reg, op1);
		}
	}
	if (insn->op == IR_NEG) {
		if (insn->type == IR_DOUBLE) {
			if (!data->double_neg_const) {
				data->double_neg_const = 1;
				ir_rodata(ctx);
				|.align 16
				|->double_neg_const:
				|.dword 0, 0x80000000, 0, 0
				|.code
			}
			if (ctx->mflags & IR_X86_AVX) {
				|	vxorpd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), [->double_neg_const]
			} else {
				|	xorpd xmm(def_reg-IR_REG_FP_FIRST), [->double_neg_const]
			}
		} else {
			IR_ASSERT(insn->type == IR_FLOAT);
			if (!data->float_neg_const) {
				data->float_neg_const = 1;
				ir_rodata(ctx);
				|.align 16
				|->float_neg_const:
				|.dword 0x80000000, 0, 0, 0
				|.code
			}
			if (ctx->mflags & IR_X86_AVX) {
				|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), [->float_neg_const]
			} else {
				|	xorps xmm(def_reg-IR_REG_FP_FIRST), [->float_neg_const]
			}
		}
	} else {
		IR_ASSERT(insn->op == IR_ABS);
		if (insn->type == IR_DOUBLE) {
			if (!data->double_abs_const) {
				data->double_abs_const = 1;
				ir_rodata(ctx);
				|.align 16
				|->double_abs_const:
				|.dword 0xffffffff, 0x7fffffff, 0, 0
				|.code
			}
			if (ctx->mflags & IR_X86_AVX) {
				|	vandpd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), [->double_abs_const]
			} else {
				|	andpd xmm(def_reg-IR_REG_FP_FIRST), [->double_abs_const]
			}
		} else {
			IR_ASSERT(insn->type == IR_FLOAT);
			if (!data->float_abs_const) {
				data->float_abs_const = 1;
				ir_rodata(ctx);
				|.align 16
				|->float_abs_const:
				|.dword 0x7fffffff, 0, 0, 0
				|.code
			}
			if (ctx->mflags & IR_X86_AVX) {
				|	vandps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), [->float_abs_const]
			} else {
				|	andps xmm(def_reg-IR_REG_FP_FIRST), [->float_abs_const]
			}
		}
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, insn->type, def, def_reg);
	}
}

static void ir_emit_binop_sse2(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = insn->type;
	ir_ref op1 = insn->op1;
	ir_ref op2 = insn->op2;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];
	ir_reg op2_reg = ctx->regs[def][2];

	IR_ASSERT(def_reg != IR_REG_NONE);

	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, op1);
	}
	if (def_reg != op1_reg) {
		if (op1_reg != IR_REG_NONE) {
			ir_emit_fp_mov(ctx, type, def_reg, op1_reg);
		} else {
			ir_emit_load(ctx, type, def_reg, op1);
		}
		if (op1 == op2) {
			op2_reg = def_reg;
		}
	}
	if (op2_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			if (op1 != op2) {
				ir_emit_load(ctx, type, op2_reg, op2);
			}
		}
		switch (insn->op) {
			default:
				IR_ASSERT(0 && "NIY binary op");
			case IR_ADD:
				|	ASM_SSE2_REG_REG_OP adds, type, def_reg, op2_reg
				break;
			case IR_SUB:
				|	ASM_SSE2_REG_REG_OP subs, type, def_reg, op2_reg
				break;
			case IR_MUL:
				|	ASM_SSE2_REG_REG_OP muls, type, def_reg, op2_reg
				break;
			case IR_DIV:
				|	ASM_SSE2_REG_REG_OP divs, type, def_reg, op2_reg
				break;
			case IR_MIN:
				|	ASM_SSE2_REG_REG_OP mins, type, def_reg, op2_reg
				break;
			case IR_MAX:
				|	ASM_SSE2_REG_REG_OP maxs, type, def_reg, op2_reg
				break;
		}
	} else if (IR_IS_CONST_REF(op2)) {
		int label = ir_const_label(ctx, op2);

		switch (insn->op) {
			default:
				IR_ASSERT(0 && "NIY binary op");
			case IR_ADD:
				|	ASM_SSE2_REG_TXT_OP adds, type, def_reg, [=>label]
				break;
			case IR_SUB:
				|	ASM_SSE2_REG_TXT_OP subs, type, def_reg, [=>label]
				break;
			case IR_MUL:
				|	ASM_SSE2_REG_TXT_OP muls, type, def_reg, [=>label]
				break;
			case IR_DIV:
				|	ASM_SSE2_REG_TXT_OP divs, type, def_reg, [=>label]
				break;
			case IR_MIN:
				|	ASM_SSE2_REG_TXT_OP mins, type, def_reg, [=>label]
				break;
			case IR_MAX:
				|	ASM_SSE2_REG_TXT_OP maxs, type, def_reg, [=>label]
				break;
		}
	} else {
		ir_mem mem;

		if (ir_rule(ctx, op2) & IR_FUSED) {
			mem = ir_fuse_load(ctx, def, op2);
		} else {
			mem = ir_ref_spill_slot(ctx, op2);
		}
		switch (insn->op) {
			default:
				IR_ASSERT(0 && "NIY binary op");
			case IR_ADD:
				|	ASM_SSE2_REG_MEM_OP adds, type, def_reg, mem
				break;
			case IR_SUB:
				|	ASM_SSE2_REG_MEM_OP subs, type, def_reg, mem
				break;
			case IR_MUL:
				|	ASM_SSE2_REG_MEM_OP muls, type, def_reg, mem
				break;
			case IR_DIV:
				|	ASM_SSE2_REG_MEM_OP divs, type, def_reg, mem
				break;
			case IR_MIN:
				|	ASM_SSE2_REG_MEM_OP mins, type, def_reg, mem
				break;
			case IR_MAX:
				|	ASM_SSE2_REG_MEM_OP maxs, type, def_reg, mem
				break;
		}
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, insn->type, def, def_reg);
	}
}

static void ir_emit_binop_avx(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = insn->type;
	ir_ref op1 = insn->op1;
	ir_ref op2 = insn->op2;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];
	ir_reg op2_reg = ctx->regs[def][2];

	IR_ASSERT(def_reg != IR_REG_NONE && op1_reg != IR_REG_NONE);

	if (IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, op1);
	}
	if (op2_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			if (op1 != op2) {
				ir_emit_load(ctx, type, op2_reg, op2);
			}
		}
		switch (insn->op) {
			default:
				IR_ASSERT(0 && "NIY binary op");
			case IR_ADD:
				|	ASM_AVX_REG_REG_REG_OP vadds, type, def_reg, op1_reg, op2_reg
				break;
			case IR_SUB:
				|	ASM_AVX_REG_REG_REG_OP vsubs, type, def_reg, op1_reg, op2_reg
				break;
			case IR_MUL:
				|	ASM_AVX_REG_REG_REG_OP vmuls, type, def_reg, op1_reg, op2_reg
				break;
			case IR_DIV:
				|	ASM_AVX_REG_REG_REG_OP vdivs, type, def_reg, op1_reg, op2_reg
				break;
			case IR_MIN:
				|	ASM_AVX_REG_REG_REG_OP vmins, type, def_reg, op1_reg, op2_reg
				break;
			case IR_MAX:
				|	ASM_AVX_REG_REG_REG_OP vmaxs, type, def_reg, op1_reg, op2_reg
				break;
		}
	} else if (IR_IS_CONST_REF(op2)) {
		int label = ir_const_label(ctx, op2);

		switch (insn->op) {
			default:
				IR_ASSERT(0 && "NIY binary op");
			case IR_ADD:
				|	ASM_AVX_REG_REG_TXT_OP vadds, type, def_reg, op1_reg, [=>label]
				break;
			case IR_SUB:
				|	ASM_AVX_REG_REG_TXT_OP vsubs, type, def_reg, op1_reg, [=>label]
				break;
			case IR_MUL:
				|	ASM_AVX_REG_REG_TXT_OP vmuls, type, def_reg, op1_reg, [=>label]
				break;
			case IR_DIV:
				|	ASM_AVX_REG_REG_TXT_OP vdivs, type, def_reg, op1_reg, [=>label]
				break;
			case IR_MIN:
				|	ASM_AVX_REG_REG_TXT_OP vmins, type, def_reg, op1_reg, [=>label]
				break;
			case IR_MAX:
				|	ASM_AVX_REG_REG_TXT_OP vmaxs, type, def_reg, op1_reg, [=>label]
				break;
		}
	} else {
		ir_mem mem;

		if (ir_rule(ctx, op2) & IR_FUSED) {
			mem = ir_fuse_load(ctx, def, op2);
		} else {
			mem = ir_ref_spill_slot(ctx, op2);
		}
		switch (insn->op) {
			default:
				IR_ASSERT(0 && "NIY binary op");
			case IR_ADD:
				|	ASM_AVX_REG_REG_MEM_OP vadds, type, def_reg, op1_reg, mem
				break;
			case IR_SUB:
				|	ASM_AVX_REG_REG_MEM_OP vsubs, type, def_reg, op1_reg, mem
				break;
			case IR_MUL:
				|	ASM_AVX_REG_REG_MEM_OP vmuls, type, def_reg, op1_reg, mem
				break;
			case IR_DIV:
				|	ASM_AVX_REG_REG_MEM_OP vdivs, type, def_reg, op1_reg, mem
				break;
			case IR_MIN:
				|	ASM_AVX_REG_REG_MEM_OP vmins, type, def_reg, op1_reg, mem
				break;
			case IR_MAX:
				|	ASM_AVX_REG_REG_MEM_OP vmaxs, type, def_reg, op1_reg, mem
				break;
		}
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, insn->type, def, def_reg);
	}
}

static void ir_emit_cmp_int_common(ir_ctx *ctx, ir_type type, ir_ref root, ir_insn *insn, ir_reg op1_reg, ir_ref op1, ir_reg op2_reg, ir_ref op2)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;

	if (op1_reg != IR_REG_NONE) {
		if (op2_reg != IR_REG_NONE) {
			|	ASM_REG_REG_OP cmp, type, op1_reg, op2_reg
		} else if (IR_IS_CONST_REF(op2) && !IR_IS_SYM_CONST(ctx->ir_base[op2].op) && ctx->ir_base[op2].val.u64 == 0) {
			|	ASM_REG_REG_OP test, type, op1_reg, op1_reg
		} else if (IR_IS_CONST_REF(op2)) {
			int32_t val = ir_fuse_imm(ctx, op2);
			|	ASM_REG_IMM_OP cmp, type, op1_reg, val
		} else {
			ir_mem mem;

			if (ir_rule(ctx, op2) & IR_FUSED) {
				mem = ir_fuse_load(ctx, root, op2);
			} else {
				mem = ir_ref_spill_slot(ctx, op2);
			}
			|	ASM_REG_MEM_OP cmp, type, op1_reg, mem
		}
	} else if (IR_IS_CONST_REF(op1)) {
		IR_ASSERT(0);
	} else {
		ir_mem mem;

		if (ir_rule(ctx, op1) & IR_FUSED) {
			mem = ir_fuse_load(ctx, root, op1);
		} else {
			mem = ir_ref_spill_slot(ctx, op1);
		}
		if (op2_reg != IR_REG_NONE) {
			|	ASM_MEM_REG_OP cmp, type, mem, op2_reg
		} else {
			int32_t val = ir_fuse_imm(ctx, op2);
			|	ASM_MEM_IMM_OP cmp, type, mem, val
		}
	}
}

static void ir_emit_cmp_int_common2(ir_ctx *ctx, ir_ref root, ir_ref ref, ir_insn *cmp_insn)
{
	ir_type type = ctx->ir_base[cmp_insn->op1].type;
	ir_ref op1 = cmp_insn->op1;
	ir_ref op2 = cmp_insn->op2;
	ir_reg op1_reg = ctx->regs[ref][1];
	ir_reg op2_reg = ctx->regs[ref][2];

	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, op1);
	}
	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
		op2_reg = IR_REG_NUM(op2_reg);
		if (op1 != op2) {
			ir_emit_load(ctx, type, op2_reg, op2);
		}
	}

	ir_emit_cmp_int_common(ctx, type, root, cmp_insn, op1_reg, op1, op2_reg, op2);
}

static void _ir_emit_setcc_int(ir_ctx *ctx, uint8_t op, ir_reg def_reg, bool after_op)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;

	switch (op) {
		default:
			IR_ASSERT(0 && "NIY binary op");
		case IR_EQ:
			|	sete Rb(def_reg)
			break;
		case IR_NE:
			|	setne Rb(def_reg)
			break;
		case IR_LT:
			if (after_op) {
				|	sets Rb(def_reg)
			} else {
				|	setl Rb(def_reg)
			}
			break;
		case IR_GE:
			if (after_op) {
				|	setns Rb(def_reg)
			} else {
				|	setge Rb(def_reg)
			}
			break;
		case IR_LE:
			|	setle Rb(def_reg)
			break;
		case IR_GT:
			|	setg Rb(def_reg)
			break;
		case IR_ULT:
			|	setb Rb(def_reg)
			break;
		case IR_UGE:
			|	setae Rb(def_reg)
			break;
		case IR_ULE:
			|	setbe Rb(def_reg)
			break;
		case IR_UGT:
			|	seta Rb(def_reg)
			break;
	}
}

static void _ir_emit_setcc_int_mem(ir_ctx *ctx, uint8_t op, ir_mem mem)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;


	switch (op) {
		default:
			IR_ASSERT(0 && "NIY binary op");
		case IR_EQ:
			|	ASM_TMEM_OP sete, byte, mem
			break;
		case IR_NE:
			|	ASM_TMEM_OP setne, byte, mem
			break;
		case IR_LT:
			|	ASM_TMEM_OP setl, byte, mem
			break;
		case IR_GE:
			|	ASM_TMEM_OP setge, byte, mem
			break;
		case IR_LE:
			|	ASM_TMEM_OP setle, byte, mem
			break;
		case IR_GT:
			|	ASM_TMEM_OP setg, byte, mem
			break;
		case IR_ULT:
			|	ASM_TMEM_OP setb, byte, mem
			break;
		case IR_UGE:
			|	ASM_TMEM_OP setae, byte, mem
			break;
		case IR_ULE:
			|	ASM_TMEM_OP setbe, byte, mem
			break;
		case IR_UGT:
			|	ASM_TMEM_OP seta, byte, mem
			break;
	}
}

static void ir_emit_cmp_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = ctx->ir_base[insn->op1].type;
	ir_op op = insn->op;
	ir_ref op1 = insn->op1;
	ir_ref op2 = insn->op2;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];
	ir_reg op2_reg = ctx->regs[def][2];

	IR_ASSERT(def_reg != IR_REG_NONE);
	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, op1);
	}
	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
		op2_reg = IR_REG_NUM(op2_reg);
		if (op1 != op2) {
			ir_emit_load(ctx, type, op2_reg, op2);
		}
	}
	if (IR_IS_CONST_REF(op2) && !IR_IS_SYM_CONST(ctx->ir_base[op2].op) && ctx->ir_base[op2].val.u64 == 0) {
		if (op == IR_ULT) {
			/* always false */
			|	xor Ra(def_reg), Ra(def_reg)
			if (IR_REG_SPILLED(ctx->regs[def][0])) {
				ir_emit_store(ctx, insn->type, def, def_reg);
			}
			return;
		} else if (op == IR_UGE) {
			/* always true */
			|	ASM_REG_IMM_OP mov, insn->type, def_reg, 1
			if (IR_REG_SPILLED(ctx->regs[def][0])) {
				ir_emit_store(ctx, insn->type, def, def_reg);
			}
			return;
		} else if (op == IR_ULE) {
			op = IR_EQ;
		} else if (op == IR_UGT) {
			op = IR_NE;
		}
	}
	ir_emit_cmp_int_common(ctx, type, def, insn, op1_reg, op1, op2_reg, op2);
	_ir_emit_setcc_int(ctx, op, def_reg, 0);
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, insn->type, def, def_reg);
	}
}

static void ir_emit_test_int_common(ir_ctx *ctx, ir_ref root, ir_ref ref, ir_op op)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_insn *binop_insn = &ctx->ir_base[ref];
	ir_type type = binop_insn->type;
	ir_ref op1 = binop_insn->op1;
	ir_ref op2 = binop_insn->op2;
	ir_reg op1_reg = ctx->regs[ref][1];
	ir_reg op2_reg = ctx->regs[ref][2];

	IR_ASSERT(binop_insn->op == IR_AND);
	if (op1_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op1_reg)) {
			op1_reg = IR_REG_NUM(op1_reg);
			ir_emit_load(ctx, type, op1_reg, op1);
		}
		if (op2_reg != IR_REG_NONE) {
			if (IR_REG_SPILLED(op2_reg)) {
				op2_reg = IR_REG_NUM(op2_reg);
				if (op1 != op2) {
					ir_emit_load(ctx, type, op2_reg, op2);
				}
			}
			|	ASM_REG_REG_OP test, type, op1_reg, op2_reg
		} else if (IR_IS_CONST_REF(op2)) {
			int32_t val = ir_fuse_imm(ctx, op2);

			if ((op == IR_EQ || op == IR_NE) && val == 0xff && (sizeof(void*) == 8 || op1_reg <= IR_REG_R3)) {
				|	test Rb(op1_reg), Rb(op1_reg)
			} else if ((op == IR_EQ || op == IR_NE) && val == 0xff00 && op1_reg <= IR_REG_R3) {
				if (op1_reg == IR_REG_RAX) {
					|	test ah, ah
				} else if (op1_reg == IR_REG_RBX) {
					|	test bh, bh
				} else if (op1_reg == IR_REG_RCX) {
					|	test ch, ch
				} else if (op1_reg == IR_REG_RDX) {
					|	test dh, dh
				} else {
					IR_ASSERT(0);
				}
			} else if ((op == IR_EQ || op == IR_NE) && val == 0xffff) {
				|	test Rw(op1_reg), Rw(op1_reg)
			} else if ((op == IR_EQ || op == IR_NE) && val == -1) {
				|	test Rd(op1_reg), Rd(op1_reg)
			} else {
				|	ASM_REG_IMM_OP test, type, op1_reg, val
			}
		} else {
			ir_mem mem;

			if (ir_rule(ctx, op2) & IR_FUSED) {
				mem = ir_fuse_load(ctx, root, op2);
			} else {
				mem = ir_ref_spill_slot(ctx, op2);
			}
			|	ASM_REG_MEM_OP test, type, op1_reg, mem
		}
	} else if (IR_IS_CONST_REF(op1)) {
		IR_ASSERT(0);
	} else {
		ir_mem mem;

		if (ir_rule(ctx, op1) & IR_FUSED) {
			mem = ir_fuse_load(ctx, root, op1);
		} else {
			mem = ir_ref_spill_slot(ctx, op1);
		}
		if (op2_reg != IR_REG_NONE) {
			if (IR_REG_SPILLED(op2_reg)) {
				op2_reg = IR_REG_NUM(op2_reg);
				if (op1 != op2) {
					ir_emit_load(ctx, type, op2_reg, op2);
				}
			}
			|	ASM_MEM_REG_OP test, type, mem, op2_reg
		} else {
			IR_ASSERT(!IR_IS_CONST_REF(op1));
			int32_t val = ir_fuse_imm(ctx, op2);
			|	ASM_MEM_IMM_OP test, type, mem, val
		}
	}
}

static void ir_emit_testcc_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);

	IR_ASSERT(def_reg != IR_REG_NONE);
	ir_emit_test_int_common(ctx, def, insn->op1, insn->op);
	_ir_emit_setcc_int(ctx, insn->op, def_reg, 0);
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, insn->type, def, def_reg);
	}
}

static void ir_emit_setcc_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);

	IR_ASSERT(def_reg != IR_REG_NONE);
	_ir_emit_setcc_int(ctx, insn->op, def_reg, 1);
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, insn->type, def, def_reg);
	}
}

static ir_op ir_emit_cmp_fp_common(ir_ctx *ctx, ir_ref root, ir_ref cmp_ref, ir_insn *cmp_insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = ctx->ir_base[cmp_insn->op1].type;
	ir_op op = cmp_insn->op;
	ir_ref op1, op2;
	ir_reg op1_reg, op2_reg;

	op1 = cmp_insn->op1;
	op2 = cmp_insn->op2;
	op1_reg = ctx->regs[cmp_ref][1];
	op2_reg = ctx->regs[cmp_ref][2];

	if (op1_reg == IR_REG_NONE && op2_reg != IR_REG_NONE && (op == IR_EQ || op == IR_NE)) {
		ir_reg tmp_reg;

		SWAP_REFS(op1, op2);
		tmp_reg = op1_reg;
		op1_reg = op2_reg;
		op2_reg = tmp_reg;
	}


	IR_ASSERT(op1_reg != IR_REG_NONE);
	if (IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, op1);
	}
	if (op2_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			if (op1 != op2) {
				ir_emit_load(ctx, type, op2_reg, op2);
			}
		}
		|	ASM_FP_REG_REG_OP ucomis, type, op1_reg, op2_reg
	} else if (IR_IS_CONST_REF(op2)) {
		int label = ir_const_label(ctx, op2);

		|	ASM_FP_REG_TXT_OP ucomis, type, op1_reg, [=>label]
	} else {
		ir_mem mem;

		if (ir_rule(ctx, op2) & IR_FUSED) {
			mem = ir_fuse_load(ctx, root, op2);
		} else {
			mem = ir_ref_spill_slot(ctx, op2);
		}
		|	ASM_FP_REG_MEM_OP ucomis, type, op1_reg, mem
	}
	return op;
}

static void ir_emit_cmp_fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_op op = ir_emit_cmp_fp_common(ctx, def, def, insn);
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg tmp_reg = ctx->regs[def][3];

	IR_ASSERT(def_reg != IR_REG_NONE);
	switch (op) {
		default:
			IR_ASSERT(0 && "NIY binary op");
		case IR_EQ:
			|	setnp Rb(def_reg)
			|	mov Rd(tmp_reg), 0
			|	cmovne Rd(def_reg), Rd(tmp_reg)
			break;
		case IR_NE:
			|	setp Rb(def_reg)
			|	mov Rd(tmp_reg), 1
			|	cmovne Rd(def_reg), Rd(tmp_reg)
			break;
		case IR_LT:
			|	setnp Rb(def_reg)
			|	mov Rd(tmp_reg), 0
			|	cmovae Rd(def_reg), Rd(tmp_reg)
			break;
		case IR_GE:
			|	setae Rb(def_reg)
			break;
		case IR_LE:
			|	setnp Rb(def_reg)
			|	mov Rd(tmp_reg), 0
			|	cmova Rd(def_reg), Rd(tmp_reg)
			break;
		case IR_GT:
			|	seta Rb(def_reg)
			break;
		case IR_ULT:
			|	setb Rb(def_reg)
			break;
		case IR_UGE:
			|	setp Rb(def_reg)
			|	mov Rd(tmp_reg), 1
			|	cmovae Rd(def_reg), Rd(tmp_reg)
			break;
		case IR_ULE:
			|	setbe Rb(def_reg)
			break;
		case IR_UGT:
			|	setp Rb(def_reg)
			|	mov Rd(tmp_reg), 1
			|	cmova Rd(def_reg), Rd(tmp_reg)
			break;
		case IR_ORDERED:
			|	setnp Rb(def_reg)
			break;
		case IR_UNORDERED:
			|	setp Rb(def_reg)
			break;
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, insn->type, def, def_reg);
	}
}

static void ir_emit_jmp_true(ir_ctx *ctx, uint32_t b, ir_ref def, uint32_t next_block)
{
	uint32_t true_block, false_block;
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;

	ir_get_true_false_blocks(ctx, b, &true_block, &false_block);
	if (true_block != next_block) {
		|	jmp =>true_block
	}
}

static void ir_emit_jmp_false(ir_ctx *ctx, uint32_t b, ir_ref def, uint32_t next_block)
{
	uint32_t true_block, false_block;
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;

	ir_get_true_false_blocks(ctx, b, &true_block, &false_block);
	if (false_block != next_block) {
		|	jmp =>false_block
	}
}

static void ir_emit_jcc(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block, uint8_t op, bool int_cmp, bool after_op)
{
	uint32_t true_block, false_block;
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;

	ir_get_true_false_blocks(ctx, b, &true_block, &false_block);
	if (true_block == next_block) {
		/* swap to avoid unconditional JMP */
		if (int_cmp || op == IR_EQ || op == IR_NE || op == IR_ORDERED || op == IR_UNORDERED) {
			op ^= 1; // reverse
		} else {
			op ^= 5; // reverse
		}
		true_block = false_block;
		false_block = 0;
	} else if (false_block == next_block) {
		false_block = 0;
	}

	if (int_cmp) {
		switch (op) {
			default:
				IR_ASSERT(0 && "NIY binary op");
			case IR_EQ:
				|	je =>true_block
				break;
			case IR_NE:
				|	jne =>true_block
				break;
			case IR_LT:
				if (after_op) {
					|	js =>true_block
				} else {
					|	jl =>true_block
				}
				break;
			case IR_GE:
				if (after_op) {
					|	jns =>true_block
				} else {
					|	jge =>true_block
				}
				break;
			case IR_LE:
				|	jle =>true_block
				break;
			case IR_GT:
				|	jg =>true_block
				break;
			case IR_ULT:
				|	jb =>true_block
				break;
			case IR_UGE:
				|	jae =>true_block
				break;
			case IR_ULE:
				|	jbe =>true_block
				break;
			case IR_UGT:
				|	ja =>true_block
				break;
		}
	} else {
		switch (op) {
			default:
				IR_ASSERT(0 && "NIY binary op");
			case IR_EQ:
				if (!false_block) {
					|	jp >1
					|	je =>true_block
					|1:
				} else {
					|	jp =>false_block
					|	je =>true_block
				}
				break;
			case IR_NE:
				|	jne =>true_block
				|	jp =>true_block
				break;
			case IR_LT:
				if (!false_block) {
					|	jp >1
					|	jb =>true_block
					|1:
				} else {
					|	jp =>false_block
					|	jb =>true_block
				}
				break;
			case IR_GE:
				|	jae =>true_block
				break;
			case IR_LE:
				if (!false_block) {
					|	jp >1
					|	jbe =>true_block
					|1:
				} else {
					|	jp =>false_block
					|	jbe =>true_block
				}
				break;
			case IR_GT:
				|	ja =>true_block
				break;
			case IR_ULT:
				|	jb =>true_block
				break;
			case IR_UGE:
				|	jp =>true_block
				|	jae =>true_block
				break;
			case IR_ULE:
				|	jbe =>true_block
				break;
			case IR_UGT:
				|	jp =>true_block
				|	ja =>true_block
				break;
			case IR_ORDERED:
				|	jnp =>true_block
				break;
			case IR_UNORDERED:
				|	jp =>true_block
				break;
		}
	}
	if (false_block) {
		|	jmp =>false_block
	}
}

static void ir_emit_cmp_and_branch_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
{
	ir_insn *cmp_insn = &ctx->ir_base[insn->op2];
	ir_op op = cmp_insn->op;
	ir_type type = ctx->ir_base[cmp_insn->op1].type;
	ir_ref op1 = cmp_insn->op1;
	ir_ref op2 = cmp_insn->op2;
	ir_reg op1_reg = ctx->regs[insn->op2][1];
	ir_reg op2_reg = ctx->regs[insn->op2][2];

	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, op1);
	}
	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
		op2_reg = IR_REG_NUM(op2_reg);
		if (op1 != op2) {
			ir_emit_load(ctx, type, op2_reg, op2);
		}
	}
	if (IR_IS_CONST_REF(op2) && !IR_IS_SYM_CONST(ctx->ir_base[op2].op) && ctx->ir_base[op2].val.u64 == 0) {
		if (op == IR_ULT) {
			/* always false */
			ir_emit_jmp_false(ctx, b, def, next_block);
			return;
		} else if (op == IR_UGE) {
			/* always true */
			ir_emit_jmp_true(ctx, b, def, next_block);
			return;
		} else if (op == IR_ULE) {
			op = IR_EQ;
		} else if (op == IR_UGT) {
			op = IR_NE;
		}
	}

	bool same_comparison = 0;
	ir_insn *prev_insn = &ctx->ir_base[insn->op1];
	if (prev_insn->op == IR_IF_TRUE || prev_insn->op == IR_IF_FALSE) {
		if (ir_rule(ctx, prev_insn->op1) == IR_CMP_AND_BRANCH_INT) {
			prev_insn = &ctx->ir_base[prev_insn->op1];
			prev_insn = &ctx->ir_base[prev_insn->op2];
			if (prev_insn->op1 == cmp_insn->op1 && prev_insn->op2 == cmp_insn->op2) {
				same_comparison = true;
			}
		}
	}
	if (!same_comparison) {
		ir_emit_cmp_int_common(ctx, type, def, cmp_insn, op1_reg, op1, op2_reg, op2);
	}
	ir_emit_jcc(ctx, b, def, insn, next_block, op, 1, 0);
}

static void ir_emit_test_and_branch_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
{
	ir_ref op2 = insn->op2;
	ir_op op = ctx->ir_base[op2].op;

	if (op >= IR_EQ && op <= IR_UGT) {
		op2 = ctx->ir_base[op2].op1;
	} else {
		IR_ASSERT(op == IR_AND);
		op = IR_NE;
	}

	ir_emit_test_int_common(ctx, def, op2, op);
	ir_emit_jcc(ctx, b, def, insn, next_block, op, 1, 0);
}

static void ir_emit_cmp_and_branch_fp(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
{
	ir_op op = ir_emit_cmp_fp_common(ctx, def, insn->op2, &ctx->ir_base[insn->op2]);
	ir_emit_jcc(ctx, b, def, insn, next_block, op, 0, 0);
}

static void ir_emit_if_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
{
	ir_type type = ctx->ir_base[insn->op2].type;
	ir_reg op2_reg = ctx->regs[def][2];
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;

	if (op2_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			ir_emit_load(ctx, type, op2_reg, insn->op2);
		}
		|	ASM_REG_REG_OP test, type, op2_reg, op2_reg
	} else if (IR_IS_CONST_REF(insn->op2)) {
		uint32_t true_block, false_block;

		ir_get_true_false_blocks(ctx, b, &true_block, &false_block);
		if (ir_const_is_true(&ctx->ir_base[insn->op2])) {
			if (true_block != next_block) {
				|	jmp =>true_block
			}
		} else {
			if (false_block != next_block) {
				|	jmp =>false_block
			}
		}
		return;
	} else if (ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA) {
		uint32_t true_block, false_block;

		ir_get_true_false_blocks(ctx, b, &true_block, &false_block);
		if (true_block != next_block) {
			|	jmp =>true_block
		}
		return;
	} else {
		ir_mem mem;

		if (ir_rule(ctx, insn->op2) & IR_FUSED) {
			mem = ir_fuse_load(ctx, def, insn->op2);
		} else {
			mem = ir_ref_spill_slot(ctx, insn->op2);
		}
		|	ASM_MEM_IMM_OP cmp, type, mem, 0
	}
	ir_emit_jcc(ctx, b, def, insn, next_block, IR_NE, 1, 0);
}

static void ir_emit_cond(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = insn->type;
	ir_ref op1 = insn->op1;
	ir_ref op2 = insn->op2;
	ir_ref op3 = insn->op3;
	ir_type op1_type = ctx->ir_base[op1].type;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];
	ir_reg op2_reg = ctx->regs[def][2];
	ir_reg op3_reg = ctx->regs[def][3];

	IR_ASSERT(def_reg != IR_REG_NONE);

	if (op2 != op3) {
		if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			ir_emit_load(ctx, type, op2_reg, op2);
			if (op1 == op2) {
				op1_reg = op2_reg;
			}
		}
		if (op3_reg != IR_REG_NONE && IR_REG_SPILLED(op3_reg)) {
			op3_reg = IR_REG_NUM(op3_reg);
			ir_emit_load(ctx, type, op3_reg, op3);
			if (op1 == op2) {
				op1_reg = op3_reg;
			}
		}
	} else if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
		op2_reg = IR_REG_NUM(op2_reg);
		ir_emit_load(ctx, type, op2_reg, op2);
		op3_reg = op2_reg;
		if (op1 == op2) {
			op1_reg = op2_reg;
		}
	} else if (op3_reg != IR_REG_NONE && IR_REG_SPILLED(op3_reg)) {
		op3_reg = IR_REG_NUM(op3_reg);
		ir_emit_load(ctx, type, op3_reg, op3);
		op2_reg = op3_reg;
		if (op1 == op3) {
			op1_reg = op3_reg;
		}
	}
	if (op1_reg != IR_REG_NONE && op1 != op2 && op1 != op3 && IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, op1_type, op1_reg, op1);
	}

	if (IR_IS_TYPE_INT(op1_type)) {
		if (op1_reg != IR_REG_NONE) {
			|	ASM_REG_REG_OP test, op1_type, op1_reg, op1_reg
		} else {
			ir_mem mem = ir_ref_spill_slot(ctx, op1);

			|	ASM_MEM_IMM_OP cmp, op1_type, mem, 0
		}
		if (IR_IS_TYPE_INT(type)) {
			IR_ASSERT(op2_reg != IR_REG_NONE || op3_reg != IR_REG_NONE);
			if (op3_reg != IR_REG_NONE) {
				if (op3_reg == def_reg) {
					IR_ASSERT(op2_reg != IR_REG_NONE);
					|	ASM_REG_REG_OP2 cmovne, type, def_reg, op2_reg
				} else {
					if (op2_reg != IR_REG_NONE) {
						if (def_reg != op2_reg) {
							if (IR_IS_TYPE_INT(type)) {
								ir_emit_mov(ctx, type, def_reg, op2_reg);
							} else {
								ir_emit_fp_mov(ctx, type, def_reg, op2_reg);
							}
						}
					} else if (IR_IS_CONST_REF(op2) && !IR_IS_SYM_CONST(ctx->ir_base[op2].op)) {
						/* prevent "xor" and flags clobbering */
						ir_emit_mov_imm_int(ctx, type, def_reg, ctx->ir_base[op2].val.i64);
					} else {
						ir_emit_load_ex(ctx, type, def_reg, op2, def);
					}
					|	ASM_REG_REG_OP2 cmove, type, def_reg, op3_reg
				}
			} else {
				IR_ASSERT(op2_reg != IR_REG_NONE && op2_reg != def_reg);
				if (IR_IS_CONST_REF(op3) && !IR_IS_SYM_CONST(ctx->ir_base[op3].op)) {
					/* prevent "xor" and flags clobbering */
					ir_emit_mov_imm_int(ctx, type, def_reg, ctx->ir_base[op3].val.i64);
				} else {
					ir_emit_load_ex(ctx, type, def_reg, op3, def);
				}
				|	ASM_REG_REG_OP2 cmovne, type, def_reg, op2_reg
			}

			if (IR_REG_SPILLED(ctx->regs[def][0])) {
				ir_emit_store(ctx, type, def, def_reg);
			}
			return;
		}
		|	je >2
	} else {
		if (!data->double_zero_const) {
			data->double_zero_const = 1;
			ir_rodata(ctx);
			|.align 16
			|->double_zero_const:
			|.dword 0, 0
			|.code
		}
		|	ASM_FP_REG_TXT_OP ucomis, op1_type, op1_reg, [->double_zero_const]
		|	jp >1
		|	je >2
		|1:
	}

	if (op2_reg != IR_REG_NONE) {
		if (def_reg != op2_reg) {
			if (IR_IS_TYPE_INT(type)) {
				ir_emit_mov(ctx, type, def_reg, op2_reg);
			} else {
				ir_emit_fp_mov(ctx, type, def_reg, op2_reg);
			}
		}
	} else {
		ir_emit_load_ex(ctx, type, def_reg, op2, def);
	}
	|	jmp >3
	|2:
	if (op3_reg != IR_REG_NONE) {
		if (def_reg != op3_reg) {
			if (IR_IS_TYPE_INT(type)) {
				ir_emit_mov(ctx, type, def_reg, op3_reg);
			} else {
				ir_emit_fp_mov(ctx, type, def_reg, op3_reg);
			}
		}
	} else {
		ir_emit_load_ex(ctx, type, def_reg, op3, def);
	}
	|3:

	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_cond_cmp_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = insn->type;
	ir_ref op2 = insn->op2;
	ir_ref op3 = insn->op3;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op2_reg = ctx->regs[def][2];
	ir_reg op3_reg = ctx->regs[def][3];
	ir_op op;

	if (op2 != op3) {
		if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			ir_emit_load(ctx, type, op2_reg, op2);
		}
		if (op3_reg != IR_REG_NONE && IR_REG_SPILLED(op3_reg)) {
			op3_reg = IR_REG_NUM(op3_reg);
			ir_emit_load(ctx, type, op3_reg, op3);
		}
	} else if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
		op2_reg = IR_REG_NUM(op2_reg);
		ir_emit_load(ctx, type, op2_reg, op2);
		op3_reg = op2_reg;
	} else if (op3_reg != IR_REG_NONE && IR_REG_SPILLED(op3_reg)) {
		op3_reg = IR_REG_NUM(op3_reg);
		ir_emit_load(ctx, type, op3_reg, op3);
		op2_reg = op3_reg;
	}

	ir_emit_cmp_int_common2(ctx, def, insn->op1, &ctx->ir_base[insn->op1]);
	op = ctx->ir_base[insn->op1].op;

	if (IR_IS_TYPE_INT(type)) {
		if (op3_reg != IR_REG_NONE) {
			if (op3_reg == def_reg) {
				IR_ASSERT(op2_reg != IR_REG_NONE);
				op3_reg = op2_reg;
				op ^= 1; // reverse
			} else {
				if (op2_reg != IR_REG_NONE) {
					if (def_reg != op2_reg) {
//						if (IR_IS_TYPE_INT(type)) {
							ir_emit_mov(ctx, type, def_reg, op2_reg);
//						} else {
//							ir_emit_fp_mov(ctx, type, def_reg, op2_reg);
//						}
					}
				} else if (IR_IS_CONST_REF(op2) && !IR_IS_SYM_CONST(ctx->ir_base[op2].op)) {
					/* prevent "xor" and flags clobbering */
					ir_emit_mov_imm_int(ctx, type, def_reg, ctx->ir_base[op2].val.i64);
				} else {
					ir_emit_load_ex(ctx, type, def_reg, op2, def);
				}
			}
		} else {
			IR_ASSERT(op2_reg != IR_REG_NONE && op2_reg != def_reg);
			if (IR_IS_CONST_REF(op3) && !IR_IS_SYM_CONST(ctx->ir_base[op3].op)) {
				/* prevent "xor" and flags clobbering */
				ir_emit_mov_imm_int(ctx, type, def_reg, ctx->ir_base[op3].val.i64);
			} else {
				ir_emit_load_ex(ctx, type, def_reg, op3, def);
			}
			op3_reg = op2_reg;
			op ^= 1; // reverse
		}

		switch (op) {
			default:
				IR_ASSERT(0 && "NIY binary op");
			case IR_EQ:
				|	ASM_REG_REG_OP2 cmovne, type, def_reg, op3_reg
				break;
			case IR_NE:
				|	ASM_REG_REG_OP2 cmove, type, def_reg, op3_reg
				break;
			case IR_LT:
				|	ASM_REG_REG_OP2 cmovge, type, def_reg, op3_reg
				break;
			case IR_GE:
				|	ASM_REG_REG_OP2 cmovl, type, def_reg, op3_reg
				break;
			case IR_LE:
				|	ASM_REG_REG_OP2 cmovg, type, def_reg, op3_reg
				break;
			case IR_GT:
				|	ASM_REG_REG_OP2 cmovle, type, def_reg, op3_reg
				break;
			case IR_ULT:
				|	ASM_REG_REG_OP2 cmovae, type, def_reg, op3_reg
				break;
			case IR_UGE:
				|	ASM_REG_REG_OP2 cmovb, type, def_reg, op3_reg
				break;
			case IR_ULE:
				|	ASM_REG_REG_OP2 cmova, type, def_reg, op3_reg
				break;
			case IR_UGT:
				|	ASM_REG_REG_OP2 cmovbe, type, def_reg, op3_reg
				break;
		}
	} else {
		switch (op) {
			default:
				IR_ASSERT(0 && "NIY binary op");
			case IR_EQ:
				|	jne >2
				break;
			case IR_NE:
				|	je >2
				break;
			case IR_LT:
				|	jge >2
				break;
			case IR_GE:
				|	jl >2
				break;
			case IR_LE:
				|	jg >2
				break;
			case IR_GT:
				|	jle >2
				break;
			case IR_ULT:
				|	jae >2
				break;
			case IR_UGE:
				|	jb >2
				break;
			case IR_ULE:
				|	ja >2
				break;
			case IR_UGT:
				|	jbe >2
				break;
		}
		|1:

		if (op2_reg != IR_REG_NONE) {
			if (def_reg != op2_reg) {
				if (IR_IS_TYPE_INT(type)) {
					ir_emit_mov(ctx, type, def_reg, op2_reg);
				} else {
					ir_emit_fp_mov(ctx, type, def_reg, op2_reg);
				}
			}
		} else {
			ir_emit_load_ex(ctx, type, def_reg, op2, def);
		}
		|	jmp >3
		|2:
		if (op3_reg != IR_REG_NONE) {
			if (def_reg != op3_reg) {
				if (IR_IS_TYPE_INT(type)) {
					ir_emit_mov(ctx, type, def_reg, op3_reg);
				} else {
					ir_emit_fp_mov(ctx, type, def_reg, op3_reg);
				}
			}
		} else {
			ir_emit_load_ex(ctx, type, def_reg, op3, def);
		}
		|3:
	}

	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_cond_cmp_fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = insn->type;
	ir_ref op2 = insn->op2;
	ir_ref op3 = insn->op3;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op2_reg = ctx->regs[def][2];
	ir_reg op3_reg = ctx->regs[def][3];
	ir_op op;

	if (op2 != op3) {
		if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			ir_emit_load(ctx, type, op2_reg, op2);
		}
		if (op3_reg != IR_REG_NONE && IR_REG_SPILLED(op3_reg)) {
			op3_reg = IR_REG_NUM(op3_reg);
			ir_emit_load(ctx, type, op3_reg, op3);
		}
	} else if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
		op2_reg = IR_REG_NUM(op2_reg);
		ir_emit_load(ctx, type, op2_reg, op2);
		op3_reg = op2_reg;
	} else if (op3_reg != IR_REG_NONE && IR_REG_SPILLED(op3_reg)) {
		op3_reg = IR_REG_NUM(op3_reg);
		ir_emit_load(ctx, type, op3_reg, op3);
		op2_reg = op3_reg;
	}

	op = ir_emit_cmp_fp_common(ctx, def, insn->op1, &ctx->ir_base[insn->op1]);

	switch (op) {
		default:
			IR_ASSERT(0 && "NIY binary op");
		case IR_EQ:
			|	jne >2
			|	jp >2
			break;
		case IR_NE:
			|	jp >1
			|	je >2
			break;
		case IR_LT:
			|	jp >2
			|	jae >2
			break;
		case IR_GE:
			|	jb >2
			break;
		case IR_LE:
			|	jp >2
			|	ja >2
			break;
		case IR_GT:
			|	jbe >2
			break;
		case IR_ULT:
			|	jae >2
			break;
		case IR_UGE:
			|	jp >1
			|	jb >2
			break;
		case IR_ULE:
			|	ja >2
			break;
		case IR_UGT:
			|	jp >1
			|	jbe >2
			break;
		case IR_ORDERED:
			|	jp >2
			break;
		case IR_UNORDERED:
			|	jnp >2
			break;
	}
	|1:

	if (op2_reg != IR_REG_NONE) {
		if (def_reg != op2_reg) {
			if (IR_IS_TYPE_INT(type)) {
				ir_emit_mov(ctx, type, def_reg, op2_reg);
			} else {
				ir_emit_fp_mov(ctx, type, def_reg, op2_reg);
			}
		}
	} else {
		ir_emit_load_ex(ctx, type, def_reg, op2, def);
	}
	|	jmp >3
	|2:
	if (op3_reg != IR_REG_NONE) {
		if (def_reg != op3_reg) {
			if (IR_IS_TYPE_INT(type)) {
				ir_emit_mov(ctx, type, def_reg, op3_reg);
			} else {
				ir_emit_fp_mov(ctx, type, def_reg, op3_reg);
			}
		}
	} else {
		ir_emit_load_ex(ctx, type, def_reg, op3, def);
	}
	|3:

	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_return_void(ir_ctx *ctx)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;

	ir_emit_epilogue(ctx);

#ifdef IR_TARGET_X86
	if (sizeof(void*) == 4 && (ctx->flags & IR_FASTCALL_FUNC) && ctx->param_stack_size) {
		|	ret ctx->param_stack_size
		return;
	}
#endif

	|	ret
}

static void ir_emit_return_int(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
{
	ir_reg op2_reg = ctx->regs[ref][2];

	if (op2_reg != IR_REG_INT_RET1) {
		ir_type type = ctx->ir_base[insn->op2].type;

		if (op2_reg != IR_REG_NONE && !IR_REG_SPILLED(op2_reg)) {
			ir_emit_mov(ctx, type, IR_REG_INT_RET1, op2_reg);
		} else {
			ir_emit_load(ctx, type, IR_REG_INT_RET1, insn->op2);
		}
	}
	ir_emit_return_void(ctx);
}

static void ir_emit_return_fp(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
{
	ir_reg op2_reg = ctx->regs[ref][2];
	ir_type type = ctx->ir_base[insn->op2].type;

#ifdef IR_REG_FP_RET1
	if (op2_reg != IR_REG_FP_RET1) {
		if (op2_reg != IR_REG_NONE && !IR_REG_SPILLED(op2_reg)) {
			ir_emit_fp_mov(ctx, type, IR_REG_FP_RET1, op2_reg);
		} else {
			ir_emit_load(ctx, type, IR_REG_FP_RET1, insn->op2);
		}
	}
#else
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;

	if (IR_IS_CONST_REF(insn->op2)) {
		ir_insn *value = &ctx->ir_base[insn->op2];

		if ((type == IR_FLOAT && value->val.f == 0.0) || (type == IR_DOUBLE && value->val.d == 0.0)) {
			|	fldz
		} else if ((type == IR_FLOAT && value->val.f == 1.0) || (type == IR_DOUBLE && value->val.d == 1.0)) {
			|	fld1
		} else {
			int label = ir_const_label(ctx, insn->op2);

			if (type == IR_DOUBLE) {
				|	fld qword [=>label]
			} else {
				IR_ASSERT(type == IR_FLOAT);
				|	fld dword [=>label]
			}
		}
	} else if (op2_reg == IR_REG_NONE || IR_REG_SPILLED(op2_reg)) {
		ir_reg fp;
		int32_t offset = ir_ref_spill_slot_offset(ctx, insn->op2, &fp);

		if (type == IR_DOUBLE) {
			|	fld qword [Ra(fp)+offset]
		} else {
			IR_ASSERT(type == IR_FLOAT);
			|	fld dword [Ra(fp)+offset]
		}
	} else {
		int32_t offset = ctx->ret_slot;
		ir_reg fp;

		IR_ASSERT(offset != -1);
		offset = IR_SPILL_POS_TO_OFFSET(offset);
		fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
		ir_emit_store_mem_fp(ctx, type, IR_MEM_BO(fp, offset), op2_reg);
		if (type == IR_DOUBLE) {
			|	fld qword [Ra(fp)+offset]
		} else {
			IR_ASSERT(type == IR_FLOAT);
			|	fld dword [Ra(fp)+offset]
		}
	}
#endif
	ir_emit_return_void(ctx);
}

static void ir_emit_sext(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_type dst_type = insn->type;
	ir_type src_type = ctx->ir_base[insn->op1].type;
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];

	IR_ASSERT(IR_IS_TYPE_INT(src_type));
	IR_ASSERT(IR_IS_TYPE_INT(dst_type));
	IR_ASSERT(ir_type_size[dst_type] > ir_type_size[src_type]);
	IR_ASSERT(def_reg != IR_REG_NONE);

	if (op1_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op1_reg)) {
			op1_reg = IR_REG_NUM(op1_reg);
			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
		}
		if (ir_type_size[src_type] == 1) {
			if (ir_type_size[dst_type] == 2) {
				if (def_reg == IR_REG_RAX && op1_reg == IR_REG_RAX) {
					|	cbw
				} else {
					|	movsx Rw(def_reg), Rb(op1_reg)
				}
			} else if (ir_type_size[dst_type] == 4) {
				|	movsx Rd(def_reg), Rb(op1_reg)
			} else {
				IR_ASSERT(ir_type_size[dst_type] == 8);
				IR_ASSERT(sizeof(void*) == 8);
|.if X64
				|	movsx Rq(def_reg), Rb(op1_reg)
|.endif
			}
		} else if (ir_type_size[src_type] == 2) {
			if (ir_type_size[dst_type] == 4) {
				if (def_reg == IR_REG_RAX && op1_reg == IR_REG_RAX) {
					|	cwde
				} else {
					|	movsx Rd(def_reg), Rw(op1_reg)
				}
			} else {
				IR_ASSERT(ir_type_size[dst_type] == 8);
				IR_ASSERT(sizeof(void*) == 8);
|.if X64
				|	movsx Rq(def_reg), Rw(op1_reg)
|.endif
			}
		} else {
			IR_ASSERT(ir_type_size[src_type] == 4);
			IR_ASSERT(ir_type_size[dst_type] == 8);
			IR_ASSERT(sizeof(void*) == 8);
|.if X64
			if (def_reg == IR_REG_RAX && op1_reg == IR_REG_RAX) {
				|	cdqe
			} else {
				|	movsxd Rq(def_reg), Rd(op1_reg)
			}
|.endif
		}
	} else if (IR_IS_CONST_REF(insn->op1)) {
		int64_t val;

		if (ir_type_size[src_type] == 1) {
			val = ctx->ir_base[insn->op1].val.i8;
		} else if (ir_type_size[src_type] == 2) {
			val = ctx->ir_base[insn->op1].val.i16;
		} else if (ir_type_size[src_type] == 4) {
			val = ctx->ir_base[insn->op1].val.i32;
		} else {
			IR_ASSERT(ir_type_size[src_type] == 8);
			val = ctx->ir_base[insn->op1].val.i64;
		}
		ir_emit_mov_imm_int(ctx, dst_type, def_reg, val);
	} else {
		ir_mem mem;

		if (ir_rule(ctx, insn->op1) & IR_FUSED) {
			mem = ir_fuse_load(ctx, def, insn->op1);
		} else {
			mem = ir_ref_spill_slot(ctx, insn->op1);
		}

		if (ir_type_size[src_type] == 1) {
			if (ir_type_size[dst_type] == 2) {
				|	ASM_TXT_TMEM_OP movsx, Rw(def_reg), byte, mem
			} else if (ir_type_size[dst_type] == 4) {
				|	ASM_TXT_TMEM_OP movsx, Rd(def_reg), byte, mem
			} else {
				IR_ASSERT(ir_type_size[dst_type] == 8);
				IR_ASSERT(sizeof(void*) == 8);
|.if X64
				|	ASM_TXT_TMEM_OP movsx, Rq(def_reg), byte, mem
|.endif
			}
		} else if (ir_type_size[src_type] == 2) {
			if (ir_type_size[dst_type] == 4) {
				|	ASM_TXT_TMEM_OP movsx, Rd(def_reg), word, mem
			} else {
				IR_ASSERT(ir_type_size[dst_type] == 8);
				IR_ASSERT(sizeof(void*) == 8);
|.if X64
				|	ASM_TXT_TMEM_OP movsx, Rq(def_reg), word, mem
|.endif
			}
		} else {
			IR_ASSERT(ir_type_size[src_type] == 4);
			IR_ASSERT(ir_type_size[dst_type] == 8);
			IR_ASSERT(sizeof(void*) == 8);
|.if X64
			|	ASM_TXT_TMEM_OP movsxd, Rq(def_reg), dword, mem
|.endif
		}
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, dst_type, def, def_reg);
	}
}

static void ir_emit_zext(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_type dst_type = insn->type;
	ir_type src_type = ctx->ir_base[insn->op1].type;
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];

	IR_ASSERT(IR_IS_TYPE_INT(src_type));
	IR_ASSERT(IR_IS_TYPE_INT(dst_type));
	IR_ASSERT(ir_type_size[dst_type] > ir_type_size[src_type]);
	IR_ASSERT(def_reg != IR_REG_NONE);

	if (op1_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op1_reg)) {
			op1_reg = IR_REG_NUM(op1_reg);
			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
		}
		if (ir_type_size[src_type] == 1) {
			if (ir_type_size[dst_type] == 2) {
				|	movzx Rw(def_reg), Rb(op1_reg)
			} else if (ir_type_size[dst_type] == 4) {
				|	movzx Rd(def_reg), Rb(op1_reg)
			} else {
				IR_ASSERT(ir_type_size[dst_type] == 8);
				IR_ASSERT(sizeof(void*) == 8);
|.if X64
				|	movzx Rq(def_reg), Rb(op1_reg)
|.endif
			}
		} else if (ir_type_size[src_type] == 2) {
			if (ir_type_size[dst_type] == 4) {
				|	movzx Rd(def_reg), Rw(op1_reg)
			} else {
				IR_ASSERT(ir_type_size[dst_type] == 8);
				IR_ASSERT(sizeof(void*) == 8);
|.if X64
				|	movzx Rq(def_reg), Rw(op1_reg)
|.endif
			}
		} else {
			IR_ASSERT(ir_type_size[src_type] == 4);
			IR_ASSERT(ir_type_size[dst_type] == 8);
			IR_ASSERT(sizeof(void*) == 8);
|.if X64
			/* Avoid zero extension to the same register. This may be not always safe ??? */
			if (op1_reg != def_reg) {
				|	mov Rd(def_reg), Rd(op1_reg)
			}
|.endif
		}
	} else if (IR_IS_CONST_REF(insn->op1)) {
		uint64_t val;

		if (ir_type_size[src_type] == 1)  {
			val = ctx->ir_base[insn->op1].val.u8;
		} else if (ir_type_size[src_type] == 2) {
			val = ctx->ir_base[insn->op1].val.u16;
		} else if (ir_type_size[src_type] == 4) {
			val = ctx->ir_base[insn->op1].val.u32;
		} else {
			IR_ASSERT(ir_type_size[src_type] == 8);
			val = ctx->ir_base[insn->op1].val.u64;
		}
		ir_emit_mov_imm_int(ctx, dst_type, def_reg, val);
	} else {
		ir_mem mem;

		if (ir_rule(ctx, insn->op1) & IR_FUSED) {
			mem = ir_fuse_load(ctx, def, insn->op1);
		} else {
			mem = ir_ref_spill_slot(ctx, insn->op1);
		}

		if (ir_type_size[src_type] == 1) {
			if (ir_type_size[dst_type] == 2) {
				|	ASM_TXT_TMEM_OP movzx, Rw(def_reg), byte, mem
			} else if (ir_type_size[dst_type] == 4) {
				|	ASM_TXT_TMEM_OP movzx, Rd(def_reg), byte, mem
			} else {
				IR_ASSERT(ir_type_size[dst_type] == 8);
				IR_ASSERT(sizeof(void*) == 8);
|.if X64
				|	ASM_TXT_TMEM_OP movzx, Rq(def_reg), byte, mem
|.endif
			}
		} else if (ir_type_size[src_type] == 2) {
			if (ir_type_size[dst_type] == 4) {
				|	ASM_TXT_TMEM_OP movzx, Rd(def_reg), word, mem
			} else {
				IR_ASSERT(ir_type_size[dst_type] == 8);
				IR_ASSERT(sizeof(void*) == 8);
|.if X64
				|	ASM_TXT_TMEM_OP movzx, Rq(def_reg), word, mem
|.endif
			}
		} else {
			IR_ASSERT(ir_type_size[src_type] == 4);
			IR_ASSERT(ir_type_size[dst_type] == 8);
|.if X64
			|	ASM_TXT_TMEM_OP mov, Rd(def_reg), dword, mem
|.endif
		}
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, dst_type, def, def_reg);
	}
}

static void ir_emit_trunc(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_type dst_type = insn->type;
	ir_type src_type = ctx->ir_base[insn->op1].type;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];

	IR_ASSERT(IR_IS_TYPE_INT(src_type));
	IR_ASSERT(IR_IS_TYPE_INT(dst_type));
	IR_ASSERT(ir_type_size[dst_type] < ir_type_size[src_type]);
	IR_ASSERT(def_reg != IR_REG_NONE);
	if (op1_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op1_reg)) {
			op1_reg = IR_REG_NUM(op1_reg);
			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
		}
		if (op1_reg != def_reg) {
			ir_emit_mov(ctx, dst_type, def_reg, op1_reg);
		}
	} else {
		ir_emit_load_ex(ctx, dst_type, def_reg, insn->op1, def);
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, dst_type, def, def_reg);
	}
}

static void ir_emit_bitcast(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_type dst_type = insn->type;
	ir_type src_type = ctx->ir_base[insn->op1].type;
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];

	IR_ASSERT(ir_type_size[dst_type] == ir_type_size[src_type]);
	IR_ASSERT(def_reg != IR_REG_NONE);
	if (IR_IS_TYPE_INT(src_type) && IR_IS_TYPE_INT(dst_type)) {
		if (op1_reg != IR_REG_NONE) {
			if (IR_REG_SPILLED(op1_reg)) {
				op1_reg = IR_REG_NUM(op1_reg);
				ir_emit_load(ctx, src_type, op1_reg, insn->op1);
			}
			if (op1_reg != def_reg) {
				ir_emit_mov(ctx, dst_type, def_reg, op1_reg);
			}
		} else {
			ir_emit_load_ex(ctx, dst_type, def_reg, insn->op1, def);
		}
	} else if (IR_IS_TYPE_FP(src_type) && IR_IS_TYPE_FP(dst_type)) {
		if (op1_reg != IR_REG_NONE) {
			if (IR_REG_SPILLED(op1_reg)) {
				op1_reg = IR_REG_NUM(op1_reg);
				ir_emit_load(ctx, src_type, op1_reg, insn->op1);
			}
			if (op1_reg != def_reg) {
				ir_emit_fp_mov(ctx, dst_type, def_reg, op1_reg);
			}
		} else {
			ir_emit_load_ex(ctx, dst_type, def_reg, insn->op1, def);
		}
	} else if (IR_IS_TYPE_FP(src_type)) {
		IR_ASSERT(IR_IS_TYPE_INT(dst_type));
		if (op1_reg != IR_REG_NONE) {
			if (IR_REG_SPILLED(op1_reg)) {
				op1_reg = IR_REG_NUM(op1_reg);
				ir_emit_load(ctx, src_type, op1_reg, insn->op1);
			}
			if (src_type == IR_DOUBLE) {
				IR_ASSERT(sizeof(void*) == 8);
|.if X64
				if (ctx->mflags & IR_X86_AVX) {
					|	vmovd Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
				} else {
					|	movd Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
				}
|.endif
			} else {
				IR_ASSERT(src_type == IR_FLOAT);
				if (ctx->mflags & IR_X86_AVX) {
					|	vmovd Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
				} else {
					|	movd Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
				}
			}
		} else if (IR_IS_CONST_REF(insn->op1)) {
			ir_insn *_insn = &ctx->ir_base[insn->op1];
			IR_ASSERT(!IR_IS_SYM_CONST(_insn->op));
			if (src_type == IR_DOUBLE) {
				IR_ASSERT(sizeof(void*) == 8);
|.if X64
				|	mov64 Rq(def_reg), _insn->val.i64
|.endif
			} else {
				IR_ASSERT(src_type == IR_FLOAT);
				|	mov Rd(def_reg), _insn->val.i32
			}
		} else {
			ir_mem mem;

			if (ir_rule(ctx, insn->op1) & IR_FUSED) {
				mem = ir_fuse_load(ctx, def, insn->op1);
			} else {
				mem = ir_ref_spill_slot(ctx, insn->op1);
			}

			if (src_type == IR_DOUBLE) {
				IR_ASSERT(sizeof(void*) == 8);
|.if X64
				|	ASM_TXT_TMEM_OP mov, Rq(def_reg), qword, mem
|.endif
			} else {
				IR_ASSERT(src_type == IR_FLOAT);
				|	ASM_TXT_TMEM_OP mov, Rd(def_reg), dword, mem
			}
		}
	} else if (IR_IS_TYPE_FP(dst_type)) {
		IR_ASSERT(IR_IS_TYPE_INT(src_type));
		if (op1_reg != IR_REG_NONE) {
			if (IR_REG_SPILLED(op1_reg)) {
				op1_reg = IR_REG_NUM(op1_reg);
				ir_emit_load(ctx, src_type, op1_reg, insn->op1);
			}
			if (dst_type == IR_DOUBLE) {
				IR_ASSERT(sizeof(void*) == 8);
|.if X64
				if (ctx->mflags & IR_X86_AVX) {
					|	vmovd xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
				} else {
					|	movd xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
				}
|.endif
			} else {
				IR_ASSERT(dst_type == IR_FLOAT);
				if (ctx->mflags & IR_X86_AVX) {
					|	vmovd xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
				} else {
					|	movd xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
				}
			}
		} else if (IR_IS_CONST_REF(insn->op1)) {
			int label = ir_const_label(ctx, insn->op1);

			|	ASM_FP_REG_TXT_OP movs, dst_type, def_reg, [=>label]
		} else {
			ir_mem mem;

			if (ir_rule(ctx, insn->op1) & IR_FUSED) {
				mem = ir_fuse_load(ctx, def, insn->op1);
			} else {
				mem = ir_ref_spill_slot(ctx, insn->op1);
			}

			|	ASM_FP_REG_MEM_OP movs, dst_type, def_reg, mem
		}
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, dst_type, def, def_reg);
	}
}

static void ir_emit_int2fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_type dst_type = insn->type;
	ir_type src_type = ctx->ir_base[insn->op1].type;
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];

	IR_ASSERT(IR_IS_TYPE_INT(src_type));
	IR_ASSERT(IR_IS_TYPE_FP(dst_type));
	IR_ASSERT(def_reg != IR_REG_NONE);
	if (op1_reg != IR_REG_NONE) {
		bool src64 = 0;

		if (IR_REG_SPILLED(op1_reg)) {
			op1_reg = IR_REG_NUM(op1_reg);
			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
		}
		if (IR_IS_TYPE_SIGNED(src_type)) {
			if (ir_type_size[src_type] < 4) {
|.if X64
||				if (ir_type_size[src_type] == 1) {
					| movsx Rq(op1_reg), Rb(op1_reg)
||				} else {
					| movsx Rq(op1_reg), Rw(op1_reg)
||				}
||				src64 = 1;
|.else
||				if (ir_type_size[src_type] == 1) {
					| movsx Rd(op1_reg), Rb(op1_reg)
||				} else if (op1_reg == IR_REG_RAX) {
					| cwde
||				} else {
					| movsx Rd(op1_reg), Rw(op1_reg)
||				}
|.endif
			} else if (ir_type_size[src_type] > 4) {
				src64 = 1;
			}
		} else {
			if (ir_type_size[src_type] < 8) {
|.if X64
||				if (ir_type_size[src_type] == 1) {
					| movzx Rq(op1_reg), Rb(op1_reg)
||				} else if (ir_type_size[src_type] == 2) {
					| movzx Rq(op1_reg), Rw(op1_reg)
||				}
||				src64 = 1;
|.else
||				if (ir_type_size[src_type] == 1) {
					| movzx Rd(op1_reg), Rb(op1_reg)
||				} else if (ir_type_size[src_type] == 2) {
					| movzx Rd(op1_reg), Rw(op1_reg)
||				}
|.endif
			} else {
				// TODO: uint64_t -> double
				src64 = 1;
			}
		}
		if (!src64) {
			if (dst_type == IR_DOUBLE) {
				if (ctx->mflags & IR_X86_AVX) {
					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
					|	vcvtsi2sd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
				} else {
					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
					|	cvtsi2sd xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
				}
			} else {
				IR_ASSERT(dst_type == IR_FLOAT);
				if (ctx->mflags & IR_X86_AVX) {
					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
					|	vcvtsi2ss xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
				} else {
					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
					|	cvtsi2ss xmm(def_reg-IR_REG_FP_FIRST), Rd(op1_reg)
				}
			}
		} else {
			IR_ASSERT(sizeof(void*) == 8);
|.if X64
			if (dst_type == IR_DOUBLE) {
				if (ctx->mflags & IR_X86_AVX) {
					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
					|	vcvtsi2sd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
				} else {
					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
					|	cvtsi2sd xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
				}
			} else {
				IR_ASSERT(dst_type == IR_FLOAT);
				if (ctx->mflags & IR_X86_AVX) {
					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
					|	vcvtsi2ss xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
				} else {
					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
					|	cvtsi2ss xmm(def_reg-IR_REG_FP_FIRST), Rq(op1_reg)
				}
			}
|.endif
		}
	} else if (IR_IS_CONST_REF(insn->op1)) {
		IR_ASSERT(0);
	} else {
		ir_mem mem;
		bool src64 = ir_type_size[src_type] == 8;

		if (ir_rule(ctx, insn->op1) & IR_FUSED) {
			mem = ir_fuse_load(ctx, def, insn->op1);
		} else {
			mem = ir_ref_spill_slot(ctx, insn->op1);
		}

		if (!src64) {
			if (dst_type == IR_DOUBLE) {
				if (ctx->mflags & IR_X86_AVX) {
					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
					|	ASM_TXT_TXT_TMEM_OP vcvtsi2sd, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), dword, mem
				} else {
					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
					|	ASM_TXT_TMEM_OP cvtsi2sd, xmm(def_reg-IR_REG_FP_FIRST), dword, mem
				}
			} else {
				IR_ASSERT(dst_type == IR_FLOAT);
				if (ctx->mflags & IR_X86_AVX) {
					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
					|	ASM_TXT_TXT_TMEM_OP vcvtsi2ss, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), dword, mem
				} else {
					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
					|	ASM_TXT_TMEM_OP cvtsi2ss, xmm(def_reg-IR_REG_FP_FIRST), dword, mem
				}
			}
		} else {
			IR_ASSERT(sizeof(void*) == 8);
|.if X64
			if (dst_type == IR_DOUBLE) {
				if (ctx->mflags & IR_X86_AVX) {
					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
					|	ASM_TXT_TXT_TMEM_OP vcvtsi2sd, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), qword, mem
				} else {
					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
					|	ASM_TXT_TMEM_OP cvtsi2sd, xmm(def_reg-IR_REG_FP_FIRST), qword, mem
				}
			} else {
				IR_ASSERT(dst_type == IR_FLOAT);
				if (ctx->mflags & IR_X86_AVX) {
					|	vxorps xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
					|	ASM_TXT_TXT_TMEM_OP vcvtsi2ss, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), qword, mem
				} else {
					|	pxor xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST)
					|	ASM_TXT_TMEM_OP cvtsi2ss, xmm(def_reg-IR_REG_FP_FIRST), qword, mem
				}
			}
|.endif
		}
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, dst_type, def, def_reg);
	}
}

static void ir_emit_fp2int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_type dst_type = insn->type;
	ir_type src_type = ctx->ir_base[insn->op1].type;
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];
	bool dst64 = 0;

	IR_ASSERT(IR_IS_TYPE_FP(src_type));
	IR_ASSERT(IR_IS_TYPE_INT(dst_type));
	IR_ASSERT(def_reg != IR_REG_NONE);
	if (IR_IS_TYPE_SIGNED(dst_type) ? ir_type_size[dst_type] == 8 : ir_type_size[dst_type] >= 4) {
		// TODO: we might need to perform truncation from 32/64 bit integer
		dst64 = 1;
	}
	if (op1_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op1_reg)) {
			op1_reg = IR_REG_NUM(op1_reg);
			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
		}
		if (!dst64) {
			if (src_type == IR_DOUBLE) {
				if (ctx->mflags & IR_X86_AVX) {
					|	vcvttsd2si Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
				} else {
					|	cvttsd2si Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
				}
			} else {
				IR_ASSERT(src_type == IR_FLOAT);
				if (ctx->mflags & IR_X86_AVX) {
					|	vcvttss2si Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
				} else {
					|	cvttss2si Rd(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
				}
			}
		} else {
			IR_ASSERT(sizeof(void*) == 8);
|.if X64
			if (src_type == IR_DOUBLE) {
				if (ctx->mflags & IR_X86_AVX) {
					|	vcvttsd2si Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
				} else {
					|	cvttsd2si Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
				}
			} else {
				IR_ASSERT(src_type == IR_FLOAT);
				if (ctx->mflags & IR_X86_AVX) {
					|	vcvttss2si Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
				} else {
					|	cvttss2si Rq(def_reg), xmm(op1_reg-IR_REG_FP_FIRST)
				}
			}
|.endif
		}
	} else if (IR_IS_CONST_REF(insn->op1)) {
		int label = ir_const_label(ctx, insn->op1);

		if (!dst64) {
			if (src_type == IR_DOUBLE) {
				if (ctx->mflags & IR_X86_AVX) {
					|	vcvttsd2si Rd(def_reg), qword [=>label]
				} else {
					|	cvttsd2si Rd(def_reg), qword [=>label]
				}
			} else {
				IR_ASSERT(src_type == IR_FLOAT);
				if (ctx->mflags & IR_X86_AVX) {
					|	vcvttss2si Rd(def_reg), dword [=>label]
				} else {
					|	cvttss2si Rd(def_reg), dword [=>label]
				}
			}
		} else {
			IR_ASSERT(sizeof(void*) == 8);
|.if X64
			if (src_type == IR_DOUBLE) {
				if (ctx->mflags & IR_X86_AVX) {
					|	vcvttsd2si Rq(def_reg), qword [=>label]
				} else {
					|	cvttsd2si Rq(def_reg), qword [=>label]
				}
			} else {
				IR_ASSERT(src_type == IR_FLOAT);
				if (ctx->mflags & IR_X86_AVX) {
					|	vcvttss2si Rq(def_reg), dword [=>label]
				} else {
					|	cvttss2si Rq(def_reg), dword [=>label]
				}
			}
|.endif
		}
	} else {
		ir_mem mem;

		if (ir_rule(ctx, insn->op1) & IR_FUSED) {
			mem = ir_fuse_load(ctx, def, insn->op1);
		} else {
			mem = ir_ref_spill_slot(ctx, insn->op1);
		}

		if (!dst64) {
			if (src_type == IR_DOUBLE) {
				if (ctx->mflags & IR_X86_AVX) {
					|	ASM_TXT_TMEM_OP vcvttsd2si, Rd(def_reg), qword, mem
				} else {
					|	ASM_TXT_TMEM_OP cvttsd2si, Rd(def_reg), qword, mem
				}
			} else {
				IR_ASSERT(src_type == IR_FLOAT);
				if (ctx->mflags & IR_X86_AVX) {
					|	ASM_TXT_TMEM_OP vcvttss2si, Rd(def_reg), dword, mem
				} else {
					|	ASM_TXT_TMEM_OP cvttss2si, Rd(def_reg), dword, mem
				}
			}
		} else {
			IR_ASSERT(sizeof(void*) == 8);
|.if X64
			if (src_type == IR_DOUBLE) {
				if (ctx->mflags & IR_X86_AVX) {
					|	ASM_TXT_TMEM_OP vcvttsd2si, Rq(def_reg), qword, mem
				} else {
					|	ASM_TXT_TMEM_OP cvttsd2si, Rq(def_reg), qword, mem
				}
			} else {
				IR_ASSERT(src_type == IR_FLOAT);
				if (ctx->mflags & IR_X86_AVX) {
					|	ASM_TXT_TMEM_OP vcvttss2si, Rq(def_reg), dword, mem
				} else {
					|	ASM_TXT_TMEM_OP cvttss2si, Rq(def_reg), dword, mem
				}
			}
|.endif
		}
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, dst_type, def, def_reg);
	}
}

static void ir_emit_fp2fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_type dst_type = insn->type;
	ir_type src_type = ctx->ir_base[insn->op1].type;
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];

	IR_ASSERT(IR_IS_TYPE_FP(src_type));
	IR_ASSERT(IR_IS_TYPE_FP(dst_type));
	IR_ASSERT(def_reg != IR_REG_NONE);
	if (op1_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op1_reg)) {
			op1_reg = IR_REG_NUM(op1_reg);
			ir_emit_load(ctx, src_type, op1_reg, insn->op1);
		}
		if (src_type == dst_type) {
			if (op1_reg != def_reg) {
				ir_emit_fp_mov(ctx, dst_type, def_reg, op1_reg);
			}
		} else if (src_type == IR_DOUBLE) {
			if (ctx->mflags & IR_X86_AVX) {
				|	vcvtsd2ss xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(op1_reg-IR_REG_FP_FIRST)
			} else {
				|	cvtsd2ss xmm(def_reg-IR_REG_FP_FIRST), xmm(op1_reg-IR_REG_FP_FIRST)
			}
		} else {
			IR_ASSERT(src_type == IR_FLOAT);
			if (ctx->mflags & IR_X86_AVX) {
				|	vcvtss2sd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), xmm(op1_reg-IR_REG_FP_FIRST)
			} else {
				|	cvtss2sd xmm(def_reg-IR_REG_FP_FIRST), xmm(op1_reg-IR_REG_FP_FIRST)
			}
		}
	} else if (IR_IS_CONST_REF(insn->op1)) {
		int label = ir_const_label(ctx, insn->op1);

		if (src_type == IR_DOUBLE) {
			if (ctx->mflags & IR_X86_AVX) {
				|	vcvtsd2ss xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), qword [=>label]
			} else {
				|	cvtsd2ss xmm(def_reg-IR_REG_FP_FIRST), qword [=>label]
			}
		} else {
			IR_ASSERT(src_type == IR_FLOAT);
			if (ctx->mflags & IR_X86_AVX) {
				|	vcvtss2sd xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), dword [=>label]
			} else {
				|	cvtss2sd xmm(def_reg-IR_REG_FP_FIRST), dword [=>label]
			}
		}
	} else {
		ir_mem mem;

		if (ir_rule(ctx, insn->op1) & IR_FUSED) {
			mem = ir_fuse_load(ctx, def, insn->op1);
		} else {
			mem = ir_ref_spill_slot(ctx, insn->op1);
		}

		if (src_type == IR_DOUBLE) {
			if (ctx->mflags & IR_X86_AVX) {
				|	ASM_TXT_TXT_TMEM_OP vcvtsd2ss, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), qword, mem
			} else {
				|	ASM_TXT_TMEM_OP cvtsd2ss, xmm(def_reg-IR_REG_FP_FIRST), qword, mem
			}
		} else {
			IR_ASSERT(src_type == IR_FLOAT);
			if (ctx->mflags & IR_X86_AVX) {
				|	ASM_TXT_TXT_TMEM_OP vcvtss2sd, xmm(def_reg-IR_REG_FP_FIRST), xmm(def_reg-IR_REG_FP_FIRST), dword, mem
			} else {
				|	ASM_TXT_TMEM_OP cvtss2sd, xmm(def_reg-IR_REG_FP_FIRST), dword, mem
			}
		}
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, dst_type, def, def_reg);
	}
}

static void ir_emit_copy_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_ref type = insn->type;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];

	IR_ASSERT(def_reg != IR_REG_NONE || op1_reg != IR_REG_NONE);
	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, insn->op1);
	}
	if (def_reg == op1_reg) {
		/* same reg */
	} else if (def_reg != IR_REG_NONE && op1_reg != IR_REG_NONE) {
		ir_emit_mov(ctx, type, def_reg, op1_reg);
	} else if (def_reg != IR_REG_NONE) {
		ir_emit_load(ctx, type, def_reg, insn->op1);
	} else if (op1_reg != IR_REG_NONE) {
		ir_emit_store(ctx, type, def, op1_reg);
	} else {
		IR_ASSERT(0);
	}
	if (def_reg != IR_REG_NONE && IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_copy_fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_type type = insn->type;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op1_reg = ctx->regs[def][1];

	IR_ASSERT(def_reg != IR_REG_NONE || op1_reg != IR_REG_NONE);
	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, insn->op1);
	}
	if (def_reg == op1_reg) {
		/* same reg */
	} else if (def_reg != IR_REG_NONE && op1_reg != IR_REG_NONE) {
		ir_emit_fp_mov(ctx, type, def_reg, op1_reg);
	} else if (def_reg != IR_REG_NONE) {
		ir_emit_load(ctx, type, def_reg, insn->op1);
	} else if (op1_reg != IR_REG_NONE) {
		ir_emit_store(ctx, type, def, op1_reg);
	} else {
		IR_ASSERT(0);
	}
	if (def_reg != IR_REG_NONE && IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_vaddr(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_ref type = insn->type;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_mem mem;
	int32_t offset;
	ir_reg fp;

	IR_ASSERT(def_reg != IR_REG_NONE);
	mem = ir_var_spill_slot(ctx, insn->op1);
	fp = IR_MEM_BASE(mem);
	offset = IR_MEM_OFFSET(mem);
	if (offset == 0) {
		|	mov Ra(def_reg), Ra(fp)
	} else {
		|	lea Ra(def_reg), aword [Ra(fp)+offset]
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_vload(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_insn *var_insn = &ctx->ir_base[insn->op2];
	ir_ref type = insn->type;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg fp;
	ir_mem mem;

	if (ctx->use_lists[def].count == 1) {
		/* dead load */
		return;
	}
	IR_ASSERT(var_insn->op == IR_VAR);
	fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
	mem = IR_MEM_BO(fp, IR_SPILL_POS_TO_OFFSET(var_insn->op3));
	if (def_reg == IR_REG_NONE && ir_is_same_mem_var(ctx, def, var_insn->op3)) {
		return; // fake load
	}
	IR_ASSERT(def_reg != IR_REG_NONE);

	ir_emit_load_mem(ctx, type, def_reg, mem);
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_vstore_int(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
{
	ir_insn *var_insn = &ctx->ir_base[insn->op2];
	ir_insn *val_insn = &ctx->ir_base[insn->op3];
	ir_ref type = val_insn->type;
	ir_reg op3_reg = ctx->regs[ref][3];
	ir_reg fp;
	ir_mem mem;

	IR_ASSERT(var_insn->op == IR_VAR);
	fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
	mem = IR_MEM_BO(fp, IR_SPILL_POS_TO_OFFSET(var_insn->op3));
	if ((op3_reg == IR_REG_NONE || IR_REG_SPILLED(op3_reg))
	 && !IR_IS_CONST_REF(insn->op3)
	 && ir_rule(ctx, insn->op3) != IR_STATIC_ALLOCA
	 && ir_is_same_mem_var(ctx, insn->op3, var_insn->op3)) {
		return; // fake store
	}
	if (IR_IS_CONST_REF(insn->op3)) {
		ir_emit_store_mem_int_const(ctx, type, mem, insn->op3, op3_reg, 0);
	} else {
		IR_ASSERT(op3_reg != IR_REG_NONE);
		if (IR_REG_SPILLED(op3_reg)) {
			op3_reg = IR_REG_NUM(op3_reg);
			ir_emit_load(ctx, type, op3_reg, insn->op3);
		}
		ir_emit_store_mem_int(ctx, type, mem, op3_reg);
	}
}

static void ir_emit_vstore_fp(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
{
	ir_insn *var_insn = &ctx->ir_base[insn->op2];
	ir_ref type = ctx->ir_base[insn->op3].type;
	ir_reg op3_reg = ctx->regs[ref][3];
	ir_reg fp;
	ir_mem mem;

	IR_ASSERT(var_insn->op == IR_VAR);
	fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
	mem = IR_MEM_BO(fp, IR_SPILL_POS_TO_OFFSET(var_insn->op3));
	if ((op3_reg == IR_REG_NONE || IR_REG_SPILLED(op3_reg))
	 && !IR_IS_CONST_REF(insn->op3)
	 && ir_rule(ctx, insn->op3) != IR_STATIC_ALLOCA
	 && ir_is_same_mem_var(ctx, insn->op3, var_insn->op3)) {
		return; // fake store
	}
	if (IR_IS_CONST_REF(insn->op3)) {
		ir_emit_store_mem_fp_const(ctx, type, mem, insn->op3, IR_REG_NONE, op3_reg);
	} else {
		IR_ASSERT(op3_reg != IR_REG_NONE);
		if (IR_REG_SPILLED(op3_reg)) {
			op3_reg = IR_REG_NUM(op3_reg);
			ir_emit_load(ctx, type, op3_reg, insn->op3);
		}
		ir_emit_store_mem_fp(ctx, type, mem, op3_reg);
	}
}

static void ir_emit_load_int(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_ref type = insn->type;
	ir_reg op2_reg = ctx->regs[def][2];
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_mem mem;

	if (ctx->use_lists[def].count == 1) {
		/* dead load */
		return;
	}
	IR_ASSERT(def_reg != IR_REG_NONE);
	if (op2_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			IR_ASSERT(ctx->ir_base[insn->op2].type == IR_ADDR);
			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
		}
		mem = IR_MEM_B(op2_reg);
	} else if (IR_IS_CONST_REF(insn->op2)) {
		mem = ir_fuse_addr_const(ctx, insn->op2);
	} else {
		IR_ASSERT(ir_rule(ctx, insn->op2) & IR_FUSED);
		mem = ir_fuse_addr(ctx, def, insn->op2);
		if (IR_REG_SPILLED(ctx->regs[def][0]) && ir_is_same_spill_slot(ctx, def, mem)) {
			if (!ir_may_avoid_spill_load(ctx, def, def)) {
				ir_emit_load_mem_int(ctx, type, def_reg, mem);
			}
			/* avoid load to the same location (valid only when register is not reused) */
			return;
		}
	}

	ir_emit_load_mem_int(ctx, type, def_reg, mem);
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_load_fp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_ref type = insn->type;
	ir_reg op2_reg = ctx->regs[def][2];
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_mem mem;

	if (ctx->use_lists[def].count == 1) {
		/* dead load */
		return;
	}
	IR_ASSERT(def_reg != IR_REG_NONE);
	if (op2_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			IR_ASSERT(ctx->ir_base[insn->op2].type == IR_ADDR);
			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
		}
		mem = IR_MEM_B(op2_reg);
	} else if (IR_IS_CONST_REF(insn->op2)) {
		mem = ir_fuse_addr_const(ctx, insn->op2);
	} else {
		IR_ASSERT(ir_rule(ctx, insn->op2) & IR_FUSED);
		mem = ir_fuse_addr(ctx, def, insn->op2);
		if (IR_REG_SPILLED(ctx->regs[def][0]) && ir_is_same_spill_slot(ctx, def, mem)) {
			if (!ir_may_avoid_spill_load(ctx, def, def)) {
				ir_emit_load_mem_fp(ctx, type, def_reg, mem);
			}
			/* avoid load to the same location (valid only when register is not reused) */
			return;
		}
	}

	ir_emit_load_mem_fp(ctx, type, def_reg, mem);
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_store_int(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
{
	ir_insn *val_insn = &ctx->ir_base[insn->op3];
	ir_ref type = val_insn->type;
	ir_reg op2_reg = ctx->regs[ref][2];
	ir_reg op3_reg = ctx->regs[ref][3];
	ir_mem mem;

	if (op2_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			IR_ASSERT(ctx->ir_base[insn->op2].type == IR_ADDR);
			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
		}
		mem = IR_MEM_B(op2_reg);
	} else if (IR_IS_CONST_REF(insn->op2)) {
		mem = ir_fuse_addr_const(ctx, insn->op2);
	} else {
		IR_ASSERT(ir_rule(ctx, insn->op2) & IR_FUSED);
		mem = ir_fuse_addr(ctx, ref, insn->op2);
		if (!IR_IS_CONST_REF(insn->op3)
		 && IR_REG_SPILLED(op3_reg)
		 && ir_rule(ctx, insn->op3) != IR_STATIC_ALLOCA
		 && ir_is_same_spill_slot(ctx, insn->op3, mem)) {
			if (!ir_may_avoid_spill_load(ctx, insn->op3, ref)) {
				op3_reg = IR_REG_NUM(op3_reg);
				ir_emit_load(ctx, type, op3_reg, insn->op3);
			}
			/* avoid store to the same location */
			return;
		}
	}

	if (IR_IS_CONST_REF(insn->op3)) {
		ir_emit_store_mem_int_const(ctx, type, mem, insn->op3, op3_reg, 0);
	} else {
		IR_ASSERT(op3_reg != IR_REG_NONE);
		if (IR_REG_SPILLED(op3_reg)) {
			op3_reg = IR_REG_NUM(op3_reg);
			ir_emit_load(ctx, type, op3_reg, insn->op3);
		}
		ir_emit_store_mem_int(ctx, type, mem, op3_reg);
	}
}

static void ir_emit_cmp_and_store_int(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
{
	ir_reg addr_reg = ctx->regs[ref][2];
	ir_mem mem;
	ir_insn *cmp_insn = &ctx->ir_base[insn->op3];
	ir_op op = cmp_insn->op;
	ir_type type = ctx->ir_base[cmp_insn->op1].type;
	ir_ref op1 = cmp_insn->op1;
	ir_ref op2 = cmp_insn->op2;
	ir_reg op1_reg = ctx->regs[insn->op3][1];
	ir_reg op2_reg = ctx->regs[insn->op3][2];

	if (addr_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(addr_reg)) {
			addr_reg = IR_REG_NUM(addr_reg);
			IR_ASSERT(ctx->ir_base[insn->op2].type == IR_ADDR);
			ir_emit_load(ctx, IR_ADDR, addr_reg, insn->op2);
		}
		mem = IR_MEM_B(addr_reg);
	} else if (IR_IS_CONST_REF(insn->op2)) {
		mem = ir_fuse_addr_const(ctx, insn->op2);
	} else {
		IR_ASSERT(ir_rule(ctx, insn->op2) & IR_FUSED);
		mem = ir_fuse_addr(ctx, ref, insn->op2);
	}

	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, op1);
	}
	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
		op2_reg = IR_REG_NUM(op2_reg);
		if (op1 != op2) {
			ir_emit_load(ctx, type, op2_reg, op2);
		}
	}

	ir_emit_cmp_int_common(ctx, type, ref, cmp_insn, op1_reg, op1, op2_reg, op2);
	_ir_emit_setcc_int_mem(ctx, op, mem);
}

static void ir_emit_store_fp(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
{
	ir_ref type = ctx->ir_base[insn->op3].type;
	ir_reg op2_reg = ctx->regs[ref][2];
	ir_reg op3_reg = ctx->regs[ref][3];
	ir_mem mem;

	IR_ASSERT(op3_reg != IR_REG_NONE);
	if (op2_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			IR_ASSERT(ctx->ir_base[insn->op2].type == IR_ADDR);
			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
		}
		mem = IR_MEM_B(op2_reg);
	} else if (IR_IS_CONST_REF(insn->op2)) {
		mem = ir_fuse_addr_const(ctx, insn->op2);
	} else {
		IR_ASSERT(ir_rule(ctx, insn->op2) & IR_FUSED);
		mem = ir_fuse_addr(ctx, ref, insn->op2);
		if (!IR_IS_CONST_REF(insn->op3)
		 && IR_REG_SPILLED(op3_reg)
		 && ir_rule(ctx, insn->op3) != IR_STATIC_ALLOCA
		 && ir_is_same_spill_slot(ctx, insn->op3, mem)) {
			if (!ir_may_avoid_spill_load(ctx, insn->op3, ref)) {
				op3_reg = IR_REG_NUM(op3_reg);
				ir_emit_load(ctx, type, op3_reg, insn->op3);
			}
			/* avoid store to the same location */
			return;
		}
	}

	if (IR_IS_CONST_REF(insn->op3)) {
		ir_emit_store_mem_fp_const(ctx, type, mem, insn->op3, IR_REG_NONE, op3_reg);
	} else {
		IR_ASSERT(op3_reg != IR_REG_NONE);
		if (IR_REG_SPILLED(op3_reg)) {
			op3_reg = IR_REG_NUM(op3_reg);
			ir_emit_load(ctx, type, op3_reg, insn->op3);
		}
		ir_emit_store_mem_fp(ctx, type, mem, op3_reg);
	}
}

static void ir_emit_rload(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_reg src_reg = insn->op2;
	ir_type type = insn->type;

	if (IR_REGSET_IN(IR_REGSET_UNION((ir_regset)ctx->fixed_regset, IR_REGSET_FIXED), src_reg)) {
		if (ctx->vregs[def]
		 && ctx->live_intervals[ctx->vregs[def]]
		 && ctx->live_intervals[ctx->vregs[def]]->stack_spill_pos != -1) {
			ir_emit_store(ctx, type, def, src_reg);
		}
	} else {
		ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);

		if (def_reg == IR_REG_NONE) {
			/* op3 is used as a flag that the value is already stored in memory.
			 * If op3 is set we don't have to store the value once again (in case of spilling)
			 */
			if (!insn->op3 || !ir_is_same_spill_slot(ctx, def, IR_MEM_BO(ctx->spill_base, insn->op3))) {
				ir_emit_store(ctx, type, def, src_reg);
			}
		} else {
			if (src_reg != def_reg) {
				if (IR_IS_TYPE_INT(type)) {
					ir_emit_mov(ctx, type, def_reg, src_reg);
				} else {
					IR_ASSERT(IR_IS_TYPE_FP(type));
					ir_emit_fp_mov(ctx, type, def_reg, src_reg);
				}
			}
			if (IR_REG_SPILLED(ctx->regs[def][0])
			 && (!insn->op3 || !ir_is_same_spill_slot(ctx, def,  IR_MEM_BO(ctx->spill_base, insn->op3)))) {
				ir_emit_store(ctx, type, def, def_reg);
			}
		}
	}
}

static void ir_emit_rstore(ir_ctx *ctx, ir_ref ref, ir_insn *insn)
{
	ir_ref type = ctx->ir_base[insn->op2].type;
	ir_reg op2_reg = ctx->regs[ref][2];
	ir_reg dst_reg = insn->op3;

	if (op2_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			ir_emit_load(ctx, type, op2_reg, insn->op2);
		}
		if (op2_reg != dst_reg) {
			if (IR_IS_TYPE_INT(type)) {
				ir_emit_mov(ctx, type, dst_reg, op2_reg);
			} else {
				IR_ASSERT(IR_IS_TYPE_FP(type));
				ir_emit_fp_mov(ctx, type, dst_reg, op2_reg);
			}
		}
	} else {
		ir_emit_load_ex(ctx, type, dst_reg, insn->op2, ref);
	}
}

static void ir_emit_alloca(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);

	if (ctx->use_lists[def].count == 1) {
		/* dead alloca */
		return;
	}
	if (IR_IS_CONST_REF(insn->op2)) {
		ir_insn *val = &ctx->ir_base[insn->op2];
		int32_t size = val->val.i32;

		IR_ASSERT(IR_IS_TYPE_INT(val->type));
		IR_ASSERT(!IR_IS_SYM_CONST(val->op));
		IR_ASSERT(IR_IS_TYPE_UNSIGNED(val->type) || val->val.i64 >= 0);
		IR_ASSERT(IR_IS_SIGNED_32BIT(val->val.i64));

		/* Stack must be 16 byte aligned */
		size = IR_ALIGNED_SIZE(size, 16);
		|	ASM_REG_IMM_OP sub, IR_ADDR, IR_REG_RSP, size
		if (!(ctx->flags & IR_USE_FRAME_POINTER)) {
			ctx->call_stack_size += size;
		}
	} else {
		int32_t alignment = 16;
		ir_reg op2_reg = ctx->regs[def][2];
		ir_type type = ctx->ir_base[insn->op2].type;

		IR_ASSERT(ctx->flags & IR_FUNCTION);
		IR_ASSERT(ctx->flags & IR_USE_FRAME_POINTER);
		IR_ASSERT(def_reg != IR_REG_NONE);
		if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			ir_emit_load(ctx, type, op2_reg, insn->op2);
		}
		if (def_reg != op2_reg) {
			if (op2_reg != IR_REG_NONE) {
				ir_emit_mov(ctx, type, def_reg, op2_reg);
			} else {
				ir_emit_load(ctx, type, def_reg, insn->op2);
			}
		}

		|	ASM_REG_IMM_OP add, IR_ADDR, def_reg, (alignment-1)
		|	ASM_REG_IMM_OP and, IR_ADDR, def_reg, ~(alignment-1)
		|	ASM_REG_REG_OP sub, IR_ADDR, IR_REG_RSP, def_reg
	}
	if (def_reg != IR_REG_NONE) {
		|	mov Ra(def_reg), Ra(IR_REG_RSP)
		if (IR_REG_SPILLED(ctx->regs[def][0])) {
			ir_emit_store(ctx, insn->type, def, def_reg);
		}
	} else {
		ir_emit_store(ctx, IR_ADDR, def, IR_REG_STACK_POINTER);
	}
}

static void ir_emit_afree(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;

	if (IR_IS_CONST_REF(insn->op2)) {
		ir_insn *val = &ctx->ir_base[insn->op2];
		int32_t size = val->val.i32;

		IR_ASSERT(IR_IS_TYPE_INT(val->type));
		IR_ASSERT(!IR_IS_SYM_CONST(val->op));
		IR_ASSERT(IR_IS_TYPE_UNSIGNED(val->type) || val->val.i64 > 0);
		IR_ASSERT(IR_IS_SIGNED_32BIT(val->val.i64));

		/* Stack must be 16 byte aligned */
		size = IR_ALIGNED_SIZE(size, 16);
		|	ASM_REG_IMM_OP add, IR_ADDR, IR_REG_RSP, size
		if (!(ctx->flags & IR_USE_FRAME_POINTER)) {
			ctx->call_stack_size -= size;
		}
	} else {
//		int32_t alignment = 16;
		ir_reg op2_reg = ctx->regs[def][2];
		ir_type type = ctx->ir_base[insn->op2].type;

		IR_ASSERT(ctx->flags & IR_FUNCTION);
		if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			ir_emit_load(ctx, type, op2_reg, insn->op2);
		}

		// TODO: alignment ???

		|	ASM_REG_REG_OP add, IR_ADDR, IR_REG_RSP, op2_reg
	}
}

static void ir_emit_block_begin(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);

	if (ctx->use_lists[def].count == 1) {
		/* dead load */
		return;
	}
	|	mov Ra(def_reg), Ra(IR_REG_RSP)

	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, IR_ADDR, def, def_reg);
	}
}

static void ir_emit_block_end(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg op2_reg = ctx->regs[def][2];

	IR_ASSERT(op2_reg != IR_REG_NONE);
	if (IR_REG_SPILLED(op2_reg)) {
		op2_reg = IR_REG_NUM(op2_reg);
		ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
	}

	|	mov Ra(IR_REG_RSP), Ra(op2_reg)
}

static void ir_emit_frame_addr(ir_ctx *ctx, ir_ref def)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);

	if (ctx->flags & IR_USE_FRAME_POINTER) {
		|	mov Ra(def_reg), Ra(IR_REG_RBP)
	} else {
		|	lea Ra(def_reg), [Ra(IR_REG_RSP)+(ctx->stack_frame_size + ctx->call_stack_size)]
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, IR_ADDR, def, def_reg);
	}
}

static void ir_emit_va_start(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
#if defined(_WIN64) || defined(IR_TARGET_X86)
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg fp;
	int arg_area_offset;
	ir_reg op2_reg = ctx->regs[def][2];
	ir_reg tmp_reg = ctx->regs[def][3];
	int32_t offset;

	IR_ASSERT(tmp_reg != IR_REG_NONE);
	if (op2_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
		}
		offset = 0;
	} else {
		IR_ASSERT(ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA);
		op2_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
		offset = ir_local_offset(ctx, &ctx->ir_base[insn->op2]);
	}

	if (ctx->flags & IR_USE_FRAME_POINTER) {
		fp = IR_REG_FRAME_POINTER;
		arg_area_offset = sizeof(void*) * 2 + ctx->param_stack_size;
	} else {
		fp = IR_REG_STACK_POINTER;
		arg_area_offset = ctx->stack_frame_size + ctx->call_stack_size + sizeof(void*) + ctx->param_stack_size;
	}
	|	lea Ra(tmp_reg), aword [Ra(fp)+arg_area_offset]
	|	mov aword [Ra(op2_reg)+offset], Ra(tmp_reg)
#elif defined(IR_TARGET_X64)
|.if X64
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg fp;
	int reg_save_area_offset;
	int overflow_arg_area_offset;
	ir_reg op2_reg = ctx->regs[def][2];
	ir_reg tmp_reg = ctx->regs[def][3];
	bool have_reg_save_area = 0;
	int32_t offset;

	IR_ASSERT(tmp_reg != IR_REG_NONE);
	if (op2_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
		}
		offset = 0;
	} else {
		IR_ASSERT(ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA);
		op2_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
		offset = ir_local_offset(ctx, &ctx->ir_base[insn->op2]);
	}

	if (ctx->flags & IR_USE_FRAME_POINTER) {
		fp = IR_REG_FRAME_POINTER;
		reg_save_area_offset = -(ctx->stack_frame_size - ctx->stack_frame_alignment - ctx->locals_area_size);
		overflow_arg_area_offset = sizeof(void*) * 2 + ctx->param_stack_size;
	} else {
		fp = IR_REG_STACK_POINTER;
		reg_save_area_offset = ctx->locals_area_size + ctx->call_stack_size;
		overflow_arg_area_offset = ctx->stack_frame_size + ctx->call_stack_size + sizeof(void*) + ctx->param_stack_size;
	}

	if ((ctx->flags2 & (IR_HAS_VA_ARG_GP|IR_HAS_VA_COPY)) && ctx->gp_reg_params < IR_REG_INT_ARGS) {
		|	lea Ra(tmp_reg), aword [Ra(fp)+reg_save_area_offset]
		have_reg_save_area = 1;
		/* Set va_list.gp_offset */
		|	mov dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, gp_offset))], sizeof(void*) * ctx->gp_reg_params
	} else {
		reg_save_area_offset -= sizeof(void*) * IR_REG_INT_ARGS;
		/* Set va_list.gp_offset */
		|	mov dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, gp_offset))], sizeof(void*) * IR_REG_INT_ARGS
	}
	if ((ctx->flags2 & (IR_HAS_VA_ARG_FP|IR_HAS_VA_COPY)) && ctx->fp_reg_params < IR_REG_FP_ARGS) {
		if (!have_reg_save_area) {
			|	lea Ra(tmp_reg), aword [Ra(fp)+reg_save_area_offset]
			have_reg_save_area = 1;
		}
		/* Set va_list.fp_offset */
		|	mov dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, fp_offset))], sizeof(void*) * IR_REG_INT_ARGS + 16 * ctx->fp_reg_params
	} else {
		/* Set va_list.fp_offset */
		|	mov dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, fp_offset))], sizeof(void*) * IR_REG_INT_ARGS + 16 * IR_REG_FP_ARGS
	}
	if (have_reg_save_area) {
		/* Set va_list.reg_save_area */
		|	mov qword [Ra(op2_reg)+(offset+offsetof(ir_va_list, reg_save_area))], Ra(tmp_reg)
	}
	|	lea Ra(tmp_reg), aword [Ra(fp)+overflow_arg_area_offset]
	/* Set va_list.overflow_arg_area */
	|	mov qword [Ra(op2_reg)+(offset+offsetof(ir_va_list, overflow_arg_area))], Ra(tmp_reg)
|.endif
#else
	IR_ASSERT(0 && "NIY va_start");
#endif
}

static void ir_emit_va_copy(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
#if defined(_WIN64) || defined(IR_TARGET_X86)
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg tmp_reg = ctx->regs[def][1];
	ir_reg op2_reg = ctx->regs[def][2];
	ir_reg op3_reg = ctx->regs[def][3];
	int32_t op2_offset, op3_offset;

	IR_ASSERT(tmp_reg != IR_REG_NONE);
	if (op2_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
		}
		op2_offset = 0;
	} else {
		IR_ASSERT(ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA);
		op2_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
		op2_offset = ir_local_offset(ctx, &ctx->ir_base[insn->op2]);
	}
	if (op3_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op3_reg)) {
			op3_reg = IR_REG_NUM(op3_reg);
			ir_emit_load(ctx, IR_ADDR, op3_reg, insn->op3);
		}
		op3_offset = 0;
	} else {
		IR_ASSERT(ir_rule(ctx, insn->op3) == IR_STATIC_ALLOCA);
		op3_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
		op3_offset = ir_local_offset(ctx, &ctx->ir_base[insn->op3]);
	}
	|	mov Ra(tmp_reg), aword [Ra(op3_reg)+op3_offset]
	|	mov aword [Ra(op2_reg)+op2_offset], Ra(tmp_reg)
#elif defined(IR_TARGET_X64)
|.if X64
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg tmp_reg = ctx->regs[def][1];
	ir_reg op2_reg = ctx->regs[def][2];
	ir_reg op3_reg = ctx->regs[def][3];
	int32_t op2_offset, op3_offset;

	IR_ASSERT(tmp_reg != IR_REG_NONE);
	if (op2_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
		}
		op2_offset = 0;
	} else {
		IR_ASSERT(ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA);
		op2_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
		op2_offset = ir_local_offset(ctx, &ctx->ir_base[insn->op2]);
	}
	if (op3_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op3_reg)) {
			op3_reg = IR_REG_NUM(op3_reg);
			ir_emit_load(ctx, IR_ADDR, op3_reg, insn->op3);
		}
		op3_offset = 0;
	} else {
		IR_ASSERT(ir_rule(ctx, insn->op3) == IR_STATIC_ALLOCA);
		op3_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
		op3_offset = ir_local_offset(ctx, &ctx->ir_base[insn->op3]);
	}
	|	mov Rd(tmp_reg), dword [Ra(op3_reg)+(op3_offset+offsetof(ir_va_list, gp_offset))]
	|	mov dword [Ra(op2_reg)+(op2_offset+offsetof(ir_va_list, gp_offset))], Rd(tmp_reg)
	|	mov Rd(tmp_reg), dword [Ra(op3_reg)+(op3_offset+offsetof(ir_va_list, fp_offset))]
	|	mov aword [Ra(op2_reg)+(op2_offset+offsetof(ir_va_list, fp_offset))], Ra(tmp_reg)
	|	mov Ra(tmp_reg), aword [Ra(op3_reg)+(op3_offset+offsetof(ir_va_list, overflow_arg_area))]
	|	mov aword [Ra(op2_reg)+(op2_offset+offsetof(ir_va_list, overflow_arg_area))], Ra(tmp_reg)
	|	mov Ra(tmp_reg), aword [Ra(op3_reg)+(op3_offset+offsetof(ir_va_list, reg_save_area))]
	|	mov aword [Ra(op2_reg)+(op2_offset+offsetof(ir_va_list, reg_save_area))], Ra(tmp_reg)
|.endif
#else
	IR_ASSERT(0 && "NIY va_copy");
#endif
}

static void ir_emit_va_arg(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
#if defined(_WIN64) || defined(IR_TARGET_X86)
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = insn->type;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op2_reg = ctx->regs[def][2];
	ir_reg tmp_reg = ctx->regs[def][3];
	int32_t offset;

	IR_ASSERT((def_reg != IR_REG_NONE || ctx->use_lists[def].count == 1) && tmp_reg != IR_REG_NONE);
	if (op2_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
		}
		offset = 0;
	} else {
		IR_ASSERT(ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA);
		op2_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
		offset = ir_local_offset(ctx, &ctx->ir_base[insn->op2]);
	}
	|	mov Ra(tmp_reg), aword [Ra(op2_reg)+offset]
#ifdef _WIN64
	if (def_reg != IR_REG_NONE) {
		ir_emit_load_mem(ctx, type, def_reg, IR_MEM_B(tmp_reg));
	}
	|	add Ra(tmp_reg), IR_MAX(ir_type_size[type], sizeof(void*))
#else
	if (!insn->op3) {
		if (def_reg != IR_REG_NONE) {
			ir_emit_load_mem(ctx, type, def_reg, IR_MEM_B(tmp_reg));
		}
		|	add Ra(tmp_reg), IR_MAX(ir_type_size[type], sizeof(void*))
	} else {
		int size = (uint32_t)insn->op3 >> 3;

		if (def_reg != IR_REG_NONE) {
			IR_ASSERT(type == IR_ADDR);
			int align = 1U << (insn->op3 & 0x7);

			if (align > (int)sizeof(void*)) {
				|	add Ra(tmp_reg), (align-1)
				|	and Ra(tmp_reg), ~(align-1)
			}
			|	mov Ra(def_reg), Ra(tmp_reg)
		}
		|	add Ra(tmp_reg), IR_ALIGNED_SIZE(size, sizeof(void*))
	}
#endif
	|	mov aword [Ra(op2_reg)+offset], Ra(tmp_reg)
	if (def_reg && IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
#elif defined(IR_TARGET_X64)
|.if X64
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type = insn->type;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_reg op2_reg = ctx->regs[def][2];
	ir_reg tmp_reg = ctx->regs[def][3];
	int32_t offset;

	IR_ASSERT((def_reg != IR_REG_NONE || ctx->use_lists[def].count == 1) && tmp_reg != IR_REG_NONE);
	if (op2_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
		}
		offset = 0;
	} else {
		IR_ASSERT(ir_rule(ctx, insn->op2) == IR_STATIC_ALLOCA);
		op2_reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
		offset = ir_local_offset(ctx, &ctx->ir_base[insn->op2]);
	}
	if (insn->op3) {
		/* long struct arguemnt */
		IR_ASSERT(type == IR_ADDR);
		int align = 1U << (insn->op3 & 0x7);
		int size = (uint32_t)insn->op3 >> 3;

		|	mov Ra(tmp_reg), aword [Ra(op2_reg)+(offset+offsetof(ir_va_list, overflow_arg_area))]
		if (align > (int)sizeof(void*)) {
			|	add Ra(tmp_reg), (align-1)
			|	and Ra(tmp_reg), ~(align-1)
		}
		if (def_reg != IR_REG_NONE) {
			|	mov Ra(def_reg), Ra(tmp_reg)
		}
		|	add Ra(tmp_reg), IR_ALIGNED_SIZE(size, sizeof(void*))
		|	mov aword [Ra(op2_reg)+(offset+offsetof(ir_va_list, overflow_arg_area))], Ra(tmp_reg)
	} else if (IR_IS_TYPE_INT(type)) {
		|	mov Rd(tmp_reg), dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, gp_offset))]
		|	cmp Rd(tmp_reg), sizeof(void*)*IR_REG_INT_ARGS
		|	jge >1
		|	add Rd(tmp_reg), sizeof(void*)
		|	mov dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, gp_offset))], Rd(tmp_reg)
		|	add Ra(tmp_reg), aword [Ra(op2_reg)+(offset+offsetof(ir_va_list, reg_save_area))]
		|	jmp >2
		|1:
		|	mov Ra(tmp_reg), aword [Ra(op2_reg)+(offset+offsetof(ir_va_list, overflow_arg_area))]
		|	add Ra(tmp_reg), sizeof(void*)
		|	mov aword [Ra(op2_reg)+(offset+offsetof(ir_va_list, overflow_arg_area))], Ra(tmp_reg)
		|2:
		if (def_reg != IR_REG_NONE) {
			if (ir_type_size[type] == 8) {
				|	mov Rq(def_reg), qword [Ra(tmp_reg)-sizeof(void*)]
			} else {
				|	mov Rd(def_reg), dword [Ra(tmp_reg)-sizeof(void*)]
			}
		}
	} else {
		|	mov Rd(tmp_reg), dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, fp_offset))]
		|	cmp Rd(tmp_reg), sizeof(void*) * IR_REG_INT_ARGS + 16 * IR_REG_FP_ARGS
		|	jge >1
		|	add Rd(tmp_reg), 16
		|	mov dword [Ra(op2_reg)+(offset+offsetof(ir_va_list, fp_offset))], Rd(tmp_reg)
		|	add Ra(tmp_reg), aword [Ra(op2_reg)+(offset+offsetof(ir_va_list, reg_save_area))]
		if (def_reg != IR_REG_NONE) {
			ir_emit_load_mem_fp(ctx, type, def_reg, IR_MEM_BO(tmp_reg, -16));
		}
		|	jmp >2
		|1:
		|	mov Ra(tmp_reg), aword [Ra(op2_reg)+(offset+offsetof(ir_va_list, overflow_arg_area))]
		if (def_reg != IR_REG_NONE) {
			ir_emit_load_mem_fp(ctx, type, def_reg, IR_MEM_BO(tmp_reg, 0));
		}
		|	add Ra(tmp_reg), 8
		|	mov aword [Ra(op2_reg)+(offset+offsetof(ir_va_list, overflow_arg_area))], Ra(tmp_reg)
		|2:
	}
	if (def_reg != IR_REG_NONE && IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
|.endif
#else
	IR_ASSERT(0 && "NIY va_arg");
#endif
}

static void ir_emit_switch(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type;
	ir_block *bb;
	ir_insn *use_insn, *val;
	uint32_t n, *p, use_block;
	int i;
	int label, default_label = 0;
	int count = 0;
	ir_val min, max;
	ir_reg op2_reg = ctx->regs[def][2];
	ir_reg tmp_reg = ctx->regs[def][3];
	bool has_case_range = 0;

	type = ctx->ir_base[insn->op2].type;
	IR_ASSERT(tmp_reg != IR_REG_NONE);
	if (IR_IS_TYPE_SIGNED(type)) {
		min.u64 = 0x7fffffffffffffff;
		max.u64 = 0x8000000000000000;
	} else {
		min.u64 = 0xffffffffffffffff;
		max.u64 = 0x0;
	}

	bb = &ctx->cfg_blocks[b];
	p = &ctx->cfg_edges[bb->successors];
	for (n = bb->successors_count; n != 0; p++, n--) {
		use_block = *p;
		use_insn = &ctx->ir_base[ctx->cfg_blocks[use_block].start];
		if (use_insn->op == IR_CASE_VAL) {
			val = &ctx->ir_base[use_insn->op2];
			IR_ASSERT(!IR_IS_SYM_CONST(val->op));
			if (IR_IS_TYPE_SIGNED(type)) {
				IR_ASSERT(IR_IS_TYPE_SIGNED(val->type));
				min.i64 = IR_MIN(min.i64, val->val.i64);
				max.i64 = IR_MAX(max.i64, val->val.i64);
			} else {
				IR_ASSERT(!IR_IS_TYPE_SIGNED(val->type));
				min.u64 = (int64_t)IR_MIN(min.u64, val->val.u64);
				max.u64 = (int64_t)IR_MAX(max.u64, val->val.u64);
			}
			count++;
		} else if (use_insn->op == IR_CASE_RANGE) {
			has_case_range = 1;
			val = &ctx->ir_base[use_insn->op2];
			IR_ASSERT(!IR_IS_SYM_CONST(val->op));
			ir_insn *val2 = &ctx->ir_base[use_insn->op3];
			IR_ASSERT(!IR_IS_SYM_CONST(val2->op));
			if (IR_IS_TYPE_SIGNED(type)) {
				IR_ASSERT(IR_IS_TYPE_SIGNED(val->type));
				min.i64 = IR_MIN(min.i64, val->val.i64);
				max.i64 = IR_MAX(max.i64, val2->val.i64);
			} else {
				IR_ASSERT(!IR_IS_TYPE_SIGNED(val->type));
				min.u64 = (int64_t)IR_MIN(min.u64, val->val.u64);
				max.u64 = (int64_t)IR_MAX(max.u64, val2->val.u64);
			}
		} else {
			IR_ASSERT(use_insn->op == IR_CASE_DEFAULT);
			default_label = ir_skip_empty_target_blocks(ctx, use_block);
		}
	}

	IR_ASSERT(op2_reg != IR_REG_NONE);
	if (IR_REG_SPILLED(op2_reg)) {
		op2_reg = IR_REG_NUM(op2_reg);
		ir_emit_load(ctx, type, op2_reg, insn->op2);
	}

	/* Generate a table jmp or a seqence of calls */
	if (!has_case_range && count > 2 && (max.i64-min.i64) < count * 8) {
		int *labels = ir_mem_malloc(sizeof(int) * (size_t)(max.i64 - min.i64 + 1));

		for (i = 0; i <= (max.i64 - min.i64); i++) {
			labels[i] = default_label;
		}
		p = &ctx->cfg_edges[bb->successors];
		for (n = bb->successors_count; n != 0; p++, n--) {
			use_block = *p;
			use_insn = &ctx->ir_base[ctx->cfg_blocks[use_block].start];
			if (use_insn->op == IR_CASE_VAL) {
				val = &ctx->ir_base[use_insn->op2];
				IR_ASSERT(!IR_IS_SYM_CONST(val->op));
				label = ir_skip_empty_target_blocks(ctx, use_block);
				labels[val->val.i64 - min.i64] = label;
			}
		}

		switch (ir_type_size[type]) {
			default:
				IR_ASSERT(0 && "Unsupported type size");
			case 1:
				if (IR_IS_TYPE_SIGNED(type)) {
					|	movsx Ra(op2_reg), Rb(op2_reg)
				} else {
					|	movzx Ra(op2_reg), Rb(op2_reg)
				}
				break;
			case 2:
				if (IR_IS_TYPE_SIGNED(type)) {
					|	movsx Ra(op2_reg), Rw(op2_reg)
				} else {
					|	movzx Ra(op2_reg), Rw(op2_reg)
				}
				break;
			case 4:
|.if X64
				if (IR_IS_TYPE_SIGNED(type)) {
					if (op2_reg == IR_REG_RAX) {
						|	cdqe
					} else {
						|	movsxd Ra(op2_reg), Rd(op2_reg)
					}
				} else {
					|	mov Rd(op2_reg), Rd(op2_reg)
				}
				break;
||			case 8:
|.endif
				break;
		}

		if (min.i64 != 0) {
			int64_t offset = -min.i64;

			if (IR_IS_SIGNED_32BIT(offset)) {
				|	lea Ra(tmp_reg), [Ra(op2_reg)+(int32_t)offset]
			} else {
				IR_ASSERT(sizeof(void*) == 8);
|.if X64
				|	mov64 Rq(tmp_reg), offset
				|	add Ra(tmp_reg), Ra(op2_reg)
|.endif
			}
			if (default_label) {
				offset = max.i64 - min.i64;

				IR_ASSERT(IR_IS_SIGNED_32BIT(offset));
				|	cmp Ra(tmp_reg), (int32_t)offset
				|	ja =>default_label
			}
|.if X64
			if (ctx->code_buffer
			 && IR_IS_SIGNED_32BIT((char*)ctx->code_buffer->start)
			 && IR_IS_SIGNED_32BIT((char*)ctx->code_buffer->end)) {
				|	jmp aword [Ra(tmp_reg)*8+>1]
			} else {
				int64_t offset = -min.i64;

				IR_ASSERT(IR_IS_SIGNED_32BIT(offset));
				offset *= 8;
				IR_ASSERT(IR_IS_SIGNED_32BIT(offset));
				|	lea Ra(tmp_reg), aword [>1]
				|	jmp aword [Ra(tmp_reg)+Ra(op2_reg)*8+offset]
			}
|.else
			|	jmp aword [Ra(tmp_reg)*4+>1]
|.endif
		} else {
			if (default_label) {
				int64_t offset = max.i64;

				IR_ASSERT(IR_IS_SIGNED_32BIT(offset));
				|	cmp Ra(op2_reg), (int32_t)offset
				|	ja =>default_label
			}
|.if X64
			if (ctx->code_buffer
			 && IR_IS_SIGNED_32BIT((char*)ctx->code_buffer->start)
			 && IR_IS_SIGNED_32BIT((char*)ctx->code_buffer->end)) {
				|	jmp aword [Ra(op2_reg)*8+>1]
			} else {
				|	lea Ra(tmp_reg), aword [>1]
				|	jmp aword [Ra(tmp_reg)+Ra(op2_reg)*8]
			}
|.else
			|	jmp aword [Ra(op2_reg)*4+>1]
|.endif
		}

		|.jmp_table
		if (!data->jmp_table_label) {
			data->jmp_table_label = ctx->cfg_blocks_count + ctx->consts_count + 3;
			|=>data->jmp_table_label:
		}
		|.align aword
		|1:
		for (i = 0; i <= (max.i64 - min.i64); i++) {
			int b = labels[i];
			if (b) {
				ir_block *bb = &ctx->cfg_blocks[b];
				ir_insn *insn = &ctx->ir_base[bb->end];

				if (insn->op == IR_IJMP && IR_IS_CONST_REF(insn->op2)) {
					ir_ref prev = ctx->prev_ref[bb->end];
					if (prev != bb->start && ctx->ir_base[prev].op == IR_SNAPSHOT) {
						prev = ctx->prev_ref[prev];
					}
					if (prev == bb->start) {
						void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op2]);

						|	.aword &addr
						if (ctx->ir_base[bb->start].op != IR_CASE_DEFAULT) {
							bb->flags |= IR_BB_EMPTY;
						}
						continue;
					}
				}
				|	.aword =>b
			} else {
				|	.aword 0
			}
		}
		|.code
		ir_mem_free(labels);
	} else {
		p = &ctx->cfg_edges[bb->successors];
		for (n = bb->successors_count; n != 0; p++, n--) {
			use_block = *p;
			use_insn = &ctx->ir_base[ctx->cfg_blocks[use_block].start];
			if (use_insn->op == IR_CASE_VAL) {
				val = &ctx->ir_base[use_insn->op2];
				IR_ASSERT(!IR_IS_SYM_CONST(val->op));
				label = ir_skip_empty_target_blocks(ctx, use_block);
				if (IR_IS_32BIT(type, val->val)) {
					|	ASM_REG_IMM_OP cmp, type, op2_reg, val->val.i32
				} else {
					IR_ASSERT(sizeof(void*) == 8);
|.if X64
					|	mov64 Ra(tmp_reg), val->val.i64
					|	ASM_REG_REG_OP cmp, type, op2_reg, tmp_reg
|.endif
				}
				|	je =>label
			} else if (use_insn->op == IR_CASE_RANGE) {
				val = &ctx->ir_base[use_insn->op2];
				IR_ASSERT(!IR_IS_SYM_CONST(val->op));
				label = ir_skip_empty_target_blocks(ctx, use_block);
				if (IR_IS_32BIT(type, val->val)) {
					|	ASM_REG_IMM_OP cmp, type, op2_reg, val->val.i32
				} else {
					IR_ASSERT(sizeof(void*) == 8);
|.if X64
					|	mov64 Ra(tmp_reg), val->val.i64
					|	ASM_REG_REG_OP cmp, type, op2_reg, tmp_reg
|.endif
				}
				if (IR_IS_TYPE_SIGNED(type)) {
					|	jl >1
				} else {
					|	jb >1
				}
				val = &ctx->ir_base[use_insn->op3];
				IR_ASSERT(!IR_IS_SYM_CONST(val->op3));
				label = ir_skip_empty_target_blocks(ctx, use_block);
				if (IR_IS_32BIT(type, val->val)) {
					|	ASM_REG_IMM_OP cmp, type, op2_reg, val->val.i32
				} else {
					IR_ASSERT(sizeof(void*) == 8);
|.if X64
					|	mov64 Ra(tmp_reg), val->val.i64
					|	ASM_REG_REG_OP cmp, type, op2_reg, tmp_reg
|.endif
				}
				if (IR_IS_TYPE_SIGNED(type)) {
					|	jle =>label
				} else {
					|	jbe =>label
				}
				|1:
			}
		}
		if (default_label) {
			|	jmp =>default_label
		}
	}
}

static int32_t ir_call_used_stack(ir_ctx *ctx, ir_insn *insn, int *copy_stack_ptr)
{
	int j, n;
	ir_type type;
	int int_param = 0;
	int fp_param = 0;
	int int_reg_params_count = IR_REG_INT_ARGS;
	int fp_reg_params_count = IR_REG_FP_ARGS;
	int32_t used_stack = 0;
#ifdef _WIN64
	int32_t copy_stack = 0;
#endif

#ifdef IR_HAVE_FASTCALL
	if (sizeof(void*) == 4 && ir_is_fastcall(ctx, insn)) {
		int_reg_params_count = IR_REG_INT_FCARGS;
		fp_reg_params_count = IR_REG_FP_FCARGS;
	}
#endif

	n = insn->inputs_count;
	for (j = 3; j <= n; j++) {
		ir_insn *arg = &ctx->ir_base[ir_insn_op(insn, j)];
		type = arg->type;
		if (IR_IS_TYPE_INT(type)) {
			if (arg->op == IR_ARGVAL) {
				int size = arg->op2;
				int align = arg->op3;

#ifdef _WIN64
				copy_stack += size;
				align = IR_MAX((int)sizeof(void*), align);
				copy_stack = IR_ALIGNED_SIZE(copy_stack, align);
				type = IR_ADDR;
#else
				align = IR_MAX((int)sizeof(void*), align);
				used_stack = IR_ALIGNED_SIZE(used_stack, align);
				used_stack += size;
				used_stack = IR_ALIGNED_SIZE(used_stack, sizeof(void*));
				continue;
#endif
			}
			if (int_param >= int_reg_params_count) {
				used_stack += IR_MAX(sizeof(void*), ir_type_size[type]);
			}
			int_param++;
#ifdef _WIN64
			/* WIN64 calling convention use common couter for int and fp registers */
			fp_param++;
#endif
		} else {
			IR_ASSERT(IR_IS_TYPE_FP(type));
			if (fp_param >= fp_reg_params_count) {
				used_stack += IR_MAX(sizeof(void*), ir_type_size[type]);
			}
			fp_param++;
#ifdef _WIN64
			/* WIN64 calling convention use common couter for int and fp registers */
			int_param++;
#endif
		}
	}

	/* Reserved "home space" or "shadow store" for register arguments (used in Windows64 ABI) */
	used_stack += IR_SHADOW_ARGS;

#ifdef _WIN64
	copy_stack = IR_ALIGNED_SIZE(copy_stack, 16);
	used_stack += copy_stack;
	*copy_stack_ptr = copy_stack;
#else
	*copy_stack_ptr = 0;
#endif

	return used_stack;
}

static int32_t ir_emit_arguments(ir_ctx *ctx, ir_ref def, ir_insn *insn, ir_reg tmp_reg)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	int j, n;
	ir_ref arg;
	ir_insn *arg_insn;
	uint8_t type;
	ir_reg src_reg, dst_reg;
	int int_param = 0;
	int fp_param = 0;
	int count = 0;
	int int_reg_params_count = IR_REG_INT_ARGS;
	int fp_reg_params_count = IR_REG_FP_ARGS;
	const int8_t *int_reg_params = _ir_int_reg_params;
	const int8_t *fp_reg_params = _ir_fp_reg_params;
	int32_t used_stack, copy_stack = 0, stack_offset = IR_SHADOW_ARGS;
	ir_copy *copies;
	bool do_pass3 = 0;
	/* For temporaries we may use any scratch registers except for registers used for parameters */
	ir_reg tmp_fp_reg = IR_REG_FP_LAST; /* Temporary register for FP loads and swap */

	n = insn->inputs_count;
	if (n < 3) {
		return 0;
	}

	if (tmp_reg == IR_REG_NONE) {
		tmp_reg = IR_REG_RAX;
	}

#ifdef IR_HAVE_FASTCALL
	if (sizeof(void*) == 4 && ir_is_fastcall(ctx, insn)) {
		int_reg_params_count = IR_REG_INT_FCARGS;
		fp_reg_params_count = IR_REG_FP_FCARGS;
		int_reg_params = _ir_int_fc_reg_params;
		fp_reg_params = _ir_fp_fc_reg_params;
	}
#endif

	if (insn->op == IR_CALL
	 && (ctx->flags & IR_PREALLOCATED_STACK)
#ifdef IR_HAVE_FASTCALL
	 && !ir_is_fastcall(ctx, insn) /* fast call functions restore stack pointer */
#endif
	) {
		// TODO: support for preallocated stack
#ifdef _WIN64
		used_stack = ir_call_used_stack(ctx, insn, &copy_stack);
#else
		used_stack = 0;
#endif
	} else {
		used_stack = ir_call_used_stack(ctx, insn, &copy_stack);
		if (IR_SHADOW_ARGS
		 && insn->op == IR_TAILCALL
		 && used_stack == IR_SHADOW_ARGS) {
			used_stack = 0;
		}
		if (ctx->fixed_call_stack_size
		 && used_stack <= ctx->fixed_call_stack_size
#ifdef IR_HAVE_FASTCALL
		 && !ir_is_fastcall(ctx, insn) /* fast call functions restore stack pointer */
#endif
		) {
			used_stack = 0;
		} else {
			/* Stack must be 16 byte aligned */
			int32_t aligned_stack = IR_ALIGNED_SIZE(used_stack, 16);
			ctx->call_stack_size += aligned_stack;
			if (aligned_stack) {
				|	sub Ra(IR_REG_RSP), aligned_stack
			}
		}
	}

#ifdef _WIN64
|.if X64
	if (copy_stack) {
		/* Copy struct arguments */
		int copy_stack_offset = 0;

		for (j = 3; j <= n; j++) {
			arg = ir_insn_op(insn, j);
			src_reg = ir_get_alocated_reg(ctx, def, j);
			arg_insn = &ctx->ir_base[arg];
			type = arg_insn->type;

			if (arg_insn->op == IR_ARGVAL) {
				/* make a stack copy */
				int size = arg_insn->op2;
				int align = arg_insn->op3;

				copy_stack_offset += size;
				align = IR_MAX((int)sizeof(void*), align);
				copy_stack_offset = IR_ALIGNED_SIZE(copy_stack_offset, align);
				src_reg = ctx->regs[arg][1];

				|	lea	rdi, [rsp + (used_stack - copy_stack_offset)]
				if (src_reg != IR_REG_NONE) {
					if (IR_REG_SPILLED(src_reg)) {
						src_reg = IR_REG_NUM(src_reg);
						ir_emit_load(ctx, IR_ADDR, src_reg, arg_insn->op1);
					}
					|	mov rsi, Ra(src_reg)
				} else {
					ir_emit_load(ctx, IR_ADDR, IR_REG_RSI, arg_insn->op1);
				}
				ir_emit_load_imm_int(ctx, IR_ADDR, IR_REG_RCX, size);
				|	rep; movsb
			}
		}
	}
|.endif
#endif

	/* 1. move all register arguments that should be passed through stack
	 *    and collect arguments that should be passed through registers */
	copies = ir_mem_malloc((n - 2) * sizeof(ir_copy));
	for (j = 3; j <= n; j++) {
		arg = ir_insn_op(insn, j);
		src_reg = ir_get_alocated_reg(ctx, def, j);
		arg_insn = &ctx->ir_base[arg];
		type = arg_insn->type;
		if (IR_IS_TYPE_INT(type)) {
#ifndef _WIN64
			if (arg_insn->op == IR_ARGVAL) {
				int size = arg_insn->op2;
				int align = arg_insn->op3;
				align = IR_MAX((int)sizeof(void*), align);
				stack_offset = IR_ALIGNED_SIZE(stack_offset, align);
				if (size) {
					src_reg = ctx->regs[arg][1];
					if (src_reg != IR_REG_NONE) {
						if (IR_REG_SPILLED(src_reg)) {
							src_reg = IR_REG_NUM(src_reg);
							ir_emit_load(ctx, IR_ADDR, src_reg, arg_insn->op1);
						}
						if (src_reg != IR_REG_RSI) {
							|.if X64
							|	mov rsi, Ra(src_reg)
							|.else
							|	mov	esi, Ra(src_reg)
							|.endif
						}
					} else {
						ir_emit_load(ctx, IR_ADDR, IR_REG_RSI, arg_insn->op1);
					}
					if (stack_offset == 0) {
						|.if X64
						|	mov	rdi, rsp
						|.else
						|	mov	edi, esp
						|.endif
					} else {
						|.if X64
						|	lea	rdi, [rsp+stack_offset]
						|.else
						|	lea	edi, [esp+stack_offset]
						|.endif
					}
					|.if X64
					|	mov rcx, size
					|	rep; movsb
					|.else
					|	mov ecx, size
					|	rep; movsb
					|.endif
				}
				stack_offset += size;
				stack_offset = IR_ALIGNED_SIZE(stack_offset, sizeof(void*));
				continue;
			}
#endif
			if (int_param < int_reg_params_count) {
				dst_reg = int_reg_params[int_param];
			} else {
				dst_reg = IR_REG_NONE; /* pass argument through stack */
			}
			int_param++;
#ifdef _WIN64
			/* WIN64 calling convention use common couter for int and fp registers */
			fp_param++;
			if (arg_insn->op == IR_ARGVAL) {
				do_pass3 = 3;
				continue;
			}
#endif
		} else {
			IR_ASSERT(IR_IS_TYPE_FP(type));
			if (fp_param < fp_reg_params_count) {
				dst_reg = fp_reg_params[fp_param];
			} else {
				dst_reg = IR_REG_NONE; /* pass argument through stack */
			}
			fp_param++;
#ifdef _WIN64
			/* WIN64 calling convention use common couter for int and fp registers */
			int_param++;
#endif
		}
		if (dst_reg != IR_REG_NONE) {
			if (IR_IS_CONST_REF(arg) ||
			    src_reg == IR_REG_NONE ||
			    (IR_REG_SPILLED(src_reg) && !IR_REGSET_IN(IR_REGSET_PRESERVED, IR_REG_NUM(src_reg)))) {
				/* delay CONST->REG and MEM->REG moves to third pass */
				do_pass3 = 1;
			} else {
				if (IR_REG_SPILLED(src_reg)) {
					src_reg = IR_REG_NUM(src_reg);
					ir_emit_load(ctx, type, src_reg, arg);
				}
				if (src_reg != dst_reg) {
					/* delay REG->REG moves to second pass */
					copies[count].type = type;
					copies[count].from = src_reg;
					copies[count].to = dst_reg;
					count++;
				}
			}
		} else {
			/* Pass register arguments to stack (REG->MEM moves) */
			if (!IR_IS_CONST_REF(arg) && src_reg != IR_REG_NONE && !IR_REG_SPILLED(src_reg)) {
				ir_emit_store_mem(ctx, type, IR_MEM_BO(IR_REG_STACK_POINTER, stack_offset), src_reg);
			} else {
				do_pass3 = 1;
			}
			stack_offset += IR_MAX(sizeof(void*), ir_type_size[type]);
		}
	}

	/* 2. move all arguments that should be passed from one register to another (REG->REG movs) */
	if (count) {
		ir_parallel_copy(ctx, copies, count, tmp_reg, tmp_fp_reg);
	}
	ir_mem_free(copies);

	/* 3. move the remaining memory and immediate values */
	if (do_pass3) {
#ifdef _WIN64
		int copy_stack_offset = 0;
#endif

		stack_offset = IR_SHADOW_ARGS;
		int_param = 0;
		fp_param = 0;
		for (j = 3; j <= n; j++) {
			arg = ir_insn_op(insn, j);
			src_reg = ir_get_alocated_reg(ctx, def, j);
			arg_insn = &ctx->ir_base[arg];
			type = arg_insn->type;
			if (IR_IS_TYPE_INT(type)) {
				if (arg_insn->op == IR_ARGVAL) {
					int size = arg_insn->op2;
					int align = arg_insn->op3;

#ifndef _WIN64
					align = IR_MAX((int)sizeof(void*), align);
					stack_offset = IR_ALIGNED_SIZE(stack_offset, align);
					stack_offset += size;
					stack_offset = IR_ALIGNED_SIZE(stack_offset, sizeof(void*));
					continue;
#else
|.if X64
					/* pass pointer to the copy on stack */
					copy_stack_offset += size;
					align = IR_MAX((int)sizeof(void*), align);
					copy_stack_offset = IR_ALIGNED_SIZE(copy_stack_offset, align);
					if (int_param < int_reg_params_count) {
						dst_reg = int_reg_params[int_param];
						|	lea Ra(dst_reg), [rsp + (used_stack - copy_stack_offset)]
					} else {
						|	lea Ra(tmp_reg), [rsp + (used_stack - copy_stack_offset)]
						ir_emit_store_mem_int(ctx, IR_ADDR, IR_MEM_BO(IR_REG_STACK_POINTER, stack_offset), tmp_reg);
						stack_offset += sizeof(void*);
					}
					int_param++;
					/* WIN64 calling convention use common couter for int and fp registers */
					fp_param++;
					continue;
|.endif
#endif
				}
				if (int_param < int_reg_params_count) {
					dst_reg = int_reg_params[int_param];
				} else {
					dst_reg = IR_REG_NONE; /* argument already passed through stack */
				}
				int_param++;
#ifdef _WIN64
				/* WIN64 calling convention use common couter for int and fp registers */
				fp_param++;
#endif
			} else {
				IR_ASSERT(IR_IS_TYPE_FP(type));
				if (fp_param < fp_reg_params_count) {
					dst_reg = fp_reg_params[fp_param];
				} else {
					dst_reg = IR_REG_NONE; /* argument already passed through stack */
				}
				fp_param++;
#ifdef _WIN64
				/* WIN64 calling convention use common couter for int and fp registers */
				int_param++;
#endif
			}
			if (dst_reg != IR_REG_NONE) {
				if (IR_IS_CONST_REF(arg) ||
				    src_reg == IR_REG_NONE ||
				    (IR_REG_SPILLED(src_reg) && !IR_REGSET_IN(IR_REGSET_PRESERVED, IR_REG_NUM(src_reg)))) {
					if (IR_IS_TYPE_INT(type)) {
						if (IR_IS_CONST_REF(arg)) {
							if (type == IR_I8 || type == IR_I16) {
								type = IR_I32;
							} else if (type == IR_U8 || type == IR_U16) {
								type = IR_U32;
							}
							ir_emit_load(ctx, type, dst_reg, arg);
						} else if (ctx->vregs[arg]) {
							ir_mem mem = ir_ref_spill_slot(ctx, arg);

							if (ir_type_size[type] > 2) {
								ir_emit_load_mem_int(ctx, type, dst_reg, mem);
							} else if (ir_type_size[type] == 2) {
								if (type == IR_I16) {
									|	ASM_TXT_TMEM_OP movsx, Rd(dst_reg), word, mem
								} else {
									|	ASM_TXT_TMEM_OP movzx, Rd(dst_reg), word, mem
								}
							} else {
								IR_ASSERT(ir_type_size[type] == 1);
								if (type == IR_I8) {
									|	ASM_TXT_TMEM_OP movsx, Rd(dst_reg), byte, mem
								} else {
									|	ASM_TXT_TMEM_OP movzx, Rd(dst_reg), byte, mem
								}
							}
						} else {
							ir_load_local_addr(ctx, dst_reg, arg);
						}
					} else {
						ir_emit_load(ctx, type, dst_reg, arg);
					}
				}
			} else {
				ir_mem mem = IR_MEM_BO(IR_REG_STACK_POINTER, stack_offset);

				if (IR_IS_TYPE_INT(type)) {
					if (IR_IS_CONST_REF(arg)) {
						ir_emit_store_mem_int_const(ctx, type, mem, arg, tmp_reg, 1);
					} else if (src_reg == IR_REG_NONE) {
						IR_ASSERT(tmp_reg != IR_REG_NONE);
						ir_emit_load(ctx, type, tmp_reg, arg);
						ir_emit_store_mem_int(ctx, type, mem, tmp_reg);
					} else if (IR_REG_SPILLED(src_reg)) {
						src_reg = IR_REG_NUM(src_reg);
						ir_emit_load(ctx, type, src_reg, arg);
						ir_emit_store_mem_int(ctx, type, mem, src_reg);
					}
				} else {
					if (IR_IS_CONST_REF(arg)) {
						ir_emit_store_mem_fp_const(ctx, type, mem, arg, tmp_reg, tmp_fp_reg);
					} else if (src_reg == IR_REG_NONE) {
						IR_ASSERT(tmp_fp_reg != IR_REG_NONE);
						ir_emit_load(ctx, type, tmp_fp_reg, arg);
						ir_emit_store_mem_fp(ctx, IR_DOUBLE, mem, tmp_fp_reg);
					} else if (IR_REG_SPILLED(src_reg)) {
						src_reg = IR_REG_NUM(src_reg);
						ir_emit_load(ctx, type, src_reg, arg);
						ir_emit_store_mem_fp(ctx, type, mem, src_reg);
					}
				}
				stack_offset += IR_MAX(sizeof(void*), ir_type_size[type]);
			}
		}
	}

#ifdef _WIN64
	/* WIN64 calling convention requires duplcation of parameters passed in FP register into GP ones */
	if (ir_is_vararg(ctx, insn)) {
		n = IR_MIN(n, IR_MAX_REG_ARGS + 2);
		for (j = 3; j <= n; j++) {
			arg = ir_insn_op(insn, j);
			arg_insn = &ctx->ir_base[arg];
			type = arg_insn->type;
			if (IR_IS_TYPE_FP(type)) {
				src_reg = fp_reg_params[j-3];
				dst_reg = int_reg_params[j-3];
|.if X64
				if (ctx->mflags & IR_X86_AVX) {
					|	vmovd Rq(dst_reg), xmm(src_reg-IR_REG_FP_FIRST)
				} else {
					|	movd Rq(dst_reg), xmm(src_reg-IR_REG_FP_FIRST)
				}
|.endif
			}
		}
	}
	if (insn->op == IR_CALL && (ctx->flags & IR_PREALLOCATED_STACK)) {
		used_stack = 0;
	}
#endif
#ifdef IR_REG_VARARG_FP_REGS
	/* set hidden argument to specify the number of vector registers used */
	if (ir_is_vararg(ctx, insn)) {
		fp_param = IR_MIN(fp_param, fp_reg_params_count);
		|	mov Rd(IR_REG_VARARG_FP_REGS), fp_param
	}
#endif

	return used_stack;
}

static void ir_emit_call_ex(ir_ctx *ctx, ir_ref def, ir_insn *insn, int32_t used_stack)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg def_reg;

	if (IR_IS_CONST_REF(insn->op2)) {
		void *addr = ir_call_addr(ctx, insn, &ctx->ir_base[insn->op2]);

		if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
			|	call aword &addr
		} else {
|.if X64
||			ir_reg tmp_reg = IR_REG_RAX;

#ifdef IR_REG_VARARG_FP_REGS
||			if (ir_is_vararg(ctx, insn)) {
||				tmp_reg = IR_REG_R11;
||			}
#endif
||			if (IR_IS_SIGNED_32BIT(addr)) {
				|	mov Rq(tmp_reg), ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
||			} else {
				|	mov64 Rq(tmp_reg), ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
||			}
			|	call Rq(tmp_reg)
|.endif
		}
    } else {
		ir_reg op2_reg = ctx->regs[def][2];

		if (op2_reg != IR_REG_NONE) {
			if (IR_REG_SPILLED(op2_reg)) {
				op2_reg = IR_REG_NUM(op2_reg);
				ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
			}
			|	call Ra(op2_reg)
		} else {
			ir_mem mem;

			if (ir_rule(ctx, insn->op2) & IR_FUSED) {
				mem = ir_fuse_load(ctx, def, insn->op2);
			} else {
				mem = ir_ref_spill_slot(ctx, insn->op2);
			}

			|	ASM_TMEM_OP call, aword, mem
		}
    }

	if (used_stack) {
		int32_t aligned_stack = IR_ALIGNED_SIZE(used_stack, 16);

		ctx->call_stack_size -= aligned_stack;
		if (ir_is_fastcall(ctx, insn)) {
			aligned_stack -= used_stack;
			if (aligned_stack) {
				|	add Ra(IR_REG_RSP), aligned_stack
			}
		} else {
			|	add Ra(IR_REG_RSP), aligned_stack
		}
	}

	if (insn->type != IR_VOID) {
		if (IR_IS_TYPE_INT(insn->type)) {
			def_reg = IR_REG_NUM(ctx->regs[def][0]);
			if (def_reg != IR_REG_NONE) {
				if (def_reg != IR_REG_INT_RET1) {
					ir_emit_mov(ctx, insn->type, def_reg, IR_REG_INT_RET1);
				}
				if (IR_REG_SPILLED(ctx->regs[def][0])) {
					ir_emit_store(ctx, insn->type, def, def_reg);
				}
			} else if (ctx->use_lists[def].count > 1) {
				ir_emit_store(ctx, insn->type, def, IR_REG_INT_RET1);
			}
		} else {
			IR_ASSERT(IR_IS_TYPE_FP(insn->type));
			def_reg = IR_REG_NUM(ctx->regs[def][0]);
#ifdef IR_REG_FP_RET1
			if (def_reg != IR_REG_NONE) {
				if (def_reg != IR_REG_FP_RET1) {
					ir_emit_fp_mov(ctx, insn->type, def_reg, IR_REG_FP_RET1);
				}
				if (IR_REG_SPILLED(ctx->regs[def][0])) {
					ir_emit_store(ctx, insn->type, def, def_reg);
				}
			} else if (ctx->use_lists[def].count > 1) {
				ir_emit_store(ctx, insn->type, def, IR_REG_FP_RET1);
			}
#else
			if (ctx->use_lists[def].count > 1) {
				int32_t offset;
				ir_reg fp;

				if (def_reg == IR_REG_NONE) {
					offset = ir_ref_spill_slot_offset(ctx, def, &fp);
					if (insn->type == IR_DOUBLE) {
						|	fstp qword [Ra(fp)+offset]
					} else {
						IR_ASSERT(insn->type == IR_FLOAT);
						|	fstp dword [Ra(fp)+offset]
					}
				} else {
					offset = ctx->ret_slot;
					IR_ASSERT(offset != -1);
					offset = IR_SPILL_POS_TO_OFFSET(offset);
					fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;
					if (insn->type == IR_DOUBLE) {
						|	fstp qword [Ra(fp)+offset]
					} else {
						IR_ASSERT(insn->type == IR_FLOAT);
						|	fstp dword [Ra(fp)+offset]
					}
					ir_emit_load_mem_fp(ctx, insn->type, def_reg, IR_MEM_BO(fp, offset));
					if (IR_REG_SPILLED(ctx->regs[def][0])) {
						ir_emit_store(ctx, insn->type, def, def_reg);
					}
				}
			}
#endif
		}
	}
}

static void ir_emit_call(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	int32_t used_stack = ir_emit_arguments(ctx, def, insn, ctx->regs[def][1]);
	ir_emit_call_ex(ctx, def, insn, used_stack);
}

static void ir_emit_tailcall(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	int32_t used_stack = ir_emit_arguments(ctx, def, insn, ctx->regs[def][1]);

	if (used_stack != 0) {
		ir_emit_call_ex(ctx, def, insn, used_stack);
		ir_emit_return_void(ctx);
		return;
	}

	/* Move op2 to a tmp register before epilogue if it's in
	 * used_preserved_regs, because it will be overridden. */

	ir_reg op2_reg = IR_REG_NONE;
	ir_mem mem = IR_MEM_B(IR_REG_NONE);
	if (!IR_IS_CONST_REF(insn->op2)) {
		op2_reg = ctx->regs[def][2];

		ir_regset preserved_regs = (ir_regset)ctx->used_preserved_regs | IR_REGSET(IR_REG_STACK_POINTER);
		if (ctx->flags & IR_USE_FRAME_POINTER) {
			preserved_regs |= IR_REGSET(IR_REG_FRAME_POINTER);
		}

		bool is_spill_slot = op2_reg != IR_REG_NONE
			&& IR_REG_SPILLED(op2_reg)
			&& ctx->vregs[insn->op2];

		if (op2_reg != IR_REG_NONE && !is_spill_slot) {
			if (IR_REGSET_IN(preserved_regs, IR_REG_NUM(op2_reg))) {
				ir_ref orig_op2_reg = op2_reg;
				op2_reg = IR_REG_RAX;

				if (IR_REG_SPILLED(orig_op2_reg)) {
					ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
				} else {
					ir_type type = ctx->ir_base[insn->op2].type;
					| ASM_REG_REG_OP mov, type, op2_reg, IR_REG_NUM(orig_op2_reg)
				}
			} else {
				op2_reg = IR_REG_NUM(op2_reg);
			}
		} else {
			if (ir_rule(ctx, insn->op2) & IR_FUSED) {
				IR_ASSERT(op2_reg == IR_REG_NONE);
				mem = ir_fuse_load(ctx, def, insn->op2);
			} else {
				mem = ir_ref_spill_slot(ctx, insn->op2);
			}
			ir_reg base = IR_MEM_BASE(mem);
			ir_reg index = IR_MEM_INDEX(mem);
			if ((base != IR_REG_NONE && IR_REGSET_IN(preserved_regs, base)) ||
					(index != IR_REG_NONE && IR_REGSET_IN(preserved_regs, index))) {
				op2_reg = IR_REG_RAX;

				ir_type type = ctx->ir_base[insn->op2].type;
				ir_emit_load_mem_int(ctx, type, op2_reg, mem);
			} else {
				op2_reg = IR_REG_NONE;
			}
		}
	}

	ir_emit_epilogue(ctx);

	if (IR_IS_CONST_REF(insn->op2)) {
		void *addr = ir_call_addr(ctx, insn, &ctx->ir_base[insn->op2]);

		if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
			|	jmp aword &addr
		} else {
|.if X64
||			ir_reg tmp_reg = IR_REG_RAX;

#ifdef IR_REG_VARARG_FP_REGS
||			if (ir_is_vararg(ctx, insn)) {
||				tmp_reg = IR_REG_R11;
||			}
#endif
||			if (IR_IS_SIGNED_32BIT(addr)) {
				|	mov Rq(tmp_reg), ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
||			} else {
				|	mov64 Rq(tmp_reg), ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
||			}
			|	jmp Rq(tmp_reg)
|.endif
		}
    } else {
		if (op2_reg != IR_REG_NONE) {
			IR_ASSERT(!IR_REGSET_IN((ir_regset)ctx->used_preserved_regs, op2_reg));
			|	jmp Ra(op2_reg)
		} else {
			|	ASM_TMEM_OP jmp, aword, mem
		}
    }
}

static void ir_emit_ijmp(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg op2_reg = ctx->regs[def][2];

	if (IR_IS_CONST_REF(insn->op2)) {
		void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op2]);

		if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
			|	jmp aword &addr
		} else {
|.if X64
			if (IR_IS_SIGNED_32BIT(addr)) {
				|	mov rax, ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
			} else {
				|	mov64 rax, ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
			}
			|	jmp rax
|.endif
		}
	} else if (ir_rule(ctx, insn->op2) & IR_FUSED) {
	    ir_mem mem = ir_fuse_load(ctx, def, insn->op2);
		|	ASM_TMEM_OP jmp, aword, mem
	} else if (op2_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			ir_emit_load(ctx, IR_ADDR, op2_reg, insn->op2);
		}
		|	jmp Ra(op2_reg)
	} else {
		ir_mem mem = ir_ref_spill_slot(ctx, insn->op2);

		|	ASM_TMEM_OP jmp, aword, mem
	}
}

static bool ir_emit_guard_jcc(ir_ctx *ctx, uint32_t b, ir_ref def, uint32_t next_block, uint8_t op, void *addr, bool int_cmp, bool after_op)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_insn *next_insn = &ctx->ir_base[def + 1];

	if (next_insn->op == IR_END || next_insn->op == IR_LOOP_END) {
		ir_block *bb = &ctx->cfg_blocks[b];
		uint32_t target;

		if (!(bb->flags & IR_BB_DESSA_MOVES)) {
			target = ctx->cfg_edges[bb->successors];
			if (UNEXPECTED(bb->successors_count == 2)) {
				if (ctx->cfg_blocks[target].flags & IR_BB_ENTRY) {
					target = ctx->cfg_edges[bb->successors + 1];
				} else {
					IR_ASSERT(ctx->cfg_blocks[ctx->cfg_edges[bb->successors + 1]].flags & IR_BB_ENTRY);
				}
			} else {
				IR_ASSERT(bb->successors_count == 1);
			}
			target = ir_skip_empty_target_blocks(ctx, target);
			if (target != next_block) {
				if (int_cmp) {
					switch (op) {
						default:
							IR_ASSERT(0 && "NIY binary op");
						case IR_EQ:
							|	jne =>target
							break;
						case IR_NE:
							|	je =>target
							break;
						case IR_LT:
							if (after_op) {
								|	jns =>target
							} else {
								|	jge =>target
							}
							break;
						case IR_GE:
							if (after_op) {
								|	js =>target
							} else {
								|	jl =>target
							}
							break;
						case IR_LE:
							|	jg =>target
							break;
						case IR_GT:
							|	jle =>target
							break;
						case IR_ULT:
							|	jae =>target
							break;
						case IR_UGE:
							|	jb =>target
							break;
						case IR_ULE:
							|	ja =>target
							break;
						case IR_UGT:
							|	jbe =>target
							break;
					}
				} else {
					switch (op) {
						default:
							IR_ASSERT(0 && "NIY binary op");
						case IR_EQ:
							|	jne =>target
							|	jp =>target
							break;
						case IR_NE:
							|	jp &addr
							|	je =>target
							break;
						case IR_LT:
							|	jae =>target
							break;
						case IR_GE:
							|	jp &addr
							|	jb =>target
							break;
						case IR_LE:
							|	ja =>target
							break;
						case IR_GT:
							|	jp &addr
							|	jbe =>target
							break;
						case IR_ORDERED:
							|	jnp =>target
							break;
						case IR_UNORDERED:
							|	jp =>target
							break;
					}
				}
				|	jmp &addr
				return 1;
			}
		}
	} else if (next_insn->op == IR_IJMP && IR_IS_CONST_REF(next_insn->op2)) {
		void *target_addr = ir_jmp_addr(ctx, next_insn, &ctx->ir_base[next_insn->op2]);

		if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, target_addr)) {
			if (int_cmp) {
				switch (op) {
					default:
						IR_ASSERT(0 && "NIY binary op");
					case IR_EQ:
						|	jne &target_addr
						break;
					case IR_NE:
						|	je &target_addr
						break;
					case IR_LT:
						if (after_op) {
							|	jns &target_addr
						} else {
							|	jge &target_addr
						}
						break;
					case IR_GE:
						if (after_op) {
							|	js &target_addr
						} else {
							|	jl &target_addr
						}
						break;
					case IR_LE:
						|	jg &target_addr
						break;
					case IR_GT:
						|	jle &target_addr
						break;
					case IR_ULT:
						|	jae &target_addr
						break;
					case IR_UGE:
						|	jb &target_addr
						break;
					case IR_ULE:
						|	ja &target_addr
						break;
					case IR_UGT:
						|	jbe &target_addr
						break;
				}
			} else {
				switch (op) {
					default:
						IR_ASSERT(0 && "NIY binary op");
					case IR_EQ:
						|	jne &target_addr
						|	jp &target_addr
						break;
					case IR_NE:
						|	jp &addr
						|	je &target_addr
						break;
					case IR_LT:
						|	jae &target_addr
						break;
					case IR_GE:
						|	jp &addr
						|	jb &target_addr
						break;
					case IR_LE:
						|	ja &target_addr
						break;
					case IR_GT:
						|	jp &addr
						|	jbe &target_addr
						break;
					case IR_ORDERED:
						|	jnp &target_addr
						break;
					case IR_UNORDERED:
						|	jp &target_addr
						break;
				}
			}
			|	jmp &addr
			return 1;
		}
	}

	if (int_cmp) {
		switch (op) {
			default:
				IR_ASSERT(0 && "NIY binary op");
			case IR_EQ:
				|	je &addr
				break;
			case IR_NE:
				|	jne &addr
				break;
			case IR_LT:
				if (after_op) {
					|	js &addr
				} else {
					|	jl &addr
				}
				break;
			case IR_GE:
				if (after_op) {
					|	jns &addr
				} else {
					|	jge &addr
				}
				break;
			case IR_LE:
				|	jle &addr
				break;
			case IR_GT:
				|	jg &addr
				break;
			case IR_ULT:
				|	jb &addr
				break;
			case IR_UGE:
				|	jae &addr
				break;
			case IR_ULE:
				|	jbe &addr
				break;
			case IR_UGT:
				|	ja &addr
				break;
		}
	} else {
		switch (op) {
			default:
				IR_ASSERT(0 && "NIY binary op");
			case IR_EQ:
				|	jp >1
				|	je &addr
				|1:
				break;
			case IR_NE:
				|	jne &addr
				|	jp &addr
				break;
			case IR_LT:
				|	jp >1
				|	jb &addr
				|1:
				break;
			case IR_GE:
				|	jae &addr
				break;
			case IR_LE:
				|	jp >1
				|	jbe &addr
				|1:
				break;
			case IR_GT:
				|	ja &addr
				break;
			case IR_ORDERED:
				|	jp &addr
				break;
			case IR_UNORDERED:
				|	jnp &addr
				break;
//			case IR_ULT: fprintf(stderr, "\tjb .LL%d\n", true_block); break;
//			case IR_UGE: fprintf(stderr, "\tjae .LL%d\n", true_block); break;
//			case IR_ULE: fprintf(stderr, "\tjbe .LL%d\n", true_block); break;
//			case IR_UGT: fprintf(stderr, "\tja .LL%d\n", true_block); break;
		}
	}
	return 0;
}

static bool ir_emit_guard(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg op2_reg = ctx->regs[def][2];
	ir_type type = ctx->ir_base[insn->op2].type;
	void *addr;

	IR_ASSERT(IR_IS_TYPE_INT(type));
	if (IR_IS_CONST_REF(insn->op2)) {
		bool is_true = ir_ref_is_true(ctx, insn->op2);

		if ((insn->op == IR_GUARD && !is_true) || (insn->op == IR_GUARD_NOT && is_true)) {
			addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
			if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
				|	jmp aword &addr
			} else {
|.if X64
				if (IR_IS_SIGNED_32BIT(addr)) {
					|	mov rax, ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
				} else {
					|	mov64 rax, ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
				}
				|	jmp aword [rax]
|.endif
			}
		}
		return 0;
	}

	if (op2_reg != IR_REG_NONE) {
		if (IR_REG_SPILLED(op2_reg)) {
			op2_reg = IR_REG_NUM(op2_reg);
			ir_emit_load(ctx, type, op2_reg, insn->op2);
		}
		|	ASM_REG_REG_OP test, type, op2_reg, op2_reg
	} else {
		ir_mem mem;

		if (ir_rule(ctx, insn->op2) & IR_FUSED) {
			mem = ir_fuse_load(ctx, def, insn->op2);
		} else {
			mem = ir_ref_spill_slot(ctx, insn->op2);
		}
		|	ASM_MEM_IMM_OP cmp, type, mem, 0
	}

	addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
	if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
		ir_op op;

		if (insn->op == IR_GUARD) {
			op = IR_EQ;
		} else {
			op = IR_NE;
		}
		return ir_emit_guard_jcc(ctx, b, def, next_block, op, addr, 1, 0);
	} else {
|.if X64
		if (insn->op == IR_GUARD) {
			|	je >1
		} else {
			|	jne >1
		}
		|.cold_code
		|1:
		if (IR_IS_SIGNED_32BIT(addr)) {
			|	mov rax, ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
		} else {
			|	mov64 rax, ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
		}
		|	jmp aword [rax]
		|.code
|.endif
		return 0;
	}
}

static bool ir_emit_guard_cmp_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_insn *cmp_insn = &ctx->ir_base[insn->op2];
	ir_op op = cmp_insn->op;
	ir_type type = ctx->ir_base[cmp_insn->op1].type;
	ir_ref op1 = cmp_insn->op1;
	ir_ref op2 = cmp_insn->op2;
	ir_reg op1_reg = ctx->regs[insn->op2][1];
	ir_reg op2_reg = ctx->regs[insn->op2][2];
	void *addr;

	if (op1_reg != IR_REG_NONE && IR_REG_SPILLED(op1_reg)) {
		op1_reg = IR_REG_NUM(op1_reg);
		ir_emit_load(ctx, type, op1_reg, op1);
	}
	if (op2_reg != IR_REG_NONE && IR_REG_SPILLED(op2_reg)) {
		op2_reg = IR_REG_NUM(op2_reg);
		if (op1 != op2) {
			ir_emit_load(ctx, type, op2_reg, op2);
		}
	}

	addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
	if (IR_IS_CONST_REF(op2) && !IR_IS_SYM_CONST(ctx->ir_base[op2].op) && ctx->ir_base[op2].val.u64 == 0) {
		if (op == IR_ULT) {
			/* always false */
			if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
				|	jmp aword &addr
			} else {
|.if X64
				if (IR_IS_SIGNED_32BIT(addr)) {
					|	mov rax, ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
				} else {
					|	mov64 rax, ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
				}
				|	jmp aword [rax]
|.endif
			}
			return 0;
		} else if (op == IR_UGE) {
			/* always true */
			return 0;
		} else if (op == IR_ULE) {
			op = IR_EQ;
		} else if (op == IR_UGT) {
			op = IR_NE;
		}
	}
	ir_emit_cmp_int_common(ctx, type, def, cmp_insn, op1_reg, op1, op2_reg, op2);

	if (insn->op == IR_GUARD) {
		op ^= 1; // reverse
	}

	return ir_emit_guard_jcc(ctx, b, def, next_block, op, addr, 1, 0);
}

static bool ir_emit_guard_cmp_fp(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
{
	ir_op op = ir_emit_cmp_fp_common(ctx, def, insn->op2, &ctx->ir_base[insn->op2]);
	void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);

	if (insn->op == IR_GUARD) {
		op ^= 1; // reverse
	}
	return ir_emit_guard_jcc(ctx, b, def, next_block, op, addr, 0, 0);
}

static bool ir_emit_guard_test_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
{
	void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
	ir_op op = (insn->op == IR_GUARD) ? IR_EQ : IR_NE;

	ir_emit_test_int_common(ctx, def, insn->op2, op);
	return ir_emit_guard_jcc(ctx, b, def, next_block, op, addr, 1, 0);
}

static bool ir_emit_guard_jcc_int(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn, uint32_t next_block)
{
	void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);
	ir_op op = ctx->ir_base[insn->op2].op;

	if (insn->op == IR_GUARD) {
		op ^= 1; // reverse
	}
	return ir_emit_guard_jcc(ctx, b, def, next_block, op, addr, 1, 1);
}

static bool ir_emit_guard_overflow(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_type type;
	void *addr = ir_jmp_addr(ctx, insn, &ctx->ir_base[insn->op3]);

	type = ctx->ir_base[ctx->ir_base[insn->op2].op1].type;

	IR_ASSERT(IR_IS_TYPE_INT(type));
	if (IR_IS_TYPE_SIGNED(type)) {
		if (insn->op == IR_GUARD) {
			|	jno &addr
		} else {
			|	jo &addr
		}
	} else {
		if (insn->op == IR_GUARD) {
			|	jnc &addr
		} else {
			|	jc &addr
		}
	}
	return 0;
}

static void ir_emit_lea(ir_ctx *ctx, ir_ref def, ir_type type)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);
	ir_mem mem = ir_fuse_addr(ctx, def, def);

	IR_ASSERT(def_reg != IR_REG_NONE);
	if (ir_type_size[type] == 4) {
		if (IR_MEM_BASE(mem) == def_reg
		 && IR_MEM_OFFSET(mem) == 0
		 && IR_MEM_SCALE(mem) == 1
		 && IR_MEM_INDEX(mem) != IR_REG_NONE) {
			ir_reg reg = IR_MEM_INDEX(mem);
			|	add Rd(def_reg), Rd(reg)
		} else if (IR_MEM_INDEX(mem) == def_reg
		 && IR_MEM_OFFSET(mem) == 0
		 && IR_MEM_SCALE(mem) == 1
		 && IR_MEM_BASE(mem) != IR_REG_NONE) {
			ir_reg reg = IR_MEM_BASE(mem);
			|	add Rd(def_reg), Rd(reg)
		} else if (IR_MEM_INDEX(mem) == def_reg
		 && IR_MEM_OFFSET(mem) == 0
		 && IR_MEM_SCALE(mem) == 2
		 && IR_MEM_BASE(mem) == IR_REG_NONE) {
			|	add Rd(def_reg), Rd(def_reg)
		} else {
			if (IR_MEM_SCALE(mem) == 2 && IR_MEM_BASE(mem) == IR_REG_NONE) {
				mem = IR_MEM(IR_MEM_INDEX(mem), IR_MEM_OFFSET(mem), IR_MEM_INDEX(mem), 1);
			}
			|	ASM_TXT_TMEM_OP lea, Rd(def_reg), dword, mem
		}
	} else {
		if (IR_MEM_BASE(mem) == def_reg
		 && IR_MEM_OFFSET(mem) == 0
		 && IR_MEM_SCALE(mem) == 1
		 && IR_MEM_INDEX(mem) != IR_REG_NONE) {
			ir_reg reg = IR_MEM_INDEX(mem);
			|	add Ra(def_reg), Ra(reg)
		} else if (IR_MEM_INDEX(mem) == def_reg
		 && IR_MEM_OFFSET(mem) == 0
		 && IR_MEM_SCALE(mem) == 1
		 && IR_MEM_BASE(mem) != IR_REG_NONE) {
			ir_reg reg = IR_MEM_BASE(mem);
			|	add Ra(def_reg), Ra(reg)
		} else if (IR_MEM_INDEX(mem) == def_reg
		 && IR_MEM_OFFSET(mem) == 0
		 && IR_MEM_SCALE(mem) == 2
		 && IR_MEM_BASE(mem) == IR_REG_NONE) {
			|	add Ra(def_reg), Ra(def_reg)
		} else {
			if (IR_MEM_SCALE(mem) == 2 && IR_MEM_BASE(mem) == IR_REG_NONE) {
				mem = IR_MEM(IR_MEM_INDEX(mem), IR_MEM_OFFSET(mem), IR_MEM_INDEX(mem), 1);
			}
			|	ASM_TXT_TMEM_OP lea, Ra(def_reg), aword, mem
		}
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, type, def, def_reg);
	}
}

static void ir_emit_tls(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg reg = IR_REG_NUM(ctx->regs[def][0]);

	if (ctx->use_lists[def].count == 1) {
		/* dead load */
		return;
	}

|.if X64WIN
|	gs
|	mov Ra(reg), aword [0x58]
|	mov Ra(reg), aword [Ra(reg)+insn->op2]
|	mov Ra(reg), aword [Ra(reg)+insn->op3]
|.elif WIN
|	fs
|	mov Ra(reg), aword [0x2c]
|	mov Ra(reg), aword [Ra(reg)+insn->op2]
|	mov Ra(reg), aword [Ra(reg)+insn->op3]
|.elif X64APPLE
|	gs
||	if (insn->op3 == IR_NULL) {
|		mov Ra(reg), aword [insn->op2]
||	} else {
|		mov Ra(reg), aword [insn->op2]
|		mov Ra(reg), aword [Ra(reg)+insn->op3]
||	}
|.elif X64
|	fs
||	if (insn->op3 == IR_NULL) {
|		mov Ra(reg), aword [insn->op2]
||	} else {
|		mov Ra(reg), [0x8]
|		mov Ra(reg), aword [Ra(reg)+insn->op2]
|		mov Ra(reg), aword [Ra(reg)+insn->op3]
||	}
|.else
|	gs
||	if (insn->op3 == IR_NULL) {
|		mov Ra(reg), aword [insn->op2]
||	} else {
|		mov Ra(reg), [0x4]
|		mov Ra(reg), aword [Ra(reg)+insn->op2]
|		mov Ra(reg), aword [Ra(reg)+insn->op3]
||	}
|	.endif
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, IR_ADDR, def, reg);
	}
}

static void ir_emit_sse_sqrt(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg op3_reg = ctx->regs[def][3];
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);

	IR_ASSERT(IR_IS_TYPE_FP(insn->type));
	IR_ASSERT(def_reg != IR_REG_NONE && op3_reg != IR_REG_NONE);

	if (IR_REG_SPILLED(op3_reg)) {
		op3_reg = IR_REG_NUM(op3_reg);
		ir_emit_load(ctx, insn->type, op3_reg, insn->op3);
	}

	|	ASM_FP_REG_REG_OP sqrts, insn->type, def_reg, op3_reg

	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, insn->type, def, def_reg);
	}
}

static void ir_emit_sse_round(ir_ctx *ctx, ir_ref def, ir_insn *insn, int round_op)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg op3_reg = ctx->regs[def][3];
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);

	IR_ASSERT(IR_IS_TYPE_FP(insn->type));
	IR_ASSERT(def_reg != IR_REG_NONE && op3_reg != IR_REG_NONE);

	if (IR_REG_SPILLED(op3_reg)) {
		op3_reg = IR_REG_NUM(op3_reg);
		ir_emit_load(ctx, insn->type, op3_reg, insn->op3);
	}

	if (ctx->mflags & IR_X86_AVX) {
		|	ASM_SSE2_REG_REG_REG_TXT_OP vrounds, insn->type, def_reg, def_reg, op3_reg, round_op
	} else {
		|	ASM_SSE2_REG_REG_TXT_OP rounds, insn->type, def_reg, op3_reg, round_op
	}

	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, insn->type, def, def_reg);
	}
}

static void ir_emit_exitcall(ir_ctx *ctx, ir_ref def, ir_insn *insn)
{
	ir_backend_data *data = ctx->data;
	dasm_State **Dst = &data->dasm_state;
	ir_reg def_reg = IR_REG_NUM(ctx->regs[def][0]);

	IR_ASSERT(def_reg != IR_REG_NONE);

	|.if X64
	|	sub rsp, 16*8+16*8+8 /* CPU regs + SSE regs */
	|	mov aword [rsp+0*8], rax
	|	mov aword [rsp+1*8], rcx
	|	mov aword [rsp+2*8], rdx
	|	mov aword [rsp+3*8], rbx
	|	mov aword [rsp+5*8], rbp
	|	mov aword [rsp+6*8], rsi
	|	mov aword [rsp+7*8], rdi
	|	mov aword [rsp+8*8], r8
	|	mov aword [rsp+9*8], r9
	|	mov aword [rsp+10*8], r10
	|	mov aword [rsp+11*8], r11
	|	mov aword [rsp+12*8], r12
	|	mov aword [rsp+13*8], r13
	|	mov aword [rsp+14*8], r14
	|	mov aword [rsp+15*8], r15
	|	movsd qword [rsp+16*8+0*8], xmm0
	|	movsd qword [rsp+16*8+1*8], xmm1
	|	movsd qword [rsp+16*8+2*8], xmm2
	|	movsd qword [rsp+16*8+3*8], xmm3
	|	movsd qword [rsp+16*8+4*8], xmm4
	|	movsd qword [rsp+16*8+5*8], xmm5
	|	movsd qword [rsp+16*8+6*8], xmm6
	|	movsd qword [rsp+16*8+7*8], xmm7
	|	movsd qword [rsp+16*8+8*8], xmm8
	|	movsd qword [rsp+16*8+9*8], xmm9
	|	movsd qword [rsp+16*8+10*8], xmm10
	|	movsd qword [rsp+16*8+11*8], xmm11
	|	movsd qword [rsp+16*8+12*8], xmm12
	|	movsd qword [rsp+16*8+13*8], xmm13
	|	movsd qword [rsp+16*8+14*8], xmm14
	|	movsd qword [rsp+16*8+15*8], xmm15
	|
	|	mov Ra(IR_REG_INT_ARG2), rsp
	|	lea Ra(IR_REG_INT_ARG1), [rsp+16*8+16*8+16]
	|	mov aword [rsp+4*8], Ra(IR_REG_INT_ARG1)
	|	mov Ra(IR_REG_INT_ARG1), [rsp+16*8+16*8+8]
	|.if X64WIN
	|	sub rsp, 32 /* shadow space */
	|.endif
	|.else
	|	sub esp, 8*4+8*8+12 /* CPU regs + SSE regs */
	|	mov aword [esp+0*4], eax
	|	mov aword [esp+1*4], ecx
	|	mov aword [esp+2*4], edx
	|	mov aword [esp+3*4], ebx
	|	mov aword [esp+5*4], ebp
	|	mov aword [esp+6*4], esi
	|	mov aword [esp+7*4], edi
	|	movsd qword [esp+8*4+0*8], xmm0
	|	movsd qword [esp+8*4+1*8], xmm1
	|	movsd qword [esp+8*4+2*8], xmm2
	|	movsd qword [esp+8*4+3*8], xmm3
	|	movsd qword [esp+8*4+4*8], xmm4
	|	movsd qword [esp+8*4+5*8], xmm5
	|	movsd qword [esp+8*4+6*8], xmm6
	|	movsd qword [esp+8*4+7*8], xmm7
	|
	|	mov Ra(IR_REG_INT_FCARG2), esp
	|	lea Ra(IR_REG_INT_FCARG1), [esp+8*4+8*8+16]
	|	mov aword [esp+4*4], Ra(IR_REG_INT_FCARG1)
	|	mov Ra(IR_REG_INT_FCARG1), [esp+8*4+8*8+12]
	|.endif

	if (IR_IS_CONST_REF(insn->op2)) {
		void *addr = ir_call_addr(ctx, insn, &ctx->ir_base[insn->op2]);

		if (sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(ctx->code_buffer, addr)) {
			|	call aword &addr
		} else {
|.if X64
			if (IR_IS_SIGNED_32BIT(addr)) {
				|	mov rax, ((ptrdiff_t)addr)    // 0x48 0xc7 0xc0 <imm-32-bit>
			} else {
				|	mov64 rax, ((ptrdiff_t)addr)  // 0x48 0xb8 <imm-64-bit>
			}
			|	call rax
|.endif
		}
	} else {
		IR_ASSERT(0);
	}

	//  restore SP
	|.if X64WIN
	|	add rsp, 32+16*8+16*8+16 /* shadow space + CPU regs + SSE regs */
	|.elif X64
	|	add rsp, 16*8+16*8+16 /* CPU regs + SSE regs */
	|.else
	|	add esp, 8*4+8*8+16 /* CPU regs + SSE regs */
	|.endif

	if (def_reg != IR_REG_INT_RET1) {
		ir_emit_mov(ctx, insn->type, def_reg, IR_REG_INT_RET1);
	}
	if (IR_REG_SPILLED(ctx->regs[def][0])) {
		ir_emit_store(ctx, insn->type, def, def_reg);
	}
}

static void ir_emit_param_move(ir_ctx *ctx, uint8_t type, ir_reg from_reg, ir_reg to_reg, ir_ref to, int32_t offset)
{
	ir_reg fp = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER;

	IR_ASSERT(from_reg != IR_REG_NONE || to_reg != IR_REG_NONE);

	if (IR_IS_TYPE_INT(type)) {
		if (from_reg != IR_REG_NONE) {
			if (to_reg != IR_REG_NONE) {
				ir_emit_mov(ctx, type, to_reg, from_reg);
			} else {
				ir_emit_store(ctx, type, to, from_reg);
			}
		} else {
			ir_emit_load_mem_int(ctx, type, to_reg, IR_MEM_BO(fp, offset));
		}
	} else {
		if (from_reg != IR_REG_NONE) {
			if (to_reg != IR_REG_NONE) {
				ir_emit_fp_mov(ctx, type, to_reg, from_reg);
			} else {
				ir_emit_store(ctx, type, to, from_reg);
			}
		} else {
			ir_emit_load_mem_fp(ctx, type, to_reg, IR_MEM_BO(fp, offset));
		}
	}
}

static void ir_emit_load_params(ir_ctx *ctx)
{
	ir_use_list *use_list = &ctx->use_lists[1];
	ir_insn *insn;
	ir_ref i, n, *p, use;
	int int_param_num = 0;
	int fp_param_num = 0;
	ir_reg src_reg;
	ir_reg dst_reg;
	// TODO: Calling convention specific
	int int_reg_params_count = IR_REG_INT_ARGS;
	int fp_reg_params_count = IR_REG_FP_ARGS;
	const int8_t *int_reg_params = _ir_int_reg_params;
	const int8_t *fp_reg_params = _ir_fp_reg_params;
	int32_t stack_offset = 0;

#ifdef IR_TARGET_X86
	if (sizeof(void*) == 4 && (ctx->flags & IR_FASTCALL_FUNC)) {
		int_reg_params_count = IR_REG_INT_FCARGS;
		fp_reg_params_count = IR_REG_FP_FCARGS;
		int_reg_params = _ir_int_fc_reg_params;
		fp_reg_params = _ir_fp_fc_reg_params;
	}
#endif

	if (ctx->flags & IR_USE_FRAME_POINTER) {
		stack_offset = sizeof(void*) * 2; /* skip old frame pointer and return address */
	} else {
		stack_offset = sizeof(void*) + ctx->stack_frame_size + ctx->call_stack_size; /* skip return address */
	}
	n = use_list->count;
	for (i = 0, p = &ctx->use_edges[use_list->refs]; i < n; i++, p++) {
		use = *p;
		insn = &ctx->ir_base[use];
		if (insn->op == IR_PARAM) {
			if (IR_IS_TYPE_INT(insn->type)) {
				if (ctx->value_params && ctx->value_params[insn->op3 - 1].align) {
					/* struct passed by value on stack */
					size_t align = ctx->value_params[insn->op3 - 1].align;

					align = IR_MAX(sizeof(void*), align);
					stack_offset = IR_ALIGNED_SIZE(stack_offset, align);
					stack_offset += ctx->value_params[insn->op3 - 1].size;
					stack_offset = IR_ALIGNED_SIZE(stack_offset, sizeof(void*));
					continue;
				} else if (int_param_num < int_reg_params_count) {
					src_reg = int_reg_params[int_param_num];
				} else {
					src_reg = IR_REG_NONE;
				}
				int_param_num++;
#ifdef _WIN64
				/* WIN64 calling convention use common couter for int and fp registers */
				fp_param_num++;
#endif
			} else {
				if (fp_param_num < fp_reg_params_count) {
					src_reg = fp_reg_params[fp_param_num];
				} else {
					src_reg = IR_REG_NONE;
				}
				fp_param_num++;
#ifdef _WIN64
				/* WIN64 calling convention use common couter for int and fp registers */
				int_param_num++;
#endif
			}
			if (ctx->vregs[use]) {
				dst_reg = IR_REG_NUM(ctx->regs[use][0]);
				IR_ASSERT(src_reg != IR_REG_NONE || dst_reg != IR_REG_NONE ||
					stack_offset == ctx->live_intervals[ctx->vregs[use]]->stack_spill_pos +
						((ctx->flags & IR_USE_FRAME_POINTER) ?
							-(ctx->stack_frame_size - ctx->stack_frame_alignment) :
							ctx->call_stack_size));
				if (src_reg != dst_reg) {
					ir_emit_param_move(ctx, insn->type, src_reg, dst_reg, use, stack_offset);
				}
				if (dst_reg != IR_REG_NONE && IR_REG_SPILLED(ctx->regs[use][0])) {
					ir_emit_store(ctx, insn->type, use, dst_reg);
				}
			}
			if (src_reg == IR_REG_NONE) {
				if (sizeof(void*) == 8) {
					stack_offset += sizeof(void*);
				} else {
					stack_offset += IR_MAX(sizeof(void*), ir_type_size[insn->type]);
				}
			}
		}
	}
}

static ir_reg ir_get_free_reg(ir_type type, ir_regset available)
{
	if (IR_IS_TYPE_INT(type)) {
		available = IR_REGSET_INTERSECTION(available, IR_REGSET_GP);
	} else {
		IR_ASSERT(IR_IS_TYPE_FP(type));
		available = IR_REGSET_INTERSECTION(available, IR_REGSET_FP);
	}
	IR_ASSERT(!IR_REGSET_IS_EMPTY(available));
	return IR_REGSET_FIRST(available);
}

static int ir_fix_dessa_tmps(ir_ctx *ctx, uint8_t type, ir_ref from, ir_ref to)
{
	ir_backend_data *data = ctx->data;
	ir_ref ref = ctx->cfg_blocks[data->dessa_from_block].end;

	if (to == 0) {
		if (IR_IS_TYPE_INT(type)) {
			if (ctx->regs[ref][0] == IR_REG_NONE) {
				ctx->regs[ref][0] = IR_REG_RAX;
			}
		} else {
			IR_ASSERT(IR_IS_TYPE_FP(type));
			if (ctx->regs[ref][1] == IR_REG_NONE) {
				ctx->regs[ref][1] = IR_REG_XMM0;
			}
		}
	} else if (from != 0) {
		if (IR_IS_TYPE_INT(type)) {
			if (ctx->regs[ref][0] == IR_REG_NONE) {
				ctx->regs[ref][0] = IR_REG_RAX;
			}
		} else {
			IR_ASSERT(IR_IS_TYPE_FP(type));
			if (ctx->regs[ref][1] == IR_REG_NONE) {
				ctx->regs[ref][1] = IR_REG_XMM0;
			}
		}
	}
	return 1;
}

static void ir_fix_param_spills(ir_ctx *ctx)
{
	ir_use_list *use_list = &ctx->use_lists[1];
	ir_insn *insn;
	ir_ref i, n, *p, use;
	int int_param_num = 0;
	int fp_param_num = 0;
	ir_reg src_reg;
	// TODO: Calling convention specific
	int int_reg_params_count = IR_REG_INT_ARGS;
	int fp_reg_params_count = IR_REG_FP_ARGS;
	const int8_t *int_reg_params = _ir_int_reg_params;
	const int8_t *fp_reg_params = _ir_fp_reg_params;
	int32_t stack_start = 0;
	int32_t stack_offset = 0;

#ifdef IR_TARGET_X86
	if (sizeof(void*) == 4 && (ctx->flags & IR_FASTCALL_FUNC)) {
		int_reg_params_count = IR_REG_INT_FCARGS;
		fp_reg_params_count = IR_REG_FP_FCARGS;
		int_reg_params = _ir_int_fc_reg_params;
		fp_reg_params = _ir_fp_fc_reg_params;
	}
#endif

	if (ctx->flags & IR_USE_FRAME_POINTER) {
		/* skip old frame pointer and return address */
		stack_start = sizeof(void*) * 2 + (ctx->stack_frame_size - ctx->stack_frame_alignment);
	} else {
		 /* skip return address */
		stack_start = sizeof(void*) + ctx->stack_frame_size;
	}
	n = use_list->count;
	for (i = 0, p = &ctx->use_edges[use_list->refs]; i < n; i++, p++) {
		use = *p;
		insn = &ctx->ir_base[use];
		if (insn->op == IR_PARAM) {
			if (IR_IS_TYPE_INT(insn->type)) {
#ifndef _WIN64
				if (ctx->value_params && ctx->value_params[insn->op3 - 1].align) {
					/* struct passed by value on stack */
					size_t align = ctx->value_params[insn->op3 - 1].align;

					align = IR_MAX(sizeof(void*), align);
					stack_offset = IR_ALIGNED_SIZE(stack_offset, align);
					ctx->value_params[insn->op3 - 1].offset = stack_start + stack_offset;
					stack_offset += ctx->value_params[insn->op3 - 1].size;
					stack_offset = IR_ALIGNED_SIZE(stack_offset, sizeof(void*));
					continue;
				}
#endif
				if (int_param_num < int_reg_params_count) {
					src_reg = int_reg_params[int_param_num];
				} else {
					src_reg = IR_REG_NONE;
				}
				int_param_num++;
#ifdef _WIN64
				/* WIN64 calling convention use common couter for int and fp registers */
				fp_param_num++;
#endif
			} else {
				if (fp_param_num < fp_reg_params_count) {
					src_reg = fp_reg_params[fp_param_num];
				} else {
					src_reg = IR_REG_NONE;
				}
				fp_param_num++;
#ifdef _WIN64
				/* WIN64 calling convention use common couter for int and fp registers */
				int_param_num++;
#endif
			}
			if (src_reg == IR_REG_NONE) {
				if (ctx->vregs[use]) {
					ir_live_interval *ival = ctx->live_intervals[ctx->vregs[use]];
					if ((ival->flags & IR_LIVE_INTERVAL_MEM_PARAM)
					 && ival->stack_spill_pos == -1
					 && (ival->next || ival->reg == IR_REG_NONE)) {
						ival->stack_spill_pos = stack_start + stack_offset;
					}
				}
				if (sizeof(void*) == 8) {
					stack_offset += sizeof(void*);
				} else {
					stack_offset += IR_MAX(sizeof(void*), ir_type_size[insn->type]);
				}
			}
		}
	}

#ifdef _WIN64
	/* WIN64 uses shsow area for registers */
	stack_offset += IR_MIN(int_param_num, int_reg_params_count) * sizeof(void*);
#endif
	ctx->gp_reg_params = IR_MIN(int_param_num, int_reg_params_count);
	ctx->fp_reg_params = IR_MIN(fp_param_num, fp_reg_params_count);
	ctx->param_stack_size = stack_offset;
}

static void ir_allocate_unique_spill_slots(ir_ctx *ctx)
{
	uint32_t b;
	ir_block *bb;
	ir_insn *insn;
	ir_ref i, n, j, *p;
	uint32_t *rule, insn_flags;
	ir_backend_data *data = ctx->data;
	ir_regset available = 0;
	ir_target_constraints constraints;
	uint32_t def_flags;
	ir_reg reg;

#ifndef IR_REG_FP_RET1
	if (ctx->flags2 & IR_HAS_FP_RET_SLOT) {
		ctx->ret_slot = ir_allocate_spill_slot(ctx, IR_DOUBLE, &data->ra_data);
	} else if (ctx->ret_type == IR_FLOAT || ctx->ret_type == IR_DOUBLE) {
		ctx->ret_slot = ir_allocate_spill_slot(ctx, ctx->ret_type, &data->ra_data);
	} else {
		ctx->ret_slot = -1;
	}
#endif

	ctx->regs = ir_mem_malloc(sizeof(ir_regs) * ctx->insns_count);
	memset(ctx->regs, IR_REG_NONE, sizeof(ir_regs) * ctx->insns_count);

	/* vregs + tmp + fixed + SRATCH + ALL */
	ctx->live_intervals = ir_mem_calloc(ctx->vregs_count + 1 + IR_REG_NUM + 2, sizeof(ir_live_interval*));

    if (!ctx->arena) {
		ctx->arena = ir_arena_create(16 * 1024);
	}

	for (b = 1, bb = ctx->cfg_blocks + b; b <= ctx->cfg_blocks_count; b++, bb++) {
		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
		for (i = bb->start, insn = ctx->ir_base + i, rule = ctx->rules + i; i <= bb->end;) {
			switch (ctx->rules ? *rule : insn->op) {
				case IR_START:
				case IR_BEGIN:
				case IR_END:
				case IR_IF_TRUE:
				case IR_IF_FALSE:
				case IR_CASE_VAL:
				case IR_CASE_RANGE:
				case IR_CASE_DEFAULT:
				case IR_MERGE:
				case IR_LOOP_BEGIN:
				case IR_LOOP_END:
					break;
#ifndef IR_REG_FP_RET1
				case IR_CALL:
					if (ctx->ret_slot == -1 && (insn->type == IR_FLOAT || insn->type == IR_DOUBLE)) {
						ctx->ret_slot = ir_allocate_spill_slot(ctx, IR_DOUBLE, &data->ra_data);
					}
#endif
					IR_FALLTHROUGH;
				default:
					def_flags = ir_get_target_constraints(ctx, i, &constraints);
					if (ctx->rules
					 && *rule != IR_CMP_AND_BRANCH_INT
					 && *rule != IR_CMP_AND_BRANCH_FP
					 && *rule != IR_TEST_AND_BRANCH_INT
					 && *rule != IR_GUARD_CMP_INT
					 && *rule != IR_GUARD_CMP_FP) {
						available = IR_REGSET_SCRATCH;
					}
					if (ctx->vregs[i]) {
						reg = constraints.def_reg;
						if (reg != IR_REG_NONE && IR_REGSET_IN(available, reg)) {
							IR_REGSET_EXCL(available, reg);
							ctx->regs[i][0] = reg | IR_REG_SPILL_STORE;
						} else if (def_flags & IR_USE_MUST_BE_IN_REG) {
							if (insn->op == IR_VLOAD
							 && ctx->live_intervals[ctx->vregs[i]]
							 && ctx->live_intervals[ctx->vregs[i]]->stack_spill_pos != -1
							 && ir_is_same_mem_var(ctx, i, ctx->ir_base[insn->op2].op3)) {
								/* pass */
							} else if (insn->op != IR_PARAM) {
								reg = ir_get_free_reg(insn->type, available);
								IR_REGSET_EXCL(available, reg);
								ctx->regs[i][0] = reg | IR_REG_SPILL_STORE;
							}
						}
						if (!ctx->live_intervals[ctx->vregs[i]]) {
							ir_live_interval *ival = ir_arena_alloc(&ctx->arena, sizeof(ir_live_interval));
							memset(ival, 0, sizeof(ir_live_interval));
							ctx->live_intervals[ctx->vregs[i]] = ival;
							ival->type = insn->type;
							ival->reg = IR_REG_NONE;
							ival->vreg = ctx->vregs[i];
							ival->stack_spill_pos = -1;
							if (insn->op == IR_PARAM && reg == IR_REG_NONE) {
								ival->flags |= IR_LIVE_INTERVAL_MEM_PARAM;
							} else {
								ival->stack_spill_pos = ir_allocate_spill_slot(ctx, ival->type, &data->ra_data);
							}
						} else if (insn->op == IR_PARAM) {
							IR_ASSERT(0 && "unexpected PARAM");
							return;
						}
					} else if (insn->op == IR_VAR) {
						ir_use_list *use_list = &ctx->use_lists[i];
						ir_ref n = use_list->count;

						if (n > 0) {
							int32_t stack_spill_pos = insn->op3 = ir_allocate_spill_slot(ctx, insn->type, &data->ra_data);
							ir_ref i, *p, use;
							ir_insn *use_insn;

							for (i = 0, p = &ctx->use_edges[use_list->refs]; i < n; i++, p++) {
								use = *p;
								use_insn = &ctx->ir_base[use];
								if (use_insn->op == IR_VLOAD) {
									if (ctx->vregs[use]
									 && !ctx->live_intervals[ctx->vregs[use]]) {
										ir_live_interval *ival = ir_arena_alloc(&ctx->arena, sizeof(ir_live_interval));
										memset(ival, 0, sizeof(ir_live_interval));
										ctx->live_intervals[ctx->vregs[use]] = ival;
										ival->type = insn->type;
										ival->reg = IR_REG_NONE;
										ival->vreg = ctx->vregs[use];
										ival->stack_spill_pos = stack_spill_pos;
									}
								} else if (use_insn->op == IR_VSTORE) {
									if (!IR_IS_CONST_REF(use_insn->op3)
									 && ctx->vregs[use_insn->op3]
									 && !ctx->live_intervals[ctx->vregs[use_insn->op3]]) {
										ir_live_interval *ival = ir_arena_alloc(&ctx->arena, sizeof(ir_live_interval));
										memset(ival, 0, sizeof(ir_live_interval));
										ctx->live_intervals[ctx->vregs[use_insn->op3]] = ival;
										ival->type = insn->type;
										ival->reg = IR_REG_NONE;
										ival->vreg = ctx->vregs[use_insn->op3];
										ival->stack_spill_pos = stack_spill_pos;
									}
								}
							}
						}
					}

					insn_flags = ir_op_flags[insn->op];
					n = constraints.tmps_count;
					if (n) {
						do {
							n--;
							if (constraints.tmp_regs[n].type) {
								ir_reg reg = ir_get_free_reg(constraints.tmp_regs[n].type, available);
								ir_ref *ops = insn->ops;
								IR_REGSET_EXCL(available, reg);
								if (constraints.tmp_regs[n].num > 0) {
									if (IR_IS_CONST_REF(ops[constraints.tmp_regs[n].num])) {
										/* rematerialization */
										reg |= IR_REG_SPILL_LOAD;
									} else if (ctx->ir_base[ops[constraints.tmp_regs[n].num]].op == IR_ALLOCA ||
											ctx->ir_base[ops[constraints.tmp_regs[n].num]].op == IR_VADDR) {
										/* local address rematerialization */
										reg |= IR_REG_SPILL_LOAD;
									}
								}
								ctx->regs[i][constraints.tmp_regs[n].num] = reg;
							} else if (constraints.tmp_regs[n].reg == IR_REG_SCRATCH) {
								available = IR_REGSET_DIFFERENCE(available, IR_REGSET_SCRATCH);
							} else {
								IR_REGSET_EXCL(available, constraints.tmp_regs[n].reg);
							}
						} while (n);
					}
					n = insn->inputs_count;
					for (j = 1, p = insn->ops + 1; j <= n; j++, p++) {
						ir_ref input = *p;
						if (IR_OPND_KIND(insn_flags, j) == IR_OPND_DATA && input > 0 && ctx->vregs[input]) {
							if ((def_flags & IR_DEF_REUSES_OP1_REG) && j == 1) {
								ir_reg reg = IR_REG_NUM(ctx->regs[i][0]);
								ctx->regs[i][1] = reg | IR_REG_SPILL_LOAD;
							} else {
								uint8_t use_flags = IR_USE_FLAGS(def_flags, j);
								ir_reg reg = (j < constraints.hints_count) ? constraints.hints[j] : IR_REG_NONE;

								if (reg != IR_REG_NONE && IR_REGSET_IN(available, reg)) {
									IR_REGSET_EXCL(available, reg);
									ctx->regs[i][j] = reg | IR_REG_SPILL_LOAD;
								} else if (IR_IS_FOLDABLE_OP(insn->op) && j > 1 && input == insn->op1 && ctx->regs[i][1] != IR_REG_NONE) {
									ctx->regs[i][j] = ctx->regs[i][1];
								} else if (use_flags & IR_USE_MUST_BE_IN_REG) {
									reg = ir_get_free_reg(ctx->ir_base[input].type, available);
									IR_REGSET_EXCL(available, reg);
									ctx->regs[i][j] = reg | IR_REG_SPILL_LOAD;
								}
							}
						}
					}
					break;
			}
			n = ir_insn_len(insn);
			i += n;
			insn += n;
			rule += n;
		}
		if (bb->flags & IR_BB_DESSA_MOVES) {
			data->dessa_from_block = b;
			ir_gen_dessa_moves(ctx, b, ir_fix_dessa_tmps);
		}
	}

	ctx->used_preserved_regs = ctx->fixed_save_regset;
	ctx->flags |= IR_NO_STACK_COMBINE;
	ir_fix_stack_frame(ctx);
}

static void ir_preallocate_call_stack(ir_ctx *ctx)
{
	int call_stack_size, copy_stack, peak_call_stack_size = 0;
	ir_ref i, n;
	ir_insn *insn;

	for (i = 1, insn = ctx->ir_base + 1; i < ctx->insns_count;) {
		if (insn->op == IR_CALL) {
			call_stack_size = ir_call_used_stack(ctx, insn, &copy_stack);
			if (call_stack_size > peak_call_stack_size
#ifdef IR_HAVE_FASTCALL
			 && !ir_is_fastcall(ctx, insn) /* fast call functions restore stack pointer */
#endif
			) {
				peak_call_stack_size = call_stack_size;
			}
		}
		n = ir_insn_len(insn);
		i += n;
		insn += n;
	}
	if (peak_call_stack_size) {
		ctx->call_stack_size = peak_call_stack_size;
		ctx->flags |= IR_PREALLOCATED_STACK;
	}
}

void ir_fix_stack_frame(ir_ctx *ctx)
{
	uint32_t additional_size = 0;

	ctx->locals_area_size = ctx->stack_frame_size;

#if defined(IR_TARGET_X64) && !defined(_WIN64)
	if ((ctx->flags & IR_VARARG_FUNC) && (ctx->flags2 & IR_HAS_VA_START)) {
		ctx->flags2 |= IR_16B_FRAME_ALIGNMENT;
		ctx->stack_frame_size = IR_ALIGNED_SIZE(ctx->stack_frame_size, 16);
		ctx->locals_area_size = ctx->stack_frame_size;
		if ((ctx->flags2 & (IR_HAS_VA_ARG_GP|IR_HAS_VA_COPY)) && ctx->gp_reg_params < IR_REG_INT_ARGS) {
			additional_size += sizeof(void*) * IR_REG_INT_ARGS;
		}
		if ((ctx->flags2 & (IR_HAS_VA_ARG_FP|IR_HAS_VA_COPY)) && ctx->fp_reg_params < IR_REG_FP_ARGS) {
			additional_size += 16 * IR_REG_FP_ARGS;
		}
	}
#endif

	if (ctx->used_preserved_regs) {
		ir_regset used_preserved_regs = (ir_regset)ctx->used_preserved_regs;
		ir_reg reg;
		(void) reg;

		IR_REGSET_FOREACH(used_preserved_regs, reg) {
			additional_size += sizeof(void*);
		} IR_REGSET_FOREACH_END();
	}

	ctx->stack_frame_size = IR_ALIGNED_SIZE(ctx->stack_frame_size, sizeof(void*));
	ctx->stack_frame_size += additional_size;
	ctx->stack_frame_alignment = 0;
	ctx->call_stack_size = 0;

	if (ctx->flags2 & IR_16B_FRAME_ALIGNMENT) {
		/* Stack must be 16 byte aligned */
		if (!(ctx->flags & IR_FUNCTION)) {
			while (IR_ALIGNED_SIZE(ctx->stack_frame_size, 16) != ctx->stack_frame_size) {
				ctx->stack_frame_size += sizeof(void*);
				ctx->stack_frame_alignment += sizeof(void*);
			}
		} else if (ctx->flags & IR_USE_FRAME_POINTER) {
			while (IR_ALIGNED_SIZE(ctx->stack_frame_size + sizeof(void*) * 2, 16) != ctx->stack_frame_size + sizeof(void*) * 2) {
				ctx->stack_frame_size += sizeof(void*);
				ctx->stack_frame_alignment += sizeof(void*);
			}
		} else {
			if (!(ctx->flags & IR_NO_STACK_COMBINE)) {
				ir_preallocate_call_stack(ctx);
			}
			while (IR_ALIGNED_SIZE(ctx->stack_frame_size + ctx->call_stack_size + sizeof(void*), 16) !=
					ctx->stack_frame_size + ctx->call_stack_size + sizeof(void*)) {
				ctx->stack_frame_size += sizeof(void*);
				ctx->stack_frame_alignment += sizeof(void*);
			}
		}
	}

	ir_fix_param_spills(ctx);
}

static void* dasm_labels[ir_lb_MAX];

static uint32_t _ir_next_block(ir_ctx *ctx, uint32_t _b)
{
	uint32_t b = ctx->cfg_schedule[++_b];

	/* Check for empty ENTRY block */
	while (b && ((ctx->cfg_blocks[b].flags & (IR_BB_START|IR_BB_EMPTY)) == IR_BB_EMPTY)) {
		b = ctx->cfg_schedule[++_b];
	}
	return b;
}

void *ir_emit_code(ir_ctx *ctx, size_t *size_ptr)
{
	uint32_t _b, b, n, target;
	ir_block *bb;
	ir_ref i;
	ir_insn *insn;
	uint32_t *rule;
	ir_backend_data data;
	dasm_State **Dst;
	int ret;
	void *entry;
	size_t size;

	data.ra_data.unused_slot_4 = 0;
	data.ra_data.unused_slot_2 = 0;
	data.ra_data.unused_slot_1 = 0;
	data.ra_data.handled = NULL;
	data.rodata_label = 0;
	data.jmp_table_label = 0;
	data.double_neg_const = 0;
	data.float_neg_const = 0;
	data.double_abs_const = 0;
	data.float_abs_const = 0;
	data.double_zero_const = 0;
	ctx->data = &data;

	if (!ctx->live_intervals) {
		ctx->stack_frame_size = 0;
		ctx->stack_frame_alignment = 0;
		ctx->call_stack_size = 0;
		ctx->used_preserved_regs = 0;
		ir_allocate_unique_spill_slots(ctx);
	}

	if (ctx->fixed_stack_frame_size != -1) {
		if (ctx->fixed_stack_red_zone) {
			IR_ASSERT(ctx->fixed_stack_red_zone == ctx->fixed_stack_frame_size + ctx->fixed_call_stack_size);
		}
		if (ctx->stack_frame_size > ctx->fixed_stack_frame_size) {
			// TODO: report error to caller
#ifdef IR_DEBUG_MESSAGES
			fprintf(stderr, "IR Compilation Aborted: ctx->stack_frame_size > ctx->fixed_stack_frame_size at %s:%d\n",
				__FILE__, __LINE__);
#endif
			ctx->data = NULL;
			ctx->status = IR_ERROR_FIXED_STACK_FRAME_OVERFLOW;
			return NULL;
		}
		ctx->stack_frame_size = ctx->fixed_stack_frame_size;
		ctx->call_stack_size = ctx->fixed_call_stack_size;
		ctx->stack_frame_alignment = 0;
	}

	Dst = &data.dasm_state;
	data.dasm_state = NULL;
	dasm_init(&data.dasm_state, DASM_MAXSECTION);
	dasm_setupglobal(&data.dasm_state, dasm_labels, ir_lb_MAX);
	dasm_setup(&data.dasm_state, dasm_actions);
	/* labels for each block + for each constant + rodata label + jmp_table label + for each entry */
	dasm_growpc(&data.dasm_state, ctx->cfg_blocks_count + 1 + ctx->consts_count + 1 + 1 + 1 + ctx->entries_count);
	if (data.dasm_state->status != DASM_S_OK) {
		IR_ASSERT(data.dasm_state->status == DASM_S_NOMEM);
		dasm_free(&data.dasm_state);
		ctx->data = NULL;
		ctx->status = IR_ERROR_TOO_LARGE;
		return NULL;
	}

	data.emit_constants = ir_bitset_malloc(ctx->consts_count);

	if ((ctx->flags & IR_GEN_ENDBR) && (ctx->flags & IR_START_BR_TARGET)) {
		|.if X64
		|	endbr64
		|.else
		|	endbr32
		|.endif
	}

	if (!(ctx->flags & IR_SKIP_PROLOGUE)) {
		ir_emit_prologue(ctx);
	}
	if (ctx->flags & IR_FUNCTION) {
		ir_emit_load_params(ctx);
	}

	if (UNEXPECTED(!ctx->cfg_schedule)) {
		uint32_t *list = ctx->cfg_schedule = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2));
		for (b = 0; b <= ctx->cfg_blocks_count; b++) {
			list[b] = b;
		}
		list[ctx->cfg_blocks_count + 1] = 0;
	}

	for (_b = 1; _b <= ctx->cfg_blocks_count; _b++) {
		b = ctx->cfg_schedule[_b];
		bb = &ctx->cfg_blocks[b];
		IR_ASSERT(!(bb->flags & IR_BB_UNREACHABLE));
		if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) {
			continue;
		}
		if (bb->flags & IR_BB_ALIGN_LOOP) {
			|	.align IR_LOOP_ALIGNMENT
		}
		|=>b:

		i = bb->start;
		insn = ctx->ir_base + i;
		if (bb->flags & IR_BB_ENTRY) {
			uint32_t label = ctx->cfg_blocks_count + ctx->consts_count + 4 + insn->op3;

			|=>label:
			if ((ctx->flags & IR_GEN_ENDBR) && (ctx->flags & IR_ENTRY_BR_TARGET)) {
				|.if X64
				|	endbr64
				|.else
				|	endbr32
				|.endif
			}
			ir_emit_prologue(ctx);
			ctx->entries[insn->op3] = i;
		}

		/* skip first instruction */
		n = ir_insn_len(insn);
		i += n;
		insn += n;
		rule = ctx->rules + i;

		while (i <= bb->end) {
			if (!((*rule) & (IR_FUSED|IR_SKIPPED)))
			switch ((*rule) & IR_RULE_MASK) {
				case IR_VAR:
				case IR_PARAM:
				case IR_PI:
				case IR_PHI:
				case IR_SNAPSHOT:
				case IR_VA_END:
					break;
				case IR_LEA_OB:
				case IR_LEA_SI:
				case IR_LEA_SIB:
				case IR_LEA_IB:
				case IR_LEA_OB_I:
				case IR_LEA_I_OB:
				case IR_LEA_SI_O:
				case IR_LEA_SIB_O:
				case IR_LEA_IB_O:
				case IR_LEA_OB_SI:
				case IR_LEA_SI_OB:
				case IR_LEA_B_SI:
				case IR_LEA_SI_B:
				case IR_LEA_B_SI_O:
				case IR_LEA_SI_B_O:
				case IR_LEA_SYM_O:
				case IR_LEA_O_SYM:
					ir_emit_lea(ctx, i, insn->type);
					break;
				case IR_MUL_PWR2:
				case IR_DIV_PWR2:
				case IR_MOD_PWR2:
					ir_emit_mul_div_mod_pwr2(ctx, i, insn);
					break;
				case IR_BIT_OP:
					ir_emit_bit_op(ctx, i, insn);
					break;
				case IR_SDIV_PWR2:
					ir_emit_sdiv_pwr2(ctx, i, insn);
					break;
				case IR_SMOD_PWR2:
					ir_emit_smod_pwr2(ctx, i, insn);
					break;
				case IR_SHIFT:
					ir_emit_shift(ctx, i, insn);
					break;
				case IR_SHIFT_CONST:
					ir_emit_shift_const(ctx, i, insn);
					break;
				case IR_BIT_COUNT:
					ir_emit_bit_count(ctx, i, insn);
					break;
				case IR_CTPOP:
					ir_emit_ctpop(ctx, i, insn);
					break;
				case IR_INC:
				case IR_DEC:
				case IR_OP_INT:
					ir_emit_op_int(ctx, i, insn, *rule);
					break;
				case IR_ABS_INT:
					ir_emit_abs_int(ctx, i, insn);
					break;
				case IR_BOOL_NOT:
					ir_emit_bool_not(ctx, i, insn);
					break;
				case IR_BOOL_NOT_INT:
					ir_emit_bool_not_int(ctx, i, insn);
					break;
				case IR_OP_FP:
					ir_emit_op_fp(ctx, i, insn);
					break;
				case IR_IMUL3:
					ir_emit_imul3(ctx, i, insn);
					break;
				case IR_BINOP_INT:
					ir_emit_binop_int(ctx, i, insn);
					break;
				case IR_BINOP_SSE2:
					ir_emit_binop_sse2(ctx, i, insn);
					break;
				case IR_BINOP_AVX:
					ir_emit_binop_avx(ctx, i, insn);
					break;
				case IR_MUL_INT:
				case IR_DIV_INT:
				case IR_MOD_INT:
					ir_emit_mul_div_mod(ctx, i, insn);
					break;
				case IR_CMP_INT:
					ir_emit_cmp_int(ctx, i, insn);
					break;
				case IR_TESTCC_INT:
					ir_emit_testcc_int(ctx, i, insn);
					break;
				case IR_SETCC_INT:
					ir_emit_setcc_int(ctx, i, insn);
					break;
				case IR_CMP_FP:
					ir_emit_cmp_fp(ctx, i, insn);
					break;
				case IR_SEXT:
					ir_emit_sext(ctx, i, insn);
					break;
				case IR_ZEXT:
					ir_emit_zext(ctx, i, insn);
					break;
				case IR_TRUNC:
					ir_emit_trunc(ctx, i, insn);
					break;
				case IR_BITCAST:
				case IR_PROTO:
					ir_emit_bitcast(ctx, i, insn);
					break;
				case IR_INT2FP:
					ir_emit_int2fp(ctx, i, insn);
					break;
				case IR_FP2INT:
					ir_emit_fp2int(ctx, i, insn);
					break;
				case IR_FP2FP:
					ir_emit_fp2fp(ctx, i, insn);
					break;
				case IR_COPY_INT:
					ir_emit_copy_int(ctx, i, insn);
					break;
				case IR_COPY_FP:
					ir_emit_copy_fp(ctx, i, insn);
					break;
				case IR_CMP_AND_STORE_INT:
					ir_emit_cmp_and_store_int(ctx, i, insn);
					break;
				case IR_CMP_AND_BRANCH_INT:
					ir_emit_cmp_and_branch_int(ctx, b, i, insn, _ir_next_block(ctx, _b));
					break;
				case IR_CMP_AND_BRANCH_FP:
					ir_emit_cmp_and_branch_fp(ctx, b, i, insn, _ir_next_block(ctx, _b));
					break;
				case IR_TEST_AND_BRANCH_INT:
					ir_emit_test_and_branch_int(ctx, b, i, insn, _ir_next_block(ctx, _b));
					break;
				case IR_JCC_INT:
					{
						ir_op op = ctx->ir_base[insn->op2].op;

						if (op == IR_ADD ||
						    op == IR_SUB ||
//						    op == IR_MUL ||
						    op == IR_OR  ||
						    op == IR_AND ||
						    op == IR_XOR) {
							op = IR_NE;
						} else {
							IR_ASSERT(op >= IR_EQ && op <= IR_UGT);
						}
						ir_emit_jcc(ctx, b, i, insn, _ir_next_block(ctx, _b), op, 1, 1);
					}
					break;
				case IR_GUARD_CMP_INT:
					if (ir_emit_guard_cmp_int(ctx, b, i, insn, _ir_next_block(ctx, _b))) {
						goto next_block;
					}
					break;
				case IR_GUARD_CMP_FP:
					if (ir_emit_guard_cmp_fp(ctx, b, i, insn, _ir_next_block(ctx, _b))) {
						goto next_block;
					}
					break;
				case IR_GUARD_TEST_INT:
					if (ir_emit_guard_test_int(ctx, b, i, insn, _ir_next_block(ctx, _b))) {
						goto next_block;
					}
					break;
				case IR_GUARD_JCC_INT:
					if (ir_emit_guard_jcc_int(ctx, b, i, insn, _ir_next_block(ctx, _b))) {
						goto next_block;
					}
					break;
				case IR_IF_INT:
					ir_emit_if_int(ctx, b, i, insn, _ir_next_block(ctx, _b));
					break;
				case IR_COND:
					ir_emit_cond(ctx, i, insn);
					break;
				case IR_COND_CMP_INT:
					ir_emit_cond_cmp_int(ctx, i, insn);
					break;
				case IR_COND_CMP_FP:
					ir_emit_cond_cmp_fp(ctx, i, insn);
					break;
				case IR_SWITCH:
					ir_emit_switch(ctx, b, i, insn);
					break;
				case IR_MIN_MAX_INT:
					ir_emit_min_max_int(ctx, i, insn);
					break;
				case IR_OVERFLOW:
					ir_emit_overflow(ctx, i, insn);
					break;
				case IR_OVERFLOW_AND_BRANCH:
					ir_emit_overflow_and_branch(ctx, b, i, insn, _ir_next_block(ctx, _b));
					break;
				case IR_END:
				case IR_LOOP_END:
					if (bb->flags & IR_BB_OSR_ENTRY_LOADS) {
						ir_emit_osr_entry_loads(ctx, b, bb);
					}
					if (bb->flags & IR_BB_DESSA_MOVES) {
						ir_emit_dessa_moves(ctx, b, bb);
					}
					do {
						ir_ref succ = ctx->cfg_edges[bb->successors];

						if (UNEXPECTED(bb->successors_count == 2)) {
							if (ctx->cfg_blocks[succ].flags & IR_BB_ENTRY) {
								succ = ctx->cfg_edges[bb->successors + 1];
							} else {
								IR_ASSERT(ctx->cfg_blocks[ctx->cfg_edges[bb->successors + 1]].flags & IR_BB_ENTRY);
							}
						} else {
							IR_ASSERT(bb->successors_count == 1);
						}
						target = ir_skip_empty_target_blocks(ctx, succ);
						if (target != _ir_next_block(ctx, _b)) {
							|	jmp =>target
						}
					} while (0);
					break;
				case IR_RETURN_VOID:
					ir_emit_return_void(ctx);
					break;
				case IR_RETURN_INT:
					ir_emit_return_int(ctx, i, insn);
					break;
				case IR_RETURN_FP:
					ir_emit_return_fp(ctx, i, insn);
					break;
				case IR_CALL:
					ir_emit_call(ctx, i, insn);
					break;
				case IR_TAILCALL:
					ir_emit_tailcall(ctx, i, insn);
					break;
				case IR_IJMP:
					ir_emit_ijmp(ctx, i, insn);
					break;
				case IR_MEM_OP_INT:
				case IR_MEM_INC:
				case IR_MEM_DEC:
					ir_emit_mem_op_int(ctx, i, insn, *rule);
					break;
				case IR_MEM_BINOP_INT:
					ir_emit_mem_binop_int(ctx, i, insn);
					break;
				case IR_MEM_MUL_PWR2:
				case IR_MEM_DIV_PWR2:
				case IR_MEM_MOD_PWR2:
					ir_emit_mem_mul_div_mod_pwr2(ctx, i, insn);
					break;
				case IR_MEM_SHIFT:
					ir_emit_mem_shift(ctx, i, insn);
					break;
				case IR_MEM_SHIFT_CONST:
					ir_emit_mem_shift_const(ctx, i, insn);
					break;
				case IR_REG_BINOP_INT:
					ir_emit_reg_binop_int(ctx, i, insn);
					break;
				case IR_VADDR:
					ir_emit_vaddr(ctx, i, insn);
					break;
				case IR_VLOAD:
					ir_emit_vload(ctx, i, insn);
					break;
				case IR_VSTORE_INT:
					ir_emit_vstore_int(ctx, i, insn);
					break;
				case IR_VSTORE_FP:
					ir_emit_vstore_fp(ctx, i, insn);
					break;
				case IR_RLOAD:
					ir_emit_rload(ctx, i, insn);
					break;
				case IR_RSTORE:
					ir_emit_rstore(ctx, i, insn);
					break;
				case IR_LOAD_INT:
					ir_emit_load_int(ctx, i, insn);
					break;
				case IR_LOAD_FP:
					ir_emit_load_fp(ctx, i, insn);
					break;
				case IR_STORE_INT:
					ir_emit_store_int(ctx, i, insn);
					break;
				case IR_STORE_FP:
					ir_emit_store_fp(ctx, i, insn);
					break;
				case IR_ALLOCA:
					ir_emit_alloca(ctx, i, insn);
					break;
				case IR_VA_START:
					ir_emit_va_start(ctx, i, insn);
					break;
				case IR_VA_COPY:
					ir_emit_va_copy(ctx, i, insn);
					break;
				case IR_VA_ARG:
					ir_emit_va_arg(ctx, i, insn);
					break;
				case IR_AFREE:
					ir_emit_afree(ctx, i, insn);
					break;
				case IR_BLOCK_BEGIN:
					ir_emit_block_begin(ctx, i, insn);
					break;
				case IR_BLOCK_END:
					ir_emit_block_end(ctx, i, insn);
					break;
				case IR_FRAME_ADDR:
					ir_emit_frame_addr(ctx, i);
					break;
				case IR_EXITCALL:
					ir_emit_exitcall(ctx, i, insn);
					break;
				case IR_GUARD:
				case IR_GUARD_NOT:
					if (ir_emit_guard(ctx, b, i, insn, _ir_next_block(ctx, _b))) {
						goto next_block;
					}
					break;
				case IR_GUARD_OVERFLOW:
					if (ir_emit_guard_overflow(ctx, b, i, insn)) {
						goto next_block;
					}
					break;
				case IR_SSE_SQRT:
					ir_emit_sse_sqrt(ctx, i, insn);
					break;
				case IR_SSE_RINT:
					ir_emit_sse_round(ctx, i, insn, 4);
					break;
				case IR_SSE_FLOOR:
					ir_emit_sse_round(ctx, i, insn, 9);
					break;
				case IR_SSE_CEIL:
					ir_emit_sse_round(ctx, i, insn, 10);
					break;
				case IR_SSE_TRUNC:
					ir_emit_sse_round(ctx, i, insn, 11);
					break;
				case IR_SSE_NEARBYINT:
					ir_emit_sse_round(ctx, i, insn, 12);
					break;
				case IR_TLS:
					ir_emit_tls(ctx, i, insn);
					break;
				case IR_TRAP:
					|	int3
					break;
				default:
					IR_ASSERT(0 && "NIY rule/instruction");
					ir_mem_free(data.emit_constants);
					dasm_free(&data.dasm_state);
					ctx->data = NULL;
					ctx->status = IR_ERROR_UNSUPPORTED_CODE_RULE;
					return NULL;
			}
			n = ir_insn_len(insn);
			i += n;
			insn += n;
			rule += n;
		}
next_block:;
	}

	if (data.rodata_label) {
		|.rodata
	}
	IR_BITSET_FOREACH(data.emit_constants, ir_bitset_len(ctx->consts_count), i) {
		insn = &ctx->ir_base[-i];
		if (IR_IS_TYPE_FP(insn->type)) {
			int label = ctx->cfg_blocks_count + i;

			if (!data.rodata_label) {
				data.rodata_label = ctx->cfg_blocks_count + ctx->consts_count + 2;

				|.rodata
				|=>data.rodata_label:
			}
			if (insn->type == IR_DOUBLE) {
				|.align 8
				|=>label:
				|.dword insn->val.u32, insn->val.u32_hi
			} else {
				IR_ASSERT(insn->type == IR_FLOAT);
				|.align 4
				|=>label:
				|.dword insn->val.u32
			}
		} else if (insn->op == IR_STR) {
			int label = ctx->cfg_blocks_count + i;
			const char *str = ir_get_str(ctx, insn->val.str);
			int i = 0;

			if (!data.rodata_label) {
				data.rodata_label = ctx->cfg_blocks_count + ctx->consts_count + 2;

				|.rodata
				|=>data.rodata_label:
			}
			|.align 8
			|=>label:
			while (str[i]) {
				char c = str[i];

				|.byte c
				i++;
			}
			|.byte 0

		} else {
			IR_ASSERT(0);
		}
	} IR_BITSET_FOREACH_END();
	if (data.rodata_label) {
		|.code
	}
	ir_mem_free(data.emit_constants);

	if (ctx->status) {
		dasm_free(&data.dasm_state);
		ctx->data = NULL;
		return NULL;
	}

	if (data.dasm_state->status != DASM_S_OK) {
		IR_ASSERT(data.dasm_state->status == DASM_S_NOMEM);
		dasm_free(&data.dasm_state);
		ctx->data = NULL;
		ctx->status = IR_ERROR_TOO_LARGE;
		return NULL;
	}

	ret = dasm_link(&data.dasm_state, size_ptr);
	if (ret != DASM_S_OK) {
		IR_ASSERT(ret == DASM_S_NOMEM);
		dasm_free(&data.dasm_state);
		ctx->data = NULL;
		ctx->status = (ret == DASM_S_NOMEM) ? IR_ERROR_TOO_LARGE : IR_ERROR_LINK;
		return NULL;
	}
	size = *size_ptr;

	if (ctx->code_buffer) {
		entry = ctx->code_buffer->pos;
		entry = (void*)IR_ALIGNED_SIZE(((size_t)(entry)), 16);
		if (size > (size_t)((char*)ctx->code_buffer->end - (char*)entry)) {
			ctx->data = NULL;
			ctx->status = IR_ERROR_CODE_MEM_OVERFLOW;
			return NULL;
		}
		ctx->code_buffer->pos = (char*)entry + size;
	} else {
		entry = ir_mem_mmap(size);
		if (!entry) {
			dasm_free(&data.dasm_state);
			ctx->data = NULL;
			ctx->status = IR_ERROR_CODE_MEM_OVERFLOW;
			return NULL;
		}
		ir_mem_unprotect(entry, size);
	}

	ret = dasm_encode(&data.dasm_state, entry);
	if (ret != DASM_S_OK) {
		IR_ASSERT(0);
		dasm_free(&data.dasm_state);
		if (ctx->code_buffer) {
			if (ctx->code_buffer->pos == (char*)entry + size) {
				/* rollback */
				ctx->code_buffer->pos = (char*)entry - size;
			}
		} else {
			ir_mem_unmap(entry, size);
		}
		ctx->data = NULL;
		ctx->status = IR_ERROR_ENCODE;
		return NULL;
	}

	if (data.jmp_table_label) {
		uint32_t offset = dasm_getpclabel(&data.dasm_state, data.jmp_table_label);
		ctx->jmp_table_offset = offset;
	} else {
		ctx->jmp_table_offset = 0;
	}
	if (data.rodata_label) {
		uint32_t offset = dasm_getpclabel(&data.dasm_state, data.rodata_label);
		ctx->rodata_offset = offset;
	} else {
		ctx->rodata_offset = 0;
	}

	if (ctx->entries_count) {
		/* For all entries */
		i = ctx->entries_count;
		do {
			ir_insn *insn = &ctx->ir_base[ctx->entries[--i]];
			uint32_t offset = dasm_getpclabel(&data.dasm_state, ctx->cfg_blocks_count + ctx->consts_count + 4 + insn->op3);
			insn->op3 = offset;
		} while (i != 0);
	}

	dasm_free(&data.dasm_state);

	ir_mem_flush(entry, size);

#if defined(__GNUC__)
	if ((ctx->flags & IR_GEN_CACHE_DEMOTE) && (ctx->mflags & IR_X86_CLDEMOTE)) {
		uintptr_t start = (uintptr_t)entry;
		uintptr_t p = (uintptr_t)start & ~0x3F;

		do {
			/* _cldemote(p); */
			asm volatile(".byte 0x0f, 0x1c, 0x06" :: "S" (p));
			p += 64;
		} while (p < start + size);
	}
#endif

	if (!ctx->code_buffer) {
		ir_mem_protect(entry, size);
	}

	ctx->data = NULL;
	return entry;
}

const void *ir_emit_exitgroup(uint32_t first_exit_point, uint32_t exit_points_per_group, const void *exit_addr, ir_code_buffer *code_buffer, size_t *size_ptr)
{
	void *entry;
	size_t size;
	uint32_t i;
	dasm_State **Dst, *dasm_state;
	int ret;

	IR_ASSERT(code_buffer);
	IR_ASSERT(sizeof(void*) == 4 || IR_MAY_USE_32BIT_ADDR(code_buffer, exit_addr));

	Dst = &dasm_state;
	dasm_state = NULL;
	dasm_init(&dasm_state, DASM_MAXSECTION);
	dasm_setupglobal(&dasm_state, dasm_labels, ir_lb_MAX);
	dasm_setup(&dasm_state, dasm_actions);

	for (i = 0; i < exit_points_per_group - 1; i++) {
		|	push byte i
		|	.byte 0xeb, (4*(exit_points_per_group-i)-6) // jmp >1
	}
	|	push byte i
	|// 1:
	|	add aword [r4], first_exit_point
	|	jmp aword &exit_addr

	ret = dasm_link(&dasm_state, &size);
	if (ret != DASM_S_OK) {
		IR_ASSERT(0);
		dasm_free(&dasm_state);
		return NULL;
	}

	entry = code_buffer->pos;
	entry = (void*)IR_ALIGNED_SIZE(((size_t)(entry)), 16);
	if (size > (size_t)((char*)code_buffer->end - (char*)entry)) {
		return NULL;
	}
	code_buffer->pos = (char*)entry + size;

	ret = dasm_encode(&dasm_state, entry);
	if (ret != DASM_S_OK) {
		IR_ASSERT(0);
		dasm_free(&dasm_state);
		if (code_buffer->pos == (char*)entry + size) {
			/* rollback */
			code_buffer->pos = (char*)entry - size;
		}
		return NULL;
	}

	dasm_free(&dasm_state);

	ir_mem_flush(entry, size);

	*size_ptr = size;
	return entry;
}

bool ir_needs_thunk(ir_code_buffer *code_buffer, void *addr)
{
	return sizeof(void*) == 8 && !IR_MAY_USE_32BIT_ADDR(code_buffer, addr);
}

void *ir_emit_thunk(ir_code_buffer *code_buffer, void *addr, size_t *size_ptr)
{
	void *entry;
	size_t size;
	dasm_State **Dst, *dasm_state;
	int ret;

	Dst = &dasm_state;
	dasm_state = NULL;
	dasm_init(&dasm_state, DASM_MAXSECTION);
	dasm_setupglobal(&dasm_state, dasm_labels, ir_lb_MAX);
	dasm_setup(&dasm_state, dasm_actions);

	|.code
	|.if X64
	|	jmp aword [>1]
	|1:
	|	.aword &addr
	|.else
	|	jmp &addr
	|.endif

	ret = dasm_link(&dasm_state, &size);
	if (ret != DASM_S_OK) {
		IR_ASSERT(0);
		dasm_free(&dasm_state);
		return NULL;
	}

	if (size > (size_t)((char*)code_buffer->end - (char*)code_buffer->pos)) {
		dasm_free(&dasm_state);
		return NULL;
	}

	entry = code_buffer->pos;
	ret = dasm_encode(&dasm_state, entry);
	if (ret != DASM_S_OK) {
		dasm_free(&dasm_state);
		return NULL;
	}

	*size_ptr = size;
	code_buffer->pos = (char*)code_buffer->pos + size;

	dasm_free(&dasm_state);
	ir_mem_flush(entry, size);

	return entry;
}

void ir_fix_thunk(void *thunk_entry, void *addr)
{
	unsigned char *code = thunk_entry;

	if (sizeof(void*) == 8 && !IR_IS_SIGNED_32BIT(((unsigned char*)addr - (code + 5)))) {
		int32_t *offset_ptr;
		void **addr_ptr;

		IR_ASSERT(code[0] == 0xff && code[1] == 0x25);
		offset_ptr = (int32_t*)(code + 2);
		addr_ptr = (void**)(code + 6 + *offset_ptr);
		*addr_ptr = addr;
	} else {
		typedef IR_SET_ALIGNED(1, int32_t unaligned_int32_t);
		unaligned_int32_t *addr_ptr;

		code[0] = 0xe9;
		addr_ptr = (int32_t*)(code + 1);
		*addr_ptr = (int32_t)(intptr_t)(void*)((unsigned char*)addr - (code + 5));
	}
}
