From 240008ec83731a47149803d4555a3b7c201829dc Mon Sep 17 00:00:00 2001 From: Paris Oplopoios <21157395+OFFTKP@users.noreply.github.com> Date: Mon, 1 Jun 2026 18:43:26 +0300 Subject: [PATCH 1/9] Fuse CMP+JCC --- src/felix86/v2/handlers.cpp | 103 +++++++++++++++++++++++++++++++++- src/felix86/v2/recompiler.cpp | 11 ++-- src/felix86/v2/recompiler.hpp | 2 +- 3 files changed, 109 insertions(+), 7 deletions(-) diff --git a/src/felix86/v2/handlers.cpp b/src/felix86/v2/handlers.cpp index e89952613..8297e8e43 100644 --- a/src/felix86/v2/handlers.cpp +++ b/src/felix86/v2/handlers.cpp @@ -1,5 +1,4 @@ #include -#include #include #include "Zydis/DecoderTypes.h" #include "Zydis/SharedTypes.h" @@ -135,7 +134,12 @@ static inline bool AttemptCmpFusing(Recompiler& rec, u64 rip, Assembler& as, Zyd return false; } - auto [next_instruction, next_operands] = rec.getNextInstruction(); + auto opt = rec.getNextInstruction(); + if (!opt.has_value()) { + return false; + } + + auto [next_instruction, next_operands] = *opt; switch (next_instruction->mnemonic) { case ZYDIS_MNEMONIC_CMOVL: { biscuit::GPR cond = rec.scratch(); @@ -271,6 +275,101 @@ static inline bool AttemptCmpFusing(Recompiler& rec, u64 rip, Assembler& as, Zyd rec.skipNext(); return true; } + case ZYDIS_MNEMONIC_JL: + case ZYDIS_MNEMONIC_JLE: + case ZYDIS_MNEMONIC_JNL: + case ZYDIS_MNEMONIC_JNLE: + case ZYDIS_MNEMONIC_JB: + case ZYDIS_MNEMONIC_JBE: + case ZYDIS_MNEMONIC_JNB: + case ZYDIS_MNEMONIC_JNBE: { + // The earlier check confirmed that no flags are needed after this jump, so we can freely fuse instructions here + biscuit::GPR op0 = rec.getGPR(&operands[0]); + biscuit::GPR op1 = rec.getGPR(&operands[1]); + biscuit::GPR lhs, rhs; + if (instruction.operand_width != 64) { + lhs = rec.scratch(); + rhs = rec.scratch(); + rec.sext(lhs, op0, rec.zydisToSize(instruction.operand_width)); + rec.sext(rhs, op1, rec.zydisToSize(instruction.operand_width)); + } else { + lhs = op0; + rhs = op1; + } + + if (g_config.auto_compress) { + as.DisableOptimization(Optimization::AutoCompress); + } + u64 immediate = rec.sextImmediate(rec.getImmediate(&next_operands[0]), next_operands[0].imm.size); + u64 rip_false = next_rip + next_instruction->length; + u64 rip_true = rip_false + immediate; + Label true_label; + switch (next_instruction->mnemonic) { + case ZYDIS_MNEMONIC_JL: { + as.BLT(lhs, rhs, &true_label); + break; + } + case ZYDIS_MNEMONIC_JLE: { + as.BLE(lhs, rhs, &true_label); + break; + } + case ZYDIS_MNEMONIC_JNL: { + as.BGE(lhs, rhs, &true_label); + break; + } + case ZYDIS_MNEMONIC_JNLE: { + as.BGT(lhs, rhs, &true_label); + break; + } + case ZYDIS_MNEMONIC_JB: { + as.BLTU(lhs, rhs, &true_label); + break; + } + case ZYDIS_MNEMONIC_JBE: { + as.BLEU(lhs, rhs, &true_label); + break; + } + case ZYDIS_MNEMONIC_JNB: { + as.BGEU(lhs, rhs, &true_label); + break; + } + case ZYDIS_MNEMONIC_JNBE: { + as.BGTU(lhs, rhs, &true_label); + break; + } + default: { + UNREACHABLE(); + } + } + + biscuit::GPR ripreg = rec.allocatedGPR(X86_REF_RIP); + u64 rip_false_offset = rip_false - rec.getCurrentRipregValue(); + rec.addi(ripreg, ripreg, rip_false_offset); + if (g_mode32) { + rec.zext(ripreg, ripreg, X86_SIZE_DWORD); + rip_false = (u32)rip_false; + } + + as.AUIPC(t5, 0); // <- must be before link point, see invalidate_caller_thunk + rec.jumpAndLink(rip_false); + + as.Bind(&true_label); + u64 rip_true_offset = rip_true - rec.getCurrentRipregValue(); + rec.addi(ripreg, ripreg, rip_true_offset); + if (g_mode32) { + rec.zext(ripreg, ripreg, X86_SIZE_DWORD); + rip_true = (u32)rip_true; + } + + as.AUIPC(t5, 0); // <- must be before link point, see invalidate_caller_thunk + rec.jumpAndLink(rip_true); + rec.skipNext(); + rec.stopCompiling(); + if (g_config.auto_compress) { + as.EnableOptimization(Optimization::AutoCompress); + } + return true; + } default: { break; } diff --git a/src/felix86/v2/recompiler.cpp b/src/felix86/v2/recompiler.cpp index 971d2a277..fb2f774c1 100644 --- a/src/felix86/v2/recompiler.cpp +++ b/src/felix86/v2/recompiler.cpp @@ -715,8 +715,10 @@ u64 Recompiler::compileSequence(u64 rip) { return rip; } -std::pair Recompiler::getNextInstruction() { - ASSERT(instructions.size() > current_instruction_index + 1); +std::optional> Recompiler::getNextInstruction() { + if (current_instruction_index + 1 >= instructions.size()) { + return std::nullopt; + } auto& [instruction, operands] = instructions[current_instruction_index + 1]; return std::make_pair(&instruction, operands); } @@ -2199,13 +2201,14 @@ void Recompiler::scanAhead(u64 rip) { // If all the landing places overwrite the flags (1 landing spot for jmp, 2 for jcc) // then we can skip those flag calculations if (is_jump && operands[0].type == ZYDIS_OPERAND_TYPE_IMMEDIATE) { + u32 flags_we_care_about = + ZYDIS_CPUFLAG_OF | ZYDIS_CPUFLAG_CF | ZYDIS_CPUFLAG_ZF | ZYDIS_CPUFLAG_SF | ZYDIS_CPUFLAG_AF | ZYDIS_CPUFLAG_PF; auto scan_landing_block = [&](u64 rip_ahead) { bool jump_to_self = rip_ahead == initial_rip; ZydisDecodedInstruction instruction_ahead; u32 changed_this_block = 0; u32 used_this_block = 0; - u32 flags_we_care_about = - ZYDIS_CPUFLAG_OF | ZYDIS_CPUFLAG_CF | ZYDIS_CPUFLAG_ZF | ZYDIS_CPUFLAG_SF | ZYDIS_CPUFLAG_AF | ZYDIS_CPUFLAG_PF; + // 10 is heuristically picked with no real reason // If we go too high we risk messing our performance // TODO: some benchmarking may be in order diff --git a/src/felix86/v2/recompiler.hpp b/src/felix86/v2/recompiler.hpp index 2e5368061..81b265ff1 100644 --- a/src/felix86/v2/recompiler.hpp +++ b/src/felix86/v2/recompiler.hpp @@ -702,7 +702,7 @@ struct Recompiler { current_ripreg_value = value; } - std::pair getNextInstruction(); + std::optional> getNextInstruction(); private: struct FlagAccess { From bb9234b5a1905dea75c62bdf9ee1b405d122085d Mon Sep 17 00:00:00 2001 From: Paris Oplopoios <21157395+OFFTKP@users.noreply.github.com> Date: Mon, 1 Jun 2026 18:51:31 +0300 Subject: [PATCH 2/9] . --- src/felix86/v2/handlers.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/felix86/v2/handlers.cpp b/src/felix86/v2/handlers.cpp index 8297e8e43..c8c32d011 100644 --- a/src/felix86/v2/handlers.cpp +++ b/src/felix86/v2/handlers.cpp @@ -283,6 +283,7 @@ static inline bool AttemptCmpFusing(Recompiler& rec, u64 rip, Assembler& as, Zyd case ZYDIS_MNEMONIC_JBE: case ZYDIS_MNEMONIC_JNB: case ZYDIS_MNEMONIC_JNBE: { + PLAIN("Fusing CMP+JCC"); // The earlier check confirmed that no flags are needed after this jump, so we can freely fuse instructions here biscuit::GPR op0 = rec.getGPR(&operands[0]); biscuit::GPR op1 = rec.getGPR(&operands[1]); From d7b92ad93d24b2411bb250158bf57190cafaf357 Mon Sep 17 00:00:00 2001 From: Paris Oplopoios <21157395+OFFTKP@users.noreply.github.com> Date: Mon, 1 Jun 2026 18:56:32 +0300 Subject: [PATCH 3/9] . --- src/felix86/v2/recompiler.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/felix86/v2/recompiler.cpp b/src/felix86/v2/recompiler.cpp index fb2f774c1..72aed024b 100644 --- a/src/felix86/v2/recompiler.cpp +++ b/src/felix86/v2/recompiler.cpp @@ -2286,6 +2286,9 @@ void Recompiler::scanAhead(u64 rip) { u64 rip_ahead_true = rip_ahead_false + immediate; // For the flags to not be calculated they need to be overwritten in both paths thrashed_ahead = scan_landing_block(rip_ahead_false) & scan_landing_block(rip_ahead_true); + if (thrashed_ahead == flags_we_care_about) { + WARN("Block with no flags ahead: %lx", initial_rip); + } } else { break; } From 79297a67d21571b0d374dc2dfda0090cbe426a80 Mon Sep 17 00:00:00 2001 From: Paris Oplopoios <21157395+OFFTKP@users.noreply.github.com> Date: Mon, 1 Jun 2026 18:59:38 +0300 Subject: [PATCH 4/9] . --- src/felix86/v2/handlers.cpp | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/src/felix86/v2/handlers.cpp b/src/felix86/v2/handlers.cpp index c8c32d011..cf2a63ee9 100644 --- a/src/felix86/v2/handlers.cpp +++ b/src/felix86/v2/handlers.cpp @@ -129,17 +129,32 @@ static inline bool AttemptCmpFusing(Recompiler& rec, u64 rip, Assembler& as, Zyd bool needs_sf = rec.shouldEmitFlag(next_rip, X86_REF_SF); bool needs_of = rec.shouldEmitFlag(next_rip, X86_REF_OF); bool needs_any_flag = needs_cf || needs_of || needs_pf || needs_sf || needs_zf || needs_af; - // If after the next instruction we need any flag, we can't fuse the CMP because the flags will be important later on - if (needs_any_flag) { - return false; - } - auto opt = rec.getNextInstruction(); if (!opt.has_value()) { return false; } auto [next_instruction, next_operands] = *opt; + switch (next_instruction->mnemonic) { + case ZYDIS_MNEMONIC_JL: + case ZYDIS_MNEMONIC_JLE: + case ZYDIS_MNEMONIC_JNL: + case ZYDIS_MNEMONIC_JNLE: + case ZYDIS_MNEMONIC_JB: + case ZYDIS_MNEMONIC_JBE: + case ZYDIS_MNEMONIC_JNB: + case ZYDIS_MNEMONIC_JNBE: { + WARN("Branch %lx, %d %d %d %d %d %d", next_rip, needs_cf, needs_of, needs_pf, needs_sf, needs_zf, needs_af); + break; + } + default: + break; + } + // If after the next instruction we need any flag, we can't fuse the CMP because the flags will be important later on + if (needs_any_flag) { + return false; + } + switch (next_instruction->mnemonic) { case ZYDIS_MNEMONIC_CMOVL: { biscuit::GPR cond = rec.scratch(); @@ -283,7 +298,6 @@ static inline bool AttemptCmpFusing(Recompiler& rec, u64 rip, Assembler& as, Zyd case ZYDIS_MNEMONIC_JBE: case ZYDIS_MNEMONIC_JNB: case ZYDIS_MNEMONIC_JNBE: { - PLAIN("Fusing CMP+JCC"); // The earlier check confirmed that no flags are needed after this jump, so we can freely fuse instructions here biscuit::GPR op0 = rec.getGPR(&operands[0]); biscuit::GPR op1 = rec.getGPR(&operands[1]); From 7a476db6cca7a4253c1f695666531b6f4c8ecef9 Mon Sep 17 00:00:00 2001 From: Paris Oplopoios <21157395+OFFTKP@users.noreply.github.com> Date: Mon, 1 Jun 2026 19:08:00 +0300 Subject: [PATCH 5/9] . --- src/felix86/v2/recompiler.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/felix86/v2/recompiler.cpp b/src/felix86/v2/recompiler.cpp index 72aed024b..68421c87c 100644 --- a/src/felix86/v2/recompiler.cpp +++ b/src/felix86/v2/recompiler.cpp @@ -2298,27 +2298,27 @@ void Recompiler::scanAhead(u64 rip) { // If the JCC actually uses the flag, that's fine because the flag access will be after the usage // so the instruction handler will emit that flag if (thrashed_ahead & ZYDIS_CPUFLAG_CF) { - flag_access_cpazso[0].push_back({true, rip}); + flag_access_cpazso[0].push_back({true, UINT64_MAX}); } if (thrashed_ahead & ZYDIS_CPUFLAG_PF) { - flag_access_cpazso[1].push_back({true, rip}); + flag_access_cpazso[1].push_back({true, UINT64_MAX}); } if (thrashed_ahead & ZYDIS_CPUFLAG_AF) { - flag_access_cpazso[2].push_back({true, rip}); + flag_access_cpazso[2].push_back({true, UINT64_MAX}); } if (thrashed_ahead & ZYDIS_CPUFLAG_ZF) { - flag_access_cpazso[3].push_back({true, rip}); + flag_access_cpazso[3].push_back({true, UINT64_MAX}); } if (thrashed_ahead & ZYDIS_CPUFLAG_SF) { - flag_access_cpazso[4].push_back({true, rip}); + flag_access_cpazso[4].push_back({true, UINT64_MAX}); } if (thrashed_ahead & ZYDIS_CPUFLAG_OF) { - flag_access_cpazso[5].push_back({true, rip}); + flag_access_cpazso[5].push_back({true, UINT64_MAX}); } } } From 849827088c0105ce4634a4fa722784d043fbcc43 Mon Sep 17 00:00:00 2001 From: Paris Oplopoios <21157395+OFFTKP@users.noreply.github.com> Date: Mon, 1 Jun 2026 19:09:36 +0300 Subject: [PATCH 6/9] . --- src/felix86/v2/recompiler.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/felix86/v2/recompiler.cpp b/src/felix86/v2/recompiler.cpp index 68421c87c..aa73cbbe4 100644 --- a/src/felix86/v2/recompiler.cpp +++ b/src/felix86/v2/recompiler.cpp @@ -2286,9 +2286,6 @@ void Recompiler::scanAhead(u64 rip) { u64 rip_ahead_true = rip_ahead_false + immediate; // For the flags to not be calculated they need to be overwritten in both paths thrashed_ahead = scan_landing_block(rip_ahead_false) & scan_landing_block(rip_ahead_true); - if (thrashed_ahead == flags_we_care_about) { - WARN("Block with no flags ahead: %lx", initial_rip); - } } else { break; } From 45296f8b7181f4d6b48fab4f038f50142b0e2144 Mon Sep 17 00:00:00 2001 From: Paris Oplopoios <21157395+OFFTKP@users.noreply.github.com> Date: Mon, 1 Jun 2026 19:16:38 +0300 Subject: [PATCH 7/9] . --- src/felix86/v2/handlers.cpp | 42 ++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/src/felix86/v2/handlers.cpp b/src/felix86/v2/handlers.cpp index cf2a63ee9..8c405afcb 100644 --- a/src/felix86/v2/handlers.cpp +++ b/src/felix86/v2/handlers.cpp @@ -129,32 +129,17 @@ static inline bool AttemptCmpFusing(Recompiler& rec, u64 rip, Assembler& as, Zyd bool needs_sf = rec.shouldEmitFlag(next_rip, X86_REF_SF); bool needs_of = rec.shouldEmitFlag(next_rip, X86_REF_OF); bool needs_any_flag = needs_cf || needs_of || needs_pf || needs_sf || needs_zf || needs_af; - auto opt = rec.getNextInstruction(); - if (!opt.has_value()) { + // If after the next instruction we need any flag, we can't fuse the CMP because the flags will be important later on + if (needs_any_flag) { return false; } - auto [next_instruction, next_operands] = *opt; - switch (next_instruction->mnemonic) { - case ZYDIS_MNEMONIC_JL: - case ZYDIS_MNEMONIC_JLE: - case ZYDIS_MNEMONIC_JNL: - case ZYDIS_MNEMONIC_JNLE: - case ZYDIS_MNEMONIC_JB: - case ZYDIS_MNEMONIC_JBE: - case ZYDIS_MNEMONIC_JNB: - case ZYDIS_MNEMONIC_JNBE: { - WARN("Branch %lx, %d %d %d %d %d %d", next_rip, needs_cf, needs_of, needs_pf, needs_sf, needs_zf, needs_af); - break; - } - default: - break; - } - // If after the next instruction we need any flag, we can't fuse the CMP because the flags will be important later on - if (needs_any_flag) { + auto opt = rec.getNextInstruction(); + if (!opt.has_value()) { return false; } + auto [next_instruction, next_operands] = *opt; switch (next_instruction->mnemonic) { case ZYDIS_MNEMONIC_CMOVL: { biscuit::GPR cond = rec.scratch(); @@ -297,12 +282,17 @@ static inline bool AttemptCmpFusing(Recompiler& rec, u64 rip, Assembler& as, Zyd case ZYDIS_MNEMONIC_JB: case ZYDIS_MNEMONIC_JBE: case ZYDIS_MNEMONIC_JNB: - case ZYDIS_MNEMONIC_JNBE: { + case ZYDIS_MNEMONIC_JNBE: + case ZYDIS_MNEMONIC_JZ: + case ZYDIS_MNEMONIC_JNZ: { // The earlier check confirmed that no flags are needed after this jump, so we can freely fuse instructions here biscuit::GPR op0 = rec.getGPR(&operands[0]); biscuit::GPR op1 = rec.getGPR(&operands[1]); biscuit::GPR lhs, rhs; - if (instruction.operand_width != 64) { + bool needs_sext = + instruction.operand_width != 64 && (instruction.mnemonic == ZYDIS_MNEMONIC_JL || instruction.mnemonic == ZYDIS_MNEMONIC_JLE || + instruction.mnemonic == ZYDIS_MNEMONIC_JNL || instruction.mnemonic == ZYDIS_MNEMONIC_JNLE); + if (&&needs_sext) { lhs = rec.scratch(); rhs = rec.scratch(); rec.sext(lhs, op0, rec.zydisToSize(instruction.operand_width)); @@ -352,6 +342,14 @@ static inline bool AttemptCmpFusing(Recompiler& rec, u64 rip, Assembler& as, Zyd as.BGTU(lhs, rhs, &true_label); break; } + case ZYDIS_MNEMONIC_JZ: { + as.BEQ(lhs, rhs, &true_label); + break; + } + case ZYDIS_MNEMONIC_JNZ: { + as.BNE(lhs, rhs, &true_label); + break; + } default: { UNREACHABLE(); } From 60f958c0fe4029bad9f4211679889eb92ecad97d Mon Sep 17 00:00:00 2001 From: Paris Oplopoios <21157395+OFFTKP@users.noreply.github.com> Date: Mon, 1 Jun 2026 19:16:46 +0300 Subject: [PATCH 8/9] . --- src/felix86/v2/handlers.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/felix86/v2/handlers.cpp b/src/felix86/v2/handlers.cpp index 8c405afcb..25cdc918e 100644 --- a/src/felix86/v2/handlers.cpp +++ b/src/felix86/v2/handlers.cpp @@ -292,7 +292,7 @@ static inline bool AttemptCmpFusing(Recompiler& rec, u64 rip, Assembler& as, Zyd bool needs_sext = instruction.operand_width != 64 && (instruction.mnemonic == ZYDIS_MNEMONIC_JL || instruction.mnemonic == ZYDIS_MNEMONIC_JLE || instruction.mnemonic == ZYDIS_MNEMONIC_JNL || instruction.mnemonic == ZYDIS_MNEMONIC_JNLE); - if (&&needs_sext) { + if (needs_sext) { lhs = rec.scratch(); rhs = rec.scratch(); rec.sext(lhs, op0, rec.zydisToSize(instruction.operand_width)); From b29421e579095ef490ec1893d94ed0a81f6eaaad Mon Sep 17 00:00:00 2001 From: Paris Oplopoios <21157395+OFFTKP@users.noreply.github.com> Date: Mon, 1 Jun 2026 19:19:43 +0300 Subject: [PATCH 9/9] . --- src/felix86/v2/handlers.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/felix86/v2/handlers.cpp b/src/felix86/v2/handlers.cpp index 25cdc918e..f0886f50a 100644 --- a/src/felix86/v2/handlers.cpp +++ b/src/felix86/v2/handlers.cpp @@ -289,9 +289,10 @@ static inline bool AttemptCmpFusing(Recompiler& rec, u64 rip, Assembler& as, Zyd biscuit::GPR op0 = rec.getGPR(&operands[0]); biscuit::GPR op1 = rec.getGPR(&operands[1]); biscuit::GPR lhs, rhs; - bool needs_sext = - instruction.operand_width != 64 && (instruction.mnemonic == ZYDIS_MNEMONIC_JL || instruction.mnemonic == ZYDIS_MNEMONIC_JLE || - instruction.mnemonic == ZYDIS_MNEMONIC_JNL || instruction.mnemonic == ZYDIS_MNEMONIC_JNLE); + bool needs_sext = instruction.operand_width != 64; + // TODO: zero-extend the immediate in op1 when not sign extending and add the below condition + // && (instruction.mnemonic == ZYDIS_MNEMONIC_JL || instruction.mnemonic == ZYDIS_MNEMONIC_JLE || + // instruction.mnemonic == ZYDIS_MNEMONIC_JNL || instruction.mnemonic == ZYDIS_MNEMONIC_JNLE); if (needs_sext) { lhs = rec.scratch(); rhs = rec.scratch();