diff --git a/src/felix86/v2/handlers.cpp b/src/felix86/v2/handlers.cpp index e89952613..f0886f50a 100644 --- a/src/felix86/v2/handlers.cpp +++ b/src/felix86/v2/handlers.cpp @@ -1,5 +1,4 @@ #include -#include #include #include "Zydis/DecoderTypes.h" #include "Zydis/SharedTypes.h" @@ -135,7 +134,12 @@ static inline bool AttemptCmpFusing(Recompiler& rec, u64 rip, Assembler& as, Zyd return false; } - auto [next_instruction, next_operands] = rec.getNextInstruction(); + auto opt = rec.getNextInstruction(); + if (!opt.has_value()) { + return false; + } + + auto [next_instruction, next_operands] = *opt; switch (next_instruction->mnemonic) { case ZYDIS_MNEMONIC_CMOVL: { biscuit::GPR cond = rec.scratch(); @@ -271,6 +275,115 @@ static inline bool AttemptCmpFusing(Recompiler& rec, u64 rip, Assembler& as, Zyd rec.skipNext(); return true; } + case ZYDIS_MNEMONIC_JL: + case ZYDIS_MNEMONIC_JLE: + case ZYDIS_MNEMONIC_JNL: + case ZYDIS_MNEMONIC_JNLE: + case ZYDIS_MNEMONIC_JB: + case ZYDIS_MNEMONIC_JBE: + case ZYDIS_MNEMONIC_JNB: + case ZYDIS_MNEMONIC_JNBE: + case ZYDIS_MNEMONIC_JZ: + case ZYDIS_MNEMONIC_JNZ: { + // The earlier check confirmed that no flags are needed after this jump, so we can freely fuse instructions here + biscuit::GPR op0 = rec.getGPR(&operands[0]); + biscuit::GPR op1 = rec.getGPR(&operands[1]); + biscuit::GPR lhs, rhs; + bool needs_sext = instruction.operand_width != 64; + // TODO: zero-extend the immediate in op1 when not sign extending and add the below condition + // && (instruction.mnemonic == ZYDIS_MNEMONIC_JL || instruction.mnemonic == ZYDIS_MNEMONIC_JLE || + // instruction.mnemonic == ZYDIS_MNEMONIC_JNL || instruction.mnemonic == ZYDIS_MNEMONIC_JNLE); + if (needs_sext) { + lhs = rec.scratch(); + rhs = rec.scratch(); + rec.sext(lhs, op0, rec.zydisToSize(instruction.operand_width)); + rec.sext(rhs, op1, rec.zydisToSize(instruction.operand_width)); + } else { + lhs = op0; + rhs = op1; + } + + if (g_config.auto_compress) { + as.DisableOptimization(Optimization::AutoCompress); + } + u64 immediate = rec.sextImmediate(rec.getImmediate(&next_operands[0]), next_operands[0].imm.size); + u64 rip_false = next_rip + next_instruction->length; + u64 rip_true = rip_false + immediate; + Label true_label; + switch (next_instruction->mnemonic) { + case ZYDIS_MNEMONIC_JL: { + as.BLT(lhs, rhs, &true_label); + break; + } + case ZYDIS_MNEMONIC_JLE: { + as.BLE(lhs, rhs, &true_label); + break; + } + case ZYDIS_MNEMONIC_JNL: { + as.BGE(lhs, rhs, &true_label); + break; + } + case ZYDIS_MNEMONIC_JNLE: { + as.BGT(lhs, rhs, &true_label); + break; + } + case ZYDIS_MNEMONIC_JB: { + as.BLTU(lhs, rhs, &true_label); + break; + } + case ZYDIS_MNEMONIC_JBE: { + as.BLEU(lhs, rhs, &true_label); + break; + } + case ZYDIS_MNEMONIC_JNB: { + as.BGEU(lhs, rhs, &true_label); + break; + } + case ZYDIS_MNEMONIC_JNBE: { + as.BGTU(lhs, rhs, &true_label); + break; + } + case ZYDIS_MNEMONIC_JZ: { + as.BEQ(lhs, rhs, &true_label); + break; + } + case ZYDIS_MNEMONIC_JNZ: { + as.BNE(lhs, rhs, &true_label); + break; + } + default: { + UNREACHABLE(); + } + } + + biscuit::GPR ripreg = rec.allocatedGPR(X86_REF_RIP); + u64 rip_false_offset = rip_false - rec.getCurrentRipregValue(); + rec.addi(ripreg, ripreg, rip_false_offset); + if (g_mode32) { + rec.zext(ripreg, ripreg, X86_SIZE_DWORD); + rip_false = (u32)rip_false; + } + + as.AUIPC(t5, 0); // <- must be before link point, see invalidate_caller_thunk + rec.jumpAndLink(rip_false); + + as.Bind(&true_label); + u64 rip_true_offset = rip_true - rec.getCurrentRipregValue(); + rec.addi(ripreg, ripreg, rip_true_offset); + if (g_mode32) { + rec.zext(ripreg, ripreg, X86_SIZE_DWORD); + rip_true = (u32)rip_true; + } + + as.AUIPC(t5, 0); // <- must be before link point, see invalidate_caller_thunk + rec.jumpAndLink(rip_true); + rec.skipNext(); + rec.stopCompiling(); + if (g_config.auto_compress) { + as.EnableOptimization(Optimization::AutoCompress); + } + return true; + } default: { break; } diff --git a/src/felix86/v2/recompiler.cpp b/src/felix86/v2/recompiler.cpp index 971d2a277..aa73cbbe4 100644 --- a/src/felix86/v2/recompiler.cpp +++ b/src/felix86/v2/recompiler.cpp @@ -715,8 +715,10 @@ u64 Recompiler::compileSequence(u64 rip) { return rip; } -std::pair Recompiler::getNextInstruction() { - ASSERT(instructions.size() > current_instruction_index + 1); +std::optional> Recompiler::getNextInstruction() { + if (current_instruction_index + 1 >= instructions.size()) { + return std::nullopt; + } auto& [instruction, operands] = instructions[current_instruction_index + 1]; return std::make_pair(&instruction, operands); } @@ -2199,13 +2201,14 @@ void Recompiler::scanAhead(u64 rip) { // If all the landing places overwrite the flags (1 landing spot for jmp, 2 for jcc) // then we can skip those flag calculations if (is_jump && operands[0].type == ZYDIS_OPERAND_TYPE_IMMEDIATE) { + u32 flags_we_care_about = + ZYDIS_CPUFLAG_OF | ZYDIS_CPUFLAG_CF | ZYDIS_CPUFLAG_ZF | ZYDIS_CPUFLAG_SF | ZYDIS_CPUFLAG_AF | ZYDIS_CPUFLAG_PF; auto scan_landing_block = [&](u64 rip_ahead) { bool jump_to_self = rip_ahead == initial_rip; ZydisDecodedInstruction instruction_ahead; u32 changed_this_block = 0; u32 used_this_block = 0; - u32 flags_we_care_about = - ZYDIS_CPUFLAG_OF | ZYDIS_CPUFLAG_CF | ZYDIS_CPUFLAG_ZF | ZYDIS_CPUFLAG_SF | ZYDIS_CPUFLAG_AF | ZYDIS_CPUFLAG_PF; + // 10 is heuristically picked with no real reason // If we go too high we risk messing our performance // TODO: some benchmarking may be in order @@ -2292,27 +2295,27 @@ void Recompiler::scanAhead(u64 rip) { // If the JCC actually uses the flag, that's fine because the flag access will be after the usage // so the instruction handler will emit that flag if (thrashed_ahead & ZYDIS_CPUFLAG_CF) { - flag_access_cpazso[0].push_back({true, rip}); + flag_access_cpazso[0].push_back({true, UINT64_MAX}); } if (thrashed_ahead & ZYDIS_CPUFLAG_PF) { - flag_access_cpazso[1].push_back({true, rip}); + flag_access_cpazso[1].push_back({true, UINT64_MAX}); } if (thrashed_ahead & ZYDIS_CPUFLAG_AF) { - flag_access_cpazso[2].push_back({true, rip}); + flag_access_cpazso[2].push_back({true, UINT64_MAX}); } if (thrashed_ahead & ZYDIS_CPUFLAG_ZF) { - flag_access_cpazso[3].push_back({true, rip}); + flag_access_cpazso[3].push_back({true, UINT64_MAX}); } if (thrashed_ahead & ZYDIS_CPUFLAG_SF) { - flag_access_cpazso[4].push_back({true, rip}); + flag_access_cpazso[4].push_back({true, UINT64_MAX}); } if (thrashed_ahead & ZYDIS_CPUFLAG_OF) { - flag_access_cpazso[5].push_back({true, rip}); + flag_access_cpazso[5].push_back({true, UINT64_MAX}); } } } diff --git a/src/felix86/v2/recompiler.hpp b/src/felix86/v2/recompiler.hpp index 2e5368061..81b265ff1 100644 --- a/src/felix86/v2/recompiler.hpp +++ b/src/felix86/v2/recompiler.hpp @@ -702,7 +702,7 @@ struct Recompiler { current_ripreg_value = value; } - std::pair getNextInstruction(); + std::optional> getNextInstruction(); private: struct FlagAccess {