diff --git a/counts/Base.json b/counts/Base.json index acba4f00a..26d174812 100644 --- a/counts/Base.json +++ b/counts/Base.json @@ -11607,13 +11607,12 @@ "disassembly": "mul [rdi]" }, "48f727": { - "instruction_count": 6, + "instruction_count": 5, "expected_asm": [ "LD ra, a0, 0x0(0)", - "MULHU t3, t0, ra", + "MULHU a2, t0, ra", "MUL t0, t0, ra", - "ADDI a2, t3, 0x0(0)", - "SLTU s5, zero, t3", + "SLTU s5, zero, a2", "ADDI s9, s5, 0x0(0)" ], "disassembly": "mul [rdi]" diff --git a/counts/Base_NoFlags.json b/counts/Base_NoFlags.json index e28e2ad3f..ff04e675d 100644 --- a/counts/Base_NoFlags.json +++ b/counts/Base_NoFlags.json @@ -5585,12 +5585,11 @@ "disassembly": "mul [rdi]" }, "48f727": { - "instruction_count": 4, + "instruction_count": 3, "expected_asm": [ "LD ra, a0, 0x0(0)", - "MULHU t3, t0, ra", - "MUL t0, t0, ra", - "ADDI a2, t3, 0x0(0)" + "MULHU a2, t0, ra", + "MUL t0, t0, ra" ], "disassembly": "mul [rdi]" }, diff --git a/external/biscuit/include/biscuit/literal.hpp b/external/biscuit/include/biscuit/literal.hpp index d62d00dcc..436f9e55c 100644 --- a/external/biscuit/include/biscuit/literal.hpp +++ b/external/biscuit/include/biscuit/literal.hpp @@ -35,7 +35,7 @@ namespace biscuit { * as.JR(x2); // Execution continues elsewhere * as.Place(&literal); // Place the literal at this location in the buffer * @endcode -*/ + */ template class Literal { public: @@ -104,6 +104,10 @@ class Literal { return m_location; } + [[nodiscard]] T GetValue() const noexcept { + return m_value; + } + private: // A literal instance is inherently bound to the assembler it's // used with, as the offsets within the literal set depend on diff --git a/src/felix86/common/config.inc b/src/felix86/common/config.inc index 95112cdeb..a1b5e317b 100644 --- a/src/felix86/common/config.inc +++ b/src/felix86/common/config.inc @@ -53,3 +53,4 @@ X(Performance, bool, auto_compress, false, FELIX86_AUTO_COMPRESS, "Automatically X(Performance, bool, scan_ahead_multi, true, FELIX86_SCAN_AHEAD_MULTI, "Scan ahead to multiple blocks when possible, avoiding even more flag calculations", false) X(Performance, bool, pclmulqdq, true, FELIX86_PCLMULQDQ, "Enable the PCLMULQDQ instruction, might improve performance in some applications", false) X(Performance, bool, no_address_overflow, true, FELIX86_NO_ADDRESS_OVERFLOW, "Assume addresses won't overflow in 32-bit apps, which allows for some optimizations", false) +X(Performance, bool, literal_pooling, true, FELIX86_LITERAL_POOLING, "Place 64-bit immediates in a literal pool after the block", false) diff --git a/src/felix86/repl.cpp b/src/felix86/repl.cpp index e1c909189..c2a08c6f1 100644 --- a/src/felix86/repl.cpp +++ b/src/felix86/repl.cpp @@ -169,6 +169,7 @@ void __attribute__((noreturn)) enter_repl() { g_config.quiet = true; g_config.inline_syscalls = false; g_config.scan_ahead_multi = false; + g_config.literal_pooling = false; Extensions::G = true; Extensions::B = true; Extensions::C = true; diff --git a/src/felix86/tools/generate_instruction_count.cpp b/src/felix86/tools/generate_instruction_count.cpp index 043b6f20a..c21e77cfd 100644 --- a/src/felix86/tools/generate_instruction_count.cpp +++ b/src/felix86/tools/generate_instruction_count.cpp @@ -287,6 +287,7 @@ u8 outlast_camera2[] = { int main() { g_config.inline_syscalls = false; g_config.scan_ahead_multi = false; + g_config.literal_pooling = false; Extensions::G = true; Extensions::B = true; Extensions::C = true; diff --git a/src/felix86/v2/handlers.cpp b/src/felix86/v2/handlers.cpp index c578dc367..c067c1759 100644 --- a/src/felix86/v2/handlers.cpp +++ b/src/felix86/v2/handlers.cpp @@ -574,7 +574,15 @@ FAST_HANDLE(MOV) { bool not_same = rec.zydisToRef(operands[0].reg.value) != rec.zydisToRef(operands[1].reg.value); bool mem_reg = operands[0].type == ZYDIS_OPERAND_TYPE_MEMORY && operands[1].type == ZYDIS_OPERAND_TYPE_REGISTER; bool reg_mem = operands[0].type == ZYDIS_OPERAND_TYPE_REGISTER && operands[1].type == ZYDIS_OPERAND_TYPE_MEMORY; - if (not_same && reg_reg) { + bool reg_imm64 = g_config.literal_pooling && operands[0].type == ZYDIS_OPERAND_TYPE_REGISTER && operands[0].size == 64 && + operands[1].type == ZYDIS_OPERAND_TYPE_IMMEDIATE && operands[1].size == 64; + if (reg_imm64) { + u64 immediate = operands[1].imm.value.u; + Literal* literal = rec.pushPendingLiteral(immediate); + biscuit::GPR reg = rec.getGPR(&operands[0]); + as.LD(reg, literal); + rec.setGPR(&operands[0], reg); + } else if (not_same && reg_reg) { // Save a mask by doing it this way biscuit::GPR src = rec.getGPR(&operands[1], X86_SIZE_QWORD); if (rec.zydisToSize(operands[1].reg.value) == X86_SIZE_BYTE_HIGH) { @@ -3716,7 +3724,8 @@ FAST_HANDLE(MUL) { break; } case X86_SIZE_QWORD: { - biscuit::GPR result = rec.scratch(); + bool is_src_rdx = operands[0].type == ZYDIS_OPERAND_TYPE_REGISTER && rec.zydisToRef(operands[0].reg.value) == X86_REF_RDX; + biscuit::GPR result = is_src_rdx ? rec.scratch() : rec.getGPR(X86_REF_RDX, X86_SIZE_QWORD); biscuit::GPR rax = rec.getGPR(X86_REF_RAX, X86_SIZE_QWORD); as.MULHU(result, rax, src); as.MUL(rax, rax, src); diff --git a/src/felix86/v2/recompiler.cpp b/src/felix86/v2/recompiler.cpp index 8bb55ca2e..fc6cf4f46 100644 --- a/src/felix86/v2/recompiler.cpp +++ b/src/felix86/v2/recompiler.cpp @@ -422,6 +422,9 @@ u64 Recompiler::compile(ThreadState* state, u64 rip) { u64 end = (u64)as.GetCursorPointer(); + // Place literal pool after the block + expirePendingLiterals(); + ASSERT(end - start >= 8); // At least 2 instructions, so that our unlinking logic works host_pc_map[block_meta.address_end - 1] = &block_meta; @@ -2728,6 +2731,15 @@ void Recompiler::expirePendingLinks(u64 rip) { block_meta.pending_links.clear(); } +void Recompiler::expirePendingLiterals() { + if (g_config.literal_pooling) { + for (auto& literal : pending_literals) { + as.Place(&literal); + } + pending_literals.clear(); + } +} + u64 Recompiler::zextImmediate(u64 imm, ZyanU8 size) { switch (size) { case 8: { diff --git a/src/felix86/v2/recompiler.hpp b/src/felix86/v2/recompiler.hpp index 288026bbf..8062d6482 100644 --- a/src/felix86/v2/recompiler.hpp +++ b/src/felix86/v2/recompiler.hpp @@ -662,6 +662,20 @@ struct Recompiler { std::pair getNextInstruction(); + biscuit::Literal* pushPendingLiteral(u64 value) { + for (auto& item : pending_literals) { + if (item.GetValue() == value) { + // Literal already exists, don't push it again + return &item; + } + } + + pending_literals.push_back(biscuit::Literal{value}); + return &pending_literals.back(); + } + + void expirePendingLiterals(); + private: struct FlagAccess { bool modification; // true if modified, false if used @@ -761,6 +775,8 @@ struct Recompiler { bool relocatable = false; + std::vector> pending_literals; + constexpr static std::array scratch_gprs = { x1, x6, x28, x29, x7, x30, x31, };