diff --git a/src/felix86/common/config.inc b/src/felix86/common/config.inc index 76b5d185b..365ff0994 100644 --- a/src/felix86/common/config.inc +++ b/src/felix86/common/config.inc @@ -70,3 +70,4 @@ X(Performance, bool, auto_compress, false, FELIX86_AUTO_COMPRESS, "Automatically X(Performance, bool, scan_ahead_multi, true, FELIX86_SCAN_AHEAD_MULTI, "Scan ahead to multiple blocks when possible, avoiding even more flag calculations") X(Performance, bool, no_address_overflow, true, FELIX86_NO_ADDRESS_OVERFLOW, "Assume addresses won't overflow in 32-bit apps, which allows for some optimizations") X(Performance, bool, group_loadstore, true, FELIX86_GROUP_LOADSTORE, "Load/store SIMD state in groups of eight, may improve performance") +X(Performance, bool, aligned_tso_optimizations, false, FELIX86_ALIGNED_TSO_OPTIMIZATIONS, "Use atomics for TSO emulation, replace with fenced load/stores if unaligned, may improve performance") diff --git a/src/felix86/hle/signals.cpp b/src/felix86/hle/signals.cpp index dfef33cd0..3d6a20a2b 100644 --- a/src/felix86/hle/signals.cpp +++ b/src/felix86/hle/signals.cpp @@ -1169,8 +1169,95 @@ bool handle_wild_sigabrt(ThreadState* current_state, siginfo_t* info, ucontext_t } } +bool handle_unaligned_tso_atomic(ThreadState* current_state, siginfo_t* info, ucontext_t* context, u64 pc) { + if (!is_in_jit_code(current_state, (u8*)pc)) { + return false; + } + + if (!g_config.aligned_tso_optimizations) { + return false; + } + + u32 current_instruction; + current_instruction = *(u32*)pc; + + u32 mask_swap = (0b11111 << 27) | 0b1111111; + u32 expected_swap = (0b00001 << 27) | 0b0101111; + u32 mask_add = (0b11111 << 27) | 0b1111111; + u32 expected_add = (0b00000 << 27) | 0b0101111; + bool is_amoswap = (current_instruction & mask_swap) == expected_swap; + bool is_amoadd = (current_instruction & mask_add) == expected_add; + if (!is_amoswap && !is_amoadd) { + return false; + } + + u32 size = (current_instruction >> 12) & 0b111; + ASSERT(size == 0b001 || size == 0b010 || size == 0b011); + + u32 rd = (current_instruction >> 7) & 0b11111; + if (is_amoswap && rd != 0) { + return false; + } + + u32 rs = (current_instruction >> 20) & 0b11111; + if (is_amoadd && rs != 0) { + return false; + } + + // It's an unaligned AMOSWAP used for TSO emulation, replace it with store instruction + fence + u32 address = (current_instruction >> 15) & 0b11111; + if (is_amoadd) { + Assembler cas((u8*)pc + 4, sizeof(u32)); + switch (size) { + case 0b001: { + ASSERT(Extensions::Zabha); + cas.LHU(biscuit::GPR(rd), 0, biscuit::GPR(address)); + break; + } + case 0b010: { + cas.LWU(biscuit::GPR(rd), 0, biscuit::GPR(address)); + break; + } + case 0b011: { + cas.LD(biscuit::GPR(rd), 0, biscuit::GPR(address)); + break; + } + default: { + UNREACHABLE(); + } + } + Assembler pas((u8*)(pc), sizeof(u32)); + pas.FENCE(FenceOrder::RW, FenceOrder::W); + } else { + ASSERT(is_amoswap); + Assembler cas((u8*)pc, sizeof(u32)); + switch (size) { + case 0b001: { + ASSERT(Extensions::Zabha); + cas.SH(biscuit::GPR(rs), 0, biscuit::GPR(address)); + break; + } + case 0b010: { + cas.SW(biscuit::GPR(rs), 0, biscuit::GPR(address)); + break; + } + case 0b011: { + cas.SD(biscuit::GPR(rs), 0, biscuit::GPR(address)); + break; + } + default: { + UNREACHABLE(); + } + } + Assembler pas((u8*)(pc + 4), sizeof(u32)); + pas.FENCE(FenceOrder::RW, FenceOrder::W); + } + + flush_icache_global(pc, pc + 8); + return true; +} + bool handle_synchronous(ThreadState* current_state, siginfo_t* info, ucontext_t* context, u64 pc) { - // We can't cause a SIGSEGV SI_KERNEL from RISC-V, so fix up info->si_code to match x86 behavior if (!is_in_jit_code(current_state, (u8*)pc)) { return false; } @@ -1209,6 +1296,7 @@ bool handle_synchronous(ThreadState* current_state, siginfo_t* info, ucontext_t* u64 actual_rip = get_actual_rip(*current_block, pc); int sig; + // We can't cause a SIGSEGV SI_KERNEL from RISC-V, so fix up info->si_code to match x86 behavior if (next_instruction == expected_hlt) { sig = SIGSEGV; info->si_code = SI_KERNEL; @@ -1233,8 +1321,11 @@ bool handle_synchronous(ThreadState* current_state, siginfo_t* info, ucontext_t* return true; } -constexpr std::array host_signals = {{ +constexpr std::array host_signals = {{ {SIGSEGV, SEGV_ACCERR, handle_safepoint}, + // Note: Regrettably this causes the same fault as the SMC handling fault with no way to detect + // However, since we patch the atomics, if a fault happens again it will be properly handled by handle_smc + {SIGSEGV, SEGV_ACCERR, handle_unaligned_tso_atomic}, {SIGSEGV, SEGV_ACCERR, handle_smc}, {SIGSEGV, SEGV_MAPERR, handle_synchronous}, {SIGILL, 0, handle_breakpoint}, diff --git a/src/felix86/v2/recompiler.cpp b/src/felix86/v2/recompiler.cpp index f97032d48..ec9aa1125 100644 --- a/src/felix86/v2/recompiler.cpp +++ b/src/felix86/v2/recompiler.cpp @@ -2867,22 +2867,57 @@ biscuit::GPR Recompiler::getCond(int cond) { } void Recompiler::readMemory(biscuit::GPR dest, biscuit::GPR address, i64 offset, x86_size_e size) { - // Warning: Don't change the LBU->LB etc. here, they must zero extend + bool emulate_tso = g_config.always_tso && !Extensions::TSO && !(g_config.no_tso_stack && current_instruction_on_stack && !g_config.paranoid); + bool use_atomics = g_config.aligned_tso_optimizations && offset == 0; switch (size) { case X86_SIZE_BYTE: { - as.LBU(dest, offset, address); + use_atomics &= Extensions::Zabha; + if (emulate_tso && use_atomics) { + as.AMOADD_B(Ordering::AQ, dest, x0, address); + as.ANDI(dest, dest, 0xFF); + } else { + as.LBU(dest, offset, address); + if (emulate_tso) { + as.FENCE(FenceOrder::R, FenceOrder::RW); + } + } break; } case X86_SIZE_WORD: { - as.LHU(dest, offset, address); + use_atomics &= Extensions::Zabha; + if (emulate_tso && use_atomics) { + as.AMOADD_H(Ordering::AQ, dest, x0, address); + as.ZEXTH(dest, dest); + } else { + as.LHU(dest, offset, address); + if (emulate_tso) { + as.FENCE(FenceOrder::R, FenceOrder::RW); + } + } break; } case X86_SIZE_DWORD: { - as.LWU(dest, offset, address); + if (emulate_tso && use_atomics) { + as.AMOADD_W(Ordering::AQ, dest, x0, address); + as.ZEXTW(dest, dest); + } else { + as.LWU(dest, offset, address); + if (emulate_tso) { + as.FENCE(FenceOrder::R, FenceOrder::RW); + } + } break; } case X86_SIZE_QWORD: { - as.LD(dest, offset, address); + if (emulate_tso && use_atomics) { + as.AMOADD_D(Ordering::AQ, dest, x0, address); + as.NOP(); + } else { + as.LD(dest, offset, address); + if (emulate_tso) { + as.FENCE(FenceOrder::R, FenceOrder::RW); + } + } break; } default: { @@ -2890,10 +2925,6 @@ void Recompiler::readMemory(biscuit::GPR dest, biscuit::GPR address, i64 offset, break; } } - - if (g_config.always_tso && !Extensions::TSO && !(g_config.no_tso_stack && current_instruction_on_stack && !g_config.paranoid)) { - as.FENCE(FenceOrder::R, FenceOrder::RW); - } } void Recompiler::readMemory(biscuit::Vec vec, biscuit::GPR address, int size) { @@ -2936,25 +2967,56 @@ void Recompiler::readMemory(biscuit::Vec vec, biscuit::GPR address, int size) { } void Recompiler::writeMemory(biscuit::GPR src, biscuit::GPR address, i64 offset, x86_size_e size) { - if (g_config.always_tso && !Extensions::TSO && !(g_config.no_tso_stack && current_instruction_on_stack && !g_config.paranoid)) { - as.FENCE(FenceOrder::RW, FenceOrder::W); - } - + bool emulate_tso = g_config.always_tso && !Extensions::TSO && !(g_config.no_tso_stack && current_instruction_on_stack && !g_config.paranoid); + bool use_atomics = g_config.aligned_tso_optimizations && offset == 0; switch (size) { case X86_SIZE_BYTE: { - as.SB(src, offset, address); + use_atomics &= Extensions::Zabha; + if (emulate_tso && use_atomics) { + as.AMOSWAP_B(Ordering::RL, x0, src, address); + } else { + if (emulate_tso) { + as.FENCE(FenceOrder::RW, FenceOrder::W); + } + as.SB(src, offset, address); + } break; } case X86_SIZE_WORD: { - as.SH(src, offset, address); + use_atomics &= Extensions::Zabha; + if (emulate_tso && use_atomics) { + as.AMOSWAP_H(Ordering::RL, x0, src, address); + as.NOP(); + } else { + if (emulate_tso) { + as.FENCE(FenceOrder::RW, FenceOrder::W); + } + as.SH(src, offset, address); + } break; } case X86_SIZE_DWORD: { - as.SW(src, offset, address); + if (emulate_tso && use_atomics) { + as.AMOSWAP_W(Ordering::RL, x0, src, address); + as.NOP(); + } else { + if (emulate_tso) { + as.FENCE(FenceOrder::RW, FenceOrder::W); + } + as.SW(src, offset, address); + } break; } case X86_SIZE_QWORD: { - as.SD(src, offset, address); + if (emulate_tso && use_atomics) { + as.AMOSWAP_D(Ordering::RL, x0, src, address); + as.NOP(); + } else { + if (emulate_tso) { + as.FENCE(FenceOrder::RW, FenceOrder::W); + } + as.SD(src, offset, address); + } break; } default: {