From 0904836d69f522cc9a5e53f4aaa98a68a0b42cf9 Mon Sep 17 00:00:00 2001 From: Paris Oplopoios <21157395+OFFTKP@users.noreply.github.com> Date: Tue, 26 May 2026 21:23:51 +0300 Subject: [PATCH 1/7] . --- src/felix86/common/config.inc | 1 + src/felix86/hle/signals.cpp | 97 ++++++++++++++++++++++++++++++++++- src/felix86/v2/recompiler.cpp | 96 ++++++++++++++++++++++++++++------ 3 files changed, 175 insertions(+), 19 deletions(-) diff --git a/src/felix86/common/config.inc b/src/felix86/common/config.inc index 76b5d185b..365ff0994 100644 --- a/src/felix86/common/config.inc +++ b/src/felix86/common/config.inc @@ -70,3 +70,4 @@ X(Performance, bool, auto_compress, false, FELIX86_AUTO_COMPRESS, "Automatically X(Performance, bool, scan_ahead_multi, true, FELIX86_SCAN_AHEAD_MULTI, "Scan ahead to multiple blocks when possible, avoiding even more flag calculations") X(Performance, bool, no_address_overflow, true, FELIX86_NO_ADDRESS_OVERFLOW, "Assume addresses won't overflow in 32-bit apps, which allows for some optimizations") X(Performance, bool, group_loadstore, true, FELIX86_GROUP_LOADSTORE, "Load/store SIMD state in groups of eight, may improve performance") +X(Performance, bool, aligned_tso_optimizations, false, FELIX86_ALIGNED_TSO_OPTIMIZATIONS, "Use atomics for TSO emulation, replace with fenced load/stores if unaligned, may improve performance") diff --git a/src/felix86/hle/signals.cpp b/src/felix86/hle/signals.cpp index dfef33cd0..695fe6b2b 100644 --- a/src/felix86/hle/signals.cpp +++ b/src/felix86/hle/signals.cpp @@ -1169,8 +1169,99 @@ bool handle_wild_sigabrt(ThreadState* current_state, siginfo_t* info, ucontext_t } } +bool handle_unaligned_tso_atomic(ThreadState* current_state, siginfo_t* info, ucontext_t* context, u64 pc) { + if (!is_in_jit_code(current_state, (u8*)pc)) { + return false; + } + + if (!g_config.aligned_tso_optimizations) { + return false; + } + + u32 current_instruction; + current_instruction = *(u32*)pc; + + u32 mask_swap = (0b11111 << 27) | 0b1111111; + u32 expected_swap = (0b00001 << 27) | 0b0101111; + u32 mask_add = (0b11111 << 27) | 0b1111111; + u32 expected_add = (0b00000 << 27) | 0b0101111; + bool is_amoswap = (current_instruction & mask_swap) == expected_swap; + bool is_amoadd = (current_instruction & mask_add) != expected_add; + if (!is_amoswap && !is_amoadd) { + WARN("BUS_ADRALN caused but not by AMOSWAP or AMOADD"); + return false; + } + + u32 size = (current_instruction >> 12) & 0b111; + ASSERT(size == 0b001 || size == 0b010 || size == 0b011); + + u32 rd = (current_instruction >> 7) & 0b11111; + if (rd != 0) { + WARN("AMOSWAP or AMOADD caused BUS_ADRALN but rd isn't x0"); + return false; + } + + u32 nop; + { + Assembler tas((u8*)&nop, sizeof(u32)); + tas.NOP(); + } + + // It's an unaligned AMOSWAP used for TSO emulation, replace it with store instruction + fence + u32 rs = (current_instruction >> 20) & 0b11111; + u32 address = (current_instruction >> 15) & 0b11111; + if (is_amoadd) { + Assembler cas((u8*)pc + 4, sizeof(u32)); + switch (size) { + case 0b001: { + ASSERT(Extensions::Zabha); + cas.LH(biscuit::GPR(rs), 0, biscuit::GPR(address)); + break; + } + case 0b010: { + cas.LW(biscuit::GPR(rs), 0, biscuit::GPR(address)); + break; + } + case 0b011: { + cas.LD(biscuit::GPR(rs), 0, biscuit::GPR(address)); + break; + } + default: { + UNREACHABLE(); + } + } + Assembler pas((u8*)(pc), sizeof(u32)); + pas.FENCE(FenceOrder::RW, FenceOrder::W); + } else { + ASSERT(is_amoswap); + Assembler cas((u8*)pc, sizeof(u32)); + switch (size) { + case 0b001: { + ASSERT(Extensions::Zabha); + cas.SH(biscuit::GPR(rs), 0, biscuit::GPR(address)); + break; + } + case 0b010: { + cas.SW(biscuit::GPR(rs), 0, biscuit::GPR(address)); + break; + } + case 0b011: { + cas.SD(biscuit::GPR(rs), 0, biscuit::GPR(address)); + break; + } + default: { + UNREACHABLE(); + } + } + Assembler pas((u8*)(pc + 4), sizeof(u32)); + pas.FENCE(FenceOrder::RW, FenceOrder::W); + } + + flush_icache_global(pc, pc + 4); + return true; +} + bool handle_synchronous(ThreadState* current_state, siginfo_t* info, ucontext_t* context, u64 pc) { - // We can't cause a SIGSEGV SI_KERNEL from RISC-V, so fix up info->si_code to match x86 behavior if (!is_in_jit_code(current_state, (u8*)pc)) { return false; } @@ -1209,6 +1300,7 @@ bool handle_synchronous(ThreadState* current_state, siginfo_t* info, ucontext_t* u64 actual_rip = get_actual_rip(*current_block, pc); int sig; + // We can't cause a SIGSEGV SI_KERNEL from RISC-V, so fix up info->si_code to match x86 behavior if (next_instruction == expected_hlt) { sig = SIGSEGV; info->si_code = SI_KERNEL; @@ -1233,7 +1325,8 @@ bool handle_synchronous(ThreadState* current_state, siginfo_t* info, ucontext_t* return true; } -constexpr std::array host_signals = {{ +constexpr std::array host_signals = {{ + {SIGSEGV, SEGV_ACCERR, handle_unaligned_tso_atomic}, {SIGSEGV, SEGV_ACCERR, handle_safepoint}, {SIGSEGV, SEGV_ACCERR, handle_smc}, {SIGSEGV, SEGV_MAPERR, handle_synchronous}, diff --git a/src/felix86/v2/recompiler.cpp b/src/felix86/v2/recompiler.cpp index f97032d48..ec9aa1125 100644 --- a/src/felix86/v2/recompiler.cpp +++ b/src/felix86/v2/recompiler.cpp @@ -2867,22 +2867,57 @@ biscuit::GPR Recompiler::getCond(int cond) { } void Recompiler::readMemory(biscuit::GPR dest, biscuit::GPR address, i64 offset, x86_size_e size) { - // Warning: Don't change the LBU->LB etc. here, they must zero extend + bool emulate_tso = g_config.always_tso && !Extensions::TSO && !(g_config.no_tso_stack && current_instruction_on_stack && !g_config.paranoid); + bool use_atomics = g_config.aligned_tso_optimizations && offset == 0; switch (size) { case X86_SIZE_BYTE: { - as.LBU(dest, offset, address); + use_atomics &= Extensions::Zabha; + if (emulate_tso && use_atomics) { + as.AMOADD_B(Ordering::AQ, dest, x0, address); + as.ANDI(dest, dest, 0xFF); + } else { + as.LBU(dest, offset, address); + if (emulate_tso) { + as.FENCE(FenceOrder::R, FenceOrder::RW); + } + } break; } case X86_SIZE_WORD: { - as.LHU(dest, offset, address); + use_atomics &= Extensions::Zabha; + if (emulate_tso && use_atomics) { + as.AMOADD_H(Ordering::AQ, dest, x0, address); + as.ZEXTH(dest, dest); + } else { + as.LHU(dest, offset, address); + if (emulate_tso) { + as.FENCE(FenceOrder::R, FenceOrder::RW); + } + } break; } case X86_SIZE_DWORD: { - as.LWU(dest, offset, address); + if (emulate_tso && use_atomics) { + as.AMOADD_W(Ordering::AQ, dest, x0, address); + as.ZEXTW(dest, dest); + } else { + as.LWU(dest, offset, address); + if (emulate_tso) { + as.FENCE(FenceOrder::R, FenceOrder::RW); + } + } break; } case X86_SIZE_QWORD: { - as.LD(dest, offset, address); + if (emulate_tso && use_atomics) { + as.AMOADD_D(Ordering::AQ, dest, x0, address); + as.NOP(); + } else { + as.LD(dest, offset, address); + if (emulate_tso) { + as.FENCE(FenceOrder::R, FenceOrder::RW); + } + } break; } default: { @@ -2890,10 +2925,6 @@ void Recompiler::readMemory(biscuit::GPR dest, biscuit::GPR address, i64 offset, break; } } - - if (g_config.always_tso && !Extensions::TSO && !(g_config.no_tso_stack && current_instruction_on_stack && !g_config.paranoid)) { - as.FENCE(FenceOrder::R, FenceOrder::RW); - } } void Recompiler::readMemory(biscuit::Vec vec, biscuit::GPR address, int size) { @@ -2936,25 +2967,56 @@ void Recompiler::readMemory(biscuit::Vec vec, biscuit::GPR address, int size) { } void Recompiler::writeMemory(biscuit::GPR src, biscuit::GPR address, i64 offset, x86_size_e size) { - if (g_config.always_tso && !Extensions::TSO && !(g_config.no_tso_stack && current_instruction_on_stack && !g_config.paranoid)) { - as.FENCE(FenceOrder::RW, FenceOrder::W); - } - + bool emulate_tso = g_config.always_tso && !Extensions::TSO && !(g_config.no_tso_stack && current_instruction_on_stack && !g_config.paranoid); + bool use_atomics = g_config.aligned_tso_optimizations && offset == 0; switch (size) { case X86_SIZE_BYTE: { - as.SB(src, offset, address); + use_atomics &= Extensions::Zabha; + if (emulate_tso && use_atomics) { + as.AMOSWAP_B(Ordering::RL, x0, src, address); + } else { + if (emulate_tso) { + as.FENCE(FenceOrder::RW, FenceOrder::W); + } + as.SB(src, offset, address); + } break; } case X86_SIZE_WORD: { - as.SH(src, offset, address); + use_atomics &= Extensions::Zabha; + if (emulate_tso && use_atomics) { + as.AMOSWAP_H(Ordering::RL, x0, src, address); + as.NOP(); + } else { + if (emulate_tso) { + as.FENCE(FenceOrder::RW, FenceOrder::W); + } + as.SH(src, offset, address); + } break; } case X86_SIZE_DWORD: { - as.SW(src, offset, address); + if (emulate_tso && use_atomics) { + as.AMOSWAP_W(Ordering::RL, x0, src, address); + as.NOP(); + } else { + if (emulate_tso) { + as.FENCE(FenceOrder::RW, FenceOrder::W); + } + as.SW(src, offset, address); + } break; } case X86_SIZE_QWORD: { - as.SD(src, offset, address); + if (emulate_tso && use_atomics) { + as.AMOSWAP_D(Ordering::RL, x0, src, address); + as.NOP(); + } else { + if (emulate_tso) { + as.FENCE(FenceOrder::RW, FenceOrder::W); + } + as.SD(src, offset, address); + } break; } default: { From 914b93da28d0bceae1f0ed8973b471eae13b7915 Mon Sep 17 00:00:00 2001 From: Paris Oplopoios <21157395+OFFTKP@users.noreply.github.com> Date: Tue, 26 May 2026 21:27:30 +0300 Subject: [PATCH 2/7] . --- src/felix86/hle/signals.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/felix86/hle/signals.cpp b/src/felix86/hle/signals.cpp index 695fe6b2b..2c95ccfb3 100644 --- a/src/felix86/hle/signals.cpp +++ b/src/felix86/hle/signals.cpp @@ -1186,7 +1186,7 @@ bool handle_unaligned_tso_atomic(ThreadState* current_state, siginfo_t* info, uc u32 mask_add = (0b11111 << 27) | 0b1111111; u32 expected_add = (0b00000 << 27) | 0b0101111; bool is_amoswap = (current_instruction & mask_swap) == expected_swap; - bool is_amoadd = (current_instruction & mask_add) != expected_add; + bool is_amoadd = (current_instruction & mask_add) == expected_add; if (!is_amoswap && !is_amoadd) { WARN("BUS_ADRALN caused but not by AMOSWAP or AMOADD"); return false; From fdbdd842f0d7df89d4f175cfc0162a5f665af444 Mon Sep 17 00:00:00 2001 From: Paris Oplopoios <21157395+OFFTKP@users.noreply.github.com> Date: Tue, 26 May 2026 21:29:15 +0300 Subject: [PATCH 3/7] . --- src/felix86/hle/signals.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/src/felix86/hle/signals.cpp b/src/felix86/hle/signals.cpp index 2c95ccfb3..85001119d 100644 --- a/src/felix86/hle/signals.cpp +++ b/src/felix86/hle/signals.cpp @@ -1196,34 +1196,33 @@ bool handle_unaligned_tso_atomic(ThreadState* current_state, siginfo_t* info, uc ASSERT(size == 0b001 || size == 0b010 || size == 0b011); u32 rd = (current_instruction >> 7) & 0b11111; - if (rd != 0) { - WARN("AMOSWAP or AMOADD caused BUS_ADRALN but rd isn't x0"); + if (is_amoswap && rd != 0) { + WARN("AMOSWAP caused BUS_ADRALN but rd isn't x0"); return false; } - u32 nop; - { - Assembler tas((u8*)&nop, sizeof(u32)); - tas.NOP(); + u32 rs = (current_instruction >> 20) & 0b11111; + if (is_amoadd && rs != 0) { + WARN("AMOADD caused BUS_ADRALN but rs isn't x0"); + return false; } // It's an unaligned AMOSWAP used for TSO emulation, replace it with store instruction + fence - u32 rs = (current_instruction >> 20) & 0b11111; u32 address = (current_instruction >> 15) & 0b11111; if (is_amoadd) { Assembler cas((u8*)pc + 4, sizeof(u32)); switch (size) { case 0b001: { ASSERT(Extensions::Zabha); - cas.LH(biscuit::GPR(rs), 0, biscuit::GPR(address)); + cas.LH(biscuit::GPR(rd), 0, biscuit::GPR(address)); break; } case 0b010: { - cas.LW(biscuit::GPR(rs), 0, biscuit::GPR(address)); + cas.LW(biscuit::GPR(rd), 0, biscuit::GPR(address)); break; } case 0b011: { - cas.LD(biscuit::GPR(rs), 0, biscuit::GPR(address)); + cas.LD(biscuit::GPR(rd), 0, biscuit::GPR(address)); break; } default: { From 18b1f4570dd2b1e39c873ab2799409581f26cab8 Mon Sep 17 00:00:00 2001 From: Paris Oplopoios <21157395+OFFTKP@users.noreply.github.com> Date: Tue, 26 May 2026 21:34:19 +0300 Subject: [PATCH 4/7] . --- src/felix86/hle/signals.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/felix86/hle/signals.cpp b/src/felix86/hle/signals.cpp index 85001119d..832c7d530 100644 --- a/src/felix86/hle/signals.cpp +++ b/src/felix86/hle/signals.cpp @@ -1214,11 +1214,11 @@ bool handle_unaligned_tso_atomic(ThreadState* current_state, siginfo_t* info, uc switch (size) { case 0b001: { ASSERT(Extensions::Zabha); - cas.LH(biscuit::GPR(rd), 0, biscuit::GPR(address)); + cas.LHU(biscuit::GPR(rd), 0, biscuit::GPR(address)); break; } case 0b010: { - cas.LW(biscuit::GPR(rd), 0, biscuit::GPR(address)); + cas.LWU(biscuit::GPR(rd), 0, biscuit::GPR(address)); break; } case 0b011: { From 3f8557961ae7154e2ca445744cb405ec2a6cbe5b Mon Sep 17 00:00:00 2001 From: Paris Oplopoios <21157395+OFFTKP@users.noreply.github.com> Date: Tue, 26 May 2026 21:50:01 +0300 Subject: [PATCH 5/7] . --- src/felix86/hle/signals.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/felix86/hle/signals.cpp b/src/felix86/hle/signals.cpp index 832c7d530..ec94764c4 100644 --- a/src/felix86/hle/signals.cpp +++ b/src/felix86/hle/signals.cpp @@ -1188,7 +1188,7 @@ bool handle_unaligned_tso_atomic(ThreadState* current_state, siginfo_t* info, uc bool is_amoswap = (current_instruction & mask_swap) == expected_swap; bool is_amoadd = (current_instruction & mask_add) == expected_add; if (!is_amoswap && !is_amoadd) { - WARN("BUS_ADRALN caused but not by AMOSWAP or AMOADD"); + WARN("BUS_ADRALN caused but not by AMOSWAP or AMOADD: %lx", current_instruction); return false; } @@ -1197,7 +1197,7 @@ bool handle_unaligned_tso_atomic(ThreadState* current_state, siginfo_t* info, uc u32 rd = (current_instruction >> 7) & 0b11111; if (is_amoswap && rd != 0) { - WARN("AMOSWAP caused BUS_ADRALN but rd isn't x0"); + WARN("AMOSWAP caused BUS_ADRALN but rd isn't x0: %lx", current_instruction); return false; } From 98a7d44779a0fb91b222574258ee200c39d4e964 Mon Sep 17 00:00:00 2001 From: Paris Oplopoios <21157395+OFFTKP@users.noreply.github.com> Date: Tue, 26 May 2026 21:57:11 +0300 Subject: [PATCH 6/7] . --- src/felix86/hle/signals.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/felix86/hle/signals.cpp b/src/felix86/hle/signals.cpp index ec94764c4..8834518ef 100644 --- a/src/felix86/hle/signals.cpp +++ b/src/felix86/hle/signals.cpp @@ -1188,7 +1188,7 @@ bool handle_unaligned_tso_atomic(ThreadState* current_state, siginfo_t* info, uc bool is_amoswap = (current_instruction & mask_swap) == expected_swap; bool is_amoadd = (current_instruction & mask_add) == expected_add; if (!is_amoswap && !is_amoadd) { - WARN("BUS_ADRALN caused but not by AMOSWAP or AMOADD: %lx", current_instruction); + WARN("SEGV_ACCERR caused but not by AMOSWAP or AMOADD: %x", current_instruction); return false; } @@ -1197,13 +1197,13 @@ bool handle_unaligned_tso_atomic(ThreadState* current_state, siginfo_t* info, uc u32 rd = (current_instruction >> 7) & 0b11111; if (is_amoswap && rd != 0) { - WARN("AMOSWAP caused BUS_ADRALN but rd isn't x0: %lx", current_instruction); + WARN("AMOSWAP caused SEGV_ACCERR but rd isn't x0: %x", current_instruction); return false; } u32 rs = (current_instruction >> 20) & 0b11111; if (is_amoadd && rs != 0) { - WARN("AMOADD caused BUS_ADRALN but rs isn't x0"); + WARN("AMOADD caused SEGV_ACCERR but rs isn't x0"); return false; } @@ -1256,7 +1256,7 @@ bool handle_unaligned_tso_atomic(ThreadState* current_state, siginfo_t* info, uc pas.FENCE(FenceOrder::RW, FenceOrder::W); } - flush_icache_global(pc, pc + 4); + flush_icache_global(pc, pc + 8); return true; } @@ -1325,8 +1325,8 @@ bool handle_synchronous(ThreadState* current_state, siginfo_t* info, ucontext_t* } constexpr std::array host_signals = {{ - {SIGSEGV, SEGV_ACCERR, handle_unaligned_tso_atomic}, {SIGSEGV, SEGV_ACCERR, handle_safepoint}, + {SIGSEGV, SEGV_ACCERR, handle_unaligned_tso_atomic}, {SIGSEGV, SEGV_ACCERR, handle_smc}, {SIGSEGV, SEGV_MAPERR, handle_synchronous}, {SIGILL, 0, handle_breakpoint}, From 4d762505650aa4fe97988bc0ecfaefe4a4d2632d Mon Sep 17 00:00:00 2001 From: Paris Oplopoios <21157395+OFFTKP@users.noreply.github.com> Date: Tue, 26 May 2026 22:01:28 +0300 Subject: [PATCH 7/7] . --- src/felix86/hle/signals.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/felix86/hle/signals.cpp b/src/felix86/hle/signals.cpp index 8834518ef..3d6a20a2b 100644 --- a/src/felix86/hle/signals.cpp +++ b/src/felix86/hle/signals.cpp @@ -1188,7 +1188,6 @@ bool handle_unaligned_tso_atomic(ThreadState* current_state, siginfo_t* info, uc bool is_amoswap = (current_instruction & mask_swap) == expected_swap; bool is_amoadd = (current_instruction & mask_add) == expected_add; if (!is_amoswap && !is_amoadd) { - WARN("SEGV_ACCERR caused but not by AMOSWAP or AMOADD: %x", current_instruction); return false; } @@ -1197,13 +1196,11 @@ bool handle_unaligned_tso_atomic(ThreadState* current_state, siginfo_t* info, uc u32 rd = (current_instruction >> 7) & 0b11111; if (is_amoswap && rd != 0) { - WARN("AMOSWAP caused SEGV_ACCERR but rd isn't x0: %x", current_instruction); return false; } u32 rs = (current_instruction >> 20) & 0b11111; if (is_amoadd && rs != 0) { - WARN("AMOADD caused SEGV_ACCERR but rs isn't x0"); return false; } @@ -1326,6 +1323,8 @@ bool handle_synchronous(ThreadState* current_state, siginfo_t* info, ucontext_t* constexpr std::array host_signals = {{ {SIGSEGV, SEGV_ACCERR, handle_safepoint}, + // Note: Regrettably this causes the same fault as the SMC handling fault with no way to detect + // However, since we patch the atomics, if a fault happens again it will be properly handled by handle_smc {SIGSEGV, SEGV_ACCERR, handle_unaligned_tso_atomic}, {SIGSEGV, SEGV_ACCERR, handle_smc}, {SIGSEGV, SEGV_MAPERR, handle_synchronous},