From 1af73fa0c38409c39fb799e1cc204d0457a3ece8 Mon Sep 17 00:00:00 2001 From: Paris Oplopoios <21157395+OFFTKP@users.noreply.github.com> Date: Wed, 27 May 2026 19:11:17 +0300 Subject: [PATCH 1/6] . --- src/felix86/common/config.inc | 1 + src/felix86/common/global.cpp | 2 +- src/felix86/common/global.hpp | 2 +- src/felix86/v2/handlers.cpp | 16 +++++++++++++++- 4 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/felix86/common/config.inc b/src/felix86/common/config.inc index 76b5d185b..ed9a63c4d 100644 --- a/src/felix86/common/config.inc +++ b/src/felix86/common/config.inc @@ -70,3 +70,4 @@ X(Performance, bool, auto_compress, false, FELIX86_AUTO_COMPRESS, "Automatically X(Performance, bool, scan_ahead_multi, true, FELIX86_SCAN_AHEAD_MULTI, "Scan ahead to multiple blocks when possible, avoiding even more flag calculations") X(Performance, bool, no_address_overflow, true, FELIX86_NO_ADDRESS_OVERFLOW, "Assume addresses won't overflow in 32-bit apps, which allows for some optimizations") X(Performance, bool, group_loadstore, true, FELIX86_GROUP_LOADSTORE, "Load/store SIMD state in groups of eight, may improve performance") +X(Performance, bool, cas128_global, false, FELIX86_CAS128_GLOBAL, "Use a global lock for emulating CMPXCHG16B, may improve stability but worsen performance in some games") diff --git a/src/felix86/common/global.cpp b/src/felix86/common/global.cpp index 95db4c293..7a2ab32c3 100644 --- a/src/felix86/common/global.cpp +++ b/src/felix86/common/global.cpp @@ -98,7 +98,7 @@ void ProcessGlobals::initialize() { perf = std::make_unique(); - cas128_lock = 0; + memset(cas128_locks, 0, sizeof(cas128_locks)); // HACK: Don't clear as they get shared per mount namespace // TODO: proper mount namespacing when we need it diff --git a/src/felix86/common/global.hpp b/src/felix86/common/global.hpp index d4d0ec2ea..377a90a11 100644 --- a/src/felix86/common/global.hpp +++ b/src/felix86/common/global.hpp @@ -49,7 +49,7 @@ struct ProcessGlobals { std::unique_ptr perf; // For cmpxchg16b - u32 cas128_lock = 0; + u32 cas128_locks[256]; // TODO: this isn't per CLONE_VM but per mount namespace // But we don't care for now diff --git a/src/felix86/v2/handlers.cpp b/src/felix86/v2/handlers.cpp index 164b2137d..301fe98ed 100644 --- a/src/felix86/v2/handlers.cpp +++ b/src/felix86/v2/handlers.cpp @@ -10684,7 +10684,21 @@ FAST_HANDLE(CMPXCHG16B) { biscuit::Label spinloop, writeloop; biscuit::GPR lock_address = rec.scratch(); biscuit::GPR lock = rec.scratch(); - as.LI(lock_address, (u64)&g_process_globals.cas128_lock); + as.LI(lock_address, (u64)&g_process_globals.cas128_locks); + if (g_config.cas128_global) { + // Do nothing, use the first lock in the array + } else { + // We will pick one of 256 different spinlocks based on a hash created by our address + // This means that if two cmpxchg16b target the same address they will spin on the same lock + // but if they get a different one they will likely get a different one, which should decrease + // lock contention + constexpr u32 knuth_hash = 2654435761u; + as.LI(mem1, knuth_hash); + as.SRLI(mem0, address, 4); // shift out low bits since they are 0 to get a better hash + as.MULW(mem0, mem0, mem1); + as.ANDI(mem0, mem0, (sizeof(g_process_globals.cas128_locks) / sizeof(u32)) - 1); + as.ADD(lock_address, lock_address, mem0); + } as.Bind(&spinloop); as.LI(lock, 1); From 02d5976381fa39519d682209d1974c96b1b7a96b Mon Sep 17 00:00:00 2001 From: Paris Oplopoios <21157395+OFFTKP@users.noreply.github.com> Date: Wed, 27 May 2026 19:15:26 +0300 Subject: [PATCH 2/6] . --- src/felix86/v2/handlers.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/felix86/v2/handlers.cpp b/src/felix86/v2/handlers.cpp index 301fe98ed..4d6407e5d 100644 --- a/src/felix86/v2/handlers.cpp +++ b/src/felix86/v2/handlers.cpp @@ -10696,7 +10696,9 @@ FAST_HANDLE(CMPXCHG16B) { as.LI(mem1, knuth_hash); as.SRLI(mem0, address, 4); // shift out low bits since they are 0 to get a better hash as.MULW(mem0, mem0, mem1); - as.ANDI(mem0, mem0, (sizeof(g_process_globals.cas128_locks) / sizeof(u32)) - 1); + as.ANDI(mem0, mem0, 0xFF); + as.SLLI(mem0, mem0, 2); + static_assert(sizeof(g_process_globals.cas128_locks) == 256 * sizeof(u32)); as.ADD(lock_address, lock_address, mem0); } From 60fae31af609a0f3c7f354d6c801f058e4234eaa Mon Sep 17 00:00:00 2001 From: Paris Oplopoios <21157395+OFFTKP@users.noreply.github.com> Date: Wed, 27 May 2026 19:24:46 +0300 Subject: [PATCH 3/6] . --- src/felix86/common/config.inc | 1 - src/felix86/v2/handlers.cpp | 28 ++++++++++++---------------- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/src/felix86/common/config.inc b/src/felix86/common/config.inc index ed9a63c4d..76b5d185b 100644 --- a/src/felix86/common/config.inc +++ b/src/felix86/common/config.inc @@ -70,4 +70,3 @@ X(Performance, bool, auto_compress, false, FELIX86_AUTO_COMPRESS, "Automatically X(Performance, bool, scan_ahead_multi, true, FELIX86_SCAN_AHEAD_MULTI, "Scan ahead to multiple blocks when possible, avoiding even more flag calculations") X(Performance, bool, no_address_overflow, true, FELIX86_NO_ADDRESS_OVERFLOW, "Assume addresses won't overflow in 32-bit apps, which allows for some optimizations") X(Performance, bool, group_loadstore, true, FELIX86_GROUP_LOADSTORE, "Load/store SIMD state in groups of eight, may improve performance") -X(Performance, bool, cas128_global, false, FELIX86_CAS128_GLOBAL, "Use a global lock for emulating CMPXCHG16B, may improve stability but worsen performance in some games") diff --git a/src/felix86/v2/handlers.cpp b/src/felix86/v2/handlers.cpp index 4d6407e5d..d4ac1ed1c 100644 --- a/src/felix86/v2/handlers.cpp +++ b/src/felix86/v2/handlers.cpp @@ -10685,22 +10685,18 @@ FAST_HANDLE(CMPXCHG16B) { biscuit::GPR lock_address = rec.scratch(); biscuit::GPR lock = rec.scratch(); as.LI(lock_address, (u64)&g_process_globals.cas128_locks); - if (g_config.cas128_global) { - // Do nothing, use the first lock in the array - } else { - // We will pick one of 256 different spinlocks based on a hash created by our address - // This means that if two cmpxchg16b target the same address they will spin on the same lock - // but if they get a different one they will likely get a different one, which should decrease - // lock contention - constexpr u32 knuth_hash = 2654435761u; - as.LI(mem1, knuth_hash); - as.SRLI(mem0, address, 4); // shift out low bits since they are 0 to get a better hash - as.MULW(mem0, mem0, mem1); - as.ANDI(mem0, mem0, 0xFF); - as.SLLI(mem0, mem0, 2); - static_assert(sizeof(g_process_globals.cas128_locks) == 256 * sizeof(u32)); - as.ADD(lock_address, lock_address, mem0); - } + // We will pick one of 256 different spinlocks based on a hash created by our address + // This means that if two cmpxchg16b target the same address they will spin on the same lock + // but if they get a different one they will likely get a different one, which should decrease + // lock contention + constexpr u32 knuth_hash = 2654435761u; + as.LI(mem1, knuth_hash); + as.SRLI(mem0, address, 4); // shift out low bits since they are 0 to get a better hash + as.MULW(mem0, mem0, mem1); + as.ANDI(mem0, mem0, 0xFF); + as.SLLI(mem0, mem0, 2); + static_assert(sizeof(g_process_globals.cas128_locks) == 256 * sizeof(u32)); + as.ADD(lock_address, lock_address, mem0); as.Bind(&spinloop); as.LI(lock, 1); From 02bcf80b7a3715ac31f6c0ed1d7e8b418a0e0c8e Mon Sep 17 00:00:00 2001 From: Paris Oplopoios <21157395+OFFTKP@users.noreply.github.com> Date: Wed, 27 May 2026 19:26:39 +0300 Subject: [PATCH 4/6] . --- src/felix86/v2/handlers.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/felix86/v2/handlers.cpp b/src/felix86/v2/handlers.cpp index d4ac1ed1c..8c89347cc 100644 --- a/src/felix86/v2/handlers.cpp +++ b/src/felix86/v2/handlers.cpp @@ -10698,8 +10698,8 @@ FAST_HANDLE(CMPXCHG16B) { static_assert(sizeof(g_process_globals.cas128_locks) == 256 * sizeof(u32)); as.ADD(lock_address, lock_address, mem0); - as.Bind(&spinloop); as.LI(lock, 1); + as.Bind(&spinloop); as.AMOSWAP_W(Ordering::AQRL, lock, lock, lock_address); as.BNEZ(lock, &spinloop); From f0d93612c1431a27ea674d955fab6e1150beed4e Mon Sep 17 00:00:00 2001 From: Paris Oplopoios <21157395+OFFTKP@users.noreply.github.com> Date: Wed, 27 May 2026 19:34:31 +0300 Subject: [PATCH 5/6] . --- src/felix86/v2/handlers.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/felix86/v2/handlers.cpp b/src/felix86/v2/handlers.cpp index 8c89347cc..663e58010 100644 --- a/src/felix86/v2/handlers.cpp +++ b/src/felix86/v2/handlers.cpp @@ -10687,7 +10687,7 @@ FAST_HANDLE(CMPXCHG16B) { as.LI(lock_address, (u64)&g_process_globals.cas128_locks); // We will pick one of 256 different spinlocks based on a hash created by our address // This means that if two cmpxchg16b target the same address they will spin on the same lock - // but if they get a different one they will likely get a different one, which should decrease + // but if they target a different address they will likely get a different lock, which should decrease // lock contention constexpr u32 knuth_hash = 2654435761u; as.LI(mem1, knuth_hash); From 0e0c722573519adeb2b1696dd9340753cc62375f Mon Sep 17 00:00:00 2001 From: Paris Oplopoios <21157395+OFFTKP@users.noreply.github.com> Date: Wed, 27 May 2026 19:36:32 +0300 Subject: [PATCH 6/6] . --- src/felix86/v2/handlers.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/felix86/v2/handlers.cpp b/src/felix86/v2/handlers.cpp index 663e58010..f6f635321 100644 --- a/src/felix86/v2/handlers.cpp +++ b/src/felix86/v2/handlers.cpp @@ -10694,9 +10694,8 @@ FAST_HANDLE(CMPXCHG16B) { as.SRLI(mem0, address, 4); // shift out low bits since they are 0 to get a better hash as.MULW(mem0, mem0, mem1); as.ANDI(mem0, mem0, 0xFF); - as.SLLI(mem0, mem0, 2); static_assert(sizeof(g_process_globals.cas128_locks) == 256 * sizeof(u32)); - as.ADD(lock_address, lock_address, mem0); + as.SH2ADD(lock_address, mem0, lock_address); as.LI(lock, 1); as.Bind(&spinloop);