From 672828e07750eaf6cd55dde953272413cc64391e Mon Sep 17 00:00:00 2001 From: justanotheranonymoususer Date: Mon, 5 Jan 2026 17:55:54 +0200 Subject: [PATCH 1/6] Add missing mut to pin.rs docs Per my understanding, needed for mut access next line. --- library/core/src/pin.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/library/core/src/pin.rs b/library/core/src/pin.rs index 74ecb5ee4946f..e49faf9eddbdc 100644 --- a/library/core/src/pin.rs +++ b/library/core/src/pin.rs @@ -831,15 +831,13 @@ //! fn get_pin_mut(self: [Pin]<[`&mut Self`]>) -> [Pin]<[`&mut T`]>. //! Then we could do the following: //! ```compile_fail -//! # use std::cell::RefCell; -//! # use std::pin::Pin; -//! fn exploit_ref_cell(rc: Pin<&mut RefCell>) { +//! fn exploit_ref_cell(mut rc: Pin<&mut RefCell>) { //! // Here we get pinned access to the `T`. //! let _: Pin<&mut T> = rc.as_mut().get_pin_mut(); //! //! // And here we have `&mut T` to the same data. //! let shared: &RefCell = rc.into_ref().get_ref(); -//! let borrow = shared.borrow_mut(); +//! let mut borrow = shared.borrow_mut(); //! let content = &mut *borrow; //! } //! ``` From 8c697128ebc92af06c69a66a5c5293ba774b38eb Mon Sep 17 00:00:00 2001 From: Andre Bogus Date: Wed, 24 Dec 2025 17:37:16 +0100 Subject: [PATCH 2/6] refactor rustc-hash integration --- compiler/rustc_data_structures/src/fx.rs | 6 ++---- compiler/rustc_data_structures/src/unord.rs | 2 +- compiler/rustc_type_ir/src/data_structures/mod.rs | 8 +++----- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/compiler/rustc_data_structures/src/fx.rs b/compiler/rustc_data_structures/src/fx.rs index 026ec5c230ec6..cad775cc98641 100644 --- a/compiler/rustc_data_structures/src/fx.rs +++ b/compiler/rustc_data_structures/src/fx.rs @@ -1,11 +1,9 @@ -use std::hash::BuildHasherDefault; - pub use rustc_hash::{FxBuildHasher, FxHashMap, FxHashSet, FxHasher}; pub type StdEntry<'a, K, V> = std::collections::hash_map::Entry<'a, K, V>; -pub type FxIndexMap = indexmap::IndexMap>; -pub type FxIndexSet = indexmap::IndexSet>; +pub type FxIndexMap = indexmap::IndexMap; +pub type FxIndexSet = indexmap::IndexSet; pub type IndexEntry<'a, K, V> = indexmap::map::Entry<'a, K, V>; pub type IndexOccupiedEntry<'a, K, V> = indexmap::map::OccupiedEntry<'a, K, V>; diff --git a/compiler/rustc_data_structures/src/unord.rs b/compiler/rustc_data_structures/src/unord.rs index 0a9a86d7a43b8..eb29ef3b4d0a5 100644 --- a/compiler/rustc_data_structures/src/unord.rs +++ b/compiler/rustc_data_structures/src/unord.rs @@ -8,10 +8,10 @@ use std::hash::Hash; use std::iter::{Product, Sum}; use std::ops::Index; -use rustc_hash::{FxBuildHasher, FxHashMap, FxHashSet}; use rustc_macros::{Decodable_NoContext, Encodable_NoContext}; use crate::fingerprint::Fingerprint; +use crate::fx::{FxBuildHasher, FxHashMap, FxHashSet}; use crate::stable_hasher::{HashStable, StableCompare, StableHasher, ToStableHashKey}; /// `UnordItems` is the order-less version of `Iterator`. It only contains methods diff --git a/compiler/rustc_type_ir/src/data_structures/mod.rs b/compiler/rustc_type_ir/src/data_structures/mod.rs index a72669cbd189b..c2b629f1d11c4 100644 --- a/compiler/rustc_type_ir/src/data_structures/mod.rs +++ b/compiler/rustc_type_ir/src/data_structures/mod.rs @@ -1,11 +1,9 @@ -use std::hash::BuildHasherDefault; - pub use ena::unify::{NoError, UnifyKey, UnifyValue}; -use rustc_hash::FxHasher; +use rustc_hash::FxBuildHasher; pub use rustc_hash::{FxHashMap as HashMap, FxHashSet as HashSet}; -pub type IndexMap = indexmap::IndexMap>; -pub type IndexSet = indexmap::IndexSet>; +pub type IndexMap = indexmap::IndexMap; +pub type IndexSet = indexmap::IndexSet; mod delayed_map; From a72f68e80154a208b85a3b80cea744b84b7b5d18 Mon Sep 17 00:00:00 2001 From: Andreas Liljeqvist Date: Sat, 24 Jan 2026 20:05:01 +0100 Subject: [PATCH 3/6] Fix is_ascii performance on x86_64 with explicit SSE2 intrinsics Use explicit SSE2 intrinsics to avoid LLVM's broken AVX-512 auto-vectorization which generates ~31 kshiftrd instructions. Performance - AVX-512: 34-48x faster - SSE2: 1.5-2x faster Improves on earlier pr --- library/core/src/slice/ascii.rs | 61 +++++++++------------------ tests/assembly-llvm/slice-is-ascii.rs | 5 +++ 2 files changed, 26 insertions(+), 40 deletions(-) diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs index 459c826f40646..de89d77e5e2ce 100644 --- a/library/core/src/slice/ascii.rs +++ b/library/core/src/slice/ascii.rs @@ -460,56 +460,37 @@ const fn is_ascii(s: &[u8]) -> bool { ) } -/// Chunk size for vectorized ASCII checking (two 16-byte SSE registers). +/// Chunk size for SSE2 vectorized ASCII checking (4x 16-byte loads). #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] -const CHUNK_SIZE: usize = 32; +const SSE2_CHUNK_SIZE: usize = 64; -/// SSE2 implementation using `_mm_movemask_epi8` (compiles to `pmovmskb`) to -/// avoid LLVM's broken AVX-512 auto-vectorization of counting loops. -/// -/// FIXME(llvm#176906): Remove this workaround once LLVM generates efficient code. #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] fn is_ascii_sse2(bytes: &[u8]) -> bool { use crate::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128}; - let mut i = 0; - - while i + CHUNK_SIZE <= bytes.len() { - // SAFETY: We have verified that `i + CHUNK_SIZE <= bytes.len()`. - let ptr = unsafe { bytes.as_ptr().add(i) }; - - // Load two 16-byte chunks and combine them. - // SAFETY: We verified `i + 32 <= len`, so ptr is valid for 32 bytes. - // `_mm_loadu_si128` allows unaligned loads. - let chunk1 = unsafe { _mm_loadu_si128(ptr as *const __m128i) }; - // SAFETY: Same as above - ptr.add(16) is within the valid 32-byte range. - let chunk2 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) }; - - // OR them together - if any byte has the high bit set, the result will too. - // SAFETY: SSE2 is guaranteed by the cfg predicate. - let combined = unsafe { _mm_or_si128(chunk1, chunk2) }; - - // Create a mask from the MSBs of each byte. - // If any byte is >= 128, its MSB is 1, so the mask will be non-zero. - // SAFETY: SSE2 is guaranteed by the cfg predicate. - let mask = unsafe { _mm_movemask_epi8(combined) }; - + let (chunks, rest) = bytes.as_chunks::(); + + for chunk in chunks { + let ptr = chunk.as_ptr(); + // SAFETY: chunk is 64 bytes. SSE2 is baseline on x86_64. + let mask = unsafe { + let a1 = _mm_loadu_si128(ptr as *const __m128i); + let a2 = _mm_loadu_si128(ptr.add(16) as *const __m128i); + let b1 = _mm_loadu_si128(ptr.add(32) as *const __m128i); + let b2 = _mm_loadu_si128(ptr.add(48) as *const __m128i); + // OR all chunks - if any byte has high bit set, combined will too. + let combined = _mm_or_si128(_mm_or_si128(a1, a2), _mm_or_si128(b1, b2)); + // Create a mask from the MSBs of each byte. + // If any byte is >= 128, its MSB is 1, so the mask will be non-zero. + _mm_movemask_epi8(combined) + }; if mask != 0 { return false; } - - i += CHUNK_SIZE; - } - - // Handle remaining bytes with simple loop - while i < bytes.len() { - if !bytes[i].is_ascii() { - return false; - } - i += 1; } - true + // Handle remaining bytes + rest.iter().all(|b| b.is_ascii()) } /// ASCII test optimized to use the `pmovmskb` instruction on `x86-64`. @@ -529,7 +510,7 @@ const fn is_ascii(bytes: &[u8]) -> bool { is_ascii_simple(bytes) } else { // For small inputs, use usize-at-a-time processing to avoid SSE2 call overhead. - if bytes.len() < CHUNK_SIZE { + if bytes.len() < SSE2_CHUNK_SIZE { let chunks = bytes.chunks_exact(USIZE_SIZE); let remainder = chunks.remainder(); for chunk in chunks { diff --git a/tests/assembly-llvm/slice-is-ascii.rs b/tests/assembly-llvm/slice-is-ascii.rs index d01b321bf460a..3b782ab2cd827 100644 --- a/tests/assembly-llvm/slice-is-ascii.rs +++ b/tests/assembly-llvm/slice-is-ascii.rs @@ -22,6 +22,11 @@ // X86_64-LABEL: test_is_ascii // X86_64-NOT: kshiftrd // X86_64-NOT: kshiftrq +// Verify explicit SSE2/AVX intrinsics are used: +// - pmovmskb/vpmovmskb: efficient mask extraction from the MSBs +// - vpor/por: OR-combining of 4x 16-byte loads (2x unrolled, 64-byte chunks) +// X86_64: {{vpmovmskb|pmovmskb}} +// X86_64: {{vpor|por}} // LA64-LABEL: test_is_ascii // LA64: vmskltz.b From 9e80b1ba7e751a8e16ce2ef1286278b3e66f728b Mon Sep 17 00:00:00 2001 From: zakie Date: Sun, 25 Jan 2026 17:24:08 +0900 Subject: [PATCH 4/6] Fix broken WASIp1 reference link --- src/doc/rustc/src/platform-support/wasm32-wasip1.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/doc/rustc/src/platform-support/wasm32-wasip1.md b/src/doc/rustc/src/platform-support/wasm32-wasip1.md index 958a34a86928c..eb74edda22de8 100644 --- a/src/doc/rustc/src/platform-support/wasm32-wasip1.md +++ b/src/doc/rustc/src/platform-support/wasm32-wasip1.md @@ -20,7 +20,7 @@ focused on the Component Model-based definition of WASI. At this point the `wasm32-wasip1` Rust target is intended for historical compatibility with [WASIp1] set of syscalls. -[WASIp1]: https://github.com/WebAssembly/WASI/tree/main/legacy/preview1 +[WASIp1]: https://github.com/WebAssembly/WASI/tree/wasi-0.1/preview1 [Component Model]: https://github.com/webassembly/component-model Today the `wasm32-wasip1` target will generate core WebAssembly modules From cbcd8694c6e549c658901f010644fddcb7ffbce8 Mon Sep 17 00:00:00 2001 From: Andreas Liljeqvist Date: Sun, 25 Jan 2026 09:44:04 +0100 Subject: [PATCH 5/6] Remove x86_64 assembly test for is_ascii The SSE2 helper function is not inlined across crate boundaries, so we cannot verify the codegen in an assembly test. The fix is still verified by the absence of performance regression. --- tests/assembly-llvm/slice-is-ascii.rs | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/tests/assembly-llvm/slice-is-ascii.rs b/tests/assembly-llvm/slice-is-ascii.rs index 3b782ab2cd827..00deb23e9a6cd 100644 --- a/tests/assembly-llvm/slice-is-ascii.rs +++ b/tests/assembly-llvm/slice-is-ascii.rs @@ -1,32 +1,12 @@ -//@ revisions: X86_64 LA64 +//@ revisions: LA64 //@ assembly-output: emit-asm //@ compile-flags: -C opt-level=3 // -//@ [X86_64] only-x86_64 -//@ [X86_64] compile-flags: -C target-cpu=znver4 -//@ [X86_64] compile-flags: -C llvm-args=-x86-asm-syntax=intel -// //@ [LA64] only-loongarch64 #![crate_type = "lib"] -/// Verify `is_ascii` generates efficient code on different architectures: -/// -/// - x86_64: Must NOT use `kshiftrd`/`kshiftrq` (broken AVX-512 auto-vectorization). -/// The fix uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`). -/// See: https://github.com/llvm/llvm-project/issues/176906 -/// /// - loongarch64: Should use `vmskltz.b` instruction for the fast-path. -/// This architecture still relies on LLVM auto-vectorization. - -// X86_64-LABEL: test_is_ascii -// X86_64-NOT: kshiftrd -// X86_64-NOT: kshiftrq -// Verify explicit SSE2/AVX intrinsics are used: -// - pmovmskb/vpmovmskb: efficient mask extraction from the MSBs -// - vpor/por: OR-combining of 4x 16-byte loads (2x unrolled, 64-byte chunks) -// X86_64: {{vpmovmskb|pmovmskb}} -// X86_64: {{vpor|por}} // LA64-LABEL: test_is_ascii // LA64: vmskltz.b From dbc870afec91308b2e6a6c6ba16e8f3bb085e338 Mon Sep 17 00:00:00 2001 From: Andreas Liljeqvist Date: Sun, 25 Jan 2026 20:03:32 +0100 Subject: [PATCH 6/6] Mark is_ascii_sse2 as #[inline] --- library/core/src/slice/ascii.rs | 1 + tests/assembly-llvm/slice-is-ascii.rs | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs index de89d77e5e2ce..ae641871279b6 100644 --- a/library/core/src/slice/ascii.rs +++ b/library/core/src/slice/ascii.rs @@ -465,6 +465,7 @@ const fn is_ascii(s: &[u8]) -> bool { const SSE2_CHUNK_SIZE: usize = 64; #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] +#[inline] fn is_ascii_sse2(bytes: &[u8]) -> bool { use crate::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128}; diff --git a/tests/assembly-llvm/slice-is-ascii.rs b/tests/assembly-llvm/slice-is-ascii.rs index 00deb23e9a6cd..b9a5205054986 100644 --- a/tests/assembly-llvm/slice-is-ascii.rs +++ b/tests/assembly-llvm/slice-is-ascii.rs @@ -1,13 +1,28 @@ -//@ revisions: LA64 +//@ revisions: X86_64 LA64 //@ assembly-output: emit-asm //@ compile-flags: -C opt-level=3 // +//@ [X86_64] only-x86_64 +//@ [X86_64] compile-flags: -C target-cpu=znver4 +//@ [X86_64] compile-flags: -C llvm-args=-x86-asm-syntax=intel +// //@ [LA64] only-loongarch64 #![crate_type = "lib"] +/// Verify `is_ascii` generates efficient code on different architectures: +/// +/// - x86_64: Must NOT use `kshiftrd`/`kshiftrq` (broken AVX-512 auto-vectorization). +/// Good version uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`). +/// /// - loongarch64: Should use `vmskltz.b` instruction for the fast-path. +// X86_64-LABEL: test_is_ascii +// X86_64-NOT: kshiftrd +// X86_64-NOT: kshiftrq +// X86_64: {{vpor|por}} +// X86_64: {{vpmovmskb|pmovmskb}} + // LA64-LABEL: test_is_ascii // LA64: vmskltz.b