From cecdc6cfed0c1b0eebcfe3a19f01f5154131e11a Mon Sep 17 00:00:00 2001 From: Stephen Akinyemi Date: Mon, 13 Apr 2026 19:17:27 +0100 Subject: [PATCH 1/5] feat(krun): raise IRQ limits and fix GIC nr_irqs for 136+ virtio devices Raise IRQ_MAX from 15 to 223 on x86_64 and from 159 to 223 on aarch64 to support 136+ virtio-MMIO devices needed for block-backed EROFS OCI rootfs (one block device per OCI layer). - Raise IOAPIC_NUM_PINS from 24 to 256 in the userspace split irqchip to match the new IRQ range on x86_64 - Always use split irqchip on x86_64 since KVM's in-kernel IOAPIC is hardcoded to 24 pins (KVM_IOAPIC_NUM_PINS) and cannot be changed - Fix GIC nr_irqs calculation in kvmgicv2 and kvmgicv3: KVM interprets nr_irqs as total interrupts including 32 private ones (SGIs + PPIs), so the old `IRQ_MAX - IRQ_BASE + 1` formula under-allocated SPIs --- src/arch/src/aarch64/layout.rs | 2 +- src/arch/src/x86_64/layout.rs | 2 +- src/devices/src/legacy/ioapic.rs | 2 +- src/devices/src/legacy/kvmgicv2.rs | 4 +++- src/devices/src/legacy/kvmgicv3.rs | 4 +++- src/vmm/src/builder.rs | 15 ++++++--------- 6 files changed, 15 insertions(+), 14 deletions(-) diff --git a/src/arch/src/aarch64/layout.rs b/src/arch/src/aarch64/layout.rs index 401925d6e..1d0303924 100644 --- a/src/arch/src/aarch64/layout.rs +++ b/src/arch/src/aarch64/layout.rs @@ -75,7 +75,7 @@ pub const FDT_MAX_SIZE: usize = 0x20_0000; pub const IRQ_BASE: u32 = 32; /// Last usable interrupt on aarch64. -pub const IRQ_MAX: u32 = 159; +pub const IRQ_MAX: u32 = 223; /// Timer interrupts pub const GTIMER_SEC: u32 = 13; diff --git a/src/arch/src/x86_64/layout.rs b/src/arch/src/x86_64/layout.rs index c626d68d7..907e994bb 100644 --- a/src/arch/src/x86_64/layout.rs +++ b/src/arch/src/x86_64/layout.rs @@ -26,7 +26,7 @@ pub const HIMEM_START: u64 = 0x0010_0000; //1 MB. /// First usable IRQ ID for virtio device interrupts on x86_64. pub const IRQ_BASE: u32 = 5; /// Last usable IRQ ID for virtio device interrupts on x86_64. -pub const IRQ_MAX: u32 = 15; +pub const IRQ_MAX: u32 = 223; /// Address for the TSS setup. pub const KVM_TSS_ADDRESS: u64 = 0xfffb_d000; diff --git a/src/devices/src/legacy/ioapic.rs b/src/devices/src/legacy/ioapic.rs index af752c906..cb887b8b5 100644 --- a/src/devices/src/legacy/ioapic.rs +++ b/src/devices/src/legacy/ioapic.rs @@ -17,7 +17,7 @@ use crate::Error as DeviceError; const IOAPIC_BASE: u32 = 0xfec0_0000; const APIC_DEFAULT_ADDRESS: u32 = 0xfee0_0000; -const IOAPIC_NUM_PINS: usize = 24; +const IOAPIC_NUM_PINS: usize = 256; const IO_REG_SEL: u64 = 0x00; const IO_WIN: u64 = 0x10; diff --git a/src/devices/src/legacy/kvmgicv2.rs b/src/devices/src/legacy/kvmgicv2.rs index fcf764f8e..76bdab7bb 100644 --- a/src/devices/src/legacy/kvmgicv2.rs +++ b/src/devices/src/legacy/kvmgicv2.rs @@ -57,7 +57,9 @@ impl KvmGicV2 { }; device_fd.set_device_attr(&attr).unwrap(); - let nr_irqs: u32 = arch::aarch64::layout::IRQ_MAX - arch::aarch64::layout::IRQ_BASE + 1; + // GIC nr_irqs includes 32 private interrupts (SGIs + PPIs), so we need + // the SPI count plus 32, rounded up to a multiple of 32. + let nr_irqs: u32 = ((arch::aarch64::layout::IRQ_MAX + 1 + 31) / 32) * 32; let nr_irqs_ptr = &nr_irqs as *const u32; let attr = kvm_bindings::kvm_device_attr { group: kvm_bindings::KVM_DEV_ARM_VGIC_GRP_NR_IRQS, diff --git a/src/devices/src/legacy/kvmgicv3.rs b/src/devices/src/legacy/kvmgicv3.rs index a25bbb330..cbeee320c 100644 --- a/src/devices/src/legacy/kvmgicv3.rs +++ b/src/devices/src/legacy/kvmgicv3.rs @@ -57,7 +57,9 @@ impl KvmGicV3 { }; device_fd.set_device_attr(&attr)?; - let nr_irqs: u32 = arch::aarch64::layout::IRQ_MAX - arch::aarch64::layout::IRQ_BASE + 1; + // GIC nr_irqs includes 32 private interrupts (SGIs + PPIs), so we need + // the SPI count plus 32, rounded up to a multiple of 32. + let nr_irqs: u32 = ((arch::aarch64::layout::IRQ_MAX + 1 + 31) / 32) * 32; let nr_irqs_ptr = &nr_irqs as *const u32; let attr = kvm_bindings::kvm_device_attr { group: kvm_bindings::KVM_DEV_ARM_VGIC_GRP_NR_IRQS, diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 000e88d61..ff70c067c 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -819,19 +819,16 @@ pub fn build_microvm( // while on aarch64 we need to do it the other way around. #[cfg(target_arch = "x86_64")] { - let ioapic: Box = if vm_resources.split_irqchip { - Box::new( - IoApic::new(vm.fd(), _sender.clone()) - .map_err(StartMicrovmError::CreateKvmIrqChip)?, - ) - } else { - Box::new(KvmIoapic::new(vm.fd()).map_err(StartMicrovmError::CreateKvmIrqChip)?) - }; + // Always use split irqchip on x86_64 to support 224 IRQ lines (0-223). + let ioapic: Box = Box::new( + IoApic::new(vm.fd(), _sender.clone()) + .map_err(StartMicrovmError::CreateKvmIrqChip)?, + ); intc = Arc::new(Mutex::new(IrqChipDevice::new(ioapic))); attach_legacy_devices( &vm, - vm_resources.split_irqchip, + true, &mut pio_device_manager, &mut mmio_device_manager, Some(intc.clone()), From c229a2883817dc51e0aead25da991eec2c3b88ca Mon Sep 17 00:00:00 2001 From: Stephen Akinyemi Date: Wed, 15 Apr 2026 09:26:55 +0100 Subject: [PATCH 2/5] fix(krun): address CI failures from IRQ limit raise - Fix rustfmt: collapse method chain in builder.rs to single line - Fix clippy: use .div_ceil(32) instead of manual ((x + 31) / 32) in kvmgicv2 and kvmgicv3 - Fix test: increase cmdline buffer from 4096 to 16384 in test_register_too_many_devices since 219 devices (~10KB) now overflows the old 4KB limit --- src/devices/src/legacy/kvmgicv2.rs | 2 +- src/devices/src/legacy/kvmgicv3.rs | 2 +- src/vmm/src/builder.rs | 3 +-- src/vmm/src/device_manager/kvm/mmio.rs | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/devices/src/legacy/kvmgicv2.rs b/src/devices/src/legacy/kvmgicv2.rs index 76bdab7bb..76fe2c031 100644 --- a/src/devices/src/legacy/kvmgicv2.rs +++ b/src/devices/src/legacy/kvmgicv2.rs @@ -59,7 +59,7 @@ impl KvmGicV2 { // GIC nr_irqs includes 32 private interrupts (SGIs + PPIs), so we need // the SPI count plus 32, rounded up to a multiple of 32. - let nr_irqs: u32 = ((arch::aarch64::layout::IRQ_MAX + 1 + 31) / 32) * 32; + let nr_irqs: u32 = (arch::aarch64::layout::IRQ_MAX + 1).div_ceil(32) * 32; let nr_irqs_ptr = &nr_irqs as *const u32; let attr = kvm_bindings::kvm_device_attr { group: kvm_bindings::KVM_DEV_ARM_VGIC_GRP_NR_IRQS, diff --git a/src/devices/src/legacy/kvmgicv3.rs b/src/devices/src/legacy/kvmgicv3.rs index cbeee320c..8c50319e4 100644 --- a/src/devices/src/legacy/kvmgicv3.rs +++ b/src/devices/src/legacy/kvmgicv3.rs @@ -59,7 +59,7 @@ impl KvmGicV3 { // GIC nr_irqs includes 32 private interrupts (SGIs + PPIs), so we need // the SPI count plus 32, rounded up to a multiple of 32. - let nr_irqs: u32 = ((arch::aarch64::layout::IRQ_MAX + 1 + 31) / 32) * 32; + let nr_irqs: u32 = (arch::aarch64::layout::IRQ_MAX + 1).div_ceil(32) * 32; let nr_irqs_ptr = &nr_irqs as *const u32; let attr = kvm_bindings::kvm_device_attr { group: kvm_bindings::KVM_DEV_ARM_VGIC_GRP_NR_IRQS, diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index ff70c067c..e3d0aad87 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -821,8 +821,7 @@ pub fn build_microvm( { // Always use split irqchip on x86_64 to support 224 IRQ lines (0-223). let ioapic: Box = Box::new( - IoApic::new(vm.fd(), _sender.clone()) - .map_err(StartMicrovmError::CreateKvmIrqChip)?, + IoApic::new(vm.fd(), _sender.clone()).map_err(StartMicrovmError::CreateKvmIrqChip)?, ); intc = Arc::new(Mutex::new(IrqChipDevice::new(ioapic))); diff --git a/src/vmm/src/device_manager/kvm/mmio.rs b/src/vmm/src/device_manager/kvm/mmio.rs index 8e97aedc4..4dbb93f46 100644 --- a/src/vmm/src/device_manager/kvm/mmio.rs +++ b/src/vmm/src/device_manager/kvm/mmio.rs @@ -443,7 +443,7 @@ mod tests { #[cfg(target_arch = "aarch64")] let _gic = KvmGicV3::new(vm.fd(), 1).unwrap(); - let mut cmdline = kernel_cmdline::Cmdline::new(4096); + let mut cmdline = kernel_cmdline::Cmdline::new(16384); for _i in arch::IRQ_BASE..=arch::IRQ_MAX { device_manager From 4bb9163b02b19be63de2347de6db4bfe1f1e9aa1 Mon Sep 17 00:00:00 2001 From: Stephen Akinyemi Date: Wed, 15 Apr 2026 09:32:50 +0100 Subject: [PATCH 3/5] fix(krun): remove unused KvmIoapic import on x86_64 No longer needed since split irqchip is now always used. --- src/vmm/src/builder.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index e3d0aad87..4a64a8fce 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -33,8 +33,6 @@ use crate::vmm_config::net::NetBuilder; use devices::legacy::Cmos; #[cfg(all(target_os = "linux", target_arch = "riscv64"))] use devices::legacy::KvmAia; -#[cfg(target_arch = "x86_64")] -use devices::legacy::KvmIoapic; use devices::legacy::Serial; #[cfg(target_os = "macos")] use devices::legacy::VcpuList; From b22c61010fff4bf2f7a7d08fb060c41397d8fbda Mon Sep 17 00:00:00 2001 From: Stephen Akinyemi Date: Sat, 18 Apr 2026 22:01:56 +0100 Subject: [PATCH 4/5] feat(krun): make split irqchip opt-in via MachineBuilder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous revision forced userspace split irqchip unconditionally on x86_64 to reach the raised IRQ cap, which changed runtime behavior for every caller — even those fine with the in-kernel IOAPIC's 11-IRQ budget. Restore the mode as opt-in while keeping the raised ceiling available to callers who need it. - src/arch/src/x86_64/layout.rs: keep IRQ_MAX at 15 for the in-kernel IOAPIC path; add IRQ_MAX_SPLIT = 223 for the userspace split irqchip. - src/vmm/src/builder.rs: restore the if/else on vm_resources.split_irqchip choosing IoApic vs KvmIoapic and the matching attach_legacy_devices arg. Size the MMIODeviceManager IRQ pool to IRQ_MAX_SPLIT only when split irqchip is selected. - src/krun/src/api/builders.rs + builder.rs: add MachineBuilder::split_irqchip(bool) and thread it through to vmr.split_irqchip. - src/devices/src/legacy/ioapic.rs: fix KVM_CAP_SPLIT_IRQCHIP args[0] to match the emulated IOAPIC's pin count (256) instead of the previous hardcoded 24. The old value was inconsistent with IOAPIC_NUM_PINS and only worked because libkrun installs MSI routes rather than pin routes. aarch64 GIC nr_irqs fix and the aarch64 IRQ_MAX bump from the prior commits remain in place — those stand on their own. --- src/arch/src/lib.rs | 4 ++-- src/arch/src/x86_64/layout.rs | 8 ++++++-- src/devices/src/legacy/ioapic.rs | 4 +++- src/krun/src/api/builder.rs | 1 + src/krun/src/api/builders.rs | 15 +++++++++++++++ src/vmm/src/builder.rs | 28 +++++++++++++++++++++------- 6 files changed, 48 insertions(+), 12 deletions(-) diff --git a/src/arch/src/lib.rs b/src/arch/src/lib.rs index 10f4de073..ce5bb8479 100644 --- a/src/arch/src/lib.rs +++ b/src/arch/src/lib.rs @@ -50,8 +50,8 @@ pub mod x86_64; #[cfg(target_arch = "x86_64")] pub use crate::x86_64::{ arch_memory_regions, configure_system, layout::CMDLINE_MAX_SIZE, layout::FIRMWARE_SIZE, - layout::FIRMWARE_START, layout::IRQ_BASE, layout::IRQ_MAX, layout::MMIO_MEM_START, - layout::RESET_VECTOR, Error, + layout::FIRMWARE_START, layout::IRQ_BASE, layout::IRQ_MAX, layout::IRQ_MAX_SPLIT, + layout::MMIO_MEM_START, layout::RESET_VECTOR, Error, }; /// Type for returning public functions outcome. diff --git a/src/arch/src/x86_64/layout.rs b/src/arch/src/x86_64/layout.rs index 907e994bb..488fe8dc0 100644 --- a/src/arch/src/x86_64/layout.rs +++ b/src/arch/src/x86_64/layout.rs @@ -25,8 +25,12 @@ pub const HIMEM_START: u64 = 0x0010_0000; //1 MB. // Typically, on x86 systems 16 IRQs are used (0-15). /// First usable IRQ ID for virtio device interrupts on x86_64. pub const IRQ_BASE: u32 = 5; -/// Last usable IRQ ID for virtio device interrupts on x86_64. -pub const IRQ_MAX: u32 = 223; +/// Last usable IRQ ID for virtio device interrupts on x86_64 when using +/// KVM's in-kernel IOAPIC (hardcoded to 24 pins by KVM_IOAPIC_NUM_PINS). +pub const IRQ_MAX: u32 = 15; +/// Last usable IRQ ID when using the userspace split irqchip, which +/// emulates an IOAPIC with `IOAPIC_NUM_PINS` redirection entries. +pub const IRQ_MAX_SPLIT: u32 = 223; /// Address for the TSS setup. pub const KVM_TSS_ADDRESS: u64 = 0xfffb_d000; diff --git a/src/devices/src/legacy/ioapic.rs b/src/devices/src/legacy/ioapic.rs index cb887b8b5..f586dec2b 100644 --- a/src/devices/src/legacy/ioapic.rs +++ b/src/devices/src/legacy/ioapic.rs @@ -114,7 +114,9 @@ impl IoApic { cap: KVM_CAP_SPLIT_IRQCHIP, ..Default::default() }; - cap.args[0] = 24; + // args[0] is the number of GSIs reserved for the userspace IOAPIC; + // must match the emulated IOAPIC's pin count. + cap.args[0] = IOAPIC_NUM_PINS as u64; vm.enable_cap(&cap)?; } diff --git a/src/krun/src/api/builder.rs b/src/krun/src/api/builder.rs index fb7387d1c..6a3319a41 100644 --- a/src/krun/src/api/builder.rs +++ b/src/krun/src/api/builder.rs @@ -332,6 +332,7 @@ impl VmBuilder { vmr.set_vm_config(&vm_config) .map_err(|err| map_vm_config_error(&self.machine, err))?; vmr.nested_enabled = self.machine.nested_virt; + vmr.split_irqchip = self.machine.split_irqchip; // Apply filesystem configuration #[cfg(not(feature = "tee"))] diff --git a/src/krun/src/api/builders.rs b/src/krun/src/api/builders.rs index c2dfb6e52..49315fecc 100644 --- a/src/krun/src/api/builders.rs +++ b/src/krun/src/api/builders.rs @@ -42,6 +42,7 @@ pub struct MachineBuilder { pub(crate) memory_mib: usize, pub(crate) hyperthreading: bool, pub(crate) nested_virt: bool, + pub(crate) split_irqchip: bool, } //-------------------------------------------------------------------------------------------------- @@ -296,6 +297,7 @@ impl MachineBuilder { memory_mib: 512, hyperthreading: false, nested_virt: false, + split_irqchip: false, } } @@ -322,6 +324,19 @@ impl MachineBuilder { self.nested_virt = enabled; self } + + /// Enable the userspace split irqchip on x86_64. + /// + /// The default in-kernel IOAPIC is hardcoded by KVM to 24 pins, leaving + /// only 11 IRQs for virtio-mmio devices. The userspace split irqchip + /// emulates a larger IOAPIC (256 pins) and raises the usable range to + /// 219 IRQs, which is needed for VMs with many virtio-mmio devices + /// (e.g. lots of virtio-fs mounts or block devices). No effect on + /// aarch64 or riscv64. + pub fn split_irqchip(mut self, enabled: bool) -> Self { + self.split_irqchip = enabled; + self + } } impl Default for MachineBuilder { diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 4a64a8fce..d053cb263 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -39,7 +39,7 @@ use devices::legacy::VcpuList; #[cfg(target_os = "macos")] use devices::legacy::{GicV3, HvfGicV3}; #[cfg(target_arch = "x86_64")] -use devices::legacy::{IoApic, IrqChipT}; +use devices::legacy::{IoApic, IrqChipT, KvmIoapic}; use devices::legacy::{IrqChip, IrqChipDevice}; #[cfg(all(target_os = "linux", target_arch = "aarch64"))] use devices::legacy::{KvmGicV2, KvmGicV3}; @@ -799,10 +799,18 @@ pub fn build_microvm( // Instantiate the MMIO device manager. // 'mmio_base' address has to be an address which is protected by the kernel // and is architectural specific. + #[cfg(target_arch = "x86_64")] + let irq_max = if vm_resources.split_irqchip { + arch::IRQ_MAX_SPLIT + } else { + arch::IRQ_MAX + }; + #[cfg(not(target_arch = "x86_64"))] + let irq_max = arch::IRQ_MAX; #[allow(unused_mut)] let mut mmio_device_manager = MMIODeviceManager::new( &mut (arch::MMIO_MEM_START.clone()), - (arch::IRQ_BASE, arch::IRQ_MAX), + (arch::IRQ_BASE, irq_max), ); #[cfg(target_os = "macos")] @@ -817,15 +825,21 @@ pub fn build_microvm( // while on aarch64 we need to do it the other way around. #[cfg(target_arch = "x86_64")] { - // Always use split irqchip on x86_64 to support 224 IRQ lines (0-223). - let ioapic: Box = Box::new( - IoApic::new(vm.fd(), _sender.clone()).map_err(StartMicrovmError::CreateKvmIrqChip)?, - ); + // Userspace split irqchip is required for >11 virtio IRQs, since KVM's + // in-kernel IOAPIC is hardcoded at 24 pins (KVM_IOAPIC_NUM_PINS). + let ioapic: Box = if vm_resources.split_irqchip { + Box::new( + IoApic::new(vm.fd(), _sender.clone()) + .map_err(StartMicrovmError::CreateKvmIrqChip)?, + ) + } else { + Box::new(KvmIoapic::new(vm.fd()).map_err(StartMicrovmError::CreateKvmIrqChip)?) + }; intc = Arc::new(Mutex::new(IrqChipDevice::new(ioapic))); attach_legacy_devices( &vm, - true, + vm_resources.split_irqchip, &mut pio_device_manager, &mut mmio_device_manager, Some(intc.clone()), From c77e491a217e15c3089ea59e39e3900df6e07319 Mon Sep 17 00:00:00 2001 From: Stephen Akinyemi Date: Sat, 18 Apr 2026 22:14:27 +0100 Subject: [PATCH 5/5] fix(ci): satisfy newer clippy and rustfmt Both lints are pre-existing on main and only surface because the CI runner uses the latest stable toolchain, which is ahead of the local dev toolchain. - src/devices/src/virtio/snd/worker.rs: drop a useless .into_iter() on the argument to Vec::extend (clippy::useless_conversion in 1.95+). - src/cpuid/src/common.rs: reorder the std::arch::* imports to put CpuidResult last, matching rustfmt 1.95+'s case-insensitive sort. --- src/cpuid/src/common.rs | 4 ++-- src/devices/src/virtio/snd/worker.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cpuid/src/common.rs b/src/cpuid/src/common.rs index b09232edb..80fab58fe 100644 --- a/src/cpuid/src/common.rs +++ b/src/cpuid/src/common.rs @@ -2,9 +2,9 @@ // SPDX-License-Identifier: Apache-2.0 #[cfg(target_arch = "x86")] -use std::arch::x86::{CpuidResult, __cpuid_count, __get_cpuid_max}; +use std::arch::x86::{__cpuid_count, __get_cpuid_max, CpuidResult}; #[cfg(target_arch = "x86_64")] -use std::arch::x86_64::{CpuidResult, __cpuid_count, __get_cpuid_max}; +use std::arch::x86_64::{__cpuid_count, __get_cpuid_max, CpuidResult}; use crate::cpu_leaf::*; diff --git a/src/devices/src/virtio/snd/worker.rs b/src/devices/src/virtio/snd/worker.rs index b269c9503..e7950d557 100644 --- a/src/devices/src/virtio/snd/worker.rs +++ b/src/devices/src/virtio/snd/worker.rs @@ -586,7 +586,7 @@ impl SndWorker { { self.streams.write().unwrap()[stream_id as usize] .buffers - .extend(std::mem::take(&mut buffers).into_iter()); + .extend(std::mem::take(&mut buffers)); state = IoState::Done; } IoState::Ready if descriptor.len as usize != size_of::() => {