From cae36b95c52566b2aa5000a9df13a18e4cf8fd16 Mon Sep 17 00:00:00 2001 From: Matt Evans Date: Mon, 11 May 2026 07:58:23 -0700 Subject: [PATCH 1/2] vfio/pci: Set up BAR resources and maps in vfio_pci_core_enable() Previously BAR resource requests and the corresponding pci_iomap() were performed on-demand and without synchronisation, which was racy. Rather than add synchronisation, it's simplest to address this by doing both activities from vfio_pci_core_enable(). The resource allocation and/or pci_iomap() can still fail; their status is tracked and existing calls to vfio_pci_core_setup_barmap() will fail in a similar way to before. This keeps the point of failure as observed by userspace the same, i.e. failures to request/map unused BARs are benign. Fixes: 89e1f7d4c66d ("vfio: Add PCI device driver") Signed-off-by: Matt Evans Link: https://lore.kernel.org/r/20260511145829.2993601-2-mattev@meta.com [ERR_PTR -> IOMEM_ERR_PTR per lkp report] Signed-off-by: Alex Williamson (cherry picked from commit 05f2a68b407a6817fe141dd64972c6ab8725312d) --- drivers/vfio/pci/vfio_pci_core.c | 37 +++++++++++++++++++++++++++++++- drivers/vfio/pci/vfio_pci_rdwr.c | 26 ++++++---------------- 2 files changed, 43 insertions(+), 20 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 595503fa9ca89..e297e2d34aed4 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -451,6 +451,40 @@ static int vfio_pci_core_runtime_resume(struct device *dev) } #endif /* CONFIG_PM */ +/* + * Eager-request BAR resources, and iomap them. Soft failures are + * allowed, and consumers must check the barmap before use in order to + * give compatible user-visible behaviour with the previous on-demand + * allocation method. + */ +static void vfio_pci_core_map_bars(struct vfio_pci_core_device *vdev) +{ + struct pci_dev *pdev = vdev->pdev; + int i; + + for (i = 0; i < PCI_STD_NUM_BARS; i++) { + int bar = i + PCI_STD_RESOURCES; + + vdev->barmap[bar] = IOMEM_ERR_PTR(-ENODEV); + + if (!pci_resource_len(pdev, i)) + continue; + + if (pci_request_selected_regions(pdev, 1 << bar, "vfio")) { + pci_dbg(pdev, "Failed to reserve region %d\n", bar); + vdev->barmap[bar] = IOMEM_ERR_PTR(-EBUSY); + continue; + } + + vdev->barmap[bar] = pci_iomap(pdev, bar, 0); + if (!vdev->barmap[bar]) { + pci_dbg(pdev, "Failed to iomap region %d\n", bar); + pci_release_selected_regions(pdev, 1 << bar); + vdev->barmap[bar] = IOMEM_ERR_PTR(-ENOMEM); + } + } +} + /* * The pci-driver core runtime PM routines always save the device state * before going into suspended state. If the device is going into low power @@ -537,6 +571,7 @@ int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev)) vdev->has_vga = true; + vfio_pci_core_map_bars(vdev); return 0; @@ -616,7 +651,7 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) for (i = 0; i < PCI_STD_NUM_BARS; i++) { bar = i + PCI_STD_RESOURCES; - if (!vdev->barmap[bar]) + if (IS_ERR_OR_NULL(vdev->barmap[bar])) continue; pci_iounmap(pdev, vdev->barmap[bar]); pci_release_selected_regions(pdev, 1 << bar); diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index a0595c745732a..36372163c5275 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -206,27 +206,15 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, } EXPORT_SYMBOL_GPL(vfio_pci_core_do_io_rw); +/* + * The barmap is set up in vfio_pci_core_enable(). Callers use this + * function to check that the BAR resources are requested or that the + * pci_iomap() was done. + */ int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar) { - struct pci_dev *pdev = vdev->pdev; - int ret; - void __iomem *io; - - if (vdev->barmap[bar]) - return 0; - - ret = pci_request_selected_regions(pdev, 1 << bar, "vfio"); - if (ret) - return ret; - - io = pci_iomap(pdev, bar, 0); - if (!io) { - pci_release_selected_regions(pdev, 1 << bar); - return -ENOMEM; - } - - vdev->barmap[bar] = io; - + if (IS_ERR(vdev->barmap[bar])) + return PTR_ERR(vdev->barmap[bar]); return 0; } EXPORT_SYMBOL_GPL(vfio_pci_core_setup_barmap); From e3d0085fffd94a05ac95f013e93c3697dbfffe3c Mon Sep 17 00:00:00 2001 From: Mikhail Malyshev Date: Thu, 18 Jun 2026 23:15:04 +0000 Subject: [PATCH 2/2] KVM: x86/mmu: emulate (not -EFAULT) guest access to a disabled passthrough BAR A passed-through PCI device's BAR is mapped into the guest via a VM_IO/ VM_PFNMAP VMA whose fault handler (e.g. vfio_pci_mmap_fault) declines to install a PTE while the device's memory space is disabled, such as right after the guest clears PCI_COMMAND.MEM. If another vCPU accesses that BAR during the window, the gup in the page-fault path fails with an error pfn even though the memslot is still valid, and KVM_RUN returns -EFAULT to userspace, crashing the VM. A guest can trigger this at will, so it is a guest-triggerable host-side VM kill. On real hardware an access to a BAR with memory decoding disabled completes as an Unsupported Request (reads return all-ones, writes are dropped). KVM can present the same behaviour by treating the access as MMIO and emulating it, which is exactly what the noslot path already does. Distinguish the VM_IO/VM_PFNMAP fault-handler failure from other error pfns with a new KVM_PFN_ERR_PFNMAP value (in-range, so existing error-pfn range checks are unaffected) and route it to kvm_handle_noslot_fault() in the x86 TDP fault path. Genuine, non-pfnmap faults (e.g. a vanished anonymous backing) still take the fatal -EFAULT path, so real errors are not masked. The MMIO mapping self-heals when the device memory is re-enabled and the memslot is updated, bumping the MMIO generation. Fixes: abafbc551fdd ("vfio-pci: Invalidate mmaps and block MMIO access on disabled memory") Signed-off-by: Mikhail Malyshev --- arch/x86/kvm/mmu/mmu.c | 16 +++++++++++++++- include/linux/kvm_host.h | 7 +++++++ virt/kvm/kvm_main.c | 8 +++++++- 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 700926eb77dfa..e2e41f1bdb63c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4524,8 +4524,22 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (ret != RET_PF_CONTINUE) return ret; - if (unlikely(is_error_pfn(fault->pfn))) + if (unlikely(is_error_pfn(fault->pfn))) { + /* + * A passed-through PCI BAR is backed by a VM_IO/VM_PFNMAP + * mapping whose fault handler refuses to install a PTE while the + * device's memory space is disabled (e.g. the guest cleared + * PCI_COMMAND.MEM). The gup then fails even though the memslot + * is still valid. Treat such an access as MMIO and emulate it + * (the guest observes Unsupported Request semantics, matching + * real hardware) instead of killing the VM with -EFAULT. Other, + * non-pfnmap errors still take the fatal path. + */ + if (fault->pfn == KVM_PFN_ERR_PFNMAP) + return kvm_handle_noslot_fault(vcpu, fault, access); + return kvm_handle_error_pfn(vcpu, fault); + } if (WARN_ON_ONCE(!fault->slot || is_noslot_pfn(fault->pfn))) return kvm_handle_noslot_fault(vcpu, fault, access); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2e836d44f7386..0e0b08a615372 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -97,6 +97,13 @@ #define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1) #define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2) #define KVM_PFN_ERR_SIGPENDING (KVM_PFN_ERR_MASK + 3) +/* + * Faulting in a VM_IO/VM_PFNMAP mapping failed because its fault handler + * declined to install a PTE, e.g. a passed-through PCI BAR whose device memory + * is currently disabled (guest cleared PCI_COMMAND.MEM). The memslot is valid; + * the access should be treated as MMIO rather than a fatal -EFAULT. + */ +#define KVM_PFN_ERR_PFNMAP (KVM_PFN_ERR_MASK + 4) /* * error pfns indicate that the gfn is in slot but faild to diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index aba4078ae2250..24a6166f917df 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2991,7 +2991,13 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible, if (r == -EAGAIN) goto retry; if (r < 0) - pfn = KVM_PFN_ERR_FAULT; + /* + * The mapping's fault handler declined to install a PTE + * (e.g. a passed-through PCI BAR with device memory + * disabled). Flag it distinctly so the fault handler can + * treat the access as MMIO instead of a fatal -EFAULT. + */ + pfn = KVM_PFN_ERR_PFNMAP; } else { if (async && vma_is_valid(vma, write_fault)) *async = true;