From 9edc75e5e0e2ca3a3ec5e4aa5766b8214a109ab6 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Wed, 18 Jun 2025 17:05:21 +0530 Subject: [PATCH 01/18] RISC-V: KVM: Check kvm_riscv_vcpu_alloc_vector_context() return value mainline inclusion from mainline-6.17-rc1 commit 4a50578 category: feature bugzilla: RVCK-Project#257 -------------------------------- The kvm_riscv_vcpu_alloc_vector_context() does return an error code upon failure so don't ignore this in kvm_arch_vcpu_create(). Signed-off-by: Anup Patel Reviewed-by: Atish Patra Tested-by: Atish Patra Reviewed-by: Nutty Liu Link: https://lore.kernel.org/r/20250618113532.471448-2-apatel@ventanamicro.com Signed-off-by: Anup Patel Signed-off-by: Wang Yechao --- arch/riscv/kvm/vcpu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 262101c0679f..a0faa152d9c1 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -139,8 +139,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) cntx->hstatus |= HSTATUS_SPVP; cntx->hstatus |= HSTATUS_SPV; - if (kvm_riscv_vcpu_alloc_vector_context(vcpu, cntx)) - return -ENOMEM; + rc = kvm_riscv_vcpu_alloc_vector_context(vcpu, cntx); + if (rc) + return rc; /* By default, make CY, TM, and IR counters accessible in VU mode */ reset_csr->scounteren = 0x7; From a648c2a8a801e552fa17db1aef7fb1b7ea1565f7 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Wed, 18 Jun 2025 17:05:22 +0530 Subject: [PATCH 02/18] RISC-V: KVM: Drop the return value of kvm_riscv_vcpu_aia_init() mainline inclusion from mainline-6.17-rc1 commit 7c67de2 category: feature bugzilla: RVCK-Project#257 -------------------------------- The kvm_riscv_vcpu_aia_init() does not return any failure so drop the return value which is always zero. Signed-off-by: Anup Patel Reviewed-by: Atish Patra Tested-by: Atish Patra Reviewed-by: Nutty Liu Link: https://lore.kernel.org/r/20250618113532.471448-3-apatel@ventanamicro.com Signed-off-by: Anup Patel Signed-off-by: Wang Yechao --- arch/riscv/include/asm/kvm_aia.h | 2 +- arch/riscv/kvm/aia_device.c | 6 ++---- arch/riscv/kvm/vcpu.c | 4 +--- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/riscv/include/asm/kvm_aia.h b/arch/riscv/include/asm/kvm_aia.h index 1f37b600ca47..2bcd089aa193 100644 --- a/arch/riscv/include/asm/kvm_aia.h +++ b/arch/riscv/include/asm/kvm_aia.h @@ -150,7 +150,7 @@ int kvm_riscv_vcpu_aia_rmw_ireg(struct kvm_vcpu *vcpu, unsigned int csr_num, int kvm_riscv_vcpu_aia_update(struct kvm_vcpu *vcpu); void kvm_riscv_vcpu_aia_reset(struct kvm_vcpu *vcpu); -int kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu); +void kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu); void kvm_riscv_vcpu_aia_deinit(struct kvm_vcpu *vcpu); int kvm_riscv_aia_inject_msi_by_id(struct kvm *kvm, u32 hart_index, diff --git a/arch/riscv/kvm/aia_device.c b/arch/riscv/kvm/aia_device.c index 39cd26af5a69..888566d3ebf3 100644 --- a/arch/riscv/kvm/aia_device.c +++ b/arch/riscv/kvm/aia_device.c @@ -541,12 +541,12 @@ void kvm_riscv_vcpu_aia_reset(struct kvm_vcpu *vcpu) kvm_riscv_vcpu_aia_imsic_reset(vcpu); } -int kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu) +void kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu) { struct kvm_vcpu_aia *vaia = &vcpu->arch.aia_context; if (!kvm_riscv_aia_available()) - return 0; + return; /* * We don't do any memory allocations over here because these @@ -558,8 +558,6 @@ int kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu) /* Initialize default values in AIA vcpu context */ vaia->imsic_addr = KVM_RISCV_AIA_UNDEF_ADDR; vaia->hart_index = vcpu->vcpu_idx; - - return 0; } void kvm_riscv_vcpu_aia_deinit(struct kvm_vcpu *vcpu) diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index a0faa152d9c1..3665caa2d062 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -153,9 +153,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_riscv_vcpu_pmu_init(vcpu); /* Setup VCPU AIA */ - rc = kvm_riscv_vcpu_aia_init(vcpu); - if (rc) - return rc; + kvm_riscv_vcpu_aia_init(vcpu); /* * Setup SBI extensions From c2b71ba76396824ea2a84b0b5347b1cf6cc9bcdb Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Wed, 18 Jun 2025 17:05:23 +0530 Subject: [PATCH 03/18] RISC-V: KVM: Rename and move kvm_riscv_local_tlb_sanitize() mainline inclusion from mainline-6.17-rc1 commit b79bf20 category: feature bugzilla: RVCK-Project#257 -------------------------------- The kvm_riscv_local_tlb_sanitize() deals with sanitizing current VMID related TLB mappings when a VCPU is moved from one host CPU to another. Let's move kvm_riscv_local_tlb_sanitize() to VMID management sources and rename it to kvm_riscv_gstage_vmid_sanitize(). Signed-off-by: Anup Patel Reviewed-by: Atish Patra Tested-by: Atish Patra Reviewed-by: Nutty Liu Link: https://lore.kernel.org/r/20250618113532.471448-4-apatel@ventanamicro.com Signed-off-by: Anup Patel Signed-off-by: Wang Yechao --- arch/riscv/include/asm/kvm_host.h | 3 +-- arch/riscv/kvm/tlb.c | 23 ----------------------- arch/riscv/kvm/vcpu.c | 4 ++-- arch/riscv/kvm/vmid.c | 23 +++++++++++++++++++++++ 4 files changed, 26 insertions(+), 27 deletions(-) diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 0b2387cd7846..798a37b303a6 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -330,8 +330,6 @@ void kvm_riscv_local_hfence_vvma_gva(unsigned long vmid, unsigned long order); void kvm_riscv_local_hfence_vvma_all(unsigned long vmid); -void kvm_riscv_local_tlb_sanitize(struct kvm_vcpu *vcpu); - void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu); void kvm_riscv_hfence_gvma_vmid_all_process(struct kvm_vcpu *vcpu); void kvm_riscv_hfence_vvma_all_process(struct kvm_vcpu *vcpu); @@ -379,6 +377,7 @@ unsigned long kvm_riscv_gstage_vmid_bits(void); int kvm_riscv_gstage_vmid_init(struct kvm *kvm); bool kvm_riscv_gstage_vmid_ver_changed(struct kvm_vmid *vmid); void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu); +void kvm_riscv_gstage_vmid_sanitize(struct kvm_vcpu *vcpu); int kvm_riscv_setup_default_irq_routing(struct kvm *kvm, u32 lines); diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c index 2f91ea5f8493..b3461bfd9756 100644 --- a/arch/riscv/kvm/tlb.c +++ b/arch/riscv/kvm/tlb.c @@ -156,29 +156,6 @@ void kvm_riscv_local_hfence_vvma_all(unsigned long vmid) csr_write(CSR_HGATP, hgatp); } -void kvm_riscv_local_tlb_sanitize(struct kvm_vcpu *vcpu) -{ - unsigned long vmid; - - if (!kvm_riscv_gstage_vmid_bits() || - vcpu->arch.last_exit_cpu == vcpu->cpu) - return; - - /* - * On RISC-V platforms with hardware VMID support, we share same - * VMID for all VCPUs of a particular Guest/VM. This means we might - * have stale G-stage TLB entries on the current Host CPU due to - * some other VCPU of the same Guest which ran previously on the - * current Host CPU. - * - * To cleanup stale TLB entries, we simply flush all G-stage TLB - * entries by VMID whenever underlying Host CPU changes for a VCPU. - */ - - vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid); - kvm_riscv_local_hfence_gvma_vmid_all(vmid); -} - void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu) { kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_FENCE_I_RCVD); diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 3665caa2d062..f0c363d51004 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -951,12 +951,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) } /* - * Cleanup stale TLB enteries + * Sanitize VMID mappings cached (TLB) on current CPU * * Note: This should be done after G-stage VMID has been * updated using kvm_riscv_gstage_vmid_ver_changed() */ - kvm_riscv_local_tlb_sanitize(vcpu); + kvm_riscv_gstage_vmid_sanitize(vcpu); trace_kvm_entry(vcpu); diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c index ddc98714ce8e..92c01255f86f 100644 --- a/arch/riscv/kvm/vmid.c +++ b/arch/riscv/kvm/vmid.c @@ -122,3 +122,26 @@ void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu) kvm_for_each_vcpu(i, v, vcpu->kvm) kvm_make_request(KVM_REQ_UPDATE_HGATP, v); } + +void kvm_riscv_gstage_vmid_sanitize(struct kvm_vcpu *vcpu) +{ + unsigned long vmid; + + if (!kvm_riscv_gstage_vmid_bits() || + vcpu->arch.last_exit_cpu == vcpu->cpu) + return; + + /* + * On RISC-V platforms with hardware VMID support, we share same + * VMID for all VCPUs of a particular Guest/VM. This means we might + * have stale G-stage TLB entries on the current Host CPU due to + * some other VCPU of the same Guest which ran previously on the + * current Host CPU. + * + * To cleanup stale TLB entries, we simply flush all G-stage TLB + * entries by VMID whenever underlying Host CPU changes for a VCPU. + */ + + vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid); + kvm_riscv_local_hfence_gvma_vmid_all(vmid); +} From 3151ba5b1d6d2906700dfc0b8d4742fec4875125 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Wed, 18 Jun 2025 17:05:24 +0530 Subject: [PATCH 04/18] RISC-V: KVM: Replace KVM_REQ_HFENCE_GVMA_VMID_ALL with KVM_REQ_TLB_FLUSH mainline inclusion from mainline-6.17-rc1 commit 7584eb6 category: feature bugzilla: RVCK-Project#257 -------------------------------- The KVM_REQ_HFENCE_GVMA_VMID_ALL is same as KVM_REQ_TLB_FLUSH so to avoid confusion let's replace KVM_REQ_HFENCE_GVMA_VMID_ALL with KVM_REQ_TLB_FLUSH. Also, rename kvm_riscv_hfence_gvma_vmid_all_process() to kvm_riscv_tlb_flush_process(). Signed-off-by: Anup Patel Reviewed-by: Atish Patra Tested-by: Atish Patra Reviewed-by: Nutty Liu Link: https://lore.kernel.org/r/20250618113532.471448-5-apatel@ventanamicro.com Signed-off-by: Anup Patel Signed-off-by: Wang Yechao --- arch/riscv/include/asm/kvm_host.h | 4 ++-- arch/riscv/kvm/tlb.c | 8 ++++---- arch/riscv/kvm/vcpu.c | 8 ++------ 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 798a37b303a6..46f208943ac6 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -37,7 +37,6 @@ #define KVM_REQ_UPDATE_HGATP KVM_ARCH_REQ(2) #define KVM_REQ_FENCE_I \ KVM_ARCH_REQ_FLAGS(3, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_HFENCE_GVMA_VMID_ALL KVM_REQ_TLB_FLUSH #define KVM_REQ_HFENCE_VVMA_ALL \ KVM_ARCH_REQ_FLAGS(4, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_HFENCE \ @@ -330,8 +329,9 @@ void kvm_riscv_local_hfence_vvma_gva(unsigned long vmid, unsigned long order); void kvm_riscv_local_hfence_vvma_all(unsigned long vmid); +void kvm_riscv_tlb_flush_process(struct kvm_vcpu *vcpu); + void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu); -void kvm_riscv_hfence_gvma_vmid_all_process(struct kvm_vcpu *vcpu); void kvm_riscv_hfence_vvma_all_process(struct kvm_vcpu *vcpu); void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu); diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c index b3461bfd9756..da98ca801d31 100644 --- a/arch/riscv/kvm/tlb.c +++ b/arch/riscv/kvm/tlb.c @@ -162,7 +162,7 @@ void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu) local_flush_icache_all(); } -void kvm_riscv_hfence_gvma_vmid_all_process(struct kvm_vcpu *vcpu) +void kvm_riscv_tlb_flush_process(struct kvm_vcpu *vcpu) { struct kvm_vmid *v = &vcpu->kvm->arch.vmid; unsigned long vmid = READ_ONCE(v->vmid); @@ -342,14 +342,14 @@ void kvm_riscv_hfence_gvma_vmid_gpa(struct kvm *kvm, data.size = gpsz; data.order = order; make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE, - KVM_REQ_HFENCE_GVMA_VMID_ALL, &data); + KVM_REQ_TLB_FLUSH, &data); } void kvm_riscv_hfence_gvma_vmid_all(struct kvm *kvm, unsigned long hbase, unsigned long hmask) { - make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE_GVMA_VMID_ALL, - KVM_REQ_HFENCE_GVMA_VMID_ALL, NULL); + make_xfence_request(kvm, hbase, hmask, KVM_REQ_TLB_FLUSH, + KVM_REQ_TLB_FLUSH, NULL); } void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm, diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index f0c363d51004..efdd93006e4d 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -709,12 +709,8 @@ static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_FENCE_I, vcpu)) kvm_riscv_fence_i_process(vcpu); - /* - * The generic KVM_REQ_TLB_FLUSH is same as - * KVM_REQ_HFENCE_GVMA_VMID_ALL - */ - if (kvm_check_request(KVM_REQ_HFENCE_GVMA_VMID_ALL, vcpu)) - kvm_riscv_hfence_gvma_vmid_all_process(vcpu); + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) + kvm_riscv_tlb_flush_process(vcpu); if (kvm_check_request(KVM_REQ_HFENCE_VVMA_ALL, vcpu)) kvm_riscv_hfence_vvma_all_process(vcpu); From 45698a429cd0c789a98b57670178bcce9396e8a8 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Wed, 18 Jun 2025 17:05:25 +0530 Subject: [PATCH 05/18] RISC-V: KVM: Don't flush TLB when PTE is unchanged mainline inclusion from mainline-6.17-rc1 commit eaa98ba category: feature bugzilla: RVCK-Project#257 -------------------------------- The gstage_set_pte() and gstage_op_pte() should flush TLB only when a leaf PTE changes so that unnecessary TLB flushes can be avoided. Signed-off-by: Anup Patel Reviewed-by: Atish Patra Tested-by: Atish Patra Reviewed-by: Nutty Liu Link: https://lore.kernel.org/r/20250618113532.471448-6-apatel@ventanamicro.com Signed-off-by: Anup Patel Signed-off-by: Wang Yechao --- arch/riscv/kvm/mmu.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 7656f4f46e7b..3272bba73594 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -167,9 +167,11 @@ static int gstage_set_pte(struct kvm *kvm, u32 level, ptep = &next_ptep[gstage_pte_index(addr, current_level)]; } - set_pte(ptep, *new_pte); - if (gstage_pte_leaf(ptep)) - gstage_remote_tlb_flush(kvm, current_level, addr); + if (pte_val(*ptep) != pte_val(*new_pte)) { + set_pte(ptep, *new_pte); + if (gstage_pte_leaf(ptep)) + gstage_remote_tlb_flush(kvm, current_level, addr); + } return 0; } @@ -229,7 +231,7 @@ static void gstage_op_pte(struct kvm *kvm, gpa_t addr, pte_t *ptep, u32 ptep_level, enum gstage_op op) { int i, ret; - pte_t *next_ptep; + pte_t old_pte, *next_ptep; u32 next_ptep_level; unsigned long next_page_size, page_size; @@ -258,11 +260,13 @@ static void gstage_op_pte(struct kvm *kvm, gpa_t addr, if (op == GSTAGE_OP_CLEAR) put_page(virt_to_page(next_ptep)); } else { + old_pte = *ptep; if (op == GSTAGE_OP_CLEAR) set_pte(ptep, __pte(0)); else if (op == GSTAGE_OP_WP) set_pte(ptep, __pte(pte_val(ptep_get(ptep)) & ~_PAGE_WRITE)); - gstage_remote_tlb_flush(kvm, ptep_level, addr); + if (pte_val(*ptep) != pte_val(old_pte)) + gstage_remote_tlb_flush(kvm, ptep_level, addr); } } From 30fc9e1fb1f4f5c876fec1a5f7678af3b62650d8 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Wed, 18 Jun 2025 17:05:26 +0530 Subject: [PATCH 06/18] RISC-V: KVM: Implement kvm_arch_flush_remote_tlbs_range() mainline inclusion from mainline-6.17-rc1 commit ca539ba category: feature bugzilla: RVCK-Project#257 -------------------------------- The kvm_arch_flush_remote_tlbs_range() expected by KVM core can be easily implemented for RISC-V using kvm_riscv_hfence_gvma_vmid_gpa() hence provide it. Also with kvm_arch_flush_remote_tlbs_range() available for RISC-V, the mmu_wp_memory_region() can happily use kvm_flush_remote_tlbs_memslot() instead of kvm_flush_remote_tlbs(). Signed-off-by: Anup Patel Reviewed-by: Atish Patra Tested-by: Atish Patra Reviewed-by: Nutty Liu Link: https://lore.kernel.org/r/20250618113532.471448-7-apatel@ventanamicro.com Signed-off-by: Anup Patel Signed-off-by: Wang Yechao --- arch/riscv/include/asm/kvm_host.h | 2 ++ arch/riscv/kvm/mmu.c | 2 +- arch/riscv/kvm/tlb.c | 8 ++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 46f208943ac6..d8b517a6398f 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -43,6 +43,8 @@ KVM_ARCH_REQ_FLAGS(5, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(6) +#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE + #define KVM_HEDELEG_DEFAULT (BIT(EXC_INST_MISALIGNED) | \ BIT(EXC_BREAKPOINT) | \ BIT(EXC_SYSCALL) | \ diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 3272bba73594..2d75ac4742ff 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -344,7 +344,7 @@ static void gstage_wp_memory_region(struct kvm *kvm, int slot) spin_lock(&kvm->mmu_lock); gstage_wp_range(kvm, start, end); spin_unlock(&kvm->mmu_lock); - kvm_flush_remote_tlbs(kvm); + kvm_flush_remote_tlbs_memslot(kvm, memslot); } int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa, diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c index da98ca801d31..f46a27658c2e 100644 --- a/arch/riscv/kvm/tlb.c +++ b/arch/riscv/kvm/tlb.c @@ -403,3 +403,11 @@ void kvm_riscv_hfence_vvma_all(struct kvm *kvm, make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE_VVMA_ALL, KVM_REQ_HFENCE_VVMA_ALL, NULL); } + +int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) +{ + kvm_riscv_hfence_gvma_vmid_gpa(kvm, -1UL, 0, + gfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT, + PAGE_SHIFT); + return 0; +} From c75ed7c9a75fa1d294d6462988273c99545737fd Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Wed, 18 Jun 2025 17:05:27 +0530 Subject: [PATCH 07/18] RISC-V: KVM: Use ncsr_xyz() in kvm_riscv_vcpu_trap_redirect() mainline inclusion from mainline-6.17-rc1 commit 77ba646 category: feature bugzilla: RVCK-Project#257 -------------------------------- The H-extension CSRs accessed by kvm_riscv_vcpu_trap_redirect() will trap when KVM RISC-V is running as Guest/VM hence remove these traps by using ncsr_xyz() instead of csr_xyz(). Signed-off-by: Anup Patel Reviewed-by: Atish Patra Tested-by: Atish Patra Reviewed-by: Nutty Liu Link: https://lore.kernel.org/r/20250618113532.471448-8-apatel@ventanamicro.com Signed-off-by: Anup Patel Signed-off-by: Wang Yechao --- arch/riscv/kvm/vcpu_exit.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c index 6e0c18412795..85c43c83e3b9 100644 --- a/arch/riscv/kvm/vcpu_exit.c +++ b/arch/riscv/kvm/vcpu_exit.c @@ -9,6 +9,7 @@ #include #include #include +#include static int gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, struct kvm_cpu_trap *trap) @@ -135,7 +136,7 @@ unsigned long kvm_riscv_vcpu_unpriv_read(struct kvm_vcpu *vcpu, void kvm_riscv_vcpu_trap_redirect(struct kvm_vcpu *vcpu, struct kvm_cpu_trap *trap) { - unsigned long vsstatus = csr_read(CSR_VSSTATUS); + unsigned long vsstatus = ncsr_read(CSR_VSSTATUS); /* Change Guest SSTATUS.SPP bit */ vsstatus &= ~SR_SPP; @@ -151,15 +152,15 @@ void kvm_riscv_vcpu_trap_redirect(struct kvm_vcpu *vcpu, vsstatus &= ~SR_SIE; /* Update Guest SSTATUS */ - csr_write(CSR_VSSTATUS, vsstatus); + ncsr_write(CSR_VSSTATUS, vsstatus); /* Update Guest SCAUSE, STVAL, and SEPC */ - csr_write(CSR_VSCAUSE, trap->scause); - csr_write(CSR_VSTVAL, trap->stval); - csr_write(CSR_VSEPC, trap->sepc); + ncsr_write(CSR_VSCAUSE, trap->scause); + ncsr_write(CSR_VSTVAL, trap->stval); + ncsr_write(CSR_VSEPC, trap->sepc); /* Set Guest PC to Guest exception vector */ - vcpu->arch.guest_context.sepc = csr_read(CSR_VSTVEC); + vcpu->arch.guest_context.sepc = ncsr_read(CSR_VSTVEC); /* Set Guest privilege mode to supervisor */ vcpu->arch.guest_context.sstatus |= SR_SPP; From 49537baf41b1ef080c82d64ce06fb2ff8f4f69eb Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Wed, 18 Jun 2025 17:05:28 +0530 Subject: [PATCH 08/18] RISC-V: KVM: Factor-out MMU related declarations into separate headers mainline inclusion from mainline-6.17-rc1 commit 4ecbd3e category: feature bugzilla: RVCK-Project#257 -------------------------------- The MMU, TLB, and VMID management for KVM RISC-V already exists as seprate sources so create separate headers along these lines. This further simplifies asm/kvm_host.h header. Signed-off-by: Anup Patel Reviewed-by: Atish Patra Tested-by: Atish Patra Reviewed-by: Nutty Liu Link: https://lore.kernel.org/r/20250618113532.471448-9-apatel@ventanamicro.com Signed-off-by: Anup Patel Signed-off-by: Wang Yechao --- arch/riscv/include/asm/kvm_host.h | 100 +----------------------------- arch/riscv/include/asm/kvm_mmu.h | 26 ++++++++ arch/riscv/include/asm/kvm_tlb.h | 78 +++++++++++++++++++++++ arch/riscv/include/asm/kvm_vmid.h | 27 ++++++++ arch/riscv/kvm/aia_imsic.c | 1 + arch/riscv/kvm/main.c | 1 + arch/riscv/kvm/mmu.c | 1 + arch/riscv/kvm/tlb.c | 2 + arch/riscv/kvm/vcpu.c | 1 + arch/riscv/kvm/vcpu_exit.c | 1 + arch/riscv/kvm/vm.c | 1 + arch/riscv/kvm/vmid.c | 2 + 12 files changed, 143 insertions(+), 98 deletions(-) create mode 100644 arch/riscv/include/asm/kvm_mmu.h create mode 100644 arch/riscv/include/asm/kvm_tlb.h create mode 100644 arch/riscv/include/asm/kvm_vmid.h diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index d8b517a6398f..539ec3fc7d0c 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include #include @@ -56,24 +58,6 @@ BIT(IRQ_VS_TIMER) | \ BIT(IRQ_VS_EXT)) -enum kvm_riscv_hfence_type { - KVM_RISCV_HFENCE_UNKNOWN = 0, - KVM_RISCV_HFENCE_GVMA_VMID_GPA, - KVM_RISCV_HFENCE_VVMA_ASID_GVA, - KVM_RISCV_HFENCE_VVMA_ASID_ALL, - KVM_RISCV_HFENCE_VVMA_GVA, -}; - -struct kvm_riscv_hfence { - enum kvm_riscv_hfence_type type; - unsigned long asid; - unsigned long order; - gpa_t addr; - gpa_t size; -}; - -#define KVM_RISCV_VCPU_MAX_HFENCE 64 - struct kvm_vm_stat { struct kvm_vm_stat_generic generic; }; @@ -99,15 +83,6 @@ struct kvm_vcpu_stat { struct kvm_arch_memory_slot { }; -struct kvm_vmid { - /* - * Writes to vmid_version and vmid happen with vmid_lock held - * whereas reads happen without any lock held. - */ - unsigned long vmid_version; - unsigned long vmid; -}; - struct kvm_arch { /* G-stage vmid */ struct kvm_vmid vmid; @@ -310,77 +285,6 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} #define KVM_ARCH_WANT_MMU_NOTIFIER -#define KVM_RISCV_GSTAGE_TLB_MIN_ORDER 12 - -void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid, - gpa_t gpa, gpa_t gpsz, - unsigned long order); -void kvm_riscv_local_hfence_gvma_vmid_all(unsigned long vmid); -void kvm_riscv_local_hfence_gvma_gpa(gpa_t gpa, gpa_t gpsz, - unsigned long order); -void kvm_riscv_local_hfence_gvma_all(void); -void kvm_riscv_local_hfence_vvma_asid_gva(unsigned long vmid, - unsigned long asid, - unsigned long gva, - unsigned long gvsz, - unsigned long order); -void kvm_riscv_local_hfence_vvma_asid_all(unsigned long vmid, - unsigned long asid); -void kvm_riscv_local_hfence_vvma_gva(unsigned long vmid, - unsigned long gva, unsigned long gvsz, - unsigned long order); -void kvm_riscv_local_hfence_vvma_all(unsigned long vmid); - -void kvm_riscv_tlb_flush_process(struct kvm_vcpu *vcpu); - -void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu); -void kvm_riscv_hfence_vvma_all_process(struct kvm_vcpu *vcpu); -void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu); - -void kvm_riscv_fence_i(struct kvm *kvm, - unsigned long hbase, unsigned long hmask); -void kvm_riscv_hfence_gvma_vmid_gpa(struct kvm *kvm, - unsigned long hbase, unsigned long hmask, - gpa_t gpa, gpa_t gpsz, - unsigned long order); -void kvm_riscv_hfence_gvma_vmid_all(struct kvm *kvm, - unsigned long hbase, unsigned long hmask); -void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm, - unsigned long hbase, unsigned long hmask, - unsigned long gva, unsigned long gvsz, - unsigned long order, unsigned long asid); -void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm, - unsigned long hbase, unsigned long hmask, - unsigned long asid); -void kvm_riscv_hfence_vvma_gva(struct kvm *kvm, - unsigned long hbase, unsigned long hmask, - unsigned long gva, unsigned long gvsz, - unsigned long order); -void kvm_riscv_hfence_vvma_all(struct kvm *kvm, - unsigned long hbase, unsigned long hmask); - -int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa, - phys_addr_t hpa, unsigned long size, - bool writable, bool in_atomic); -void kvm_riscv_gstage_iounmap(struct kvm *kvm, gpa_t gpa, - unsigned long size); -int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, - struct kvm_memory_slot *memslot, - gpa_t gpa, unsigned long hva, bool is_write); -int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm); -void kvm_riscv_gstage_free_pgd(struct kvm *kvm); -void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu); -void __init kvm_riscv_gstage_mode_detect(void); -unsigned long __init kvm_riscv_gstage_mode(void); -int kvm_riscv_gstage_gpa_bits(void); - -void __init kvm_riscv_gstage_vmid_detect(void); -unsigned long kvm_riscv_gstage_vmid_bits(void); -int kvm_riscv_gstage_vmid_init(struct kvm *kvm); -bool kvm_riscv_gstage_vmid_ver_changed(struct kvm_vmid *vmid); -void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu); -void kvm_riscv_gstage_vmid_sanitize(struct kvm_vcpu *vcpu); - int kvm_riscv_setup_default_irq_routing(struct kvm *kvm, u32 lines); void __kvm_riscv_unpriv_trap(void); diff --git a/arch/riscv/include/asm/kvm_mmu.h b/arch/riscv/include/asm/kvm_mmu.h new file mode 100644 index 000000000000..4e1654282ee4 --- /dev/null +++ b/arch/riscv/include/asm/kvm_mmu.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2025 Ventana Micro Systems Inc. + */ + +#ifndef __RISCV_KVM_MMU_H_ +#define __RISCV_KVM_MMU_H_ + +#include + +int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa, + phys_addr_t hpa, unsigned long size, + bool writable, bool in_atomic); +void kvm_riscv_gstage_iounmap(struct kvm *kvm, gpa_t gpa, + unsigned long size); +int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, + struct kvm_memory_slot *memslot, + gpa_t gpa, unsigned long hva, bool is_write); +int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm); +void kvm_riscv_gstage_free_pgd(struct kvm *kvm); +void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu); +void kvm_riscv_gstage_mode_detect(void); +unsigned long kvm_riscv_gstage_mode(void); +int kvm_riscv_gstage_gpa_bits(void); + +#endif diff --git a/arch/riscv/include/asm/kvm_tlb.h b/arch/riscv/include/asm/kvm_tlb.h new file mode 100644 index 000000000000..cd00c9a46cb1 --- /dev/null +++ b/arch/riscv/include/asm/kvm_tlb.h @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2025 Ventana Micro Systems Inc. + */ + +#ifndef __RISCV_KVM_TLB_H_ +#define __RISCV_KVM_TLB_H_ + +#include + +enum kvm_riscv_hfence_type { + KVM_RISCV_HFENCE_UNKNOWN = 0, + KVM_RISCV_HFENCE_GVMA_VMID_GPA, + KVM_RISCV_HFENCE_VVMA_ASID_GVA, + KVM_RISCV_HFENCE_VVMA_ASID_ALL, + KVM_RISCV_HFENCE_VVMA_GVA, +}; + +struct kvm_riscv_hfence { + enum kvm_riscv_hfence_type type; + unsigned long asid; + unsigned long order; + gpa_t addr; + gpa_t size; +}; + +#define KVM_RISCV_VCPU_MAX_HFENCE 64 + +#define KVM_RISCV_GSTAGE_TLB_MIN_ORDER 12 + +void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid, + gpa_t gpa, gpa_t gpsz, + unsigned long order); +void kvm_riscv_local_hfence_gvma_vmid_all(unsigned long vmid); +void kvm_riscv_local_hfence_gvma_gpa(gpa_t gpa, gpa_t gpsz, + unsigned long order); +void kvm_riscv_local_hfence_gvma_all(void); +void kvm_riscv_local_hfence_vvma_asid_gva(unsigned long vmid, + unsigned long asid, + unsigned long gva, + unsigned long gvsz, + unsigned long order); +void kvm_riscv_local_hfence_vvma_asid_all(unsigned long vmid, + unsigned long asid); +void kvm_riscv_local_hfence_vvma_gva(unsigned long vmid, + unsigned long gva, unsigned long gvsz, + unsigned long order); +void kvm_riscv_local_hfence_vvma_all(unsigned long vmid); + +void kvm_riscv_tlb_flush_process(struct kvm_vcpu *vcpu); + +void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu); +void kvm_riscv_hfence_vvma_all_process(struct kvm_vcpu *vcpu); +void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu); + +void kvm_riscv_fence_i(struct kvm *kvm, + unsigned long hbase, unsigned long hmask); +void kvm_riscv_hfence_gvma_vmid_gpa(struct kvm *kvm, + unsigned long hbase, unsigned long hmask, + gpa_t gpa, gpa_t gpsz, + unsigned long order); +void kvm_riscv_hfence_gvma_vmid_all(struct kvm *kvm, + unsigned long hbase, unsigned long hmask); +void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm, + unsigned long hbase, unsigned long hmask, + unsigned long gva, unsigned long gvsz, + unsigned long order, unsigned long asid); +void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm, + unsigned long hbase, unsigned long hmask, + unsigned long asid); +void kvm_riscv_hfence_vvma_gva(struct kvm *kvm, + unsigned long hbase, unsigned long hmask, + unsigned long gva, unsigned long gvsz, + unsigned long order); +void kvm_riscv_hfence_vvma_all(struct kvm *kvm, + unsigned long hbase, unsigned long hmask); + +#endif diff --git a/arch/riscv/include/asm/kvm_vmid.h b/arch/riscv/include/asm/kvm_vmid.h new file mode 100644 index 000000000000..ab98e1434fb7 --- /dev/null +++ b/arch/riscv/include/asm/kvm_vmid.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2025 Ventana Micro Systems Inc. + */ + +#ifndef __RISCV_KVM_VMID_H_ +#define __RISCV_KVM_VMID_H_ + +#include + +struct kvm_vmid { + /* + * Writes to vmid_version and vmid happen with vmid_lock held + * whereas reads happen without any lock held. + */ + unsigned long vmid_version; + unsigned long vmid; +}; + +void __init kvm_riscv_gstage_vmid_detect(void); +unsigned long kvm_riscv_gstage_vmid_bits(void); +int kvm_riscv_gstage_vmid_init(struct kvm *kvm); +bool kvm_riscv_gstage_vmid_ver_changed(struct kvm_vmid *vmid); +void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu); +void kvm_riscv_gstage_vmid_sanitize(struct kvm_vcpu *vcpu); + +#endif diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c index a8085cd8215e..c02dbc4566b0 100644 --- a/arch/riscv/kvm/aia_imsic.c +++ b/arch/riscv/kvm/aia_imsic.c @@ -16,6 +16,7 @@ #include #include #include +#include #define IMSIC_MAX_EIX (IMSIC_MAX_ID / BITS_PER_TYPE(u64)) diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index 0150457cfb99..0e3667089f36 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 2d75ac4742ff..8b74b6f6e015 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c index f46a27658c2e..6fc4361c3d75 100644 --- a/arch/riscv/kvm/tlb.c +++ b/arch/riscv/kvm/tlb.c @@ -15,6 +15,8 @@ #include #include #include +#include +#include #define has_svinval() riscv_has_extension_unlikely(RISCV_ISA_EXT_SVINVAL) diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index efdd93006e4d..4a3ba85cc3f5 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c index 85c43c83e3b9..965df528de90 100644 --- a/arch/riscv/kvm/vcpu_exit.c +++ b/arch/riscv/kvm/vcpu_exit.c @@ -9,6 +9,7 @@ #include #include #include +#include #include static int gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c index 235b40ab82ea..1b014485f80e 100644 --- a/arch/riscv/kvm/vm.c +++ b/arch/riscv/kvm/vm.c @@ -11,6 +11,7 @@ #include #include #include +#include const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS() diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c index 92c01255f86f..3b426c800480 100644 --- a/arch/riscv/kvm/vmid.c +++ b/arch/riscv/kvm/vmid.c @@ -14,6 +14,8 @@ #include #include #include +#include +#include static unsigned long vmid_version = 1; static unsigned long vmid_next; From 8c96aac053322b3100e9e4bcc174fe9dacf3b573 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Wed, 18 Jun 2025 17:05:29 +0530 Subject: [PATCH 09/18] RISC-V: KVM: Introduce struct kvm_gstage_mapping mainline inclusion from mainline-6.17-rc1 commit f035b44 category: feature bugzilla: RVCK-Project#257 -------------------------------- Introduce struct kvm_gstage_mapping which represents a g-stage mapping at a particular g-stage page table level. Also, update the kvm_riscv_gstage_map() to return the g-stage mapping upon success. Signed-off-by: Anup Patel Reviewed-by: Atish Patra Tested-by: Atish Patra Reviewed-by: Nutty Liu Link: https://lore.kernel.org/r/20250618113532.471448-10-apatel@ventanamicro.com Signed-off-by: Anup Patel Signed-off-by: Wang Yechao --- arch/riscv/include/asm/kvm_mmu.h | 9 ++++- arch/riscv/kvm/mmu.c | 61 ++++++++++++++++++-------------- arch/riscv/kvm/vcpu_exit.c | 3 +- 3 files changed, 45 insertions(+), 28 deletions(-) diff --git a/arch/riscv/include/asm/kvm_mmu.h b/arch/riscv/include/asm/kvm_mmu.h index 4e1654282ee4..91c11e692dc7 100644 --- a/arch/riscv/include/asm/kvm_mmu.h +++ b/arch/riscv/include/asm/kvm_mmu.h @@ -8,6 +8,12 @@ #include +struct kvm_gstage_mapping { + gpa_t addr; + pte_t pte; + u32 level; +}; + int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, unsigned long size, bool writable, bool in_atomic); @@ -15,7 +21,8 @@ void kvm_riscv_gstage_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size); int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, - gpa_t gpa, unsigned long hva, bool is_write); + gpa_t gpa, unsigned long hva, bool is_write, + struct kvm_gstage_mapping *out_map); int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm); void kvm_riscv_gstage_free_pgd(struct kvm *kvm); void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu); diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 8b74b6f6e015..a6eb4999d697 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -135,18 +135,18 @@ static void gstage_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr) kvm_riscv_hfence_gvma_vmid_gpa(kvm, -1UL, 0, addr, BIT(order), order); } -static int gstage_set_pte(struct kvm *kvm, u32 level, - struct kvm_mmu_memory_cache *pcache, - gpa_t addr, const pte_t *new_pte) +static int gstage_set_pte(struct kvm *kvm, + struct kvm_mmu_memory_cache *pcache, + const struct kvm_gstage_mapping *map) { u32 current_level = gstage_pgd_levels - 1; pte_t *next_ptep = (pte_t *)kvm->arch.pgd; - pte_t *ptep = &next_ptep[gstage_pte_index(addr, current_level)]; + pte_t *ptep = &next_ptep[gstage_pte_index(map->addr, current_level)]; - if (current_level < level) + if (current_level < map->level) return -EINVAL; - while (current_level != level) { + while (current_level != map->level) { if (gstage_pte_leaf(ptep)) return -EEXIST; @@ -165,13 +165,13 @@ static int gstage_set_pte(struct kvm *kvm, u32 level, } current_level--; - ptep = &next_ptep[gstage_pte_index(addr, current_level)]; + ptep = &next_ptep[gstage_pte_index(map->addr, current_level)]; } - if (pte_val(*ptep) != pte_val(*new_pte)) { - set_pte(ptep, *new_pte); + if (pte_val(*ptep) != pte_val(map->pte)) { + set_pte(ptep, map->pte); if (gstage_pte_leaf(ptep)) - gstage_remote_tlb_flush(kvm, current_level, addr); + gstage_remote_tlb_flush(kvm, current_level, map->addr); } return 0; @@ -181,14 +181,16 @@ static int gstage_map_page(struct kvm *kvm, struct kvm_mmu_memory_cache *pcache, gpa_t gpa, phys_addr_t hpa, unsigned long page_size, - bool page_rdonly, bool page_exec) + bool page_rdonly, bool page_exec, + struct kvm_gstage_mapping *out_map) { - int ret; - u32 level = 0; - pte_t new_pte; pgprot_t prot; + int ret; - ret = gstage_page_size_to_level(page_size, &level); + out_map->addr = gpa; + out_map->level = 0; + + ret = gstage_page_size_to_level(page_size, &out_map->level); if (ret) return ret; @@ -216,10 +218,10 @@ static int gstage_map_page(struct kvm *kvm, else prot = PAGE_WRITE; } - new_pte = pfn_pte(PFN_DOWN(hpa), prot); - new_pte = pte_mkdirty(new_pte); + out_map->pte = pfn_pte(PFN_DOWN(hpa), prot); + out_map->pte = pte_mkdirty(out_map->pte); - return gstage_set_pte(kvm, level, pcache, gpa, &new_pte); + return gstage_set_pte(kvm, pcache, out_map); } enum gstage_op { @@ -352,7 +354,6 @@ int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, unsigned long size, bool writable, bool in_atomic) { - pte_t pte; int ret = 0; unsigned long pfn; phys_addr_t addr, end; @@ -360,22 +361,25 @@ int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa, .gfp_custom = (in_atomic) ? GFP_ATOMIC | __GFP_ACCOUNT : 0, .gfp_zero = __GFP_ZERO, }; + struct kvm_gstage_mapping map; end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK; pfn = __phys_to_pfn(hpa); for (addr = gpa; addr < end; addr += PAGE_SIZE) { - pte = pfn_pte(pfn, PAGE_KERNEL_IO); + map.addr = addr; + map.pte = pfn_pte(pfn, PAGE_KERNEL_IO); + map.level = 0; if (!writable) - pte = pte_wrprotect(pte); + map.pte = pte_wrprotect(map.pte); ret = kvm_mmu_topup_memory_cache(&pcache, gstage_pgd_levels); if (ret) goto out; spin_lock(&kvm->mmu_lock); - ret = gstage_set_pte(kvm, 0, &pcache, addr, &pte); + ret = gstage_set_pte(kvm, &pcache, &map); spin_unlock(&kvm->mmu_lock); if (ret) goto out; @@ -559,6 +563,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) { int ret; kvm_pfn_t pfn = pte_pfn(range->arg.pte); + struct kvm_gstage_mapping out_map = {0}; if (!kvm->arch.pgd) return false; @@ -566,7 +571,7 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) WARN_ON(range->end - range->start != 1); ret = gstage_map_page(kvm, NULL, range->start << PAGE_SHIFT, - __pfn_to_phys(pfn), PAGE_SIZE, true, true); + __pfn_to_phys(pfn), PAGE_SIZE, true, true, &out_map); if (ret) { kvm_debug("Failed to map G-stage page (error %d)\n", ret); return true; @@ -613,7 +618,8 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, - gpa_t gpa, unsigned long hva, bool is_write) + gpa_t gpa, unsigned long hva, bool is_write, + struct kvm_gstage_mapping *out_map) { int ret; kvm_pfn_t hfn; @@ -627,6 +633,9 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, !(memslot->flags & KVM_MEM_READONLY)) ? true : false; unsigned long vma_pagesize, mmu_seq; + /* Setup initial state of output mapping */ + memset(out_map, 0, sizeof(*out_map)); + /* We need minimum second+third level pages */ ret = kvm_mmu_topup_memory_cache(pcache, gstage_pgd_levels); if (ret) { @@ -697,10 +706,10 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, kvm_set_pfn_dirty(hfn); mark_page_dirty(kvm, gfn); ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT, - vma_pagesize, false, true); + vma_pagesize, false, true, out_map); } else { ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT, - vma_pagesize, true, true); + vma_pagesize, true, true, out_map); } if (ret) diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c index 965df528de90..6b4694bc07ea 100644 --- a/arch/riscv/kvm/vcpu_exit.c +++ b/arch/riscv/kvm/vcpu_exit.c @@ -15,6 +15,7 @@ static int gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, struct kvm_cpu_trap *trap) { + struct kvm_gstage_mapping host_map; struct kvm_memory_slot *memslot; unsigned long hva, fault_addr; bool writable; @@ -43,7 +44,7 @@ static int gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, } ret = kvm_riscv_gstage_map(vcpu, memslot, fault_addr, hva, - (trap->scause == EXC_STORE_GUEST_PAGE_FAULT) ? true : false); + (trap->scause == EXC_STORE_GUEST_PAGE_FAULT) ? true : false, &host_map); if (ret < 0) return ret; From 1e3f8539e409f2e7b917fa34f31cc00e5a6dff5f Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Wed, 18 Jun 2025 17:05:30 +0530 Subject: [PATCH 10/18] RISC-V: KVM: Add vmid field to struct kvm_riscv_hfence mainline inclusion from mainline-6.17-rc1 commit 4c933f3 category: feature bugzilla: RVCK-Project#257 -------------------------------- Currently, the struct kvm_riscv_hfence does not have vmid field and various hfence processing functions always pick vmid assigned to the guest/VM. This prevents us from doing hfence operation on arbitrary vmid hence add vmid field to struct kvm_riscv_hfence and use it wherever applicable. Signed-off-by: Anup Patel Reviewed-by: Atish Patra Tested-by: Atish Patra Reviewed-by: Nutty Liu Link: https://lore.kernel.org/r/20250618113532.471448-11-apatel@ventanamicro.com Signed-off-by: Anup Patel Signed-off-by: Wang Yechao --- arch/riscv/include/asm/kvm_tlb.h | 1 + arch/riscv/kvm/tlb.c | 30 ++++++++++++++++-------------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/riscv/include/asm/kvm_tlb.h b/arch/riscv/include/asm/kvm_tlb.h index cd00c9a46cb1..f67e03edeaec 100644 --- a/arch/riscv/include/asm/kvm_tlb.h +++ b/arch/riscv/include/asm/kvm_tlb.h @@ -19,6 +19,7 @@ enum kvm_riscv_hfence_type { struct kvm_riscv_hfence { enum kvm_riscv_hfence_type type; unsigned long asid; + unsigned long vmid; unsigned long order; gpa_t addr; gpa_t size; diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c index 6fc4361c3d75..349fcfc93f54 100644 --- a/arch/riscv/kvm/tlb.c +++ b/arch/riscv/kvm/tlb.c @@ -237,49 +237,43 @@ static bool vcpu_hfence_enqueue(struct kvm_vcpu *vcpu, void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu) { - unsigned long vmid; struct kvm_riscv_hfence d = { 0 }; - struct kvm_vmid *v = &vcpu->kvm->arch.vmid; while (vcpu_hfence_dequeue(vcpu, &d)) { switch (d.type) { case KVM_RISCV_HFENCE_UNKNOWN: break; case KVM_RISCV_HFENCE_GVMA_VMID_GPA: - vmid = READ_ONCE(v->vmid); if (kvm_riscv_nacl_available()) - nacl_hfence_gvma_vmid(nacl_shmem(), vmid, + nacl_hfence_gvma_vmid(nacl_shmem(), d.vmid, d.addr, d.size, d.order); else - kvm_riscv_local_hfence_gvma_vmid_gpa(vmid, d.addr, + kvm_riscv_local_hfence_gvma_vmid_gpa(d.vmid, d.addr, d.size, d.order); break; case KVM_RISCV_HFENCE_VVMA_ASID_GVA: kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_RCVD); - vmid = READ_ONCE(v->vmid); if (kvm_riscv_nacl_available()) - nacl_hfence_vvma_asid(nacl_shmem(), vmid, d.asid, + nacl_hfence_vvma_asid(nacl_shmem(), d.vmid, d.asid, d.addr, d.size, d.order); else - kvm_riscv_local_hfence_vvma_asid_gva(vmid, d.asid, d.addr, + kvm_riscv_local_hfence_vvma_asid_gva(d.vmid, d.asid, d.addr, d.size, d.order); break; case KVM_RISCV_HFENCE_VVMA_ASID_ALL: kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_RCVD); - vmid = READ_ONCE(v->vmid); if (kvm_riscv_nacl_available()) - nacl_hfence_vvma_asid_all(nacl_shmem(), vmid, d.asid); + nacl_hfence_vvma_asid_all(nacl_shmem(), d.vmid, d.asid); else - kvm_riscv_local_hfence_vvma_asid_all(vmid, d.asid); + kvm_riscv_local_hfence_vvma_asid_all(d.vmid, d.asid); break; case KVM_RISCV_HFENCE_VVMA_GVA: kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_RCVD); - vmid = READ_ONCE(v->vmid); if (kvm_riscv_nacl_available()) - nacl_hfence_vvma(nacl_shmem(), vmid, + nacl_hfence_vvma(nacl_shmem(), d.vmid, d.addr, d.size, d.order); else - kvm_riscv_local_hfence_vvma_gva(vmid, d.addr, + kvm_riscv_local_hfence_vvma_gva(d.vmid, d.addr, d.size, d.order); break; default: @@ -336,10 +330,12 @@ void kvm_riscv_hfence_gvma_vmid_gpa(struct kvm *kvm, gpa_t gpa, gpa_t gpsz, unsigned long order) { + struct kvm_vmid *v = &kvm->arch.vmid; struct kvm_riscv_hfence data; data.type = KVM_RISCV_HFENCE_GVMA_VMID_GPA; data.asid = 0; + data.vmid = READ_ONCE(v->vmid); data.addr = gpa; data.size = gpsz; data.order = order; @@ -359,10 +355,12 @@ void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm, unsigned long gva, unsigned long gvsz, unsigned long order, unsigned long asid) { + struct kvm_vmid *v = &kvm->arch.vmid; struct kvm_riscv_hfence data; data.type = KVM_RISCV_HFENCE_VVMA_ASID_GVA; data.asid = asid; + data.vmid = READ_ONCE(v->vmid); data.addr = gva; data.size = gvsz; data.order = order; @@ -374,10 +372,12 @@ void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm, unsigned long hbase, unsigned long hmask, unsigned long asid) { + struct kvm_vmid *v = &kvm->arch.vmid; struct kvm_riscv_hfence data; data.type = KVM_RISCV_HFENCE_VVMA_ASID_ALL; data.asid = asid; + data.vmid = READ_ONCE(v->vmid); data.addr = data.size = data.order = 0; make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE, KVM_REQ_HFENCE_VVMA_ALL, &data); @@ -388,10 +388,12 @@ void kvm_riscv_hfence_vvma_gva(struct kvm *kvm, unsigned long gva, unsigned long gvsz, unsigned long order) { + struct kvm_vmid *v = &kvm->arch.vmid; struct kvm_riscv_hfence data; data.type = KVM_RISCV_HFENCE_VVMA_GVA; data.asid = 0; + data.vmid = READ_ONCE(v->vmid); data.addr = gva; data.size = gvsz; data.order = order; From aba695a870b56b94dade225b26a4dcd8a7680957 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Wed, 18 Jun 2025 17:05:31 +0530 Subject: [PATCH 11/18] RISC-V: KVM: Factor-out g-stage page table management mainline inclusion from mainline-6.17-rc1 commit dd82e35 category: feature bugzilla: RVCK-Project#257 -------------------------------- The upcoming nested virtualization can share g-stage page table management with the current host g-stage implementation hence factor-out g-stage page table management as separate sources and also use "kvm_riscv_mmu_" prefix for host g-stage functions. Signed-off-by: Anup Patel Tested-by: Atish Patra Reviewed-by: Nutty Liu Link: https://lore.kernel.org/r/20250618113532.471448-12-apatel@ventanamicro.com Signed-off-by: Anup Patel Signed-off-by: Wang Yechao --- arch/riscv/include/asm/kvm_gstage.h | 72 ++++ arch/riscv/include/asm/kvm_mmu.h | 32 +- arch/riscv/kvm/Makefile | 1 + arch/riscv/kvm/aia_imsic.c | 11 +- arch/riscv/kvm/gstage.c | 337 +++++++++++++++++++ arch/riscv/kvm/main.c | 2 +- arch/riscv/kvm/mmu.c | 501 ++++++---------------------- arch/riscv/kvm/vcpu.c | 4 +- arch/riscv/kvm/vcpu_exit.c | 5 +- arch/riscv/kvm/vm.c | 6 +- 10 files changed, 538 insertions(+), 433 deletions(-) create mode 100644 arch/riscv/include/asm/kvm_gstage.h create mode 100644 arch/riscv/kvm/gstage.c diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h new file mode 100644 index 000000000000..595e2183173e --- /dev/null +++ b/arch/riscv/include/asm/kvm_gstage.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2019 Western Digital Corporation or its affiliates. + * Copyright (c) 2025 Ventana Micro Systems Inc. + */ + +#ifndef __RISCV_KVM_GSTAGE_H_ +#define __RISCV_KVM_GSTAGE_H_ + +#include + +struct kvm_gstage { + struct kvm *kvm; + unsigned long flags; +#define KVM_GSTAGE_FLAGS_LOCAL BIT(0) + unsigned long vmid; + pgd_t *pgd; +}; + +struct kvm_gstage_mapping { + gpa_t addr; + pte_t pte; + u32 level; +}; + +#ifdef CONFIG_64BIT +#define kvm_riscv_gstage_index_bits 9 +#else +#define kvm_riscv_gstage_index_bits 10 +#endif + +extern unsigned long kvm_riscv_gstage_mode; +extern unsigned long kvm_riscv_gstage_pgd_levels; + +#define kvm_riscv_gstage_pgd_xbits 2 +#define kvm_riscv_gstage_pgd_size (1UL << (HGATP_PAGE_SHIFT + kvm_riscv_gstage_pgd_xbits)) +#define kvm_riscv_gstage_gpa_bits (HGATP_PAGE_SHIFT + \ + (kvm_riscv_gstage_pgd_levels * \ + kvm_riscv_gstage_index_bits) + \ + kvm_riscv_gstage_pgd_xbits) +#define kvm_riscv_gstage_gpa_size ((gpa_t)(1ULL << kvm_riscv_gstage_gpa_bits)) + +bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr, + pte_t **ptepp, u32 *ptep_level); + +int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage, + struct kvm_mmu_memory_cache *pcache, + const struct kvm_gstage_mapping *map); + +int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage, + struct kvm_mmu_memory_cache *pcache, + gpa_t gpa, phys_addr_t hpa, unsigned long page_size, + bool page_rdonly, bool page_exec, + struct kvm_gstage_mapping *out_map); + +enum kvm_riscv_gstage_op { + GSTAGE_OP_NOP = 0, /* Nothing */ + GSTAGE_OP_CLEAR, /* Clear/Unmap */ + GSTAGE_OP_WP, /* Write-protect */ +}; + +void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr, + pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op); + +void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage, + gpa_t start, gpa_t size, bool may_block); + +void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end); + +void kvm_riscv_gstage_mode_detect(void); + +#endif diff --git a/arch/riscv/include/asm/kvm_mmu.h b/arch/riscv/include/asm/kvm_mmu.h index 91c11e692dc7..5439e76f0a96 100644 --- a/arch/riscv/include/asm/kvm_mmu.h +++ b/arch/riscv/include/asm/kvm_mmu.h @@ -6,28 +6,16 @@ #ifndef __RISCV_KVM_MMU_H_ #define __RISCV_KVM_MMU_H_ -#include +#include -struct kvm_gstage_mapping { - gpa_t addr; - pte_t pte; - u32 level; -}; - -int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa, - phys_addr_t hpa, unsigned long size, - bool writable, bool in_atomic); -void kvm_riscv_gstage_iounmap(struct kvm *kvm, gpa_t gpa, - unsigned long size); -int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, - struct kvm_memory_slot *memslot, - gpa_t gpa, unsigned long hva, bool is_write, - struct kvm_gstage_mapping *out_map); -int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm); -void kvm_riscv_gstage_free_pgd(struct kvm *kvm); -void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu); -void kvm_riscv_gstage_mode_detect(void); -unsigned long kvm_riscv_gstage_mode(void); -int kvm_riscv_gstage_gpa_bits(void); +int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, + unsigned long size, bool writable, bool in_atomic); +void kvm_riscv_mmu_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size); +int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, + gpa_t gpa, unsigned long hva, bool is_write, + struct kvm_gstage_mapping *out_map); +int kvm_riscv_mmu_alloc_pgd(struct kvm *kvm); +void kvm_riscv_mmu_free_pgd(struct kvm *kvm); +void kvm_riscv_mmu_update_hgatp(struct kvm_vcpu *vcpu); #endif diff --git a/arch/riscv/kvm/Makefile b/arch/riscv/kvm/Makefile index f541a683daaf..eab00a07e846 100644 --- a/arch/riscv/kvm/Makefile +++ b/arch/riscv/kvm/Makefile @@ -14,6 +14,7 @@ kvm-y += aia.o kvm-y += aia_aplic.o kvm-y += aia_device.o kvm-y += aia_imsic.o +kvm-y += gstage.o kvm-y += main.o kvm-y += mmu.o kvm-y += nacl.o diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c index c02dbc4566b0..7ce790e53451 100644 --- a/arch/riscv/kvm/aia_imsic.c +++ b/arch/riscv/kvm/aia_imsic.c @@ -704,9 +704,8 @@ void kvm_riscv_vcpu_aia_imsic_release(struct kvm_vcpu *vcpu) */ /* Purge the G-stage mapping */ - kvm_riscv_gstage_iounmap(vcpu->kvm, - vcpu->arch.aia_context.imsic_addr, - IMSIC_MMIO_PAGE_SZ); + kvm_riscv_mmu_iounmap(vcpu->kvm, vcpu->arch.aia_context.imsic_addr, + IMSIC_MMIO_PAGE_SZ); /* TODO: Purge the IOMMU mapping ??? */ @@ -786,9 +785,9 @@ int kvm_riscv_vcpu_aia_imsic_update(struct kvm_vcpu *vcpu) imsic_vsfile_local_clear(new_vsfile_hgei, imsic->nr_hw_eix); /* Update G-stage mapping for the new IMSIC VS-file */ - ret = kvm_riscv_gstage_ioremap(kvm, vcpu->arch.aia_context.imsic_addr, - new_vsfile_pa, IMSIC_MMIO_PAGE_SZ, - true, true); + ret = kvm_riscv_mmu_ioremap(kvm, vcpu->arch.aia_context.imsic_addr, + new_vsfile_pa, IMSIC_MMIO_PAGE_SZ, + true, true); if (ret) goto fail_free_vsfile_hgei; diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c new file mode 100644 index 000000000000..9c7c44f09b05 --- /dev/null +++ b/arch/riscv/kvm/gstage.c @@ -0,0 +1,337 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 Western Digital Corporation or its affiliates. + * Copyright (c) 2025 Ventana Micro Systems Inc. + */ + +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_64BIT +unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV39X4; +unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 3; +#else +unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV32X4; +unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 2; +#endif + +#define gstage_pte_leaf(__ptep) \ + (pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)) + +static inline unsigned long gstage_pte_index(gpa_t addr, u32 level) +{ + unsigned long mask; + unsigned long shift = HGATP_PAGE_SHIFT + (kvm_riscv_gstage_index_bits * level); + + if (level == (kvm_riscv_gstage_pgd_levels - 1)) + mask = (PTRS_PER_PTE * (1UL << kvm_riscv_gstage_pgd_xbits)) - 1; + else + mask = PTRS_PER_PTE - 1; + + return (addr >> shift) & mask; +} + +static inline unsigned long gstage_pte_page_vaddr(pte_t pte) +{ + return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte))); +} + +static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level) +{ + u32 i; + unsigned long psz = 1UL << 12; + + for (i = 0; i < kvm_riscv_gstage_pgd_levels; i++) { + if (page_size == (psz << (i * kvm_riscv_gstage_index_bits))) { + *out_level = i; + return 0; + } + } + + return -EINVAL; +} + +static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder) +{ + if (kvm_riscv_gstage_pgd_levels < level) + return -EINVAL; + + *out_pgorder = 12 + (level * kvm_riscv_gstage_index_bits); + return 0; +} + +static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize) +{ + int rc; + unsigned long page_order = PAGE_SHIFT; + + rc = gstage_level_to_page_order(level, &page_order); + if (rc) + return rc; + + *out_pgsize = BIT(page_order); + return 0; +} + +bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr, + pte_t **ptepp, u32 *ptep_level) +{ + pte_t *ptep; + u32 current_level = kvm_riscv_gstage_pgd_levels - 1; + + *ptep_level = current_level; + ptep = (pte_t *)gstage->pgd; + ptep = &ptep[gstage_pte_index(addr, current_level)]; + while (ptep && pte_val(ptep_get(ptep))) { + if (gstage_pte_leaf(ptep)) { + *ptep_level = current_level; + *ptepp = ptep; + return true; + } + + if (current_level) { + current_level--; + *ptep_level = current_level; + ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep)); + ptep = &ptep[gstage_pte_index(addr, current_level)]; + } else { + ptep = NULL; + } + } + + return false; +} + +static void gstage_tlb_flush(struct kvm_gstage *gstage, u32 level, gpa_t addr) +{ + unsigned long order = PAGE_SHIFT; + + if (gstage_level_to_page_order(level, &order)) + return; + addr &= ~(BIT(order) - 1); + + if (gstage->flags & KVM_GSTAGE_FLAGS_LOCAL) + kvm_riscv_local_hfence_gvma_vmid_gpa(gstage->vmid, addr, BIT(order), order); + else + kvm_riscv_hfence_gvma_vmid_gpa(gstage->kvm, -1UL, 0, addr, BIT(order), order); +} + +int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage, + struct kvm_mmu_memory_cache *pcache, + const struct kvm_gstage_mapping *map) +{ + u32 current_level = kvm_riscv_gstage_pgd_levels - 1; + pte_t *next_ptep = (pte_t *)gstage->pgd; + pte_t *ptep = &next_ptep[gstage_pte_index(map->addr, current_level)]; + + if (current_level < map->level) + return -EINVAL; + + while (current_level != map->level) { + if (gstage_pte_leaf(ptep)) + return -EEXIST; + + if (!pte_val(ptep_get(ptep))) { + if (!pcache) + return -ENOMEM; + next_ptep = kvm_mmu_memory_cache_alloc(pcache); + if (!next_ptep) + return -ENOMEM; + set_pte(ptep, pfn_pte(PFN_DOWN(__pa(next_ptep)), + __pgprot(_PAGE_TABLE))); + } else { + if (gstage_pte_leaf(ptep)) + return -EEXIST; + next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep)); + } + + current_level--; + ptep = &next_ptep[gstage_pte_index(map->addr, current_level)]; + } + + if (pte_val(*ptep) != pte_val(map->pte)) { + set_pte(ptep, map->pte); + if (gstage_pte_leaf(ptep)) + gstage_tlb_flush(gstage, current_level, map->addr); + } + + return 0; +} + +int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage, + struct kvm_mmu_memory_cache *pcache, + gpa_t gpa, phys_addr_t hpa, unsigned long page_size, + bool page_rdonly, bool page_exec, + struct kvm_gstage_mapping *out_map) +{ + pgprot_t prot; + int ret; + + out_map->addr = gpa; + out_map->level = 0; + + ret = gstage_page_size_to_level(page_size, &out_map->level); + if (ret) + return ret; + + /* + * A RISC-V implementation can choose to either: + * 1) Update 'A' and 'D' PTE bits in hardware + * 2) Generate page fault when 'A' and/or 'D' bits are not set + * PTE so that software can update these bits. + * + * We support both options mentioned above. To achieve this, we + * always set 'A' and 'D' PTE bits at time of creating G-stage + * mapping. To support KVM dirty page logging with both options + * mentioned above, we will write-protect G-stage PTEs to track + * dirty pages. + */ + + if (page_exec) { + if (page_rdonly) + prot = PAGE_READ_EXEC; + else + prot = PAGE_WRITE_EXEC; + } else { + if (page_rdonly) + prot = PAGE_READ; + else + prot = PAGE_WRITE; + } + out_map->pte = pfn_pte(PFN_DOWN(hpa), prot); + out_map->pte = pte_mkdirty(out_map->pte); + + return kvm_riscv_gstage_set_pte(gstage, pcache, out_map); +} + +void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr, + pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op) +{ + int i, ret; + pte_t old_pte, *next_ptep; + u32 next_ptep_level; + unsigned long next_page_size, page_size; + + ret = gstage_level_to_page_size(ptep_level, &page_size); + if (ret) + return; + + WARN_ON(addr & (page_size - 1)); + + if (!pte_val(ptep_get(ptep))) + return; + + if (ptep_level && !gstage_pte_leaf(ptep)) { + next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep)); + next_ptep_level = ptep_level - 1; + ret = gstage_level_to_page_size(next_ptep_level, &next_page_size); + if (ret) + return; + + if (op == GSTAGE_OP_CLEAR) + set_pte(ptep, __pte(0)); + for (i = 0; i < PTRS_PER_PTE; i++) + kvm_riscv_gstage_op_pte(gstage, addr + i * next_page_size, + &next_ptep[i], next_ptep_level, op); + if (op == GSTAGE_OP_CLEAR) + put_page(virt_to_page(next_ptep)); + } else { + old_pte = *ptep; + if (op == GSTAGE_OP_CLEAR) + set_pte(ptep, __pte(0)); + else if (op == GSTAGE_OP_WP) + set_pte(ptep, __pte(pte_val(ptep_get(ptep)) & ~_PAGE_WRITE)); + if (pte_val(*ptep) != pte_val(old_pte)) + gstage_tlb_flush(gstage, ptep_level, addr); + } +} + +void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage, + gpa_t start, gpa_t size, bool may_block) +{ + int ret; + pte_t *ptep; + u32 ptep_level; + bool found_leaf; + unsigned long page_size; + gpa_t addr = start, end = start + size; + + while (addr < end) { + found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level); + ret = gstage_level_to_page_size(ptep_level, &page_size); + if (ret) + break; + + if (!found_leaf) + goto next; + + if (!(addr & (page_size - 1)) && ((end - addr) >= page_size)) + kvm_riscv_gstage_op_pte(gstage, addr, ptep, + ptep_level, GSTAGE_OP_CLEAR); + +next: + addr += page_size; + + /* + * If the range is too large, release the kvm->mmu_lock + * to prevent starvation and lockup detector warnings. + */ + if (!(gstage->flags & KVM_GSTAGE_FLAGS_LOCAL) && may_block && addr < end) + cond_resched_lock(&gstage->kvm->mmu_lock); + } +} + +void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end) +{ + int ret; + pte_t *ptep; + u32 ptep_level; + bool found_leaf; + gpa_t addr = start; + unsigned long page_size; + + while (addr < end) { + found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level); + ret = gstage_level_to_page_size(ptep_level, &page_size); + if (ret) + break; + + if (!found_leaf) + goto next; + + if (!(addr & (page_size - 1)) && ((end - addr) >= page_size)) + kvm_riscv_gstage_op_pte(gstage, addr, ptep, + ptep_level, GSTAGE_OP_WP); + +next: + addr += page_size; + } +} + +void __init kvm_riscv_gstage_mode_detect(void) +{ +#ifdef CONFIG_64BIT + /* Try Sv57x4 G-stage mode */ + csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT); + if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) { + kvm_riscv_gstage_mode = HGATP_MODE_SV57X4; + kvm_riscv_gstage_pgd_levels = 5; + goto skip_sv48x4_test; + } + + /* Try Sv48x4 G-stage mode */ + csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT); + if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) { + kvm_riscv_gstage_mode = HGATP_MODE_SV48X4; + kvm_riscv_gstage_pgd_levels = 4; + } +skip_sv48x4_test: + + csr_write(CSR_HGATP, 0); + kvm_riscv_local_hfence_gvma_all(); +#endif +} diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index 0e3667089f36..ffbd7ec89199 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -135,7 +135,7 @@ static int __init riscv_kvm_init(void) (rc) ? slist : "no features"); } - switch (kvm_riscv_gstage_mode()) { + switch (kvm_riscv_gstage_mode) { case HGATP_MODE_SV32X4: str = "Sv32x4"; break; diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index a6eb4999d697..02a0b78a96ce 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -6,9 +6,7 @@ * Anup Patel */ -#include #include -#include #include #include #include @@ -17,342 +15,28 @@ #include #include #include -#include -#include - -#ifdef CONFIG_64BIT -static unsigned long gstage_mode __ro_after_init = (HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT); -static unsigned long gstage_pgd_levels __ro_after_init = 3; -#define gstage_index_bits 9 -#else -static unsigned long gstage_mode __ro_after_init = (HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT); -static unsigned long gstage_pgd_levels __ro_after_init = 2; -#define gstage_index_bits 10 -#endif - -#define gstage_pgd_xbits 2 -#define gstage_pgd_size (1UL << (HGATP_PAGE_SHIFT + gstage_pgd_xbits)) -#define gstage_gpa_bits (HGATP_PAGE_SHIFT + \ - (gstage_pgd_levels * gstage_index_bits) + \ - gstage_pgd_xbits) -#define gstage_gpa_size ((gpa_t)(1ULL << gstage_gpa_bits)) - -#define gstage_pte_leaf(__ptep) \ - (pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)) - -static inline unsigned long gstage_pte_index(gpa_t addr, u32 level) -{ - unsigned long mask; - unsigned long shift = HGATP_PAGE_SHIFT + (gstage_index_bits * level); - - if (level == (gstage_pgd_levels - 1)) - mask = (PTRS_PER_PTE * (1UL << gstage_pgd_xbits)) - 1; - else - mask = PTRS_PER_PTE - 1; - - return (addr >> shift) & mask; -} - -static inline unsigned long gstage_pte_page_vaddr(pte_t pte) -{ - return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte))); -} - -static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level) -{ - u32 i; - unsigned long psz = 1UL << 12; - - for (i = 0; i < gstage_pgd_levels; i++) { - if (page_size == (psz << (i * gstage_index_bits))) { - *out_level = i; - return 0; - } - } - - return -EINVAL; -} - -static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder) -{ - if (gstage_pgd_levels < level) - return -EINVAL; - - *out_pgorder = 12 + (level * gstage_index_bits); - return 0; -} - -static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize) -{ - int rc; - unsigned long page_order = PAGE_SHIFT; - - rc = gstage_level_to_page_order(level, &page_order); - if (rc) - return rc; - - *out_pgsize = BIT(page_order); - return 0; -} - -static bool gstage_get_leaf_entry(struct kvm *kvm, gpa_t addr, - pte_t **ptepp, u32 *ptep_level) -{ - pte_t *ptep; - u32 current_level = gstage_pgd_levels - 1; - - *ptep_level = current_level; - ptep = (pte_t *)kvm->arch.pgd; - ptep = &ptep[gstage_pte_index(addr, current_level)]; - while (ptep && pte_val(ptep_get(ptep))) { - if (gstage_pte_leaf(ptep)) { - *ptep_level = current_level; - *ptepp = ptep; - return true; - } - - if (current_level) { - current_level--; - *ptep_level = current_level; - ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep)); - ptep = &ptep[gstage_pte_index(addr, current_level)]; - } else { - ptep = NULL; - } - } - - return false; -} - -static void gstage_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr) -{ - unsigned long order = PAGE_SHIFT; - - if (gstage_level_to_page_order(level, &order)) - return; - addr &= ~(BIT(order) - 1); - - kvm_riscv_hfence_gvma_vmid_gpa(kvm, -1UL, 0, addr, BIT(order), order); -} -static int gstage_set_pte(struct kvm *kvm, - struct kvm_mmu_memory_cache *pcache, - const struct kvm_gstage_mapping *map) -{ - u32 current_level = gstage_pgd_levels - 1; - pte_t *next_ptep = (pte_t *)kvm->arch.pgd; - pte_t *ptep = &next_ptep[gstage_pte_index(map->addr, current_level)]; - - if (current_level < map->level) - return -EINVAL; - - while (current_level != map->level) { - if (gstage_pte_leaf(ptep)) - return -EEXIST; - - if (!pte_val(ptep_get(ptep))) { - if (!pcache) - return -ENOMEM; - next_ptep = kvm_mmu_memory_cache_alloc(pcache); - if (!next_ptep) - return -ENOMEM; - set_pte(ptep, pfn_pte(PFN_DOWN(__pa(next_ptep)), - __pgprot(_PAGE_TABLE))); - } else { - if (gstage_pte_leaf(ptep)) - return -EEXIST; - next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep)); - } - - current_level--; - ptep = &next_ptep[gstage_pte_index(map->addr, current_level)]; - } - - if (pte_val(*ptep) != pte_val(map->pte)) { - set_pte(ptep, map->pte); - if (gstage_pte_leaf(ptep)) - gstage_remote_tlb_flush(kvm, current_level, map->addr); - } - - return 0; -} - -static int gstage_map_page(struct kvm *kvm, - struct kvm_mmu_memory_cache *pcache, - gpa_t gpa, phys_addr_t hpa, - unsigned long page_size, - bool page_rdonly, bool page_exec, - struct kvm_gstage_mapping *out_map) -{ - pgprot_t prot; - int ret; - - out_map->addr = gpa; - out_map->level = 0; - - ret = gstage_page_size_to_level(page_size, &out_map->level); - if (ret) - return ret; - - /* - * A RISC-V implementation can choose to either: - * 1) Update 'A' and 'D' PTE bits in hardware - * 2) Generate page fault when 'A' and/or 'D' bits are not set - * PTE so that software can update these bits. - * - * We support both options mentioned above. To achieve this, we - * always set 'A' and 'D' PTE bits at time of creating G-stage - * mapping. To support KVM dirty page logging with both options - * mentioned above, we will write-protect G-stage PTEs to track - * dirty pages. - */ - - if (page_exec) { - if (page_rdonly) - prot = PAGE_READ_EXEC; - else - prot = PAGE_WRITE_EXEC; - } else { - if (page_rdonly) - prot = PAGE_READ; - else - prot = PAGE_WRITE; - } - out_map->pte = pfn_pte(PFN_DOWN(hpa), prot); - out_map->pte = pte_mkdirty(out_map->pte); - - return gstage_set_pte(kvm, pcache, out_map); -} - -enum gstage_op { - GSTAGE_OP_NOP = 0, /* Nothing */ - GSTAGE_OP_CLEAR, /* Clear/Unmap */ - GSTAGE_OP_WP, /* Write-protect */ -}; - -static void gstage_op_pte(struct kvm *kvm, gpa_t addr, - pte_t *ptep, u32 ptep_level, enum gstage_op op) -{ - int i, ret; - pte_t old_pte, *next_ptep; - u32 next_ptep_level; - unsigned long next_page_size, page_size; - - ret = gstage_level_to_page_size(ptep_level, &page_size); - if (ret) - return; - - BUG_ON(addr & (page_size - 1)); - - if (!pte_val(ptep_get(ptep))) - return; - - if (ptep_level && !gstage_pte_leaf(ptep)) { - next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep)); - next_ptep_level = ptep_level - 1; - ret = gstage_level_to_page_size(next_ptep_level, - &next_page_size); - if (ret) - return; - - if (op == GSTAGE_OP_CLEAR) - set_pte(ptep, __pte(0)); - for (i = 0; i < PTRS_PER_PTE; i++) - gstage_op_pte(kvm, addr + i * next_page_size, - &next_ptep[i], next_ptep_level, op); - if (op == GSTAGE_OP_CLEAR) - put_page(virt_to_page(next_ptep)); - } else { - old_pte = *ptep; - if (op == GSTAGE_OP_CLEAR) - set_pte(ptep, __pte(0)); - else if (op == GSTAGE_OP_WP) - set_pte(ptep, __pte(pte_val(ptep_get(ptep)) & ~_PAGE_WRITE)); - if (pte_val(*ptep) != pte_val(old_pte)) - gstage_remote_tlb_flush(kvm, ptep_level, addr); - } -} - -static void gstage_unmap_range(struct kvm *kvm, gpa_t start, - gpa_t size, bool may_block) -{ - int ret; - pte_t *ptep; - u32 ptep_level; - bool found_leaf; - unsigned long page_size; - gpa_t addr = start, end = start + size; - - while (addr < end) { - found_leaf = gstage_get_leaf_entry(kvm, addr, - &ptep, &ptep_level); - ret = gstage_level_to_page_size(ptep_level, &page_size); - if (ret) - break; - - if (!found_leaf) - goto next; - - if (!(addr & (page_size - 1)) && ((end - addr) >= page_size)) - gstage_op_pte(kvm, addr, ptep, - ptep_level, GSTAGE_OP_CLEAR); - -next: - addr += page_size; - - /* - * If the range is too large, release the kvm->mmu_lock - * to prevent starvation and lockup detector warnings. - */ - if (may_block && addr < end) - cond_resched_lock(&kvm->mmu_lock); - } -} - -static void gstage_wp_range(struct kvm *kvm, gpa_t start, gpa_t end) -{ - int ret; - pte_t *ptep; - u32 ptep_level; - bool found_leaf; - gpa_t addr = start; - unsigned long page_size; - - while (addr < end) { - found_leaf = gstage_get_leaf_entry(kvm, addr, - &ptep, &ptep_level); - ret = gstage_level_to_page_size(ptep_level, &page_size); - if (ret) - break; - - if (!found_leaf) - goto next; - - if (!(addr & (page_size - 1)) && ((end - addr) >= page_size)) - gstage_op_pte(kvm, addr, ptep, - ptep_level, GSTAGE_OP_WP); - -next: - addr += page_size; - } -} - -static void gstage_wp_memory_region(struct kvm *kvm, int slot) +static void mmu_wp_memory_region(struct kvm *kvm, int slot) { struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); phys_addr_t start = memslot->base_gfn << PAGE_SHIFT; phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; + struct kvm_gstage gstage; + + gstage.kvm = kvm; + gstage.flags = 0; + gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage.pgd = kvm->arch.pgd; spin_lock(&kvm->mmu_lock); - gstage_wp_range(kvm, start, end); + kvm_riscv_gstage_wp_range(&gstage, start, end); spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs_memslot(kvm, memslot); } -int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa, - phys_addr_t hpa, unsigned long size, - bool writable, bool in_atomic) +int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, + unsigned long size, bool writable, bool in_atomic) { int ret = 0; unsigned long pfn; @@ -362,6 +46,12 @@ int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa, .gfp_zero = __GFP_ZERO, }; struct kvm_gstage_mapping map; + struct kvm_gstage gstage; + + gstage.kvm = kvm; + gstage.flags = 0; + gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage.pgd = kvm->arch.pgd; end = (gpa + size + PAGE_SIZE - 1) & PAGE_MASK; pfn = __phys_to_pfn(hpa); @@ -374,12 +64,12 @@ int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa, if (!writable) map.pte = pte_wrprotect(map.pte); - ret = kvm_mmu_topup_memory_cache(&pcache, gstage_pgd_levels); + ret = kvm_mmu_topup_memory_cache(&pcache, kvm_riscv_gstage_pgd_levels); if (ret) goto out; spin_lock(&kvm->mmu_lock); - ret = gstage_set_pte(kvm, &pcache, &map); + ret = kvm_riscv_gstage_set_pte(&gstage, &pcache, &map); spin_unlock(&kvm->mmu_lock); if (ret) goto out; @@ -392,10 +82,17 @@ int kvm_riscv_gstage_ioremap(struct kvm *kvm, gpa_t gpa, return ret; } -void kvm_riscv_gstage_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size) +void kvm_riscv_mmu_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size) { + struct kvm_gstage gstage; + + gstage.kvm = kvm; + gstage.flags = 0; + gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage.pgd = kvm->arch.pgd; + spin_lock(&kvm->mmu_lock); - gstage_unmap_range(kvm, gpa, size, false); + kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false); spin_unlock(&kvm->mmu_lock); } @@ -407,8 +104,14 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, phys_addr_t base_gfn = slot->base_gfn + gfn_offset; phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; + struct kvm_gstage gstage; + + gstage.kvm = kvm; + gstage.flags = 0; + gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage.pgd = kvm->arch.pgd; - gstage_wp_range(kvm, start, end); + kvm_riscv_gstage_wp_range(&gstage, start, end); } void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) @@ -425,7 +128,7 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) void kvm_arch_flush_shadow_all(struct kvm *kvm) { - kvm_riscv_gstage_free_pgd(kvm); + kvm_riscv_mmu_free_pgd(kvm); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, @@ -433,9 +136,15 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, { gpa_t gpa = slot->base_gfn << PAGE_SHIFT; phys_addr_t size = slot->npages << PAGE_SHIFT; + struct kvm_gstage gstage; + + gstage.kvm = kvm; + gstage.flags = 0; + gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage.pgd = kvm->arch.pgd; spin_lock(&kvm->mmu_lock); - gstage_unmap_range(kvm, gpa, size, false); + kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false); spin_unlock(&kvm->mmu_lock); } @@ -450,7 +159,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * the memory slot is write protected. */ if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) - gstage_wp_memory_region(kvm, new->id); + mmu_wp_memory_region(kvm, new->id); } int kvm_arch_prepare_memory_region(struct kvm *kvm, @@ -472,7 +181,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * space addressable by the KVM guest GPA space. */ if ((new->base_gfn + new->npages) >= - (gstage_gpa_size >> PAGE_SHIFT)) + (kvm_riscv_gstage_gpa_size >> PAGE_SHIFT)) return -EFAULT; hva = new->userspace_addr; @@ -528,9 +237,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, goto out; } - ret = kvm_riscv_gstage_ioremap(kvm, gpa, pa, - vm_end - vm_start, - writable, false); + ret = kvm_riscv_mmu_ioremap(kvm, gpa, pa, vm_end - vm_start, + writable, false); if (ret) break; } @@ -541,7 +249,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, goto out; if (ret) - kvm_riscv_gstage_iounmap(kvm, base_gpa, size); + kvm_riscv_mmu_iounmap(kvm, base_gpa, size); out: mmap_read_unlock(current->mm); @@ -550,12 +258,18 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { + struct kvm_gstage gstage; + if (!kvm->arch.pgd) return false; - gstage_unmap_range(kvm, range->start << PAGE_SHIFT, - (range->end - range->start) << PAGE_SHIFT, - range->may_block); + gstage.kvm = kvm; + gstage.flags = 0; + gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_unmap_range(&gstage, range->start << PAGE_SHIFT, + (range->end - range->start) << PAGE_SHIFT, + range->may_block); return false; } @@ -564,14 +278,21 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) int ret; kvm_pfn_t pfn = pte_pfn(range->arg.pte); struct kvm_gstage_mapping out_map = {0}; + struct kvm_gstage gstage; if (!kvm->arch.pgd) return false; WARN_ON(range->end - range->start != 1); - ret = gstage_map_page(kvm, NULL, range->start << PAGE_SHIFT, + gstage.kvm = kvm; + gstage.flags = 0; + gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage.pgd = kvm->arch.pgd; + + ret = kvm_riscv_gstage_map_page(&gstage, NULL, range->start << PAGE_SHIFT, __pfn_to_phys(pfn), PAGE_SIZE, true, true, &out_map); + if (ret) { kvm_debug("Failed to map G-stage page (error %d)\n", ret); return true; @@ -585,14 +306,19 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) pte_t *ptep; u32 ptep_level = 0; u64 size = (range->end - range->start) << PAGE_SHIFT; + struct kvm_gstage gstage; if (!kvm->arch.pgd) return false; WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - if (!gstage_get_leaf_entry(kvm, range->start << PAGE_SHIFT, - &ptep, &ptep_level)) + gstage.kvm = kvm; + gstage.flags = 0; + gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage.pgd = kvm->arch.pgd; + if (!kvm_riscv_gstage_get_leaf(&gstage, range->start << PAGE_SHIFT, + &ptep, &ptep_level)) return false; return ptep_test_and_clear_young(NULL, 0, ptep); @@ -603,23 +329,27 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) pte_t *ptep; u32 ptep_level = 0; u64 size = (range->end - range->start) << PAGE_SHIFT; + struct kvm_gstage gstage; if (!kvm->arch.pgd) return false; WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE); - if (!gstage_get_leaf_entry(kvm, range->start << PAGE_SHIFT, - &ptep, &ptep_level)) + gstage.kvm = kvm; + gstage.flags = 0; + gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage.pgd = kvm->arch.pgd; + if (!kvm_riscv_gstage_get_leaf(&gstage, range->start << PAGE_SHIFT, + &ptep, &ptep_level)) return false; return pte_young(ptep_get(ptep)); } -int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, - struct kvm_memory_slot *memslot, - gpa_t gpa, unsigned long hva, bool is_write, - struct kvm_gstage_mapping *out_map) +int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, + gpa_t gpa, unsigned long hva, bool is_write, + struct kvm_gstage_mapping *out_map) { int ret; kvm_pfn_t hfn; @@ -632,12 +362,18 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, bool logging = (memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY)) ? true : false; unsigned long vma_pagesize, mmu_seq; + struct kvm_gstage gstage; + + gstage.kvm = kvm; + gstage.flags = 0; + gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage.pgd = kvm->arch.pgd; /* Setup initial state of output mapping */ memset(out_map, 0, sizeof(*out_map)); /* We need minimum second+third level pages */ - ret = kvm_mmu_topup_memory_cache(pcache, gstage_pgd_levels); + ret = kvm_mmu_topup_memory_cache(pcache, kvm_riscv_gstage_pgd_levels); if (ret) { kvm_err("Failed to topup G-stage cache\n"); return ret; @@ -705,11 +441,11 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, if (writable) { kvm_set_pfn_dirty(hfn); mark_page_dirty(kvm, gfn); - ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT, - vma_pagesize, false, true, out_map); + ret = kvm_riscv_gstage_map_page(&gstage, pcache, gpa, hfn << PAGE_SHIFT, + vma_pagesize, false, true, out_map); } else { - ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT, - vma_pagesize, true, true, out_map); + ret = kvm_riscv_gstage_map_page(&gstage, pcache, gpa, hfn << PAGE_SHIFT, + vma_pagesize, true, true, out_map); } if (ret) @@ -722,7 +458,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, return ret; } -int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm) +int kvm_riscv_mmu_alloc_pgd(struct kvm *kvm) { struct page *pgd_page; @@ -732,7 +468,7 @@ int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm) } pgd_page = alloc_pages(GFP_KERNEL | __GFP_ZERO, - get_order(gstage_pgd_size)); + get_order(kvm_riscv_gstage_pgd_size)); if (!pgd_page) return -ENOMEM; kvm->arch.pgd = page_to_virt(pgd_page); @@ -741,13 +477,18 @@ int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm) return 0; } -void kvm_riscv_gstage_free_pgd(struct kvm *kvm) +void kvm_riscv_mmu_free_pgd(struct kvm *kvm) { + struct kvm_gstage gstage; void *pgd = NULL; spin_lock(&kvm->mmu_lock); if (kvm->arch.pgd) { - gstage_unmap_range(kvm, 0UL, gstage_gpa_size, false); + gstage.kvm = kvm; + gstage.flags = 0; + gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); + gstage.pgd = kvm->arch.pgd; + kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size, false); pgd = READ_ONCE(kvm->arch.pgd); kvm->arch.pgd = NULL; kvm->arch.pgd_phys = 0; @@ -755,12 +496,12 @@ void kvm_riscv_gstage_free_pgd(struct kvm *kvm) spin_unlock(&kvm->mmu_lock); if (pgd) - free_pages((unsigned long)pgd, get_order(gstage_pgd_size)); + free_pages((unsigned long)pgd, get_order(kvm_riscv_gstage_pgd_size)); } -void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu) +void kvm_riscv_mmu_update_hgatp(struct kvm_vcpu *vcpu) { - unsigned long hgatp = gstage_mode; + unsigned long hgatp = kvm_riscv_gstage_mode << HGATP_MODE_SHIFT; struct kvm_arch *k = &vcpu->kvm->arch; hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) & HGATP_VMID; @@ -771,37 +512,3 @@ void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu) if (!kvm_riscv_gstage_vmid_bits()) kvm_riscv_local_hfence_gvma_all(); } - -void __init kvm_riscv_gstage_mode_detect(void) -{ -#ifdef CONFIG_64BIT - /* Try Sv57x4 G-stage mode */ - csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT); - if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) { - gstage_mode = (HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT); - gstage_pgd_levels = 5; - goto skip_sv48x4_test; - } - - /* Try Sv48x4 G-stage mode */ - csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT); - if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) { - gstage_mode = (HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT); - gstage_pgd_levels = 4; - } -skip_sv48x4_test: - - csr_write(CSR_HGATP, 0); - kvm_riscv_local_hfence_gvma_all(); -#endif -} - -unsigned long __init kvm_riscv_gstage_mode(void) -{ - return gstage_mode >> HGATP_MODE_SHIFT; -} - -int kvm_riscv_gstage_gpa_bits(void) -{ - return gstage_gpa_bits; -} diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 4a3ba85cc3f5..a9a13509d177 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -620,7 +620,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } } - kvm_riscv_gstage_update_hgatp(vcpu); + kvm_riscv_mmu_update_hgatp(vcpu); kvm_riscv_vcpu_timer_restore(vcpu); @@ -705,7 +705,7 @@ static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu) kvm_riscv_reset_vcpu(vcpu); if (kvm_check_request(KVM_REQ_UPDATE_HGATP, vcpu)) - kvm_riscv_gstage_update_hgatp(vcpu); + kvm_riscv_mmu_update_hgatp(vcpu); if (kvm_check_request(KVM_REQ_FENCE_I, vcpu)) kvm_riscv_fence_i_process(vcpu); diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c index 6b4694bc07ea..0bb0c51e3c89 100644 --- a/arch/riscv/kvm/vcpu_exit.c +++ b/arch/riscv/kvm/vcpu_exit.c @@ -43,8 +43,9 @@ static int gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, }; } - ret = kvm_riscv_gstage_map(vcpu, memslot, fault_addr, hva, - (trap->scause == EXC_STORE_GUEST_PAGE_FAULT) ? true : false, &host_map); + ret = kvm_riscv_mmu_map(vcpu, memslot, fault_addr, hva, + (trap->scause == EXC_STORE_GUEST_PAGE_FAULT) ? true : false, + &host_map); if (ret < 0) return ret; diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c index 1b014485f80e..ef9512fc275e 100644 --- a/arch/riscv/kvm/vm.c +++ b/arch/riscv/kvm/vm.c @@ -32,13 +32,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int r; - r = kvm_riscv_gstage_alloc_pgd(kvm); + r = kvm_riscv_mmu_alloc_pgd(kvm); if (r) return r; r = kvm_riscv_gstage_vmid_init(kvm); if (r) { - kvm_riscv_gstage_free_pgd(kvm); + kvm_riscv_mmu_free_pgd(kvm); return r; } @@ -201,7 +201,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_USER_MEM_SLOTS; break; case KVM_CAP_VM_GPA_BITS: - r = kvm_riscv_gstage_gpa_bits(); + r = kvm_riscv_gstage_gpa_bits; break; default: r = 0; From e77eaa84f74266d02a99964db609aa121037512b Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Wed, 18 Jun 2025 17:05:32 +0530 Subject: [PATCH 12/18] RISC-V: KVM: Pass VMID as parameter to kvm_riscv_hfence_xyz() APIs mainline inclusion from mainline-6.17-rc1 commit 1f6d0ee category: feature bugzilla: RVCK-Project#257 -------------------------------- Currently, all kvm_riscv_hfence_xyz() APIs assume VMID to be the host VMID of the Guest/VM which resticts use of these APIs only for host TLB maintenance. Let's allow passing VMID as a parameter to all kvm_riscv_hfence_xyz() APIs so that they can be re-used for nested virtualization related TLB maintenance. Signed-off-by: Anup Patel Tested-by: Atish Patra Reviewed-by: Nutty Liu Link: https://lore.kernel.org/r/20250618113532.471448-13-apatel@ventanamicro.com Signed-off-by: Anup Patel Signed-off-by: Wang Yechao --- arch/riscv/include/asm/kvm_tlb.h | 17 ++++++--- arch/riscv/kvm/gstage.c | 3 +- arch/riscv/kvm/tlb.c | 61 ++++++++++++++++++++----------- arch/riscv/kvm/vcpu_sbi_replace.c | 17 +++++---- arch/riscv/kvm/vcpu_sbi_v01.c | 25 ++++++------- 5 files changed, 73 insertions(+), 50 deletions(-) diff --git a/arch/riscv/include/asm/kvm_tlb.h b/arch/riscv/include/asm/kvm_tlb.h index f67e03edeaec..38a2f933ad3a 100644 --- a/arch/riscv/include/asm/kvm_tlb.h +++ b/arch/riscv/include/asm/kvm_tlb.h @@ -11,9 +11,11 @@ enum kvm_riscv_hfence_type { KVM_RISCV_HFENCE_UNKNOWN = 0, KVM_RISCV_HFENCE_GVMA_VMID_GPA, + KVM_RISCV_HFENCE_GVMA_VMID_ALL, KVM_RISCV_HFENCE_VVMA_ASID_GVA, KVM_RISCV_HFENCE_VVMA_ASID_ALL, KVM_RISCV_HFENCE_VVMA_GVA, + KVM_RISCV_HFENCE_VVMA_ALL }; struct kvm_riscv_hfence { @@ -59,21 +61,24 @@ void kvm_riscv_fence_i(struct kvm *kvm, void kvm_riscv_hfence_gvma_vmid_gpa(struct kvm *kvm, unsigned long hbase, unsigned long hmask, gpa_t gpa, gpa_t gpsz, - unsigned long order); + unsigned long order, unsigned long vmid); void kvm_riscv_hfence_gvma_vmid_all(struct kvm *kvm, - unsigned long hbase, unsigned long hmask); + unsigned long hbase, unsigned long hmask, + unsigned long vmid); void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm, unsigned long hbase, unsigned long hmask, unsigned long gva, unsigned long gvsz, - unsigned long order, unsigned long asid); + unsigned long order, unsigned long asid, + unsigned long vmid); void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm, unsigned long hbase, unsigned long hmask, - unsigned long asid); + unsigned long asid, unsigned long vmid); void kvm_riscv_hfence_vvma_gva(struct kvm *kvm, unsigned long hbase, unsigned long hmask, unsigned long gva, unsigned long gvsz, - unsigned long order); + unsigned long order, unsigned long vmid); void kvm_riscv_hfence_vvma_all(struct kvm *kvm, - unsigned long hbase, unsigned long hmask); + unsigned long hbase, unsigned long hmask, + unsigned long vmid); #endif diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c index 9c7c44f09b05..24c270d6d0e2 100644 --- a/arch/riscv/kvm/gstage.c +++ b/arch/riscv/kvm/gstage.c @@ -117,7 +117,8 @@ static void gstage_tlb_flush(struct kvm_gstage *gstage, u32 level, gpa_t addr) if (gstage->flags & KVM_GSTAGE_FLAGS_LOCAL) kvm_riscv_local_hfence_gvma_vmid_gpa(gstage->vmid, addr, BIT(order), order); else - kvm_riscv_hfence_gvma_vmid_gpa(gstage->kvm, -1UL, 0, addr, BIT(order), order); + kvm_riscv_hfence_gvma_vmid_gpa(gstage->kvm, -1UL, 0, addr, BIT(order), order, + gstage->vmid); } int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage, diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c index 349fcfc93f54..3c5a70a2b927 100644 --- a/arch/riscv/kvm/tlb.c +++ b/arch/riscv/kvm/tlb.c @@ -251,6 +251,12 @@ void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu) kvm_riscv_local_hfence_gvma_vmid_gpa(d.vmid, d.addr, d.size, d.order); break; + case KVM_RISCV_HFENCE_GVMA_VMID_ALL: + if (kvm_riscv_nacl_available()) + nacl_hfence_gvma_vmid_all(nacl_shmem(), d.vmid); + else + kvm_riscv_local_hfence_gvma_vmid_all(d.vmid); + break; case KVM_RISCV_HFENCE_VVMA_ASID_GVA: kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_RCVD); if (kvm_riscv_nacl_available()) @@ -276,6 +282,13 @@ void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu) kvm_riscv_local_hfence_vvma_gva(d.vmid, d.addr, d.size, d.order); break; + case KVM_RISCV_HFENCE_VVMA_ALL: + kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_RCVD); + if (kvm_riscv_nacl_available()) + nacl_hfence_vvma_all(nacl_shmem(), d.vmid); + else + kvm_riscv_local_hfence_vvma_all(d.vmid); + break; default: break; } @@ -328,14 +341,13 @@ void kvm_riscv_fence_i(struct kvm *kvm, void kvm_riscv_hfence_gvma_vmid_gpa(struct kvm *kvm, unsigned long hbase, unsigned long hmask, gpa_t gpa, gpa_t gpsz, - unsigned long order) + unsigned long order, unsigned long vmid) { - struct kvm_vmid *v = &kvm->arch.vmid; struct kvm_riscv_hfence data; data.type = KVM_RISCV_HFENCE_GVMA_VMID_GPA; data.asid = 0; - data.vmid = READ_ONCE(v->vmid); + data.vmid = vmid; data.addr = gpa; data.size = gpsz; data.order = order; @@ -344,23 +356,28 @@ void kvm_riscv_hfence_gvma_vmid_gpa(struct kvm *kvm, } void kvm_riscv_hfence_gvma_vmid_all(struct kvm *kvm, - unsigned long hbase, unsigned long hmask) + unsigned long hbase, unsigned long hmask, + unsigned long vmid) { - make_xfence_request(kvm, hbase, hmask, KVM_REQ_TLB_FLUSH, - KVM_REQ_TLB_FLUSH, NULL); + struct kvm_riscv_hfence data = {0}; + + data.type = KVM_RISCV_HFENCE_GVMA_VMID_ALL; + data.vmid = vmid; + make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE, + KVM_REQ_TLB_FLUSH, &data); } void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm, unsigned long hbase, unsigned long hmask, unsigned long gva, unsigned long gvsz, - unsigned long order, unsigned long asid) + unsigned long order, unsigned long asid, + unsigned long vmid) { - struct kvm_vmid *v = &kvm->arch.vmid; struct kvm_riscv_hfence data; data.type = KVM_RISCV_HFENCE_VVMA_ASID_GVA; data.asid = asid; - data.vmid = READ_ONCE(v->vmid); + data.vmid = vmid; data.addr = gva; data.size = gvsz; data.order = order; @@ -370,15 +387,13 @@ void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm, void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm, unsigned long hbase, unsigned long hmask, - unsigned long asid) + unsigned long asid, unsigned long vmid) { - struct kvm_vmid *v = &kvm->arch.vmid; - struct kvm_riscv_hfence data; + struct kvm_riscv_hfence data = {0}; data.type = KVM_RISCV_HFENCE_VVMA_ASID_ALL; data.asid = asid; - data.vmid = READ_ONCE(v->vmid); - data.addr = data.size = data.order = 0; + data.vmid = vmid; make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE, KVM_REQ_HFENCE_VVMA_ALL, &data); } @@ -386,14 +401,13 @@ void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm, void kvm_riscv_hfence_vvma_gva(struct kvm *kvm, unsigned long hbase, unsigned long hmask, unsigned long gva, unsigned long gvsz, - unsigned long order) + unsigned long order, unsigned long vmid) { - struct kvm_vmid *v = &kvm->arch.vmid; struct kvm_riscv_hfence data; data.type = KVM_RISCV_HFENCE_VVMA_GVA; data.asid = 0; - data.vmid = READ_ONCE(v->vmid); + data.vmid = vmid; data.addr = gva; data.size = gvsz; data.order = order; @@ -402,16 +416,21 @@ void kvm_riscv_hfence_vvma_gva(struct kvm *kvm, } void kvm_riscv_hfence_vvma_all(struct kvm *kvm, - unsigned long hbase, unsigned long hmask) + unsigned long hbase, unsigned long hmask, + unsigned long vmid) { - make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE_VVMA_ALL, - KVM_REQ_HFENCE_VVMA_ALL, NULL); + struct kvm_riscv_hfence data = {0}; + + data.type = KVM_RISCV_HFENCE_VVMA_ALL; + data.vmid = vmid; + make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE, + KVM_REQ_HFENCE_VVMA_ALL, &data); } int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) { kvm_riscv_hfence_gvma_vmid_gpa(kvm, -1UL, 0, gfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT, - PAGE_SHIFT); + PAGE_SHIFT, READ_ONCE(kvm->arch.vmid.vmid)); return 0; } diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index b17fad091bab..b490ed1428a6 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -96,6 +96,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run unsigned long hmask = cp->a0; unsigned long hbase = cp->a1; unsigned long funcid = cp->a6; + unsigned long vmid; switch (funcid) { case SBI_EXT_RFENCE_REMOTE_FENCE_I: @@ -103,22 +104,22 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_FENCE_I_SENT); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA: + vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid); if ((cp->a2 == 0 && cp->a3 == 0) || cp->a3 == -1UL) - kvm_riscv_hfence_vvma_all(vcpu->kvm, hbase, hmask); + kvm_riscv_hfence_vvma_all(vcpu->kvm, hbase, hmask, vmid); else kvm_riscv_hfence_vvma_gva(vcpu->kvm, hbase, hmask, - cp->a2, cp->a3, PAGE_SHIFT); + cp->a2, cp->a3, PAGE_SHIFT, vmid); kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_SENT); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID: + vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid); if ((cp->a2 == 0 && cp->a3 == 0) || cp->a3 == -1UL) - kvm_riscv_hfence_vvma_asid_all(vcpu->kvm, - hbase, hmask, cp->a4); + kvm_riscv_hfence_vvma_asid_all(vcpu->kvm, hbase, hmask, + cp->a4, vmid); else - kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm, - hbase, hmask, - cp->a2, cp->a3, - PAGE_SHIFT, cp->a4); + kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm, hbase, hmask, cp->a2, + cp->a3, PAGE_SHIFT, cp->a4, vmid); kvm_riscv_vcpu_pmu_incr_fw(vcpu, SBI_PMU_FW_HFENCE_VVMA_ASID_SENT); break; case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA: diff --git a/arch/riscv/kvm/vcpu_sbi_v01.c b/arch/riscv/kvm/vcpu_sbi_v01.c index 8f4c4fa16227..368dfddd23d9 100644 --- a/arch/riscv/kvm/vcpu_sbi_v01.c +++ b/arch/riscv/kvm/vcpu_sbi_v01.c @@ -23,6 +23,7 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, struct kvm *kvm = vcpu->kvm; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; struct kvm_cpu_trap *utrap = retdata->utrap; + unsigned long vmid; switch (cp->a7) { case SBI_EXT_0_1_CONSOLE_GETCHAR: @@ -78,25 +79,21 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, if (cp->a7 == SBI_EXT_0_1_REMOTE_FENCE_I) kvm_riscv_fence_i(vcpu->kvm, 0, hmask); else if (cp->a7 == SBI_EXT_0_1_REMOTE_SFENCE_VMA) { + vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid); if (cp->a1 == 0 && cp->a2 == 0) - kvm_riscv_hfence_vvma_all(vcpu->kvm, - 0, hmask); + kvm_riscv_hfence_vvma_all(vcpu->kvm, 0, hmask, vmid); else - kvm_riscv_hfence_vvma_gva(vcpu->kvm, - 0, hmask, - cp->a1, cp->a2, - PAGE_SHIFT); + kvm_riscv_hfence_vvma_gva(vcpu->kvm, 0, hmask, cp->a1, + cp->a2, PAGE_SHIFT, vmid); } else { + vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid); if (cp->a1 == 0 && cp->a2 == 0) - kvm_riscv_hfence_vvma_asid_all(vcpu->kvm, - 0, hmask, - cp->a3); + kvm_riscv_hfence_vvma_asid_all(vcpu->kvm, 0, hmask, + cp->a3, vmid); else - kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm, - 0, hmask, - cp->a1, cp->a2, - PAGE_SHIFT, - cp->a3); + kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm, 0, hmask, + cp->a1, cp->a2, PAGE_SHIFT, + cp->a3, vmid); } break; default: From b1a96f6742531b68abffa82980036727be9c2457 Mon Sep 17 00:00:00 2001 From: Andrew Bresticker Date: Wed, 8 Jan 2025 05:57:00 -0800 Subject: [PATCH 13/18] riscv: Support huge pfnmaps mainline inclusion from mainline-6.15-rc1 commit 03dc00a category: feature bugzilla: RVCK-Project#257 -------------------------------- Use RSW0 as the special bit for pmds and puds, just like for ptes. Also define the {pte,pmd,pud}_pgprot helpers which were previously missing and are needed for the follow_pfnmap APIs. Signed-off-by: Andrew Bresticker Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20250108135700.2614848-1-abrestic@rivosinc.com Signed-off-by: Alexandre Ghiti Signed-off-by: Wang Yechao --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/pgtable.h | 49 ++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index aedd09e55b69..b22d5b390746 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -60,6 +60,7 @@ config RISCV select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_CFI_CLANG select ARCH_SUPPORTS_DEBUG_PAGEALLOC if MMU + select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE select ARCH_SUPPORTS_HUGETLBFS if MMU select ARCH_SUPPORTS_PAGE_TABLE_CHECK if MMU select ARCH_SUPPORTS_PER_VMA_LOCK if MMU diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index bce95009561b..6edb47d8ee23 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -362,6 +362,14 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) #define mk_pte(page, prot) pfn_pte(page_to_pfn(page), prot) +#define pte_pgprot pte_pgprot +static inline pgprot_t pte_pgprot(pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); + + return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); +} + static inline int pte_present(pte_t pte) { return (pte_val(pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE)); @@ -686,6 +694,11 @@ static inline pmd_t pte_pmd(pte_t pte) return __pmd(pte_val(pte)); } +static inline pud_t pte_pud(pte_t pte) +{ + return __pud(pte_val(pte)); +} + static inline pmd_t pmd_mkhuge(pmd_t pmd) { return pmd; @@ -710,6 +723,18 @@ static inline unsigned long pud_pfn(pud_t pud) return ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT); } +#define pmd_pgprot pmd_pgprot +static inline pgprot_t pmd_pgprot(pmd_t pmd) +{ + return pte_pgprot(pmd_pte(pmd)); +} + +#define pud_pgprot pud_pgprot +static inline pgprot_t pud_pgprot(pud_t pud) +{ + return pte_pgprot(pud_pte(pud)); +} + static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) { return pte_pmd(pte_modify(pmd_pte(pmd), newprot)); @@ -778,6 +803,30 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) return pte_pmd(pte_mkdevmap(pmd_pte(pmd))); } +#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP +static inline bool pmd_special(pmd_t pmd) +{ + return pte_special(pmd_pte(pmd)); +} + +static inline pmd_t pmd_mkspecial(pmd_t pmd) +{ + return pte_pmd(pte_mkspecial(pmd_pte(pmd))); +} +#endif + +#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP +static inline bool pud_special(pud_t pud) +{ + return pte_special(pud_pte(pud)); +} + +static inline pud_t pud_mkspecial(pud_t pud) +{ + return pte_pud(pte_mkspecial(pud_pte(pud))); +} +#endif + static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { From b4d0addb975bf04512af78f59b6e5b55514f20a4 Mon Sep 17 00:00:00 2001 From: Jessica Liu Date: Thu, 27 Nov 2025 16:51:37 +0800 Subject: [PATCH 14/18] RISC-V: KVM: Transparent huge page support mainline inclusion from mainline-7.0-rc1 commit ed7ae7a category: feature bugzilla: RVCK-Project#257 -------------------------------- Use block mapping if backed by a THP, as implemented in architectures like ARM and x86_64. Signed-off-by: Jessica Liu Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20251127165137780QbUOVPKPAfWSGAFl5qtRy@zte.com.cn Signed-off-by: Anup Patel Signed-off-by: Wang Yechao --- arch/riscv/kvm/mmu.c | 140 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 02a0b78a96ce..6d9f2bb78cb6 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -347,6 +347,142 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) return pte_young(ptep_get(ptep)); } +static bool fault_supports_gstage_huge_mapping(struct kvm_memory_slot *memslot, + unsigned long hva) +{ + hva_t uaddr_start, uaddr_end; + gpa_t gpa_start; + size_t size; + + size = memslot->npages * PAGE_SIZE; + uaddr_start = memslot->userspace_addr; + uaddr_end = uaddr_start + size; + + gpa_start = memslot->base_gfn << PAGE_SHIFT; + + /* + * Pages belonging to memslots that don't have the same alignment + * within a PMD for userspace and GPA cannot be mapped with g-stage + * PMD entries, because we'll end up mapping the wrong pages. + * + * Consider a layout like the following: + * + * memslot->userspace_addr: + * +-----+--------------------+--------------------+---+ + * |abcde|fgh vs-stage block | vs-stage block tv|xyz| + * +-----+--------------------+--------------------+---+ + * + * memslot->base_gfn << PAGE_SHIFT: + * +---+--------------------+--------------------+-----+ + * |abc|def g-stage block | g-stage block |tvxyz| + * +---+--------------------+--------------------+-----+ + * + * If we create those g-stage blocks, we'll end up with this incorrect + * mapping: + * d -> f + * e -> g + * f -> h + */ + if ((gpa_start & (PMD_SIZE - 1)) != (uaddr_start & (PMD_SIZE - 1))) + return false; + + /* + * Next, let's make sure we're not trying to map anything not covered + * by the memslot. This means we have to prohibit block size mappings + * for the beginning and end of a non-block aligned and non-block sized + * memory slot (illustrated by the head and tail parts of the + * userspace view above containing pages 'abcde' and 'xyz', + * respectively). + * + * Note that it doesn't matter if we do the check using the + * userspace_addr or the base_gfn, as both are equally aligned (per + * the check above) and equally sized. + */ + return (hva >= ALIGN(uaddr_start, PMD_SIZE)) && (hva < ALIGN_DOWN(uaddr_end, PMD_SIZE)); +} + +static int get_hva_mapping_size(struct kvm *kvm, + unsigned long hva) +{ + int size = PAGE_SIZE; + unsigned long flags; + pgd_t pgd; + p4d_t p4d; + pud_t pud; + pmd_t pmd; + + /* + * Disable IRQs to prevent concurrent tear down of host page tables, + * e.g. if the primary MMU promotes a P*D to a huge page and then frees + * the original page table. + */ + local_irq_save(flags); + + /* + * Read each entry once. As above, a non-leaf entry can be promoted to + * a huge page _during_ this walk. Re-reading the entry could send the + * walk into the weeks, e.g. p*d_leaf() returns false (sees the old + * value) and then p*d_offset() walks into the target huge page instead + * of the old page table (sees the new value). + */ + pgd = pgdp_get(pgd_offset(kvm->mm, hva)); + if (pgd_none(pgd)) + goto out; + + p4d = p4dp_get(p4d_offset(&pgd, hva)); + if (p4d_none(p4d) || !p4d_present(p4d)) + goto out; + + pud = pudp_get(pud_offset(&p4d, hva)); + if (pud_none(pud) || !pud_present(pud)) + goto out; + + if (pud_leaf(pud)) { + size = PUD_SIZE; + goto out; + } + + pmd = pmdp_get(pmd_offset(&pud, hva)); + if (pmd_none(pmd) || !pmd_present(pmd)) + goto out; + + if (pmd_leaf(pmd)) + size = PMD_SIZE; + +out: + local_irq_restore(flags); + return size; +} + +static unsigned long transparent_hugepage_adjust(struct kvm *kvm, + struct kvm_memory_slot *memslot, + unsigned long hva, + kvm_pfn_t *hfnp, gpa_t *gpa) +{ + kvm_pfn_t hfn = *hfnp; + + /* + * Make sure the adjustment is done only for THP pages. Also make + * sure that the HVA and GPA are sufficiently aligned and that the + * block map is contained within the memslot. + */ + if (fault_supports_gstage_huge_mapping(memslot, hva)) { + int sz; + + sz = get_hva_mapping_size(kvm, hva); + if (sz < PMD_SIZE) + return sz; + + *gpa &= PMD_MASK; + hfn &= ~(PTRS_PER_PMD - 1); + *hfnp = hfn; + + return PMD_SIZE; + } + + return PAGE_SIZE; +} + int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, gpa_t gpa, unsigned long hva, bool is_write, struct kvm_gstage_mapping *out_map) @@ -438,6 +574,10 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, if (mmu_invalidate_retry(kvm, mmu_seq)) goto out_unlock; + /* Check if we are backed by a THP and thus use block mapping if possible */ + if (vma_pagesize == PAGE_SIZE) + vma_pagesize = transparent_hugepage_adjust(kvm, memslot, hva, &hfn, &gpa); + if (writable) { kvm_set_pfn_dirty(hfn); mark_page_dirty(kvm, gfn); From d5a3929102d696582d06096f793c17223cf3288d Mon Sep 17 00:00:00 2001 From: Wang Yechao Date: Thu, 26 Feb 2026 19:12:31 +0800 Subject: [PATCH 15/18] RISC-V: KVM: Skip THP support check during dirty logging mainline inclusion from mainline-v7.0-rc4 commit b342166 category: feature bugzilla: RVCK-Project#257 -------------------------------- When dirty logging is enabled, guest stage mappings are forced to PAGE_SIZE granularity. Changing the mapping page size at this point is incorrect. Fixes: ed7ae7a34bea ("RISC-V: KVM: Transparent huge page support") Signed-off-by: Wang Yechao Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20260226191231140_X1Juus7s2kgVlc0ZyW_K@zte.com.cn Signed-off-by: Anup Patel --- arch/riscv/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 6d9f2bb78cb6..978faf044603 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -575,7 +575,7 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, goto out_unlock; /* Check if we are backed by a THP and thus use block mapping if possible */ - if (vma_pagesize == PAGE_SIZE) + if (!logging && (vma_pagesize == PAGE_SIZE)) vma_pagesize = transparent_hugepage_adjust(kvm, memslot, hva, &hfn, &gpa); if (writable) { From db51164062b27a6f931e01649a356eb31ed4b83c Mon Sep 17 00:00:00 2001 From: Wang Yechao Date: Mon, 30 Mar 2026 16:10:52 +0800 Subject: [PATCH 16/18] RISC-V: KVM: Fix lost write protection on huge pages during dirty logging mainline inclusion from mainline-7.1-rc1 commit a216e24 category: feature bugzilla: RVCK-Project#257 -------------------------------- When enabling dirty log in small chunks (e.g., QEMU default chunk size of 256K), the chunk size is always smaller than the page size of huge pages (1G or 2M) used in the gstage page tables. This caused the write protection to be incorrectly skipped for huge PTEs because the condition `(end - addr) >= page_size` was not satisfied. Remove the size check in `kvm_riscv_gstage_wp_range()` to ensure huge PTEs are always write-protected regardless of the chunk size. Additionally, explicitly align the address down to the page size before invoking `kvm_riscv_gstage_op_pte()` to guarantee that the address passed to the operation function is page-aligned. This fixes the issue where dirty pages might not be tracked correctly when using huge pages. Fixes: 9d05c1fee837 ("RISC-V: KVM: Implement stage2 page table programming") Signed-off-by: Wang Yechao Reviewed-by: Nutty Liu Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/202603301610527120YZ-pAJY6x9SBpSRo1Wg4@zte.com.cn Signed-off-by: Anup Patel --- arch/riscv/kvm/gstage.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c index 24c270d6d0e2..ccd953f962ba 100644 --- a/arch/riscv/kvm/gstage.c +++ b/arch/riscv/kvm/gstage.c @@ -304,10 +304,9 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end if (!found_leaf) goto next; - if (!(addr & (page_size - 1)) && ((end - addr) >= page_size)) - kvm_riscv_gstage_op_pte(gstage, addr, ptep, - ptep_level, GSTAGE_OP_WP); - + addr = ALIGN_DOWN(addr, page_size); + kvm_riscv_gstage_op_pte(gstage, addr, ptep, + ptep_level, GSTAGE_OP_WP); next: addr += page_size; } From ced35919a0111702a68f43cbd231399308fbfbbd Mon Sep 17 00:00:00 2001 From: Wang Yechao Date: Mon, 30 Mar 2026 16:12:58 +0800 Subject: [PATCH 17/18] RISC-V: KVM: Split huge pages during fault handling for dirty logging mainline inclusion from mainline-7.1-rc1 commit 6ad36f3 category: feature bugzilla: RVCK-Project#257 -------------------------------- During dirty logging, all huge pages are write-protected. When the guest writes to a write-protected huge page, a page fault is triggered. Before recovering the write permission, the huge page must be split into smaller pages (e.g., 4K). After splitting, the normal mapping process proceeds, allowing write permission to be restored at the smaller page granularity. If dirty logging is disabled because migration failed or was cancelled, only recover the write permission at the 4K level, and skip recovering the huge page mapping at this time to avoid the overhead of freeing page tables. The huge page mapping can be recovered in the ioctl context, similar to x86, in a later patch. Signed-off-by: Wang Yechao Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/202603301612587174XZ6QMCrymBqv30S6BN50@zte.com.cn Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_gstage.h | 4 + arch/riscv/kvm/gstage.c | 126 ++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+) diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h index 595e2183173e..a89d1422cc84 100644 --- a/arch/riscv/include/asm/kvm_gstage.h +++ b/arch/riscv/include/asm/kvm_gstage.h @@ -53,6 +53,10 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage, bool page_rdonly, bool page_exec, struct kvm_gstage_mapping *out_map); +int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage, + struct kvm_mmu_memory_cache *pcache, + gpa_t addr, u32 target_level, bool flush); + enum kvm_riscv_gstage_op { GSTAGE_OP_NOP = 0, /* Nothing */ GSTAGE_OP_CLEAR, /* Clear/Unmap */ diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c index ccd953f962ba..bf129946029f 100644 --- a/arch/riscv/kvm/gstage.c +++ b/arch/riscv/kvm/gstage.c @@ -163,13 +163,32 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage, return 0; } +static void kvm_riscv_gstage_update_pte_prot(struct kvm_gstage *gstage, u32 level, + gpa_t addr, pte_t *ptep, pgprot_t prot) +{ + pte_t new_pte; + + if (pgprot_val(pte_pgprot(ptep_get(ptep))) == pgprot_val(prot)) + return; + + new_pte = pfn_pte(pte_pfn(ptep_get(ptep)), prot); + new_pte = pte_mkdirty(new_pte); + + set_pte(ptep, new_pte); + + gstage_tlb_flush(gstage, level, addr); +} + int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage, struct kvm_mmu_memory_cache *pcache, gpa_t gpa, phys_addr_t hpa, unsigned long page_size, bool page_rdonly, bool page_exec, struct kvm_gstage_mapping *out_map) { + bool found_leaf; + u32 ptep_level; pgprot_t prot; + pte_t *ptep; int ret; out_map->addr = gpa; @@ -203,12 +222,119 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage, else prot = PAGE_WRITE; } + + found_leaf = kvm_riscv_gstage_get_leaf(gstage, gpa, &ptep, &ptep_level); + if (found_leaf) { + /* + * ptep_level is the current gstage mapping level of addr, out_map->level + * is the required mapping level during fault handling. + * + * 1) ptep_level > out_map->level + * This happens when dirty logging is enabled and huge pages are used. + * KVM must track the pages at 4K level, and split the huge mapping + * into 4K mappings. + * + * 2) ptep_level < out_map->level + * This happens when dirty logging is disabled and huge pages are used. + * The gstage is split into 4K mappings, but the out_map level is now + * back to the huge page level. Ignore the out_map level this time, and + * just update the pte prot here. Otherwise, we would fall back to mapping + * the gstage at huge page level in `kvm_riscv_gstage_set_pte`, with the + * overhead of freeing the page tables(not support now), which would slow + * down the vCPUs' performance. + * + * It is better to recover the huge page mapping in the ioctl context when + * disabling dirty logging. + * + * 3) ptep_level == out_map->level + * We already have the ptep, just update the pte prot if the pfn not change. + * There is no need to invoke `kvm_riscv_gstage_set_pte` again. + */ + if (ptep_level > out_map->level) { + kvm_riscv_gstage_split_huge(gstage, pcache, gpa, + out_map->level, true); + } else if (ALIGN_DOWN(PFN_PHYS(pte_pfn(ptep_get(ptep))), page_size) == hpa) { + kvm_riscv_gstage_update_pte_prot(gstage, ptep_level, gpa, ptep, prot); + return 0; + } + } + out_map->pte = pfn_pte(PFN_DOWN(hpa), prot); out_map->pte = pte_mkdirty(out_map->pte); return kvm_riscv_gstage_set_pte(gstage, pcache, out_map); } +static inline unsigned long make_child_pte(unsigned long huge_pte, int index, + unsigned long child_page_size) +{ + unsigned long child_pte = huge_pte; + unsigned long child_pfn_offset; + + /* + * The child_pte already has the base address of the huge page being + * split. So we just have to OR in the offset to the page at the next + * lower level for the given index. + */ + child_pfn_offset = index * (child_page_size / PAGE_SIZE); + child_pte |= pte_val(pfn_pte(child_pfn_offset, __pgprot(0))); + + return child_pte; +} + +int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage, + struct kvm_mmu_memory_cache *pcache, + gpa_t addr, u32 target_level, bool flush) +{ + u32 current_level = kvm_riscv_gstage_pgd_levels - 1; + pte_t *next_ptep = (pte_t *)gstage->pgd; + unsigned long huge_pte, child_pte; + unsigned long child_page_size; + pte_t *ptep; + int i, ret; + + if (!pcache) + return -ENOMEM; + + while (current_level > target_level) { + ptep = (pte_t *)&next_ptep[gstage_pte_index(addr, current_level)]; + + if (!pte_val(ptep_get(ptep))) + break; + + if (!gstage_pte_leaf(ptep)) { + next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep)); + current_level--; + continue; + } + + huge_pte = pte_val(ptep_get(ptep)); + + ret = gstage_level_to_page_size(current_level - 1, &child_page_size); + if (ret) + return ret; + + next_ptep = kvm_mmu_memory_cache_alloc(pcache); + if (!next_ptep) + return -ENOMEM; + + for (i = 0; i < PTRS_PER_PTE; i++) { + child_pte = make_child_pte(huge_pte, i, child_page_size); + set_pte((pte_t *)&next_ptep[i], __pte(child_pte)); + } + + set_pte(ptep, pfn_pte(PFN_DOWN(__pa(next_ptep)), + __pgprot(_PAGE_TABLE))); + + if (flush) + gstage_tlb_flush(gstage, current_level, addr); + + current_level--; + } + + return 0; +} + void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr, pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op) { From 136df7a4e6d44564423bb01865f9bab3f644c411 Mon Sep 17 00:00:00 2001 From: Jiakai Xu Date: Mon, 2 Feb 2026 04:00:59 +0000 Subject: [PATCH 18/18] RISC-V: KVM: Fix use-after-free in kvm_riscv_gstage_get_leaf() mainline inclusion from mainline-7.0-rc4 commit dec9ed9 category: feature bugzilla: RVCK-Project#257 -------------------------------- While fuzzing KVM on RISC-V, a use-after-free was observed in kvm_riscv_gstage_get_leaf(), where ptep_get() dereferences a freed gstage page table page during gfn unmap. The crash manifests as: use-after-free in ptep_get include/linux/pgtable.h:340 [inline] use-after-free in kvm_riscv_gstage_get_leaf arch/riscv/kvm/gstage.c:89 Call Trace: ptep_get include/linux/pgtable.h:340 [inline] kvm_riscv_gstage_get_leaf+0x2ea/0x358 arch/riscv/kvm/gstage.c:89 kvm_riscv_gstage_unmap_range+0xf0/0x308 arch/riscv/kvm/gstage.c:265 kvm_unmap_gfn_range+0x168/0x1fc arch/riscv/kvm/mmu.c:256 kvm_mmu_unmap_gfn_range virt/kvm/kvm_main.c:724 [inline] page last free pid 808 tgid 808 stack trace: kvm_riscv_mmu_free_pgd+0x1b6/0x26a arch/riscv/kvm/mmu.c:457 kvm_arch_flush_shadow_all+0x1a/0x24 arch/riscv/kvm/mmu.c:134 kvm_flush_shadow_all virt/kvm/kvm_main.c:344 [inline] The UAF is caused by gstage page table walks running concurrently with gstage pgd teardown. In particular, kvm_unmap_gfn_range() can traverse gstage page tables while kvm_arch_flush_shadow_all() frees the pgd, leading to use-after-free of page table pages. Fix the issue by serializing gstage unmap and pgd teardown with kvm->mmu_lock. Holding mmu_lock ensures that gstage page tables remain valid for the duration of unmap operations and prevents concurrent frees. This matches existing RISC-V KVM usage of mmu_lock to protect gstage map/unmap operations, e.g. kvm_riscv_mmu_iounmap. Fixes: dd82e35638d67f ("RISC-V: KVM: Factor-out g-stage page table management") Signed-off-by: Jiakai Xu Signed-off-by: Jiakai Xu Reviewed-by: Anup Patel Link: https://lore.kernel.org/r/20260202040059.1801167-1-xujiakai2025@iscas.ac.cn Signed-off-by: Anup Patel Signed-off-by: Wang Yechao --- arch/riscv/kvm/mmu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 978faf044603..82738cbb6577 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -259,6 +259,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { struct kvm_gstage gstage; + bool mmu_locked; if (!kvm->arch.pgd) return false; @@ -267,9 +268,12 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) gstage.flags = 0; gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid); gstage.pgd = kvm->arch.pgd; + mmu_locked = spin_trylock(&kvm->mmu_lock); kvm_riscv_gstage_unmap_range(&gstage, range->start << PAGE_SHIFT, (range->end - range->start) << PAGE_SHIFT, range->may_block); + if (mmu_locked) + spin_unlock(&kvm->mmu_lock); return false; }