From 1b5a360feb2618b4367c5c37bc4e98a03b4600ce Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:43 -0700 Subject: [PATCH 01/54] KVM: Tweak kvm_hva_range and hva_handler_t to allow reusing for gfn ranges mainline inclusion from mainline-v6.8-rc1 commit e97b39c5c4362dc1cbc37a563ddac313b96c84f3 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e97b39c5c4362dc1cbc37a563ddac313b96c84f3 ------------------------------------- Hygon-SIG: commit e97b39c5c436 KVM: Tweak kvm_hva_range and hva_handler_t to allow reusing for gfn ranges Backport the guest-memory feature. ------------------------------------- Rework and rename "struct kvm_hva_range" into "kvm_mmu_notifier_range" so that the structure can be used to handle notifications that operate on gfn context, i.e. that aren't tied to a host virtual address. Rename the handler typedef too (arguably it should always have been gfn_handler_t). Practically speaking, this is a nop for 64-bit kernels as the only meaningful change is to store start+end as u64s instead of unsigned longs. Reviewed-by: Paolo Bonzini Reviewed-by: Xiaoyao Li Signed-off-by: Sean Christopherson Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-2-seanjc@google.com> Reviewed-by: Kai Huang Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- virt/kvm/kvm_main.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 029119a56a5a..cf1ed3146000 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -554,18 +554,22 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) return container_of(mn, struct kvm, mmu_notifier); } -typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); +typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start, unsigned long end); typedef void (*on_unlock_fn_t)(struct kvm *kvm); -struct kvm_hva_range { - unsigned long start; - unsigned long end; +struct kvm_mmu_notifier_range { + /* + * 64-bit addresses, as KVM notifiers can operate on host virtual + * addresses (unsigned long) and guest physical addresses (64-bit). + */ + u64 start; + u64 end; union kvm_mmu_notifier_arg arg; - hva_handler_t handler; + gfn_handler_t handler; on_lock_fn_t on_lock; on_unlock_fn_t on_unlock; bool flush_on_ret; @@ -594,7 +598,7 @@ static const union kvm_mmu_notifier_arg KVM_MMU_NOTIFIER_NO_ARG; node = interval_tree_iter_next(node, start, last)) \ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, - const struct kvm_hva_range *range) + const struct kvm_mmu_notifier_range *range) { bool ret = false, locked = false; struct kvm_gfn_range gfn_range; @@ -621,9 +625,9 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, unsigned long hva_start, hva_end; slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]); - hva_start = max(range->start, slot->userspace_addr); - hva_end = min(range->end, slot->userspace_addr + - (slot->npages << PAGE_SHIFT)); + hva_start = max_t(unsigned long, range->start, slot->userspace_addr); + hva_end = min_t(unsigned long, range->end, + slot->userspace_addr + (slot->npages << PAGE_SHIFT)); /* * To optimize for the likely case where the address @@ -678,10 +682,10 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, unsigned long start, unsigned long end, union kvm_mmu_notifier_arg arg, - hva_handler_t handler) + gfn_handler_t handler) { struct kvm *kvm = mmu_notifier_to_kvm(mn); - const struct kvm_hva_range range = { + const struct kvm_mmu_notifier_range range = { .start = start, .end = end, .arg = arg, @@ -698,10 +702,10 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn, unsigned long start, unsigned long end, - hva_handler_t handler) + gfn_handler_t handler) { struct kvm *kvm = mmu_notifier_to_kvm(mn); - const struct kvm_hva_range range = { + const struct kvm_mmu_notifier_range range = { .start = start, .end = end, .handler = handler, @@ -789,7 +793,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { struct kvm *kvm = mmu_notifier_to_kvm(mn); - const struct kvm_hva_range hva_range = { + const struct kvm_mmu_notifier_range hva_range = { .start = range->start, .end = range->end, .handler = kvm_unmap_gfn_range, @@ -853,7 +857,7 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { struct kvm *kvm = mmu_notifier_to_kvm(mn); - const struct kvm_hva_range hva_range = { + const struct kvm_mmu_notifier_range hva_range = { .start = range->start, .end = range->end, .handler = (void *)kvm_null_fn, -- Gitee From 3adda2cc02965ff9c35ec65ea6f6fd086b2c3b5d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:44 -0700 Subject: [PATCH 02/54] KVM: Assert that mmu_invalidate_in_progress *never* goes negative mainline inclusion from mainline-v6.8-rc1 commit c0db19232c1ed6bd7fcb825c28b014c52732c19e category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c0db19232c1ed6bd7fcb825c28b014c52732c19e ------------------------------------- Hygon-SIG: commit c0db19232c1e KVM: Assert that mmu_invalidate_in_progress *never* goes negative Backport the guest-memory feature. ------------------------------------- Move the assertion on the in-progress invalidation count from the primary MMU's notifier path to KVM's common notification path, i.e. assert that the count doesn't go negative even when the invalidation is coming from KVM itself. Opportunistically convert the assertion to a KVM_BUG_ON(), i.e. kill only the affected VM, not the entire kernel. A corrupted count is fatal to the VM, e.g. the non-zero (negative) count will cause mmu_invalidate_retry() to block any and all attempts to install new mappings. But it's far from guaranteed that an end() without a start() is fatal or even problematic to anything other than the target VM, e.g. the underlying bug could simply be a duplicate call to end(). And it's much more likely that a missed invalidation, i.e. a potential use-after-free, would manifest as no notification whatsoever, not an end() without a start(). Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-3-seanjc@google.com> Reviewed-by: Kai Huang Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- virt/kvm/kvm_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index cf1ed3146000..89a5c397189b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -851,6 +851,7 @@ void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start, * in conjunction with the smp_rmb in mmu_invalidate_retry(). */ kvm->mmu_invalidate_in_progress--; + KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm); } static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, @@ -881,8 +882,6 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, */ if (wake) rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait); - - BUG_ON(kvm->mmu_invalidate_in_progress < 0); } static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, -- Gitee From 8eeca1be0b4721409e70544a75d4c6b35f2bcf67 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:21:45 -0700 Subject: [PATCH 03/54] KVM: Use gfn instead of hva for mmu_notifier_retry mainline inclusion from mainline-v6.8-rc1 commit 8569992d64b8f750e34b7858eac5d7daaf0f80fd category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8569992d64b8f750e34b7858eac5d7daaf0f80fd ------------------------------------- Hygon-SIG: commit 8569992d64b8 KVM: Use gfn instead of hva for mmu_notifier_retry Backport the guest-memory feature. ------------------------------------- Currently in mmu_notifier invalidate path, hva range is recorded and then checked against by mmu_invalidate_retry_hva() in the page fault handling path. However, for the soon-to-be-introduced private memory, a page fault may not have a hva associated, checking gfn(gpa) makes more sense. For existing hva based shared memory, gfn is expected to also work. The only downside is when aliasing multiple gfns to a single hva, the current algorithm of checking multiple ranges could result in a much larger range being rejected. Such aliasing should be uncommon, so the impact is expected small. Suggested-by: Sean Christopherson Cc: Xu Yilun Signed-off-by: Chao Peng Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba [sean: convert vmx_set_apic_access_page_addr() to gfn-based API] Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Xu Yilun Message-Id: <20231027182217.3615211-4-seanjc@google.com> Reviewed-by: Kai Huang Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- arch/x86/kvm/mmu/mmu.c | 10 ++++++---- arch/x86/kvm/vmx/vmx.c | 11 +++++----- include/linux/kvm_host.h | 33 +++++++++++++++++++----------- virt/kvm/kvm_main.c | 43 +++++++++++++++++++++++++++++++--------- 4 files changed, 66 insertions(+), 31 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c54c8385b16d..7b07440a0656 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3056,7 +3056,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) * * There are several ways to safely use this helper: * - * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before + * - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before * consuming it. In this case, mmu_lock doesn't need to be held during the * lookup, but it does need to be held while checking the MMU notifier. * @@ -4358,7 +4358,7 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, return true; return fault->slot && - mmu_invalidate_retry_hva(vcpu->kvm, fault->mmu_seq, fault->hva); + mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn); } static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) @@ -6256,7 +6256,9 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) write_lock(&kvm->mmu_lock); - kvm_mmu_invalidate_begin(kvm, 0, -1ul); + kvm_mmu_invalidate_begin(kvm); + + kvm_mmu_invalidate_range_add(kvm, gfn_start, gfn_end); flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end); @@ -6266,7 +6268,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (flush) kvm_flush_remote_tlbs_range(kvm, gfn_start, gfn_end - gfn_start); - kvm_mmu_invalidate_end(kvm, 0, -1ul); + kvm_mmu_invalidate_end(kvm); write_unlock(&kvm->mmu_lock); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d0b128dea437..7eb7a466b8df 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6790,10 +6790,10 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) return; /* - * Grab the memslot so that the hva lookup for the mmu_notifier retry - * is guaranteed to use the same memslot as the pfn lookup, i.e. rely - * on the pfn lookup's validation of the memslot to ensure a valid hva - * is used for the retry check. + * Explicitly grab the memslot using KVM's internal slot ID to ensure + * KVM doesn't unintentionally grab a userspace memslot. It _should_ + * be impossible for userspace to create a memslot for the APIC when + * APICv is enabled, but paranoia won't hurt in this case. */ slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT); if (!slot || slot->flags & KVM_MEMSLOT_INVALID) @@ -6818,8 +6818,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) return; read_lock(&vcpu->kvm->mmu_lock); - if (mmu_invalidate_retry_hva(kvm, mmu_seq, - gfn_to_hva_memslot(slot, gfn))) { + if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) { kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); read_unlock(&vcpu->kvm->mmu_lock); goto out; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 070fda6c98ca..3d1d6bdf08d0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -837,8 +837,8 @@ struct kvm { struct mmu_notifier mmu_notifier; unsigned long mmu_invalidate_seq; long mmu_invalidate_in_progress; - unsigned long mmu_invalidate_range_start; - unsigned long mmu_invalidate_range_end; + gfn_t mmu_invalidate_range_start; + gfn_t mmu_invalidate_range_end; #endif struct list_head devices; u64 manual_dirty_log_protect; @@ -1476,10 +1476,9 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc); void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); #endif -void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start, - unsigned long end); -void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start, - unsigned long end); +void kvm_mmu_invalidate_begin(struct kvm *kvm); +void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end); +void kvm_mmu_invalidate_end(struct kvm *kvm); long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); @@ -2117,9 +2116,9 @@ static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq) return 0; } -static inline int mmu_invalidate_retry_hva(struct kvm *kvm, +static inline int mmu_invalidate_retry_gfn(struct kvm *kvm, unsigned long mmu_seq, - unsigned long hva) + gfn_t gfn) { lockdep_assert_held(&kvm->mmu_lock); /* @@ -2128,10 +2127,20 @@ static inline int mmu_invalidate_retry_hva(struct kvm *kvm, * that might be being invalidated. Note that it may include some false * positives, due to shortcuts when handing concurrent invalidations. */ - if (unlikely(kvm->mmu_invalidate_in_progress) && - hva >= kvm->mmu_invalidate_range_start && - hva < kvm->mmu_invalidate_range_end) - return 1; + if (unlikely(kvm->mmu_invalidate_in_progress)) { + /* + * Dropping mmu_lock after bumping mmu_invalidate_in_progress + * but before updating the range is a KVM bug. + */ + if (WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA || + kvm->mmu_invalidate_range_end == INVALID_GPA)) + return 1; + + if (gfn >= kvm->mmu_invalidate_range_start && + gfn < kvm->mmu_invalidate_range_end) + return 1; + } + if (kvm->mmu_invalidate_seq != mmu_seq) return 1; return 0; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 89a5c397189b..2ad7d697e7dd 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -556,9 +556,7 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); -typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start, - unsigned long end); - +typedef void (*on_lock_fn_t)(struct kvm *kvm); typedef void (*on_unlock_fn_t)(struct kvm *kvm); struct kvm_mmu_notifier_range { @@ -655,7 +653,8 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, locked = true; KVM_MMU_LOCK(kvm); if (!IS_KVM_NULL_FN(range->on_lock)) - range->on_lock(kvm, range->start, range->end); + range->on_lock(kvm); + if (IS_KVM_NULL_FN(range->handler)) break; } @@ -760,16 +759,29 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, kvm_handle_hva_range(mn, address, address + 1, arg, kvm_change_spte_gfn); } -void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start, - unsigned long end) +void kvm_mmu_invalidate_begin(struct kvm *kvm) { + lockdep_assert_held_write(&kvm->mmu_lock); /* * The count increase must become visible at unlock time as no * spte can be established without taking the mmu_lock and * count is also read inside the mmu_lock critical section. */ kvm->mmu_invalidate_in_progress++; + if (likely(kvm->mmu_invalidate_in_progress == 1)) { + kvm->mmu_invalidate_range_start = INVALID_GPA; + kvm->mmu_invalidate_range_end = INVALID_GPA; + } +} + +void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end) +{ + lockdep_assert_held_write(&kvm->mmu_lock); + + WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress); + + if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) { kvm->mmu_invalidate_range_start = start; kvm->mmu_invalidate_range_end = end; } else { @@ -789,6 +801,12 @@ void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start, } } +static bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) +{ + kvm_mmu_invalidate_range_add(kvm, range->start, range->end); + return kvm_unmap_gfn_range(kvm, range); +} + static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { @@ -796,7 +814,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, const struct kvm_mmu_notifier_range hva_range = { .start = range->start, .end = range->end, - .handler = kvm_unmap_gfn_range, + .handler = kvm_mmu_unmap_gfn_range, .on_lock = kvm_mmu_invalidate_begin, .on_unlock = kvm_arch_guest_memory_reclaimed, .flush_on_ret = true, @@ -835,9 +853,10 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, return 0; } -void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start, - unsigned long end) +void kvm_mmu_invalidate_end(struct kvm *kvm) { + lockdep_assert_held_write(&kvm->mmu_lock); + /* * This sequence increase will notify the kvm page fault that * the page that is going to be mapped in the spte could have @@ -852,6 +871,12 @@ void kvm_mmu_invalidate_end(struct kvm *kvm, unsigned long start, */ kvm->mmu_invalidate_in_progress--; KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm); + + /* + * Assert that at least one range was added between start() and end(). + * Not adding a range isn't fatal, but it is a KVM bug. + */ + WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA); } static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, -- Gitee From 49b0968ac8563e9a825c8d9f7e08d23b96f7f6d4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:46 -0700 Subject: [PATCH 04/54] KVM: WARN if there are dangling MMU invalidations at VM destruction mainline inclusion from mainline-v6.8-rc1 commit d497a0fab8b8457214fcc9b1a39530920ea7e95e category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d497a0fab8b8457214fcc9b1a39530920ea7e95e ------------------------------------- Hygon-SIG: commit d497a0fab8b8 KVM: WARN if there are dangling MMU invalidations at VM destruction Backport the guest-memory feature. ------------------------------------- Add an assertion that there are no in-progress MMU invalidations when a VM is being destroyed, with the exception of the scenario where KVM unregisters its MMU notifier between an .invalidate_range_start() call and the corresponding .invalidate_range_end(). KVM can't detect unpaired calls from the mmu_notifier due to the above exception waiver, but the assertion can detect KVM bugs, e.g. such as the bug that *almost* escaped initial guest_memfd development. Link: https://lore.kernel.org/all/e397d30c-c6af-e68f-d18e-b4e3739c5389@linux.intel.com Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-5-seanjc@google.com> Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- virt/kvm/kvm_main.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2ad7d697e7dd..513011ff113f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1390,9 +1390,16 @@ static void kvm_destroy_vm(struct kvm *kvm) * No threads can be waiting in kvm_swap_active_memslots() as the * last reference on KVM has been dropped, but freeing * memslots would deadlock without this manual intervention. + * + * If the count isn't unbalanced, i.e. KVM did NOT unregister its MMU + * notifier between a start() and end(), then there shouldn't be any + * in-progress invalidations. */ WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait)); - kvm->mn_active_invalidate_count = 0; + if (kvm->mn_active_invalidate_count) + kvm->mn_active_invalidate_count = 0; + else + WARN_ON(kvm->mmu_invalidate_in_progress); #else kvm_flush_shadow_all(kvm); #endif -- Gitee From 16b85fc85c6025bbb09dd4cbfdc1eccf06637f3b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:47 -0700 Subject: [PATCH 05/54] KVM: PPC: Drop dead code related to KVM_ARCH_WANT_MMU_NOTIFIER mainline inclusion from mainline-v6.8-rc1 commit 1853d7502a19b34b2cfe4ea0698bee73323fec3a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1853d7502a19b34b2cfe4ea0698bee73323fec3a ------------------------------------- Hygon-SIG: commit 1853d7502a19 KVM: PPC: Drop dead code related to KVM_ARCH_WANT_MMU_NOTIFIER Backport the guest-memory feature. ------------------------------------- Assert that both KVM_ARCH_WANT_MMU_NOTIFIER and CONFIG_MMU_NOTIFIER are defined when KVM is enabled, and return '1' unconditionally for the CONFIG_KVM_BOOK3S_HV_POSSIBLE=n path. All flavors of PPC support for KVM select MMU_NOTIFIER, and KVM_ARCH_WANT_MMU_NOTIFIER is unconditionally defined by arch/powerpc/include/asm/kvm_host.h. Effectively dropping use of KVM_ARCH_WANT_MMU_NOTIFIER will simplify a future cleanup to turn KVM_ARCH_WANT_MMU_NOTIFIER into a Kconfig, i.e. will allow combining all of the #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) checks into a single #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER without having to worry about PPC's "bare" usage of KVM_ARCH_WANT_MMU_NOTIFIER. Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Fuad Tabba Message-Id: <20231027182217.3615211-6-seanjc@google.com> Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- arch/powerpc/kvm/powerpc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 6cef200c2404..9e4507668de1 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -632,12 +632,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; #endif case KVM_CAP_SYNC_MMU: +#if !defined(CONFIG_MMU_NOTIFIER) || !defined(KVM_ARCH_WANT_MMU_NOTIFIER) + BUILD_BUG(); +#endif #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE r = hv_enabled; -#elif defined(KVM_ARCH_WANT_MMU_NOTIFIER) - r = 1; #else - r = 0; + r = 1; #endif break; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE -- Gitee From 99a83c6419b8387b028d480ce45857f6f44ffd46 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:48 -0700 Subject: [PATCH 06/54] KVM: PPC: Return '1' unconditionally for KVM_CAP_SYNC_MMU mainline inclusion from mainline-v6.8-rc1 commit 4a2e993faad3880962540ab8e3b68f22e48b18e8 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4a2e993faad3880962540ab8e3b68f22e48b18e8 ------------------------------------- Hygon-SIG: commit 4a2e993faad3 KVM: PPC: Return '1' unconditionally for KVM_CAP_SYNC_MMU Backport the guest-memory feature. ------------------------------------- Advertise that KVM's MMU is synchronized with the primary MMU for all flavors of PPC KVM support, i.e. advertise that the MMU is synchronized when CONFIG_KVM_BOOK3S_HV_POSSIBLE=y but the VM is not using hypervisor mode (a.k.a. PR VMs). PR VMs, via kvm_unmap_gfn_range_pr(), do the right thing for mmu_notifier invalidation events, and more tellingly, KVM returns '1' for KVM_CAP_SYNC_MMU when CONFIG_KVM_BOOK3S_HV_POSSIBLE=n and CONFIG_KVM_BOOK3S_PR_POSSIBLE=y, i.e. KVM already advertises a synchronized MMU for PR VMs, just not when CONFIG_KVM_BOOK3S_HV_POSSIBLE=y. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-7-seanjc@google.com> Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- arch/powerpc/kvm/powerpc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 9e4507668de1..cb47fd487a12 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -635,11 +635,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #if !defined(CONFIG_MMU_NOTIFIER) || !defined(KVM_ARCH_WANT_MMU_NOTIFIER) BUILD_BUG(); #endif -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - r = hv_enabled; -#else r = 1; -#endif break; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE case KVM_CAP_PPC_HTAB_FD: -- Gitee From 3c83af9c4bb728218c48f90d43cc2c25bd1e3e53 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:49 -0700 Subject: [PATCH 07/54] KVM: Convert KVM_ARCH_WANT_MMU_NOTIFIER to CONFIG_KVM_GENERIC_MMU_NOTIFIER mainline inclusion from mainline-v6.8-rc1 commit f128cf8cfbecccf95e891ae90d9c917df5117c7a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f128cf8cfbecccf95e891ae90d9c917df5117c7a ------------------------------------- Hygon-SIG: commit f128cf8cfbec KVM: Convert KVM_ARCH_WANT_MMU_NOTIFIER to CONFIG_KVM_GENERIC_MMU_NOTIFIER Backport the guest-memory feature. ------------------------------------- Convert KVM_ARCH_WANT_MMU_NOTIFIER into a Kconfig and select it where appropriate to effectively maintain existing behavior. Using a proper Kconfig will simplify building more functionality on top of KVM's mmu_notifier infrastructure. Add a forward declaration of kvm_gfn_range to kvm_types.h so that including arch/powerpc/include/asm/kvm_ppc.h's with CONFIG_KVM=n doesn't generate warnings due to kvm_gfn_range being undeclared. PPC defines hooks for PR vs. HV without guarding them via #ifdeffery, e.g. bool (*unmap_gfn_range)(struct kvm *kvm, struct kvm_gfn_range *range); bool (*age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); bool (*test_age_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); bool (*set_spte_gfn)(struct kvm *kvm, struct kvm_gfn_range *range); Alternatively, PPC could forward declare kvm_gfn_range, but there's no good reason not to define it in common KVM. Acked-by: Anup Patel Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-8-seanjc@google.com> Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Li Zhiquan --- arch/arm64/include/asm/kvm_host.h | 2 -- arch/arm64/kvm/Kconfig | 2 +- arch/loongarch/include/asm/kvm_host.h | 1 - arch/loongarch/kvm/Kconfig | 2 +- arch/mips/include/asm/kvm_host.h | 2 -- arch/mips/kvm/Kconfig | 2 +- arch/powerpc/include/asm/kvm_host.h | 2 -- arch/powerpc/kvm/Kconfig | 8 ++++---- arch/powerpc/kvm/powerpc.c | 4 +--- arch/riscv/include/asm/kvm_host.h | 2 -- arch/riscv/kvm/Kconfig | 2 +- arch/x86/include/asm/kvm_host.h | 2 -- arch/x86/kvm/Kconfig | 2 +- include/linux/kvm_host.h | 6 +++--- include/linux/kvm_types.h | 1 + virt/kvm/Kconfig | 4 ++++ virt/kvm/kvm_main.c | 10 +++++----- 17 files changed, 23 insertions(+), 31 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 69a08a4f3d85..bedc22351ba0 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1040,8 +1040,6 @@ int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events); -#define KVM_ARCH_WANT_MMU_NOTIFIER - void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index d4740f693fdf..46cb2e91120d 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -23,7 +23,7 @@ menuconfig KVM bool "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM select KVM_GENERIC_HARDWARE_ENABLING - select MMU_NOTIFIER + select KVM_GENERIC_MMU_NOTIFIER select PREEMPT_NOTIFIERS select HAVE_KVM_CPU_RELAX_INTERCEPT select KVM_MMIO diff --git a/arch/loongarch/include/asm/kvm_host.h b/arch/loongarch/include/asm/kvm_host.h index 4da029a93c68..274217aba8a3 100644 --- a/arch/loongarch/include/asm/kvm_host.h +++ b/arch/loongarch/include/asm/kvm_host.h @@ -298,7 +298,6 @@ void kvm_flush_tlb_all(void); void kvm_flush_tlb_gpa(struct kvm_vcpu *vcpu, unsigned long gpa); int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long badv, bool write); -#define KVM_ARCH_WANT_MMU_NOTIFIER void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, bool blockable); int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig index e899d96f4da6..283871e15e41 100644 --- a/arch/loongarch/kvm/Kconfig +++ b/arch/loongarch/kvm/Kconfig @@ -30,10 +30,10 @@ config KVM select HAVE_KVM_MSI select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_GENERIC_HARDWARE_ENABLING + select KVM_GENERIC_MMU_NOTIFIER select KVM_MMIO select KVM_XFER_TO_GUEST_WORK select SCHED_INFO - select MMU_NOTIFIER select PREEMPT_NOTIFIERS select KVM_VFIO help diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 54a85f1d4f2c..179f320cc231 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -810,8 +810,6 @@ int kvm_mips_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn); pgd_t *kvm_pgd_alloc(void); void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); -#define KVM_ARCH_WANT_MMU_NOTIFIER - /* Emulation */ enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause); int kvm_get_badinstr(u32 *opc, struct kvm_vcpu *vcpu, u32 *out); diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index a8cdba75f98d..c04987d2ed2e 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -25,7 +25,7 @@ config KVM select HAVE_KVM_EVENTFD select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_MMIO - select MMU_NOTIFIER + select KVM_GENERIC_MMU_NOTIFIER select INTERVAL_TREE select KVM_GENERIC_HARDWARE_ENABLING help diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 14ee0dece853..4b5c3f2acf78 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -62,8 +62,6 @@ #include -#define KVM_ARCH_WANT_MMU_NOTIFIER - #define HPTEG_CACHE_NUM (1 << 15) #define HPTEG_HASH_BITS_PTE 13 #define HPTEG_HASH_BITS_PTE_LONG 12 diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 902611954200..b33358ee6424 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -42,7 +42,7 @@ config KVM_BOOK3S_64_HANDLER config KVM_BOOK3S_PR_POSSIBLE bool select KVM_MMIO - select MMU_NOTIFIER + select KVM_GENERIC_MMU_NOTIFIER config KVM_BOOK3S_HV_POSSIBLE bool @@ -85,7 +85,7 @@ config KVM_BOOK3S_64_HV tristate "KVM for POWER7 and later using hypervisor mode in host" depends on KVM_BOOK3S_64 && PPC_POWERNV select KVM_BOOK3S_HV_POSSIBLE - select MMU_NOTIFIER + select KVM_GENERIC_MMU_NOTIFIER select CMA help Support running unmodified book3s_64 guest kernels in @@ -194,7 +194,7 @@ config KVM_E500V2 depends on !CONTEXT_TRACKING_USER select KVM select KVM_MMIO - select MMU_NOTIFIER + select KVM_GENERIC_MMU_NOTIFIER help Support running unmodified E500 guest kernels in virtual machines on E500v2 host processors. @@ -211,7 +211,7 @@ config KVM_E500MC select KVM select KVM_MMIO select KVM_BOOKE_HV - select MMU_NOTIFIER + select KVM_GENERIC_MMU_NOTIFIER help Support running unmodified E500MC/E5500/E6500 guest kernels in virtual machines on E500MC/E5500/E6500 host processors. diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index cb47fd487a12..d867b209cc95 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -632,9 +632,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; #endif case KVM_CAP_SYNC_MMU: -#if !defined(CONFIG_MMU_NOTIFIER) || !defined(KVM_ARCH_WANT_MMU_NOTIFIER) - BUILD_BUG(); -#endif + BUILD_BUG_ON(!IS_ENABLED(CONFIG_KVM_GENERIC_MMU_NOTIFIER)); r = 1; break; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index 459e61ad7d2b..efad04b22b6a 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -250,8 +250,6 @@ struct kvm_vcpu_arch { static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} -#define KVM_ARCH_WANT_MMU_NOTIFIER - #define KVM_RISCV_GSTAGE_TLB_MIN_ORDER 12 void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid, diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index dfc237d7875b..ae2e05f050ec 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -30,7 +30,7 @@ config KVM select KVM_GENERIC_HARDWARE_ENABLING select KVM_MMIO select KVM_XFER_TO_GUEST_WORK - select MMU_NOTIFIER + select KVM_GENERIC_MMU_NOTIFIER select PREEMPT_NOTIFIERS help Support hosting virtualized guest machines. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3f22221e5d6a..085fd69af9db 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2169,8 +2169,6 @@ enum { # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0) #endif -#define KVM_ARCH_WANT_MMU_NOTIFIER - int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_cpu_has_extint(struct kvm_vcpu *v); diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 4e1135c986c8..21b101bf45e4 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -24,7 +24,7 @@ config KVM depends on HIGH_RES_TIMERS depends on X86_LOCAL_APIC select PREEMPT_NOTIFIERS - select MMU_NOTIFIER + select KVM_GENERIC_MMU_NOTIFIER select HAVE_KVM_IRQCHIP select HAVE_KVM_PFNCACHE select HAVE_KVM_IRQFD diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3d1d6bdf08d0..305a96fcb84a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -254,7 +254,7 @@ bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif -#ifdef KVM_ARCH_WANT_MMU_NOTIFIER +#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER union kvm_mmu_notifier_arg { pte_t pte; }; @@ -833,7 +833,7 @@ struct kvm { struct hlist_head irq_ack_notifier_list; #endif -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) +#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER struct mmu_notifier mmu_notifier; unsigned long mmu_invalidate_seq; long mmu_invalidate_in_progress; @@ -2093,7 +2093,7 @@ extern const struct _kvm_stats_desc kvm_vm_stats_desc[]; extern const struct kvm_stats_header kvm_vcpu_stats_header; extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[]; -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) +#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq) { if (unlikely(kvm->mmu_invalidate_in_progress)) diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 6f4737d5046a..9d1f7835d8c1 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -6,6 +6,7 @@ struct kvm; struct kvm_async_pf; struct kvm_device_ops; +struct kvm_gfn_range; struct kvm_interrupt; struct kvm_irq_routing_table; struct kvm_memory_slot; diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 7e19e8ada121..4043cd3ce147 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -93,5 +93,9 @@ config HAVE_KVM_PM_NOTIFIER config KVM_GENERIC_HARDWARE_ENABLING bool +config KVM_GENERIC_MMU_NOTIFIER + select MMU_NOTIFIER + bool + config HAVE_KVM_PINNED_VMID bool diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 513011ff113f..ea3adb114375 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -548,7 +548,7 @@ void kvm_destroy_vcpus(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_destroy_vcpus); -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) +#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) { return container_of(mn, struct kvm, mmu_notifier); @@ -980,14 +980,14 @@ static int kvm_init_mmu_notifier(struct kvm *kvm) return mmu_notifier_register(&kvm->mmu_notifier, current->mm); } -#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ +#else /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */ static int kvm_init_mmu_notifier(struct kvm *kvm) { return 0; } -#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ +#endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */ #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER static int kvm_pm_notifier_call(struct notifier_block *bl, @@ -1307,7 +1307,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) out_err_no_debugfs: kvm_coalesced_mmio_free(kvm); out_no_coalesced_mmio: -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) +#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER if (kvm->mmu_notifier.ops) mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); #endif @@ -1381,7 +1381,7 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm->buses[i] = NULL; } kvm_coalesced_mmio_free(kvm); -#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) +#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); /* * At this point, pending calls to invalidate_range_start() -- Gitee From fc94ff60014ddbaf14e6007b8335ce948131680a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:50 -0700 Subject: [PATCH 08/54] KVM: Introduce KVM_SET_USER_MEMORY_REGION2 mainline inclusion from mainline-v6.8-rc1 commit bb58b90b1a8f753b582055adaf448214a8e22c31 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bb58b90b1a8f753b582055adaf448214a8e22c31 ------------------------------------- Hygon-SIG: commit bb58b90b1a8f KVM: Introduce KVM_SET_USER_MEMORY_REGION2 Backport the guest-memory feature. ------------------------------------- Introduce a "version 2" of KVM_SET_USER_MEMORY_REGION so that additional information can be supplied without setting userspace up to fail. The padding in the new kvm_userspace_memory_region2 structure will be used to pass a file descriptor in addition to the userspace_addr, i.e. allow userspace to point at a file descriptor and map memory into a guest that is NOT mapped into host userspace. Alternatively, KVM could simply add "struct kvm_userspace_memory_region2" without a new ioctl(), but as Paolo pointed out, adding a new ioctl() makes detection of bad flags a bit more robust, e.g. if the new fd field is guarded only by a flag and not a new ioctl(), then a userspace bug (setting a "bad" flag) would generate out-of-bounds access instead of an -EINVAL error. Cc: Jarkko Sakkinen Reviewed-by: Paolo Bonzini Reviewed-by: Xiaoyao Li Signed-off-by: Sean Christopherson Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-9-seanjc@google.com> Acked-by: Kai Huang Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Li Zhiquan --- Documentation/virt/kvm/api.rst | 22 +++++++++++++ arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 4 +-- include/uapi/linux/kvm.h | 13 ++++++++ virt/kvm/kvm_main.c | 57 +++++++++++++++++++++++++++++----- 5 files changed, 87 insertions(+), 11 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 470b7da2ae41..ab35409027bf 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6200,6 +6200,28 @@ to know what fields can be changed for the system register described by ``op0, op1, crn, crm, op2``. KVM rejects ID register values that describe a superset of the features supported by the system. +4.140 KVM_SET_USER_MEMORY_REGION2 +--------------------------------- + +:Capability: KVM_CAP_USER_MEMORY2 +:Architectures: all +:Type: vm ioctl +:Parameters: struct kvm_userspace_memory_region2 (in) +:Returns: 0 on success, -1 on error + +:: + + struct kvm_userspace_memory_region2 { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ + __u64 userspace_addr; /* start of the userspace allocated memory */ + __u64 pad[16]; + }; + +See KVM_SET_USER_MEMORY_REGION. + 5. The kvm_run structure ======================== diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index de851f3dff7b..c23bbe72c101 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12682,7 +12682,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, } for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - struct kvm_userspace_memory_region m; + struct kvm_userspace_memory_region2 m; m.slot = id | (i << 16); m.flags = 0; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 305a96fcb84a..150ddb90b763 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1230,9 +1230,9 @@ enum kvm_mr_change { }; int kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem); + const struct kvm_userspace_memory_region2 *mem); int __kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem); + const struct kvm_userspace_memory_region2 *mem); void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen); int kvm_arch_prepare_memory_region(struct kvm *kvm, diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 0036cfaf5d69..cbc8f3244831 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -95,6 +95,16 @@ struct kvm_userspace_memory_region { __u64 userspace_addr; /* start of the userspace allocated memory */ }; +/* for KVM_SET_USER_MEMORY_REGION2 */ +struct kvm_userspace_memory_region2 { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; + __u64 userspace_addr; + __u64 pad[16]; +}; + /* * The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for * userspace, other bits are reserved for kvm internal use which are defined @@ -1219,6 +1229,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228 #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 #define KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES 230 +#define KVM_CAP_USER_MEMORY2 231 #define KVM_CAP_ARM_WRITABLE_IMP_ID_REGS 239 @@ -1549,6 +1560,8 @@ struct kvm_numa_info { struct kvm_userspace_memory_region) #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64) +#define KVM_SET_USER_MEMORY_REGION2 _IOW(KVMIO, 0x49, \ + struct kvm_userspace_memory_region2) #define KVM_LOAD_USER_DATA _IOW(KVMIO, 0x49, struct kvm_user_data) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ea3adb114375..fe228f204690 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1612,7 +1612,15 @@ static void kvm_replace_memslot(struct kvm *kvm, } } -static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) +/* + * Flags that do not access any of the extra space of struct + * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_V1_FLAGS + * only allows these. + */ +#define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \ + (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY) + +static int check_memory_region_flags(const struct kvm_userspace_memory_region2 *mem) { u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; @@ -2015,7 +2023,7 @@ static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id, * Must be called holding kvm->slots_lock for write. */ int __kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem) + const struct kvm_userspace_memory_region2 *mem) { struct kvm_memory_slot *old, *new; struct kvm_memslots *slots; @@ -2119,7 +2127,7 @@ int __kvm_set_memory_region(struct kvm *kvm, EXPORT_SYMBOL_GPL(__kvm_set_memory_region); int kvm_set_memory_region(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem) + const struct kvm_userspace_memory_region2 *mem) { int r; @@ -2131,7 +2139,7 @@ int kvm_set_memory_region(struct kvm *kvm, EXPORT_SYMBOL_GPL(kvm_set_memory_region); static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem) + struct kvm_userspace_memory_region2 *mem) { if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) return -EINVAL; @@ -4642,6 +4650,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) { switch (arg) { case KVM_CAP_USER_MEMORY: + case KVM_CAP_USER_MEMORY2: case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: case KVM_CAP_INTERNAL_ERROR_DATA: @@ -4897,6 +4906,14 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm) return fd; } +#define SANITY_CHECK_MEM_REGION_FIELD(field) \ +do { \ + BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) != \ + offsetof(struct kvm_userspace_memory_region2, field)); \ + BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) != \ + sizeof_field(struct kvm_userspace_memory_region2, field)); \ +} while (0) + static long kvm_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4919,15 +4936,39 @@ static long kvm_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); break; } + case KVM_SET_USER_MEMORY_REGION2: case KVM_SET_USER_MEMORY_REGION: { - struct kvm_userspace_memory_region kvm_userspace_mem; + struct kvm_userspace_memory_region2 mem; + unsigned long size; + + if (ioctl == KVM_SET_USER_MEMORY_REGION) { + /* + * Fields beyond struct kvm_userspace_memory_region shouldn't be + * accessed, but avoid leaking kernel memory in case of a bug. + */ + memset(&mem, 0, sizeof(mem)); + size = sizeof(struct kvm_userspace_memory_region); + } else { + size = sizeof(struct kvm_userspace_memory_region2); + } + + /* Ensure the common parts of the two structs are identical. */ + SANITY_CHECK_MEM_REGION_FIELD(slot); + SANITY_CHECK_MEM_REGION_FIELD(flags); + SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr); + SANITY_CHECK_MEM_REGION_FIELD(memory_size); + SANITY_CHECK_MEM_REGION_FIELD(userspace_addr); r = -EFAULT; - if (copy_from_user(&kvm_userspace_mem, argp, - sizeof(kvm_userspace_mem))) + if (copy_from_user(&mem, argp, size)) + goto out; + + r = -EINVAL; + if (ioctl == KVM_SET_USER_MEMORY_REGION && + (mem.flags & ~KVM_SET_USER_MEMORY_REGION_V1_FLAGS)) goto out; - r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); + r = kvm_vm_ioctl_set_memory_region(kvm, &mem); break; } case KVM_GET_DIRTY_LOG: { -- Gitee From d0fde7d08b10ccdaf71ea8ce4df5c4c6056b52e6 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:21:51 -0700 Subject: [PATCH 09/54] KVM: Add KVM_EXIT_MEMORY_FAULT exit to report faults to userspace mainline inclusion from mainline-v6.8-rc1 commit 16f95f3b95caded251a0440051e44a2fbe9e5f55 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=16f95f3b95caded251a0440051e44a2fbe9e5f55 ------------------------------------- Hygon-SIG: commit 16f95f3b95ca KVM: Add KVM_EXIT_MEMORY_FAULT exit to report faults to userspace Backport the guest-memory feature. ------------------------------------- Add a new KVM exit type to allow userspace to handle memory faults that KVM cannot resolve, but that userspace *may* be able to handle (without terminating the guest). KVM will initially use KVM_EXIT_MEMORY_FAULT to report implicit conversions between private and shared memory. With guest private memory, there will be two kind of memory conversions: - explicit conversion: happens when the guest explicitly calls into KVM to map a range (as private or shared) - implicit conversion: happens when the guest attempts to access a gfn that is configured in the "wrong" state (private vs. shared) On x86 (first architecture to support guest private memory), explicit conversions will be reported via KVM_EXIT_HYPERCALL+KVM_HC_MAP_GPA_RANGE, but reporting KVM_EXIT_HYPERCALL for implicit conversions is undesriable as there is (obviously) no hypercall, and there is no guarantee that the guest actually intends to convert between private and shared, i.e. what KVM thinks is an implicit conversion "request" could actually be the result of a guest code bug. KVM_EXIT_MEMORY_FAULT will be used to report memory faults that appear to be implicit conversions. Note! To allow for future possibilities where KVM reports KVM_EXIT_MEMORY_FAULT and fills run->memory_fault on _any_ unresolved fault, KVM returns "-EFAULT" (-1 with errno == EFAULT from userspace's perspective), not '0'! Due to historical baggage within KVM, exiting to userspace with '0' from deep callstacks, e.g. in emulation paths, is infeasible as doing so would require a near-complete overhaul of KVM, whereas KVM already propagates -errno return codes to userspace even when the -errno originated in a low level helper. Report the gpa+size instead of a single gfn even though the initial usage is expected to always report single pages. It's entirely possible, likely even, that KVM will someday support sub-page granularity faults, e.g. Intel's sub-page protection feature allows for additional protections at 128-byte granularity. Link: https://lore.kernel.org/all/20230908222905.1321305-5-amoorthy@google.com Link: https://lore.kernel.org/all/ZQ3AmLO2SYv3DszH@google.com Cc: Anish Moorthy Cc: David Matlack Suggested-by: Sean Christopherson Co-developed-by: Yu Zhang Signed-off-by: Yu Zhang Signed-off-by: Chao Peng Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Message-Id: <20231027182217.3615211-10-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Reviewed-by: Xiaoyao Li Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Li Zhiquan --- Documentation/virt/kvm/api.rst | 41 ++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 1 + include/linux/kvm_host.h | 11 +++++++++ include/uapi/linux/kvm.h | 8 +++++++ 4 files changed, 61 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index ab35409027bf..64a569231927 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6878,6 +6878,26 @@ array field represents return values. The userspace should update the return values of SBI call before resuming the VCPU. For more details on RISC-V SBI spec refer, https://github.com/riscv/riscv-sbi-doc. +:: + + /* KVM_EXIT_MEMORY_FAULT */ + struct { + __u64 flags; + __u64 gpa; + __u64 size; + } memory_fault; + +KVM_EXIT_MEMORY_FAULT indicates the vCPU has encountered a memory fault that +could not be resolved by KVM. The 'gpa' and 'size' (in bytes) describe the +guest physical address range [gpa, gpa + size) of the fault. The 'flags' field +describes properties of the faulting access that are likely pertinent. +Currently, no flags are defined. + +Note! KVM_EXIT_MEMORY_FAULT is unique among all KVM exit reasons in that it +accompanies a return code of '-1', not '0'! errno will always be set to EFAULT +or EHWPOISON when KVM exits with KVM_EXIT_MEMORY_FAULT, userspace should assume +kvm_run.exit_reason is stale/undefined for all other error numbers. + :: /* KVM_EXIT_NOTIFY */ @@ -7912,6 +7932,27 @@ This capability is aimed to mitigate the threat that malicious VMs can cause CPU stuck (due to event windows don't open up) and make the CPU unavailable to host or other VMs. +7.34 KVM_CAP_MEMORY_FAULT_INFO +------------------------------ + +:Architectures: x86 +:Returns: Informational only, -EINVAL on direct KVM_ENABLE_CAP. + +The presence of this capability indicates that KVM_RUN will fill +kvm_run.memory_fault if KVM cannot resolve a guest page fault VM-Exit, e.g. if +there is a valid memslot but no backing VMA for the corresponding host virtual +address. + +The information in kvm_run.memory_fault is valid if and only if KVM_RUN returns +an error with errno=EFAULT or errno=EHWPOISON *and* kvm_run.exit_reason is set +to KVM_EXIT_MEMORY_FAULT. + +Note: Userspaces which attempt to resolve memory faults so that they can retry +KVM_RUN are encouraged to guard against repeatedly receiving the same +error/annotated fault. + +See KVM_EXIT_MEMORY_FAULT for more information. + 7.37 KVM_CAP_ARM_WRITABLE_IMP_ID_REGS ------------------------------------- diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c23bbe72c101..34f793860608 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4613,6 +4613,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ENABLE_CAP: case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: case KVM_CAP_IRQFD_RESAMPLE: + case KVM_CAP_MEMORY_FAULT_INFO: r = 1; break; case KVM_CAP_EXIT_HYPERCALL: diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 150ddb90b763..ac647e5bd069 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2497,4 +2497,15 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr) /* Max number of entries allowed for each kvm dirty ring */ #define KVM_DIRTY_RING_MAX_ENTRIES 65536 +static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, + gpa_t gpa, gpa_t size) +{ + vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT; + vcpu->run->memory_fault.gpa = gpa; + vcpu->run->memory_fault.size = size; + + /* Flags are not (yet) defined or communicated to userspace. */ + vcpu->run->memory_fault.flags = 0; +} + #endif diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index cbc8f3244831..a106942debcb 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -276,6 +276,7 @@ struct kvm_xen_exit { #define KVM_EXIT_RISCV_CSR 36 #define KVM_EXIT_NOTIFY 37 #define KVM_EXIT_LOONGARCH_IOCSR 38 +#define KVM_EXIT_MEMORY_FAULT 39 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -529,6 +530,12 @@ struct kvm_run { #define KVM_NOTIFY_CONTEXT_INVALID (1 << 0) __u32 flags; } notify; + /* KVM_EXIT_MEMORY_FAULT */ + struct { + __u64 flags; + __u64 gpa; + __u64 size; + } memory_fault; /* Fix the size of the union. */ char padding[256]; }; @@ -1230,6 +1237,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 #define KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES 230 #define KVM_CAP_USER_MEMORY2 231 +#define KVM_CAP_MEMORY_FAULT_INFO 232 #define KVM_CAP_ARM_WRITABLE_IMP_ID_REGS 239 -- Gitee From 8d9b17a1500ff7f1b6866c53f225cbf33c2b6827 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:52 -0700 Subject: [PATCH 10/54] KVM: Add a dedicated mmu_notifier flag for reclaiming freed memory mainline inclusion from mainline-v6.8-rc1 commit cec29eef0a815386d520d61c2cbe16d537931639 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cec29eef0a815386d520d61c2cbe16d537931639 ------------------------------------- Hygon-SIG: commit cec29eef0a81 KVM: Add a dedicated mmu_notifier flag for reclaiming freed memory Backport the guest-memory feature. ------------------------------------- Handle AMD SEV's kvm_arch_guest_memory_reclaimed() hook by having __kvm_handle_hva_range() return whether or not an overlapping memslot was found, i.e. mmu_lock was acquired. Using the .on_unlock() hook works, but kvm_arch_guest_memory_reclaimed() needs to run after dropping mmu_lock, which makes .on_lock() and .on_unlock() asymmetrical. Use a small struct to return the tuple of the notifier-specific return, plus whether or not overlap was found. Because the iteration helpers are __always_inlined, practically speaking, the struct will never actually be returned from a function call (not to mention the size of the struct will be two bytes in practice). Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-11-seanjc@google.com> Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- virt/kvm/kvm_main.c | 53 +++++++++++++++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fe228f204690..2d77ceb6d7fb 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -574,6 +574,19 @@ struct kvm_mmu_notifier_range { bool may_block; }; +/* + * The inner-most helper returns a tuple containing the return value from the + * arch- and action-specific handler, plus a flag indicating whether or not at + * least one memslot was found, i.e. if the handler found guest memory. + * + * Note, most notifiers are averse to booleans, so even though KVM tracks the + * return from arch code as a bool, outer helpers will cast it to an int. :-( + */ +typedef struct kvm_mmu_notifier_return { + bool ret; + bool found_memslot; +} kvm_mn_ret_t; + /* * Use a dedicated stub instead of NULL to indicate that there is no callback * function/handler. The compiler technically can't guarantee that a real @@ -595,22 +608,25 @@ static const union kvm_mmu_notifier_arg KVM_MMU_NOTIFIER_NO_ARG; node; \ node = interval_tree_iter_next(node, start, last)) \ -static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, - const struct kvm_mmu_notifier_range *range) +static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, + const struct kvm_mmu_notifier_range *range) { - bool ret = false, locked = false; + struct kvm_mmu_notifier_return r = { + .ret = false, + .found_memslot = false, + }; struct kvm_gfn_range gfn_range; struct kvm_memory_slot *slot; struct kvm_memslots *slots; int i, idx; if (WARN_ON_ONCE(range->end <= range->start)) - return 0; + return r; /* A null handler is allowed if and only if on_lock() is provided. */ if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) && IS_KVM_NULL_FN(range->handler))) - return 0; + return r; idx = srcu_read_lock(&kvm->srcu); @@ -649,8 +665,8 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot); gfn_range.slot = slot; - if (!locked) { - locked = true; + if (!r.found_memslot) { + r.found_memslot = true; KVM_MMU_LOCK(kvm); if (!IS_KVM_NULL_FN(range->on_lock)) range->on_lock(kvm); @@ -658,14 +674,14 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, if (IS_KVM_NULL_FN(range->handler)) break; } - ret |= range->handler(kvm, &gfn_range); + r.ret |= range->handler(kvm, &gfn_range); } } - if (range->flush_on_ret && ret) + if (range->flush_on_ret && r.ret) kvm_flush_remote_tlbs(kvm); - if (locked) { + if (r.found_memslot) { KVM_MMU_UNLOCK(kvm); if (!IS_KVM_NULL_FN(range->on_unlock)) range->on_unlock(kvm); @@ -673,8 +689,7 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, srcu_read_unlock(&kvm->srcu, idx); - /* The notifiers are averse to booleans. :-( */ - return (int)ret; + return r; } static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, @@ -695,7 +710,7 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, .may_block = false, }; - return __kvm_handle_hva_range(kvm, &range); + return __kvm_handle_hva_range(kvm, &range).ret; } static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn, @@ -714,7 +729,7 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn .may_block = false, }; - return __kvm_handle_hva_range(kvm, &range); + return __kvm_handle_hva_range(kvm, &range).ret; } static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) @@ -816,7 +831,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, .end = range->end, .handler = kvm_mmu_unmap_gfn_range, .on_lock = kvm_mmu_invalidate_begin, - .on_unlock = kvm_arch_guest_memory_reclaimed, + .on_unlock = (void *)kvm_null_fn, .flush_on_ret = true, .may_block = mmu_notifier_range_blockable(range), }; @@ -848,7 +863,13 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end, hva_range.may_block); - __kvm_handle_hva_range(kvm, &hva_range); + /* + * If one or more memslots were found and thus zapped, notify arch code + * that guest memory has been reclaimed. This needs to be done *after* + * dropping mmu_lock, as x86's reclaim path is slooooow. + */ + if (__kvm_handle_hva_range(kvm, &hva_range).found_memslot) + kvm_arch_guest_memory_reclaimed(kvm); return 0; } -- Gitee From 3e8046d0af100478049188dd418076f54ad90029 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:53 -0700 Subject: [PATCH 11/54] KVM: Drop .on_unlock() mmu_notifier hook mainline inclusion from mainline-v6.8-rc1 commit 193bbfaacc84f9ee9c281ec0a8dd2ec8e4821e57 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=193bbfaacc84f9ee9c281ec0a8dd2ec8e4821e57 ------------------------------------- Hygon-SIG: commit 193bbfaacc84 KVM: Drop .on_unlock() mmu_notifier hook Backport the guest-memory feature. ------------------------------------- Drop the .on_unlock() mmu_notifer hook now that it's no longer used for notifying arch code that memory has been reclaimed. Adding .on_unlock() and invoking it *after* dropping mmu_lock was a terrible idea, as doing so resulted in .on_lock() and .on_unlock() having divergent and asymmetric behavior, and set future developers up for failure, i.e. all but asked for bugs where KVM relied on using .on_unlock() to try to run a callback while holding mmu_lock. Opportunistically add a lockdep assertion in kvm_mmu_invalidate_end() to guard against future bugs of this nature. Reported-by: Isaku Yamahata Link: https://lore.kernel.org/all/20230802203119.GB2021422@ls.amr.corp.intel.com Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-12-seanjc@google.com> Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- virt/kvm/kvm_main.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2d77ceb6d7fb..66789189e5f7 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -557,7 +557,6 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); typedef void (*on_lock_fn_t)(struct kvm *kvm); -typedef void (*on_unlock_fn_t)(struct kvm *kvm); struct kvm_mmu_notifier_range { /* @@ -569,7 +568,6 @@ struct kvm_mmu_notifier_range { union kvm_mmu_notifier_arg arg; gfn_handler_t handler; on_lock_fn_t on_lock; - on_unlock_fn_t on_unlock; bool flush_on_ret; bool may_block; }; @@ -681,11 +679,8 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, if (range->flush_on_ret && r.ret) kvm_flush_remote_tlbs(kvm); - if (r.found_memslot) { + if (r.found_memslot) KVM_MMU_UNLOCK(kvm); - if (!IS_KVM_NULL_FN(range->on_unlock)) - range->on_unlock(kvm); - } srcu_read_unlock(&kvm->srcu, idx); @@ -705,7 +700,6 @@ static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn, .arg = arg, .handler = handler, .on_lock = (void *)kvm_null_fn, - .on_unlock = (void *)kvm_null_fn, .flush_on_ret = true, .may_block = false, }; @@ -724,7 +718,6 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn .end = end, .handler = handler, .on_lock = (void *)kvm_null_fn, - .on_unlock = (void *)kvm_null_fn, .flush_on_ret = false, .may_block = false, }; @@ -831,7 +824,6 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, .end = range->end, .handler = kvm_mmu_unmap_gfn_range, .on_lock = kvm_mmu_invalidate_begin, - .on_unlock = (void *)kvm_null_fn, .flush_on_ret = true, .may_block = mmu_notifier_range_blockable(range), }; @@ -909,7 +901,6 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, .end = range->end, .handler = (void *)kvm_null_fn, .on_lock = kvm_mmu_invalidate_end, - .on_unlock = (void *)kvm_null_fn, .flush_on_ret = false, .may_block = mmu_notifier_range_blockable(range), }; -- Gitee From 4a89433a189917617368b9fa3199c56b28ddf490 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:21:55 -0700 Subject: [PATCH 12/54] KVM: Introduce per-page memory attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.8-rc1 commit 5a475554db1e476a14216e742ea2bdb77362d5d5 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5a475554db1e476a14216e742ea2bdb77362d5d5 ------------------------------------- Hygon-SIG: commit 5a475554db1e KVM: Introduce per-page memory attributes Backport the guest-memory feature. ------------------------------------- In confidential computing usages, whether a page is private or shared is necessary information for KVM to perform operations like page fault handling, page zapping etc. There are other potential use cases for per-page memory attributes, e.g. to make memory read-only (or no-exec, or exec-only, etc.) without having to modify memslots. Introduce the KVM_SET_MEMORY_ATTRIBUTES ioctl, advertised by KVM_CAP_MEMORY_ATTRIBUTES, to allow userspace to set the per-page memory attributes to a guest memory range. Use an xarray to store the per-page attributes internally, with a naive, not fully optimized implementation, i.e. prioritize correctness over performance for the initial implementation. Use bit 3 for the PRIVATE attribute so that KVM can use bits 0-2 for RWX attributes/protections in the future, e.g. to give userspace fine-grained control over read, write, and execute protections for guest memory. Provide arch hooks for handling attribute changes before and after common code sets the new attributes, e.g. x86 will use the "pre" hook to zap all relevant mappings, and the "post" hook to track whether or not hugepages can be used to map the range. To simplify the implementation wrap the entire sequence with kvm_mmu_invalidate_{begin,end}() even though the operation isn't strictly guaranteed to be an invalidation. For the initial use case, x86 *will* always invalidate memory, and preventing arch code from creating new mappings while the attributes are in flux makes it much easier to reason about the correctness of consuming attributes. It's possible that future usages may not require an invalidation, e.g. if KVM ends up supporting RWX protections and userspace grants _more_ protections, but again opt for simplicity and punt optimizations to if/when they are needed. Suggested-by: Sean Christopherson Link: https://lore.kernel.org/all/Y2WB48kD0J4VGynX@google.com Cc: Fuad Tabba Cc: Xu Yilun Cc: Mickaël Salaün Signed-off-by: Chao Peng Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-14-seanjc@google.com> Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Li Zhiquan --- Documentation/virt/kvm/api.rst | 36 ++++++ include/linux/kvm_host.h | 19 +++ include/uapi/linux/kvm.h | 13 ++ virt/kvm/Kconfig | 4 + virt/kvm/kvm_main.c | 216 +++++++++++++++++++++++++++++++++ 5 files changed, 288 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 64a569231927..95551c317a49 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6222,6 +6222,42 @@ superset of the features supported by the system. See KVM_SET_USER_MEMORY_REGION. +4.141 KVM_SET_MEMORY_ATTRIBUTES +------------------------------- + +:Capability: KVM_CAP_MEMORY_ATTRIBUTES +:Architectures: x86 +:Type: vm ioctl +:Parameters: struct kvm_memory_attributes (in) +:Returns: 0 on success, <0 on error + +KVM_SET_MEMORY_ATTRIBUTES allows userspace to set memory attributes for a range +of guest physical memory. + +:: + + struct kvm_memory_attributes { + __u64 address; + __u64 size; + __u64 attributes; + __u64 flags; + }; + + #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3) + +The address and size must be page aligned. The supported attributes can be +retrieved via ioctl(KVM_CHECK_EXTENSION) on KVM_CAP_MEMORY_ATTRIBUTES. If +executed on a VM, KVM_CAP_MEMORY_ATTRIBUTES precisely returns the attributes +supported by that VM. If executed at system scope, KVM_CAP_MEMORY_ATTRIBUTES +returns all attributes supported by KVM. The only attribute defined at this +time is KVM_MEMORY_ATTRIBUTE_PRIVATE, which marks the associated gfn as being +guest private memory. + +Note, there is no "get" API. Userspace is responsible for explicitly tracking +the state of a gfn/page as needed. + +The "flags" field is reserved for future extensions and must be '0'. + 5. The kvm_run structure ======================== diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ac647e5bd069..7478098dfeb4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -257,6 +257,7 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER union kvm_mmu_notifier_arg { pte_t pte; + unsigned long attributes; }; enum kvm_gfn_range_filter { @@ -856,6 +857,10 @@ struct kvm { #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER struct notifier_block pm_notifier; +#endif +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES + /* Protected by slots_locks (for writes) and RCU (for reads) */ + struct xarray mem_attr_array; #endif char stats_id[KVM_STATS_NAME_SIZE]; #ifdef CONFIG_ARM64_HDBSS @@ -2508,4 +2513,18 @@ static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, vcpu->run->memory_fault.flags = 0; } +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn) +{ + return xa_to_value(xa_load(&kvm->mem_attr_array, gfn)); +} + +bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end, + unsigned long attrs); +bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, + struct kvm_gfn_range *range); +bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, + struct kvm_gfn_range *range); +#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ + #endif diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index a106942debcb..58ba6df9c43d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1238,6 +1238,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES 230 #define KVM_CAP_USER_MEMORY2 231 #define KVM_CAP_MEMORY_FAULT_INFO 232 +#define KVM_CAP_MEMORY_ATTRIBUTES 233 #define KVM_CAP_ARM_WRITABLE_IMP_ID_REGS 239 @@ -2384,6 +2385,18 @@ struct kvm_s390_zpci_op { /* flags for kvm_s390_zpci_op->u.reg_aen.flags */ #define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) +/* Available with KVM_CAP_MEMORY_ATTRIBUTES */ +#define KVM_SET_MEMORY_ATTRIBUTES _IOW(KVMIO, 0xd2, struct kvm_memory_attributes) + +struct kvm_memory_attributes { + __u64 address; + __u64 size; + __u64 attributes; + __u64 flags; +}; + +#define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3) + enum csv_cmd_id { /* HYGON CSV batch command */ KVM_CSV_COMMAND_BATCH = 0x18, diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 4043cd3ce147..7376591d0ef8 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -97,5 +97,9 @@ config KVM_GENERIC_MMU_NOTIFIER select MMU_NOTIFIER bool +config KVM_GENERIC_MEMORY_ATTRIBUTES + select KVM_GENERIC_MMU_NOTIFIER + bool + config HAVE_KVM_PINNED_VMID bool diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 66789189e5f7..9f0e2379201d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1229,6 +1229,9 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) spin_lock_init(&kvm->mn_invalidate_lock); rcuwait_init(&kvm->mn_memslots_update_rcuwait); xa_init(&kvm->vcpu_array); +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES + xa_init(&kvm->mem_attr_array); +#endif INIT_LIST_HEAD(&kvm->gpc_list); spin_lock_init(&kvm->gpc_lock); @@ -1423,6 +1426,9 @@ static void kvm_destroy_vm(struct kvm *kvm) } cleanup_srcu_struct(&kvm->irq_srcu); cleanup_srcu_struct(&kvm->srcu); +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES + xa_destroy(&kvm->mem_attr_array); +#endif kvm_arch_free_vm(kvm); preempt_notifier_dec(); hardware_disable_all(); @@ -2430,6 +2436,200 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, } #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +/* + * Returns true if _all_ gfns in the range [@start, @end) have attributes + * matching @attrs. + */ +bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end, + unsigned long attrs) +{ + XA_STATE(xas, &kvm->mem_attr_array, start); + unsigned long index; + bool has_attrs; + void *entry; + + rcu_read_lock(); + + if (!attrs) { + has_attrs = !xas_find(&xas, end - 1); + goto out; + } + + has_attrs = true; + for (index = start; index < end; index++) { + do { + entry = xas_next(&xas); + } while (xas_retry(&xas, entry)); + + if (xas.xa_index != index || xa_to_value(entry) != attrs) { + has_attrs = false; + break; + } + } + +out: + rcu_read_unlock(); + return has_attrs; +} + +static u64 kvm_supported_mem_attributes(struct kvm *kvm) +{ + if (!kvm) + return KVM_MEMORY_ATTRIBUTE_PRIVATE; + + return 0; +} + +static __always_inline void kvm_handle_gfn_range(struct kvm *kvm, + struct kvm_mmu_notifier_range *range) +{ + struct kvm_gfn_range gfn_range; + struct kvm_memory_slot *slot; + struct kvm_memslots *slots; + struct kvm_memslot_iter iter; + bool found_memslot = false; + bool ret = false; + int i; + + gfn_range.arg = range->arg; + gfn_range.may_block = range->may_block; + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + + kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) { + slot = iter.slot; + gfn_range.slot = slot; + + gfn_range.start = max(range->start, slot->base_gfn); + gfn_range.end = min(range->end, slot->base_gfn + slot->npages); + if (gfn_range.start >= gfn_range.end) + continue; + + if (!found_memslot) { + found_memslot = true; + KVM_MMU_LOCK(kvm); + if (!IS_KVM_NULL_FN(range->on_lock)) + range->on_lock(kvm); + } + + ret |= range->handler(kvm, &gfn_range); + } + } + + if (range->flush_on_ret && ret) + kvm_flush_remote_tlbs(kvm); + + if (found_memslot) + KVM_MMU_UNLOCK(kvm); +} + +static bool kvm_pre_set_memory_attributes(struct kvm *kvm, + struct kvm_gfn_range *range) +{ + /* + * Unconditionally add the range to the invalidation set, regardless of + * whether or not the arch callback actually needs to zap SPTEs. E.g. + * if KVM supports RWX attributes in the future and the attributes are + * going from R=>RW, zapping isn't strictly necessary. Unconditionally + * adding the range allows KVM to require that MMU invalidations add at + * least one range between begin() and end(), e.g. allows KVM to detect + * bugs where the add() is missed. Relaxing the rule *might* be safe, + * but it's not obvious that allowing new mappings while the attributes + * are in flux is desirable or worth the complexity. + */ + kvm_mmu_invalidate_range_add(kvm, range->start, range->end); + + return kvm_arch_pre_set_memory_attributes(kvm, range); +} + +/* Set @attributes for the gfn range [@start, @end). */ +static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end, + unsigned long attributes) +{ + struct kvm_mmu_notifier_range pre_set_range = { + .start = start, + .end = end, + .handler = kvm_pre_set_memory_attributes, + .on_lock = kvm_mmu_invalidate_begin, + .flush_on_ret = true, + .may_block = true, + }; + struct kvm_mmu_notifier_range post_set_range = { + .start = start, + .end = end, + .arg.attributes = attributes, + .handler = kvm_arch_post_set_memory_attributes, + .on_lock = kvm_mmu_invalidate_end, + .may_block = true, + }; + unsigned long i; + void *entry; + int r = 0; + + entry = attributes ? xa_mk_value(attributes) : NULL; + + mutex_lock(&kvm->slots_lock); + + /* Nothing to do if the entire range as the desired attributes. */ + if (kvm_range_has_memory_attributes(kvm, start, end, attributes)) + goto out_unlock; + + /* + * Reserve memory ahead of time to avoid having to deal with failures + * partway through setting the new attributes. + */ + for (i = start; i < end; i++) { + r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT); + if (r) + goto out_unlock; + } + + kvm_handle_gfn_range(kvm, &pre_set_range); + + for (i = start; i < end; i++) { + r = xa_err(xa_store(&kvm->mem_attr_array, i, entry, + GFP_KERNEL_ACCOUNT)); + KVM_BUG_ON(r, kvm); + } + + kvm_handle_gfn_range(kvm, &post_set_range); + +out_unlock: + mutex_unlock(&kvm->slots_lock); + + return r; +} +static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm, + struct kvm_memory_attributes *attrs) +{ + gfn_t start, end; + + /* flags is currently not used. */ + if (attrs->flags) + return -EINVAL; + if (attrs->attributes & ~kvm_supported_mem_attributes(kvm)) + return -EINVAL; + if (attrs->size == 0 || attrs->address + attrs->size < attrs->address) + return -EINVAL; + if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size)) + return -EINVAL; + + start = attrs->address >> PAGE_SHIFT; + end = (attrs->address + attrs->size) >> PAGE_SHIFT; + + /* + * xarray tracks data using "unsigned long", and as a result so does + * KVM. For simplicity, supports generic attributes only on 64-bit + * architectures. + */ + BUILD_BUG_ON(sizeof(attrs->attributes) != sizeof(unsigned long)); + + return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes); +} +#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ + struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) { return __gfn_to_memslot(kvm_memslots(kvm), gfn); @@ -4715,6 +4915,10 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_BINARY_STATS_FD: case KVM_CAP_SYSTEM_EVENT_DATA: return 1; +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES + case KVM_CAP_MEMORY_ATTRIBUTES: + return kvm_supported_mem_attributes(kvm); +#endif default: break; } @@ -5108,6 +5312,18 @@ static long kvm_vm_ioctl(struct file *filp, break; } #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES + case KVM_SET_MEMORY_ATTRIBUTES: { + struct kvm_memory_attributes attrs; + + r = -EFAULT; + if (copy_from_user(&attrs, argp, sizeof(attrs))) + goto out; + + r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs); + break; + } +#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ case KVM_CREATE_DEVICE: { struct kvm_create_device cd; -- Gitee From be44c477761ab023632aa776a8b09e406c06225f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:21:56 -0700 Subject: [PATCH 13/54] mm: Add AS_UNMOVABLE to mark mapping as completely unmovable mainline inclusion from mainline-v6.8-rc1 commit 0003e2a414687fff6a75250d381e4abf345d663f category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=0003e2a414687fff6a75250d381e4abf345d663f ------------------------------------- Hygon-SIG: commit 0003e2a41468 mm: Add AS_UNMOVABLE to mark mapping as completely unmovable Backport the guest-memory feature. ------------------------------------- Add an "unmovable" flag for mappings that cannot be migrated under any circumstance. KVM will use the flag for its upcoming GUEST_MEMFD support, which will not support compaction/migration, at least not in the foreseeable future. Test AS_UNMOVABLE under folio lock as already done for the async compaction/dirty folio case, as the mapping can be removed by truncation while compaction is running. To avoid having to lock every folio with a mapping, assume/require that unmovable mappings are also unevictable, and have mapping_set_unmovable() also set AS_UNEVICTABLE. Cc: Matthew Wilcox Co-developed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-15-seanjc@google.com> Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log and resolve the conflict. When backport commit 84429b675bcf ("fs: Allow fine-grained control of folio sizes") by commit e6ebbdef6b5d, below dependencies commits were skipped: commit 0003e2a41468 ("mm: Add AS_UNMOVABLE to mark mapping as completely unmovable") <-- this commit commit c72ceafbd12c ("mm: Introduce AS_INACCESSIBLE for encrypted/confidential memory") commit 27e6a24a4cf3 ("mm, virt: merge AS_UNMOVABLE and AS_INACCESSIBLE") So directly using the final flag - AS_INACCESSIBLE instead of old AS_UNMOVABLE. ] Signed-off-by: Li Zhiquan --- include/linux/pagemap.h | 16 +++++++++++++++ mm/compaction.c | 43 +++++++++++++++++++++++++++++------------ mm/migrate.c | 2 ++ 3 files changed, 49 insertions(+), 12 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 624712273947..c4364038c7b7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -301,6 +301,22 @@ static inline void mapping_clear_release_always(struct address_space *mapping) clear_bit(AS_RELEASE_ALWAYS, &mapping->flags); } +static inline void mapping_set_unmovable(struct address_space *mapping) +{ + /* + * It's expected unmovable mappings are also unevictable. Compaction + * migrate scanner (isolate_migratepages_block()) relies on this to + * reduce page locking. + */ + set_bit(AS_UNEVICTABLE, &mapping->flags); + set_bit(AS_INACCESSIBLE, &mapping->flags); +} + +static inline bool mapping_unmovable(struct address_space *mapping) +{ + return test_bit(AS_INACCESSIBLE, &mapping->flags); +} + static inline bool mapping_stable_writes(const struct address_space *mapping) { return test_bit(AS_STABLE_WRITES, &mapping->flags); diff --git a/mm/compaction.c b/mm/compaction.c index 06fa615dec32..b65b96e9006b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -907,6 +907,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Time to isolate some pages for migration */ for (; low_pfn < end_pfn; low_pfn++) { + bool is_dirty, is_unevictable; if (skip_on_failure && low_pfn >= next_skip_pfn) { /* @@ -1122,8 +1123,10 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (!folio_test_lru(folio)) goto isolate_fail_put; + is_unevictable = folio_test_unevictable(folio); + /* Compaction might skip unevictable pages but CMA takes them */ - if (!(mode & ISOLATE_UNEVICTABLE) && folio_test_unevictable(folio)) + if (!(mode & ISOLATE_UNEVICTABLE) && is_unevictable) goto isolate_fail_put; /* @@ -1135,26 +1138,42 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_writeback(folio)) goto isolate_fail_put; - if ((mode & ISOLATE_ASYNC_MIGRATE) && folio_test_dirty(folio)) { - bool migrate_dirty; + is_dirty = folio_test_dirty(folio); + + if (((mode & ISOLATE_ASYNC_MIGRATE) && is_dirty) || + (mapping && is_unevictable)) { + bool migrate_dirty = true; + bool is_unmovable; /* * Only folios without mappings or that have - * a ->migrate_folio callback are possible to - * migrate without blocking. However, we may - * be racing with truncation, which can free - * the mapping. Truncation holds the folio lock - * until after the folio is removed from the page - * cache so holding it ourselves is sufficient. + * a ->migrate_folio callback are possible to migrate + * without blocking. + * + * Folios from unmovable mappings are not migratable. + * + * However, we can be racing with truncation, which can + * free the mapping that we need to check. Truncation + * holds the folio lock until after the folio is removed + * from the page so holding it ourselves is sufficient. + * + * To avoid locking the folio just to check unmovable, + * assume every unmovable folio is also unevictable, + * which is a cheaper test. If our assumption goes + * wrong, it's not a correctness bug, just potentially + * wasted cycles. */ if (!folio_trylock(folio)) goto isolate_fail_put; mapping = folio_mapping(folio); - migrate_dirty = !mapping || - mapping->a_ops->migrate_folio; + if ((mode & ISOLATE_ASYNC_MIGRATE) && is_dirty) { + migrate_dirty = !mapping || + mapping->a_ops->migrate_folio; + } + is_unmovable = mapping && mapping_unmovable(mapping); folio_unlock(folio); - if (!migrate_dirty) + if (!migrate_dirty || is_unmovable) goto isolate_fail_put; } diff --git a/mm/migrate.c b/mm/migrate.c index 4edd29d9a041..30277ae64533 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1013,6 +1013,8 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, if (!mapping) rc = migrate_folio(mapping, dst, src, mode); + else if (mapping_unmovable(mapping)) + rc = -EOPNOTSUPP; else if (mapping->a_ops->migrate_folio) /* * Most folios have a mapping and most filesystems -- Gitee From f2daa71e414d42ebdf0e971e31cc9b5589995a48 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 3 Nov 2023 06:47:51 -0400 Subject: [PATCH 14/54] fs: Rename anon_inode_getfile_secure() and anon_inode_getfd_secure() mainline inclusion from mainline-v6.8-rc1 commit 4f0b9194bc119a9850a99e5e824808e2f468c348 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4f0b9194bc119a9850a99e5e824808e2f468c348 ------------------------------------- Hygon-SIG: commit 4f0b9194bc11 fs: Rename anon_inode_getfile_secure() and anon_inode_getfd_secure() Backport the guest-memory feature. ------------------------------------- The call to the inode_init_security_anon() LSM hook is not the sole reason to use anon_inode_getfile_secure() or anon_inode_getfd_secure(). For example, the functions also allow one to create a file with non-zero size, without needing a full-blown filesystem. In this case, you don't need a "secure" version, just unique inodes; the current name of the functions is confusing and does not explain well the difference with the more "standard" anon_inode_getfile() and anon_inode_getfd(). Of course, there is another side of the coin; neither io_uring nor userfaultfd strictly speaking need distinct inodes, and it is not that clear anymore that anon_inode_create_get{file,fd}() allow the LSM to intercept and block the inode's creation. If one was so inclined, anon_inode_getfile_secure() and anon_inode_getfd_secure() could be kept, using the shared inode or a new one depending on CONFIG_SECURITY. However, this is probably overkill, and potentially a cause of bugs in different configurations. Therefore, just add a comment to io_uring and userfaultfd explaining the choice of the function. While at it, remove the export for what is now anon_inode_create_getfd(). There is no in-tree module that uses it, and the old name is gone anyway. If anybody actually needs the symbol, they can ask or they can just use anon_inode_create_getfile(), which will be exported very soon for use in KVM. Suggested-by: Christian Brauner Reviewed-by: Christian Brauner Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Li Zhiquan --- fs/anon_inodes.c | 50 ++++++++++++++++++++++++------------- fs/userfaultfd.c | 5 ++-- include/linux/anon_inodes.h | 4 +-- io_uring/io_uring.c | 3 ++- 4 files changed, 40 insertions(+), 22 deletions(-) diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 24192a7667ed..42b02dc36474 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -79,7 +79,7 @@ static struct file *__anon_inode_getfile(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode, - bool secure) + bool make_inode) { struct inode *inode; struct file *file; @@ -87,7 +87,7 @@ static struct file *__anon_inode_getfile(const char *name, if (fops->owner && !try_module_get(fops->owner)) return ERR_PTR(-ENOENT); - if (secure) { + if (make_inode) { inode = anon_inode_make_secure_inode(name, context_inode); if (IS_ERR(inode)) { file = ERR_CAST(inode); @@ -149,13 +149,10 @@ struct file *anon_inode_getfile(const char *name, EXPORT_SYMBOL_GPL(anon_inode_getfile); /** - * anon_inode_getfile_secure - Like anon_inode_getfile(), but creates a new + * anon_inode_create_getfile - Like anon_inode_getfile(), but creates a new * !S_PRIVATE anon inode rather than reuse the * singleton anon inode and calls the - * inode_init_security_anon() LSM hook. This - * allows for both the inode to have its own - * security context and for the LSM to enforce - * policy on the inode's creation. + * inode_init_security_anon() LSM hook. * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file @@ -164,11 +161,21 @@ EXPORT_SYMBOL_GPL(anon_inode_getfile); * @context_inode: * [in] the logical relationship with the new inode (optional) * + * Create a new anonymous inode and file pair. This can be done for two + * reasons: + * + * - for the inode to have its own security context, so that LSMs can enforce + * policy on the inode's creation; + * + * - if the caller needs a unique inode, for example in order to customize + * the size returned by fstat() + * * The LSM may use @context_inode in inode_init_security_anon(), but a - * reference to it is not held. Returns the newly created file* or an error - * pointer. See the anon_inode_getfile() documentation for more information. + * reference to it is not held. + * + * Returns the newly created file* or an error pointer. */ -struct file *anon_inode_getfile_secure(const char *name, +struct file *anon_inode_create_getfile(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode) @@ -181,7 +188,7 @@ static int __anon_inode_getfd(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode, - bool secure) + bool make_inode) { int error, fd; struct file *file; @@ -192,7 +199,7 @@ static int __anon_inode_getfd(const char *name, fd = error; file = __anon_inode_getfile(name, fops, priv, flags, context_inode, - secure); + make_inode); if (IS_ERR(file)) { error = PTR_ERR(file); goto err_put_unused_fd; @@ -231,10 +238,9 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops, EXPORT_SYMBOL_GPL(anon_inode_getfd); /** - * anon_inode_getfd_secure - Like anon_inode_getfd(), but creates a new + * anon_inode_create_getfd - Like anon_inode_getfd(), but creates a new * !S_PRIVATE anon inode rather than reuse the singleton anon inode, and calls - * the inode_init_security_anon() LSM hook. This allows the inode to have its - * own security context and for a LSM to reject creation of the inode. + * the inode_init_security_anon() LSM hook. * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file @@ -243,16 +249,26 @@ EXPORT_SYMBOL_GPL(anon_inode_getfd); * @context_inode: * [in] the logical relationship with the new inode (optional) * + * Create a new anonymous inode and file pair. This can be done for two + * reasons: + * + * - for the inode to have its own security context, so that LSMs can enforce + * policy on the inode's creation; + * + * - if the caller needs a unique inode, for example in order to customize + * the size returned by fstat() + * * The LSM may use @context_inode in inode_init_security_anon(), but a * reference to it is not held. + * + * Returns a newly created file descriptor or an error code. */ -int anon_inode_getfd_secure(const char *name, const struct file_operations *fops, +int anon_inode_create_getfd(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode) { return __anon_inode_getfd(name, fops, priv, flags, context_inode, true); } -EXPORT_SYMBOL_GPL(anon_inode_getfd_secure); static int __init anon_inode_init(void) { diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index d616b7777eef..eb46803cd512 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1083,7 +1083,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *new, { int fd; - fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new, + fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new, O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode); if (fd < 0) return fd; @@ -2280,7 +2280,8 @@ static int new_userfaultfd(int flags) /* prevent the mm struct to be freed */ mmgrab(ctx->mm); - fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx, + /* Create a new inode so that the LSM can block the creation. */ + fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, ctx, O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); if (fd < 0) { mmdrop(ctx->mm); diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h index 5deaddbd7927..93a5f16d03f3 100644 --- a/include/linux/anon_inodes.h +++ b/include/linux/anon_inodes.h @@ -15,13 +15,13 @@ struct inode; struct file *anon_inode_getfile(const char *name, const struct file_operations *fops, void *priv, int flags); -struct file *anon_inode_getfile_secure(const char *name, +struct file *anon_inode_create_getfile(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode); int anon_inode_getfd(const char *name, const struct file_operations *fops, void *priv, int flags); -int anon_inode_getfd_secure(const char *name, +int anon_inode_create_getfd(const char *name, const struct file_operations *fops, void *priv, int flags, const struct inode *context_inode); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8b15e9dc340f..83e9998955cd 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3961,7 +3961,8 @@ static int io_uring_install_fd(struct file *file) */ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) { - return anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx, + /* Create a new inode so that the LSM can block the creation. */ + return anon_inode_create_getfile("[io_uring]", &io_uring_fops, ctx, O_RDWR | O_CLOEXEC, NULL); } -- Gitee From 48a1e3d282bb1adc16e07beca469f0d1f05d3678 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 13 Nov 2023 05:42:34 -0500 Subject: [PATCH 15/54] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.8-rc1 commit a7800aa80ea4d5356b8474c2302812e9d4926fa6 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a7800aa80ea4d5356b8474c2302812e9d4926fa6 ------------------------------------- Hygon-SIG: commit a7800aa80ea4 KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory Backport the guest-memory feature. ------------------------------------- Introduce an ioctl(), KVM_CREATE_GUEST_MEMFD, to allow creating file-based memory that is tied to a specific KVM virtual machine and whose primary purpose is to serve guest memory. A guest-first memory subsystem allows for optimizations and enhancements that are kludgy or outright infeasible to implement/support in a generic memory subsystem. With guest_memfd, guest protections and mapping sizes are fully decoupled from host userspace mappings. E.g. KVM currently doesn't support mapping memory as writable in the guest without it also being writable in host userspace, as KVM's ABI uses VMA protections to define the allow guest protection. Userspace can fudge this by establishing two mappings, a writable mapping for the guest and readable one for itself, but that’s suboptimal on multiple fronts. Similarly, KVM currently requires the guest mapping size to be a strict subset of the host userspace mapping size, e.g. KVM doesn’t support creating a 1GiB guest mapping unless userspace also has a 1GiB guest mapping. Decoupling the mappings sizes would allow userspace to precisely map only what is needed without impacting guest performance, e.g. to harden against unintentional accesses to guest memory. Decoupling guest and userspace mappings may also allow for a cleaner alternative to high-granularity mappings for HugeTLB, which has reached a bit of an impasse and is unlikely to ever be merged. A guest-first memory subsystem also provides clearer line of sight to things like a dedicated memory pool (for slice-of-hardware VMs) and elimination of "struct page" (for offload setups where userspace _never_ needs to mmap() guest memory). More immediately, being able to map memory into KVM guests without mapping said memory into the host is critical for Confidential VMs (CoCo VMs), the initial use case for guest_memfd. While AMD's SEV and Intel's TDX prevent untrusted software from reading guest private data by encrypting guest memory with a key that isn't usable by the untrusted host, projects such as Protected KVM (pKVM) provide confidentiality and integrity *without* relying on memory encryption. And with SEV-SNP and TDX, accessing guest private memory can be fatal to the host, i.e. KVM must be prevent host userspace from accessing guest memory irrespective of hardware behavior. Attempt #1 to support CoCo VMs was to add a VMA flag to mark memory as being mappable only by KVM (or a similarly enlightened kernel subsystem). That approach was abandoned largely due to it needing to play games with PROT_NONE to prevent userspace from accessing guest memory. Attempt #2 to was to usurp PG_hwpoison to prevent the host from mapping guest private memory into userspace, but that approach failed to meet several requirements for software-based CoCo VMs, e.g. pKVM, as the kernel wouldn't easily be able to enforce a 1:1 page:guest association, let alone a 1:1 pfn:gfn mapping. And using PG_hwpoison does not work for memory that isn't backed by 'struct page', e.g. if devices gain support for exposing encrypted memory regions to guests. Attempt #3 was to extend the memfd() syscall and wrap shmem to provide dedicated file-based guest memory. That approach made it as far as v10 before feedback from Hugh Dickins and Christian Brauner (and others) led to it demise. Hugh's objection was that piggybacking shmem made no sense for KVM's use case as KVM didn't actually *want* the features provided by shmem. I.e. KVM was using memfd() and shmem to avoid having to manage memory directly, not because memfd() and shmem were the optimal solution, e.g. things like read/write/mmap in shmem were dead weight. Christian pointed out flaws with implementing a partial overlay (wrapping only _some_ of shmem), e.g. poking at inode_operations or super_operations would show shmem stuff, but address_space_operations and file_operations would show KVM's overlay. Paraphrashing heavily, Christian suggested KVM stop being lazy and create a proper API. Link: https://lore.kernel.org/all/20201020061859.18385-1-kirill.shutemov@linux.intel.com Link: https://lore.kernel.org/all/20210416154106.23721-1-kirill.shutemov@linux.intel.com Link: https://lore.kernel.org/all/20210824005248.200037-1-seanjc@google.com Link: https://lore.kernel.org/all/20211111141352.26311-1-chao.p.peng@linux.intel.com Link: https://lore.kernel.org/all/20221202061347.1070246-1-chao.p.peng@linux.intel.com Link: https://lore.kernel.org/all/ff5c5b97-acdf-9745-ebe5-c6609dd6322e@google.com Link: https://lore.kernel.org/all/20230418-anfallen-irdisch-6993a61be10b@brauner Link: https://lore.kernel.org/all/ZEM5Zq8oo+xnApW9@google.com Link: https://lore.kernel.org/linux-mm/20230306191944.GA15773@monkey Link: https://lore.kernel.org/linux-mm/ZII1p8ZHlHaQ3dDl@casper.infradead.org Cc: Fuad Tabba Cc: Vishal Annapurve Cc: Ackerley Tng Cc: Jarkko Sakkinen Cc: Maciej Szmigiero Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Quentin Perret Cc: Michael Roth Cc: Wang Cc: Liam Merwick Cc: Isaku Yamahata Co-developed-by: Kirill A. Shutemov Signed-off-by: Kirill A. Shutemov Co-developed-by: Yu Zhang Signed-off-by: Yu Zhang Co-developed-by: Chao Peng Signed-off-by: Chao Peng Co-developed-by: Ackerley Tng Signed-off-by: Ackerley Tng Co-developed-by: Isaku Yamahata Signed-off-by: Isaku Yamahata Co-developed-by: Paolo Bonzini Signed-off-by: Paolo Bonzini Co-developed-by: Michael Roth Signed-off-by: Michael Roth Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-17-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Reviewed-by: Xiaoyao Li Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Li Zhiquan --- Documentation/virt/kvm/api.rst | 70 ++++- fs/anon_inodes.c | 1 + include/linux/kvm_host.h | 48 +++ include/uapi/linux/kvm.h | 15 +- virt/kvm/Kconfig | 4 + virt/kvm/Makefile.kvm | 1 + virt/kvm/guest_memfd.c | 538 +++++++++++++++++++++++++++++++++ virt/kvm/kvm_main.c | 59 +++- virt/kvm/kvm_mm.h | 26 ++ 9 files changed, 754 insertions(+), 8 deletions(-) create mode 100644 virt/kvm/guest_memfd.c diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 95551c317a49..eb0c49016768 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6209,6 +6209,15 @@ superset of the features supported by the system. :Parameters: struct kvm_userspace_memory_region2 (in) :Returns: 0 on success, -1 on error +KVM_SET_USER_MEMORY_REGION2 is an extension to KVM_SET_USER_MEMORY_REGION that +allows mapping guest_memfd memory into a guest. All fields shared with +KVM_SET_USER_MEMORY_REGION identically. Userspace can set KVM_MEM_GUEST_MEMFD +in flags to have KVM bind the memory region to a given guest_memfd range of +[guest_memfd_offset, guest_memfd_offset + memory_size]. The target guest_memfd +must point at a file created via KVM_CREATE_GUEST_MEMFD on the current VM, and +the target range must not be bound to any other memory region. All standard +bounds checks apply (use common sense). + :: struct kvm_userspace_memory_region2 { @@ -6217,10 +6226,24 @@ superset of the features supported by the system. __u64 guest_phys_addr; __u64 memory_size; /* bytes */ __u64 userspace_addr; /* start of the userspace allocated memory */ - __u64 pad[16]; + __u64 guest_memfd_offset; + __u32 guest_memfd; + __u32 pad1; + __u64 pad2[14]; }; -See KVM_SET_USER_MEMORY_REGION. +A KVM_MEM_GUEST_MEMFD region _must_ have a valid guest_memfd (private memory) and +userspace_addr (shared memory). However, "valid" for userspace_addr simply +means that the address itself must be a legal userspace address. The backing +mapping for userspace_addr is not required to be valid/populated at the time of +KVM_SET_USER_MEMORY_REGION2, e.g. shared memory can be lazily mapped/allocated +on-demand. + +When mapping a gfn into the guest, KVM selects shared vs. private, i.e consumes +userspace_addr vs. guest_memfd, based on the gfn's KVM_MEMORY_ATTRIBUTE_PRIVATE +state. At VM creation time, all memory is shared, i.e. the PRIVATE attribute +is '0' for all gfns. Userspace can control whether memory is shared/private by +toggling KVM_MEMORY_ATTRIBUTE_PRIVATE via KVM_SET_MEMORY_ATTRIBUTES as needed. 4.141 KVM_SET_MEMORY_ATTRIBUTES ------------------------------- @@ -6258,6 +6281,49 @@ the state of a gfn/page as needed. The "flags" field is reserved for future extensions and must be '0'. +4.142 KVM_CREATE_GUEST_MEMFD +---------------------------- + +:Capability: KVM_CAP_GUEST_MEMFD +:Architectures: none +:Type: vm ioctl +:Parameters: struct kvm_create_guest_memfd(in) +:Returns: 0 on success, <0 on error + +KVM_CREATE_GUEST_MEMFD creates an anonymous file and returns a file descriptor +that refers to it. guest_memfd files are roughly analogous to files created +via memfd_create(), e.g. guest_memfd files live in RAM, have volatile storage, +and are automatically released when the last reference is dropped. Unlike +"regular" memfd_create() files, guest_memfd files are bound to their owning +virtual machine (see below), cannot be mapped, read, or written by userspace, +and cannot be resized (guest_memfd files do however support PUNCH_HOLE). + +:: + + struct kvm_create_guest_memfd { + __u64 size; + __u64 flags; + __u64 reserved[6]; + }; + +Conceptually, the inode backing a guest_memfd file represents physical memory, +i.e. is coupled to the virtual machine as a thing, not to a "struct kvm". The +file itself, which is bound to a "struct kvm", is that instance's view of the +underlying memory, e.g. effectively provides the translation of guest addresses +to host memory. This allows for use cases where multiple KVM structures are +used to manage a single virtual machine, e.g. when performing intrahost +migration of a virtual machine. + +KVM currently only supports mapping guest_memfd via KVM_SET_USER_MEMORY_REGION2, +and more specifically via the guest_memfd and guest_memfd_offset fields in +"struct kvm_userspace_memory_region2", where guest_memfd_offset is the offset +into the guest_memfd instance. For a given guest_memfd file, there can be at +most one mapping per page, i.e. binding multiple memory regions to a single +guest_memfd range is not allowed (any number of memory regions can be bound to +a single guest_memfd file, but the bound ranges must not overlap). + +See KVM_SET_USER_MEMORY_REGION2 for additional details. + 5. The kvm_run structure ======================== diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 42b02dc36474..8dd436ee985b 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -183,6 +183,7 @@ struct file *anon_inode_create_getfile(const char *name, return __anon_inode_getfile(name, fops, priv, flags, context_inode, true); } +EXPORT_SYMBOL_GPL(anon_inode_create_getfile); static int __anon_inode_getfd(const char *name, const struct file_operations *fops, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7478098dfeb4..9ff3272e34c0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -610,8 +610,20 @@ struct kvm_memory_slot { u32 flags; short id; u16 as_id; + +#ifdef CONFIG_KVM_PRIVATE_MEM + struct { + struct file __rcu *file; + pgoff_t pgoff; + } gmem; +#endif }; +static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot) +{ + return slot && (slot->flags & KVM_MEM_GUEST_MEMFD); +} + static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot) { return slot->flags & KVM_MEM_LOG_DIRTY_PAGES; @@ -717,6 +729,17 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) } #endif +/* + * Arch code must define kvm_arch_has_private_mem if support for private memory + * is enabled. + */ +#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) +static inline bool kvm_arch_has_private_mem(struct kvm *kvm) +{ + return false; +} +#endif + struct kvm_memslots { u64 generation; atomic_long_t last_used_slot; @@ -1484,6 +1507,7 @@ void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); void kvm_mmu_invalidate_begin(struct kvm *kvm); void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end); void kvm_mmu_invalidate_end(struct kvm *kvm); +bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); @@ -2525,6 +2549,30 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range); + +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) +{ + return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) && + kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE; +} +#else +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) +{ + return false; +} #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */ +#ifdef CONFIG_KVM_PRIVATE_MEM +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, int *max_order); +#else +static inline int kvm_gmem_get_pfn(struct kvm *kvm, + struct kvm_memory_slot *slot, gfn_t gfn, + kvm_pfn_t *pfn, int *max_order) +{ + KVM_BUG_ON(1, kvm); + return -EIO; +} +#endif /* CONFIG_KVM_PRIVATE_MEM */ + #endif diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 58ba6df9c43d..b75168459321 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -102,7 +102,10 @@ struct kvm_userspace_memory_region2 { __u64 guest_phys_addr; __u64 memory_size; __u64 userspace_addr; - __u64 pad[16]; + __u64 guest_memfd_offset; + __u32 guest_memfd; + __u32 pad1; + __u64 pad2[14]; }; /* @@ -112,6 +115,7 @@ struct kvm_userspace_memory_region2 { */ #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) #define KVM_MEM_READONLY (1UL << 1) +#define KVM_MEM_GUEST_MEMFD (1UL << 2) #define KVM_MEM_HUGE_POD (1UL << 9) /* for KVM_IRQ_LINE */ @@ -1239,6 +1243,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_USER_MEMORY2 231 #define KVM_CAP_MEMORY_FAULT_INFO 232 #define KVM_CAP_MEMORY_ATTRIBUTES 233 +#define KVM_CAP_GUEST_MEMFD 234 #define KVM_CAP_ARM_WRITABLE_IMP_ID_REGS 239 @@ -2397,6 +2402,14 @@ struct kvm_memory_attributes { #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3) +#define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd) + +struct kvm_create_guest_memfd { + __u64 size; + __u64 flags; + __u64 reserved[6]; +}; + enum csv_cmd_id { /* HYGON CSV batch command */ KVM_CSV_COMMAND_BATCH = 0x18, diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 7376591d0ef8..4c1edf4efb57 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -101,5 +101,9 @@ config KVM_GENERIC_MEMORY_ATTRIBUTES select KVM_GENERIC_MMU_NOTIFIER bool +config KVM_PRIVATE_MEM + select XARRAY_MULTI + bool + config HAVE_KVM_PINNED_VMID bool diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm index 2c27d5d0c367..724c89af78af 100644 --- a/virt/kvm/Makefile.kvm +++ b/virt/kvm/Makefile.kvm @@ -12,3 +12,4 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o +kvm-$(CONFIG_KVM_PRIVATE_MEM) += $(KVM)/guest_memfd.o diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c new file mode 100644 index 000000000000..e65f4170425c --- /dev/null +++ b/virt/kvm/guest_memfd.c @@ -0,0 +1,538 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +#include "kvm_mm.h" + +struct kvm_gmem { + struct kvm *kvm; + struct xarray bindings; + struct list_head entry; +}; + +static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) +{ + struct folio *folio; + + /* TODO: Support huge pages. */ + folio = filemap_grab_folio(inode->i_mapping, index); + if (IS_ERR_OR_NULL(folio)) + return NULL; + + /* + * Use the up-to-date flag to track whether or not the memory has been + * zeroed before being handed off to the guest. There is no backing + * storage for the memory, so the folio will remain up-to-date until + * it's removed. + * + * TODO: Skip clearing pages when trusted firmware will do it when + * assigning memory to the guest. + */ + if (!folio_test_uptodate(folio)) { + unsigned long nr_pages = folio_nr_pages(folio); + unsigned long i; + + for (i = 0; i < nr_pages; i++) + clear_highpage(folio_page(folio, i)); + + folio_mark_uptodate(folio); + } + + /* + * Ignore accessed, referenced, and dirty flags. The memory is + * unevictable and there is no storage to write back to. + */ + return folio; +} + +static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, + pgoff_t end) +{ + bool flush = false, found_memslot = false; + struct kvm_memory_slot *slot; + struct kvm *kvm = gmem->kvm; + unsigned long index; + + xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) { + pgoff_t pgoff = slot->gmem.pgoff; + + struct kvm_gfn_range gfn_range = { + .start = slot->base_gfn + max(pgoff, start) - pgoff, + .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff, + .slot = slot, + .may_block = true, + }; + + if (!found_memslot) { + found_memslot = true; + + KVM_MMU_LOCK(kvm); + kvm_mmu_invalidate_begin(kvm); + } + + flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range); + } + + if (flush) + kvm_flush_remote_tlbs(kvm); + + if (found_memslot) + KVM_MMU_UNLOCK(kvm); +} + +static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, + pgoff_t end) +{ + struct kvm *kvm = gmem->kvm; + + if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { + KVM_MMU_LOCK(kvm); + kvm_mmu_invalidate_end(kvm); + KVM_MMU_UNLOCK(kvm); + } +} + +static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) +{ + struct list_head *gmem_list = &inode->i_mapping->private_list; + pgoff_t start = offset >> PAGE_SHIFT; + pgoff_t end = (offset + len) >> PAGE_SHIFT; + struct kvm_gmem *gmem; + + /* + * Bindings must be stable across invalidation to ensure the start+end + * are balanced. + */ + filemap_invalidate_lock(inode->i_mapping); + + list_for_each_entry(gmem, gmem_list, entry) + kvm_gmem_invalidate_begin(gmem, start, end); + + truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1); + + list_for_each_entry(gmem, gmem_list, entry) + kvm_gmem_invalidate_end(gmem, start, end); + + filemap_invalidate_unlock(inode->i_mapping); + + return 0; +} + +static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t start, index, end; + int r; + + /* Dedicated guest is immutable by default. */ + if (offset + len > i_size_read(inode)) + return -EINVAL; + + filemap_invalidate_lock_shared(mapping); + + start = offset >> PAGE_SHIFT; + end = (offset + len) >> PAGE_SHIFT; + + r = 0; + for (index = start; index < end; ) { + struct folio *folio; + + if (signal_pending(current)) { + r = -EINTR; + break; + } + + folio = kvm_gmem_get_folio(inode, index); + if (!folio) { + r = -ENOMEM; + break; + } + + index = folio_next_index(folio); + + folio_unlock(folio); + folio_put(folio); + + /* 64-bit only, wrapping the index should be impossible. */ + if (WARN_ON_ONCE(!index)) + break; + + cond_resched(); + } + + filemap_invalidate_unlock_shared(mapping); + + return r; +} + +static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset, + loff_t len) +{ + int ret; + + if (!(mode & FALLOC_FL_KEEP_SIZE)) + return -EOPNOTSUPP; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len)) + return -EINVAL; + + if (mode & FALLOC_FL_PUNCH_HOLE) + ret = kvm_gmem_punch_hole(file_inode(file), offset, len); + else + ret = kvm_gmem_allocate(file_inode(file), offset, len); + + if (!ret) + file_modified(file); + return ret; +} + +static int kvm_gmem_release(struct inode *inode, struct file *file) +{ + struct kvm_gmem *gmem = file->private_data; + struct kvm_memory_slot *slot; + struct kvm *kvm = gmem->kvm; + unsigned long index; + + /* + * Prevent concurrent attempts to *unbind* a memslot. This is the last + * reference to the file and thus no new bindings can be created, but + * dereferencing the slot for existing bindings needs to be protected + * against memslot updates, specifically so that unbind doesn't race + * and free the memslot (kvm_gmem_get_file() will return NULL). + */ + mutex_lock(&kvm->slots_lock); + + filemap_invalidate_lock(inode->i_mapping); + + xa_for_each(&gmem->bindings, index, slot) + rcu_assign_pointer(slot->gmem.file, NULL); + + synchronize_rcu(); + + /* + * All in-flight operations are gone and new bindings can be created. + * Zap all SPTEs pointed at by this file. Do not free the backing + * memory, as its lifetime is associated with the inode, not the file. + */ + kvm_gmem_invalidate_begin(gmem, 0, -1ul); + kvm_gmem_invalidate_end(gmem, 0, -1ul); + + list_del(&gmem->entry); + + filemap_invalidate_unlock(inode->i_mapping); + + mutex_unlock(&kvm->slots_lock); + + xa_destroy(&gmem->bindings); + kfree(gmem); + + kvm_put_kvm(kvm); + + return 0; +} + +static struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) +{ + struct file *file; + + rcu_read_lock(); + + file = rcu_dereference(slot->gmem.file); + if (file && !get_file_rcu(file)) + file = NULL; + + rcu_read_unlock(); + + return file; +} + +static struct file_operations kvm_gmem_fops = { + .open = generic_file_open, + .release = kvm_gmem_release, + .fallocate = kvm_gmem_fallocate, +}; + +void kvm_gmem_init(struct module *module) +{ + kvm_gmem_fops.owner = module; +} + +static int kvm_gmem_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, + enum migrate_mode mode) +{ + WARN_ON_ONCE(1); + return -EINVAL; +} + +static int kvm_gmem_error_page(struct address_space *mapping, struct page *page) +{ + struct list_head *gmem_list = &mapping->private_list; + struct kvm_gmem *gmem; + pgoff_t start, end; + + filemap_invalidate_lock_shared(mapping); + + start = page->index; + end = start + thp_nr_pages(page); + + list_for_each_entry(gmem, gmem_list, entry) + kvm_gmem_invalidate_begin(gmem, start, end); + + /* + * Do not truncate the range, what action is taken in response to the + * error is userspace's decision (assuming the architecture supports + * gracefully handling memory errors). If/when the guest attempts to + * access a poisoned page, kvm_gmem_get_pfn() will return -EHWPOISON, + * at which point KVM can either terminate the VM or propagate the + * error to userspace. + */ + + list_for_each_entry(gmem, gmem_list, entry) + kvm_gmem_invalidate_end(gmem, start, end); + + filemap_invalidate_unlock_shared(mapping); + + return MF_DELAYED; +} + +static const struct address_space_operations kvm_gmem_aops = { + .dirty_folio = noop_dirty_folio, +#ifdef CONFIG_MIGRATION + .migrate_folio = kvm_gmem_migrate_folio, +#endif + .error_remove_page = kvm_gmem_error_page, +}; + +static int kvm_gmem_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + struct inode *inode = path->dentry->d_inode; + + generic_fillattr(idmap, request_mask, inode, stat); + return 0; +} + +static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) +{ + return -EINVAL; +} +static const struct inode_operations kvm_gmem_iops = { + .getattr = kvm_gmem_getattr, + .setattr = kvm_gmem_setattr, +}; + +static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) +{ + const char *anon_name = "[kvm-gmem]"; + struct kvm_gmem *gmem; + struct inode *inode; + struct file *file; + int fd, err; + + fd = get_unused_fd_flags(0); + if (fd < 0) + return fd; + + gmem = kzalloc(sizeof(*gmem), GFP_KERNEL); + if (!gmem) { + err = -ENOMEM; + goto err_fd; + } + + file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem, + O_RDWR, NULL); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto err_gmem; + } + + file->f_flags |= O_LARGEFILE; + + inode = file->f_inode; + WARN_ON(file->f_mapping != inode->i_mapping); + + inode->i_private = (void *)(unsigned long)flags; + inode->i_op = &kvm_gmem_iops; + inode->i_mapping->a_ops = &kvm_gmem_aops; + inode->i_mode |= S_IFREG; + inode->i_size = size; + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); + mapping_set_unmovable(inode->i_mapping); + /* Unmovable mappings are supposed to be marked unevictable as well. */ + WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); + + kvm_get_kvm(kvm); + gmem->kvm = kvm; + xa_init(&gmem->bindings); + list_add(&gmem->entry, &inode->i_mapping->private_list); + + fd_install(fd, file); + return fd; + +err_gmem: + kfree(gmem); +err_fd: + put_unused_fd(fd); + return err; +} + +int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args) +{ + loff_t size = args->size; + u64 flags = args->flags; + u64 valid_flags = 0; + + if (flags & ~valid_flags) + return -EINVAL; + + if (size <= 0 || !PAGE_ALIGNED(size)) + return -EINVAL; + + return __kvm_gmem_create(kvm, size, flags); +} + +int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned int fd, loff_t offset) +{ + loff_t size = slot->npages << PAGE_SHIFT; + unsigned long start, end; + struct kvm_gmem *gmem; + struct inode *inode; + struct file *file; + int r = -EINVAL; + + BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff)); + + file = fget(fd); + if (!file) + return -EBADF; + + if (file->f_op != &kvm_gmem_fops) + goto err; + + gmem = file->private_data; + if (gmem->kvm != kvm) + goto err; + + inode = file_inode(file); + + if (offset < 0 || !PAGE_ALIGNED(offset) || + offset + size > i_size_read(inode)) + goto err; + + filemap_invalidate_lock(inode->i_mapping); + + start = offset >> PAGE_SHIFT; + end = start + slot->npages; + + if (!xa_empty(&gmem->bindings) && + xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { + filemap_invalidate_unlock(inode->i_mapping); + goto err; + } + + /* + * No synchronize_rcu() needed, any in-flight readers are guaranteed to + * be see either a NULL file or this new file, no need for them to go + * away. + */ + rcu_assign_pointer(slot->gmem.file, file); + slot->gmem.pgoff = start; + + xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL); + filemap_invalidate_unlock(inode->i_mapping); + + /* + * Drop the reference to the file, even on success. The file pins KVM, + * not the other way 'round. Active bindings are invalidated if the + * file is closed before memslots are destroyed. + */ + r = 0; +err: + fput(file); + return r; +} + +void kvm_gmem_unbind(struct kvm_memory_slot *slot) +{ + unsigned long start = slot->gmem.pgoff; + unsigned long end = start + slot->npages; + struct kvm_gmem *gmem; + struct file *file; + + /* + * Nothing to do if the underlying file was already closed (or is being + * closed right now), kvm_gmem_release() invalidates all bindings. + */ + file = kvm_gmem_get_file(slot); + if (!file) + return; + + gmem = file->private_data; + + filemap_invalidate_lock(file->f_mapping); + xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL); + rcu_assign_pointer(slot->gmem.file, NULL); + synchronize_rcu(); + filemap_invalidate_unlock(file->f_mapping); + + fput(file); +} + +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, kvm_pfn_t *pfn, int *max_order) +{ + pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff; + struct kvm_gmem *gmem; + struct folio *folio; + struct page *page; + struct file *file; + int r; + + file = kvm_gmem_get_file(slot); + if (!file) + return -EFAULT; + + gmem = file->private_data; + + if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) { + r = -EIO; + goto out_fput; + } + + folio = kvm_gmem_get_folio(file_inode(file), index); + if (!folio) { + r = -ENOMEM; + goto out_fput; + } + + if (folio_test_hwpoison(folio)) { + r = -EHWPOISON; + goto out_unlock; + } + + page = folio_file_page(folio, index); + + *pfn = page_to_pfn(page); + if (max_order) + *max_order = 0; + + r = 0; + +out_unlock: + folio_unlock(folio); +out_fput: + fput(file); + + return r; +} +EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9f0e2379201d..77d23e0fa891 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -809,7 +809,7 @@ void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end) } } -static bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) +bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) { kvm_mmu_invalidate_range_add(kvm, range->start, range->end); return kvm_unmap_gfn_range(kvm, range); @@ -1045,6 +1045,9 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) /* This does not remove the slot from struct kvm_memslots data structures */ static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { + if (slot->flags & KVM_MEM_GUEST_MEMFD) + kvm_gmem_unbind(slot); + kvm_destroy_dirty_bitmap(slot); kvm_arch_free_memslot(kvm, slot); @@ -1638,10 +1641,18 @@ static void kvm_replace_memslot(struct kvm *kvm, #define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \ (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY) -static int check_memory_region_flags(const struct kvm_userspace_memory_region2 *mem) +static int check_memory_region_flags(struct kvm *kvm, + const struct kvm_userspace_memory_region2 *mem) { u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; + if (kvm_arch_has_private_mem(kvm)) + valid_flags |= KVM_MEM_GUEST_MEMFD; + + /* Dirty logging private memory is not currently supported. */ + if (mem->flags & KVM_MEM_GUEST_MEMFD) + valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES; + #ifdef __KVM_HAVE_READONLY_MEM valid_flags |= KVM_MEM_READONLY; #endif @@ -2051,7 +2062,7 @@ int __kvm_set_memory_region(struct kvm *kvm, int as_id, id; int r; - r = check_memory_region_flags(mem); + r = check_memory_region_flags(kvm, mem); if (r) return r; @@ -2070,6 +2081,10 @@ int __kvm_set_memory_region(struct kvm *kvm, !access_ok((void __user *)(unsigned long)mem->userspace_addr, mem->memory_size)) return -EINVAL; + if (mem->flags & KVM_MEM_GUEST_MEMFD && + (mem->guest_memfd_offset & (PAGE_SIZE - 1) || + mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset)) + return -EINVAL; if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) return -EINVAL; if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) @@ -2108,6 +2123,9 @@ int __kvm_set_memory_region(struct kvm *kvm, if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages) return -EINVAL; } else { /* Modify an existing slot. */ + /* Private memslots are immutable, they can only be deleted. */ + if (mem->flags & KVM_MEM_GUEST_MEMFD) + return -EINVAL; if ((mem->userspace_addr != old->userspace_addr) || (npages != old->npages) || ((mem->flags ^ old->flags) & KVM_MEM_READONLY)) @@ -2136,10 +2154,23 @@ int __kvm_set_memory_region(struct kvm *kvm, new->npages = npages; new->flags = mem->flags; new->userspace_addr = mem->userspace_addr; + if (mem->flags & KVM_MEM_GUEST_MEMFD) { + r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset); + if (r) + goto out; + } r = kvm_set_memslot(kvm, old, new, change); if (r) - kfree(new); + goto out_unbind; + + return 0; + +out_unbind: + if (mem->flags & KVM_MEM_GUEST_MEMFD) + kvm_gmem_unbind(new); +out: + kfree(new); return r; } EXPORT_SYMBOL_GPL(__kvm_set_memory_region); @@ -2475,7 +2506,7 @@ bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end, static u64 kvm_supported_mem_attributes(struct kvm *kvm) { - if (!kvm) + if (!kvm || kvm_arch_has_private_mem(kvm)) return KVM_MEMORY_ATTRIBUTE_PRIVATE; return 0; @@ -4918,6 +4949,10 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES case KVM_CAP_MEMORY_ATTRIBUTES: return kvm_supported_mem_attributes(kvm); +#endif +#ifdef CONFIG_KVM_PRIVATE_MEM + case KVM_CAP_GUEST_MEMFD: + return !kvm || kvm_arch_has_private_mem(kvm); #endif default: break; @@ -5351,6 +5386,18 @@ static long kvm_vm_ioctl(struct file *filp, case KVM_GET_STATS_FD: r = kvm_vm_ioctl_get_stats_fd(kvm); break; +#ifdef CONFIG_KVM_PRIVATE_MEM + case KVM_CREATE_GUEST_MEMFD: { + struct kvm_create_guest_memfd guest_memfd; + + r = -EFAULT; + if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd))) + goto out; + + r = kvm_gmem_create(kvm, &guest_memfd); + break; + } +#endif default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); } @@ -6748,6 +6795,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) if (WARN_ON_ONCE(r)) goto err_vfio; + kvm_gmem_init(module); + /* * Registration _must_ be the very last thing done, as this exposes * /dev/kvm to userspace, i.e. all infrastructure must be setup! diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index 180f1a09e6ba..ecefc7ec51af 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -37,4 +37,30 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, } #endif /* HAVE_KVM_PFNCACHE */ +#ifdef CONFIG_KVM_PRIVATE_MEM +void kvm_gmem_init(struct module *module); +int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args); +int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned int fd, loff_t offset); +void kvm_gmem_unbind(struct kvm_memory_slot *slot); +#else +static inline void kvm_gmem_init(struct module *module) +{ + +} + +static inline int kvm_gmem_bind(struct kvm *kvm, + struct kvm_memory_slot *slot, + unsigned int fd, loff_t offset) +{ + WARN_ON_ONCE(1); + return -EIO; +} + +static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot) +{ + WARN_ON_ONCE(1); +} +#endif /* CONFIG_KVM_PRIVATE_MEM */ + #endif /* __KVM_MM_H__ */ -- Gitee From a37f8428965d1f7f69eeeed5f068b60f55b13b1e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:00 -0700 Subject: [PATCH 16/54] KVM: x86: "Reset" vcpu->run->exit_reason early in KVM_RUN mainline inclusion from mainline-v6.8-rc1 commit ee605e31563348f44d2ff0b6b74d33df69dd535c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ee605e31563348f44d2ff0b6b74d33df69dd535c ------------------------------------- Hygon-SIG: commit ee605e315633 KVM: x86: "Reset" vcpu->run->exit_reason early in KVM_RUN Backport the guest-memory feature. ------------------------------------- Initialize run->exit_reason to KVM_EXIT_UNKNOWN early in KVM_RUN to reduce the probability of exiting to userspace with a stale run->exit_reason that *appears* to be valid. To support fd-based guest memory (guest memory without a corresponding userspace virtual address), KVM will exit to userspace for various memory related errors, which userspace *may* be able to resolve, instead of using e.g. BUS_MCEERR_AR. And in the more distant future, KVM will also likely utilize the same functionality to let userspace "intercept" and handle memory faults when the userspace mapping is missing, i.e. when fast gup() fails. Because many of KVM's internal APIs related to guest memory use '0' to indicate "success, continue on" and not "exit to userspace", reporting memory faults/errors to userspace will set run->exit_reason and corresponding fields in the run structure fields in conjunction with a a non-zero, negative return code, e.g. -EFAULT or -EHWPOISON. And because KVM already returns -EFAULT in many paths, there's a relatively high probability that KVM could return -EFAULT without setting run->exit_reason, in which case reporting KVM_EXIT_UNKNOWN is much better than reporting whatever exit reason happened to be in the run structure. Note, KVM must wait until after run->immediate_exit is serviced to sanitize run->exit_reason as KVM's ABI is that run->exit_reason is preserved across KVM_RUN when run->immediate_exit is true. Link: https://lore.kernel.org/all/20230908222905.1321305-1-amoorthy@google.com Link: https://lore.kernel.org/all/ZFFbwOXZ5uI%2Fgdaf@google.com Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-19-seanjc@google.com> Reviewed-by: Xiaoyao Li Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 34f793860608..58a36d375c0f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11174,6 +11174,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) { int r; + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; vcpu->arch.l1tf_flush_l1d = true; for (;;) { -- Gitee From 50c1f3b3f2725e2a247354eabc9f5055efb88037 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:22:01 -0700 Subject: [PATCH 17/54] KVM: x86: Disallow hugepages when memory attributes are mixed mainline inclusion from mainline-v6.8-rc1 commit 90b4fe17981e155432c4dbc490606d0c2e9c2199 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=90b4fe17981e155432c4dbc490606d0c2e9c2199 ------------------------------------- Hygon-SIG: commit 90b4fe17981e KVM: x86: Disallow hugepages when memory attributes are mixed Backport the guest-memory feature. ------------------------------------- Disallow creating hugepages with mixed memory attributes, e.g. shared versus private, as mapping a hugepage in this case would allow the guest to access memory with the wrong attributes, e.g. overlaying private memory with a shared hugepage. Tracking whether or not attributes are mixed via the existing disallow_lpage field, but use the most significant bit in 'disallow_lpage' to indicate a hugepage has mixed attributes instead using the normal refcounting. Whether or not attributes are mixed is binary; either they are or they aren't. Attempting to squeeze that info into the refcount is unnecessarily complex as it would require knowing the previous state of the mixed count when updating attributes. Using a flag means KVM just needs to ensure the current status is reflected in the memslots. Signed-off-by: Chao Peng Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-20-seanjc@google.com> Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- arch/x86/include/asm/kvm_host.h | 3 + arch/x86/kvm/mmu/mmu.c | 154 +++++++++++++++++++++++++++++++- arch/x86/kvm/x86.c | 4 + 3 files changed, 159 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 085fd69af9db..02d338f81e34 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1876,6 +1876,9 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); +void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm, + struct kvm_memory_slot *slot); + void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7b07440a0656..c860969f260b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -795,16 +795,26 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, return &slot->arch.lpage_info[level - 2][idx]; } +/* + * The most significant bit in disallow_lpage tracks whether or not memory + * attributes are mixed, i.e. not identical for all gfns at the current level. + * The lower order bits are used to refcount other cases where a hugepage is + * disallowed, e.g. if KVM has shadow a page table at the gfn. + */ +#define KVM_LPAGE_MIXED_FLAG BIT(31) + static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot, gfn_t gfn, int count) { struct kvm_lpage_info *linfo; - int i; + int old, i; for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); + + old = linfo->disallow_lpage; linfo->disallow_lpage += count; - WARN_ON_ONCE(linfo->disallow_lpage < 0); + WARN_ON_ONCE((old ^ linfo->disallow_lpage) & KVM_LPAGE_MIXED_FLAG); } } @@ -7174,3 +7184,143 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm) if (kvm->arch.nx_huge_page_recovery_thread) kthread_stop(kvm->arch.nx_huge_page_recovery_thread); } + +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; +} + +static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, int level, unsigned long attrs) +{ + const unsigned long start = gfn; + const unsigned long end = start + KVM_PAGES_PER_HPAGE(level); + + if (level == PG_LEVEL_2M) + return kvm_range_has_memory_attributes(kvm, start, end, attrs); + + for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) { + if (hugepage_test_mixed(slot, gfn, level - 1) || + attrs != kvm_get_memory_attributes(kvm, gfn)) + return false; + } + return true; +} + +bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, + struct kvm_gfn_range *range) +{ + unsigned long attrs = range->arg.attributes; + struct kvm_memory_slot *slot = range->slot; + int level; + + lockdep_assert_held_write(&kvm->mmu_lock); + lockdep_assert_held(&kvm->slots_lock); + + /* + * Calculate which ranges can be mapped with hugepages even if the slot + * can't map memory PRIVATE. KVM mustn't create a SHARED hugepage over + * a range that has PRIVATE GFNs, and conversely converting a range to + * SHARED may now allow hugepages. + */ + if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) + return false; + + /* + * The sequence matters here: upper levels consume the result of lower + * level's scanning. + */ + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); + gfn_t gfn = gfn_round_for_level(range->start, level); + + /* Process the head page if it straddles the range. */ + if (gfn != range->start || gfn + nr_pages > range->end) { + /* + * Skip mixed tracking if the aligned gfn isn't covered + * by the memslot, KVM can't use a hugepage due to the + * misaligned address regardless of memory attributes. + */ + if (gfn >= slot->base_gfn) { + if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) + hugepage_clear_mixed(slot, gfn, level); + else + hugepage_set_mixed(slot, gfn, level); + } + gfn += nr_pages; + } + + /* + * Pages entirely covered by the range are guaranteed to have + * only the attributes which were just set. + */ + for ( ; gfn + nr_pages <= range->end; gfn += nr_pages) + hugepage_clear_mixed(slot, gfn, level); + + /* + * Process the last tail page if it straddles the range and is + * contained by the memslot. Like the head page, KVM can't + * create a hugepage if the slot size is misaligned. + */ + if (gfn < range->end && + (gfn + nr_pages) <= (slot->base_gfn + slot->npages)) { + if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) + hugepage_clear_mixed(slot, gfn, level); + else + hugepage_set_mixed(slot, gfn, level); + } + } + return false; +} + +void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + int level; + + if (!kvm_arch_has_private_mem(kvm)) + return; + + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + /* + * Don't bother tracking mixed attributes for pages that can't + * be huge due to alignment, i.e. process only pages that are + * entirely contained by the memslot. + */ + gfn_t end = gfn_round_for_level(slot->base_gfn + slot->npages, level); + gfn_t start = gfn_round_for_level(slot->base_gfn, level); + gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); + gfn_t gfn; + + if (start < slot->base_gfn) + start += nr_pages; + + /* + * Unlike setting attributes, every potential hugepage needs to + * be manually checked as the attributes may already be mixed. + */ + for (gfn = start; gfn < end; gfn += nr_pages) { + unsigned long attrs = kvm_get_memory_attributes(kvm, gfn); + + if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) + hugepage_clear_mixed(slot, gfn, level); + else + hugepage_set_mixed(slot, gfn, level); + } + } +} +#endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 58a36d375c0f..41f35743bc39 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12834,6 +12834,10 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm, } } +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES + kvm_mmu_init_memslot_memory_attributes(kvm, slot); +#endif + if (kvm_page_track_create_memslot(kvm, slot, npages)) goto out_free; -- Gitee From 8b65dc553e44a0c8505aa80d87d9a9e4c47024d8 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:22:02 -0700 Subject: [PATCH 18/54] KVM: x86/mmu: Handle page fault for private memory mainline inclusion from mainline-v6.8-rc1 commit 8dd2eee9d526c30fccfe75da7ec5365c6476e510 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8dd2eee9d526c30fccfe75da7ec5365c6476e510 ------------------------------------- Hygon-SIG: commit 8dd2eee9d526 KVM: x86/mmu: Handle page fault for private memory Backport the guest-memory feature. ------------------------------------- Add support for resolving page faults on guest private memory for VMs that differentiate between "shared" and "private" memory. For such VMs, KVM_MEM_GUEST_MEMFD memslots can include both fd-based private memory and hva-based shared memory, and KVM needs to map in the "correct" variant, i.e. KVM needs to map the gfn shared/private as appropriate based on the current state of the gfn's KVM_MEMORY_ATTRIBUTE_PRIVATE flag. For AMD's SEV-SNP and Intel's TDX, the guest effectively gets to request shared vs. private via a bit in the guest page tables, i.e. what the guest wants may conflict with the current memory attributes. To support such "implicit" conversion requests, exit to user with KVM_EXIT_MEMORY_FAULT to forward the request to userspace. Add a new flag for memory faults, KVM_MEMORY_EXIT_FLAG_PRIVATE, to communicate whether the guest wants to map memory as shared vs. private. Like KVM_MEMORY_ATTRIBUTE_PRIVATE, use bit 3 for flagging private memory so that KVM can use bits 0-2 for capturing RWX behavior if/when userspace needs such information, e.g. a likely user of KVM_EXIT_MEMORY_FAULT is to exit on missing mappings when handling guest page fault VM-Exits. In that case, userspace will want to know RWX information in order to correctly/precisely resolve the fault. Note, private memory *must* be backed by guest_memfd, i.e. shared mappings always come from the host userspace page tables, and private mappings always come from a guest_memfd instance. Co-developed-by: Yu Zhang Signed-off-by: Yu Zhang Signed-off-by: Chao Peng Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-21-seanjc@google.com> Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- Documentation/virt/kvm/api.rst | 8 ++- arch/x86/kvm/mmu/mmu.c | 101 ++++++++++++++++++++++++++++++-- arch/x86/kvm/mmu/mmu_internal.h | 1 + include/linux/kvm_host.h | 8 ++- include/uapi/linux/kvm.h | 1 + 5 files changed, 110 insertions(+), 9 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index eb0c49016768..fc835672a18d 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6984,6 +6984,7 @@ spec refer, https://github.com/riscv/riscv-sbi-doc. /* KVM_EXIT_MEMORY_FAULT */ struct { + #define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3) __u64 flags; __u64 gpa; __u64 size; @@ -6992,8 +6993,11 @@ spec refer, https://github.com/riscv/riscv-sbi-doc. KVM_EXIT_MEMORY_FAULT indicates the vCPU has encountered a memory fault that could not be resolved by KVM. The 'gpa' and 'size' (in bytes) describe the guest physical address range [gpa, gpa + size) of the fault. The 'flags' field -describes properties of the faulting access that are likely pertinent. -Currently, no flags are defined. +describes properties of the faulting access that are likely pertinent: + + - KVM_MEMORY_EXIT_FLAG_PRIVATE - When set, indicates the memory fault occurred + on a private memory access. When clear, indicates the fault occurred on a + shared access. Note! KVM_EXIT_MEMORY_FAULT is unique among all KVM exit reasons in that it accompanies a return code of '-1', not '0'! errno will always be set to EFAULT diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c860969f260b..765249f60a93 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3147,9 +3147,9 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, return level; } -int kvm_mmu_max_mapping_level(struct kvm *kvm, - const struct kvm_memory_slot *slot, gfn_t gfn, - int max_level) +static int __kvm_mmu_max_mapping_level(struct kvm *kvm, + const struct kvm_memory_slot *slot, + gfn_t gfn, int max_level, bool is_private) { struct kvm_lpage_info *linfo; int host_level; @@ -3161,6 +3161,9 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, break; } + if (is_private) + return max_level; + if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; @@ -3168,6 +3171,16 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, return min(host_level, max_level); } +int kvm_mmu_max_mapping_level(struct kvm *kvm, + const struct kvm_memory_slot *slot, gfn_t gfn, + int max_level) +{ + bool is_private = kvm_slot_can_be_private(slot) && + kvm_mem_is_private(kvm, gfn); + + return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private); +} + void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_memory_slot *slot = fault->slot; @@ -3188,8 +3201,9 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * Enforce the iTLB multihit workaround after capturing the requested * level, which will be used to do precise, accurate accounting. */ - fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, - fault->gfn, fault->max_level); + fault->req_level = __kvm_mmu_max_mapping_level(vcpu->kvm, slot, + fault->gfn, fault->max_level, + fault->is_private); if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) return; @@ -4261,6 +4275,55 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true, NULL); } +static inline u8 kvm_max_level_for_order(int order) +{ + BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G); + + KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) && + order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) && + order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K)); + + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G)) + return PG_LEVEL_1G; + + if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M)) + return PG_LEVEL_2M; + + return PG_LEVEL_4K; +} + +static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT, + PAGE_SIZE, fault->write, fault->exec, + fault->is_private); +} + +static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) +{ + int max_order, r; + + if (!kvm_slot_can_be_private(fault->slot)) { + kvm_mmu_prepare_memory_fault_exit(vcpu, fault); + return -EFAULT; + } + + r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn, + &max_order); + if (r) { + kvm_mmu_prepare_memory_fault_exit(vcpu, fault); + return r; + } + + fault->max_level = min(kvm_max_level_for_order(max_order), + fault->max_level); + fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY); + + return RET_PF_CONTINUE; +} + static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct kvm_memory_slot *slot = fault->slot; @@ -4293,6 +4356,14 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault return RET_PF_EMULATE; } + if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) { + kvm_mmu_prepare_memory_fault_exit(vcpu, fault); + return -EFAULT; + } + + if (fault->is_private) + return kvm_faultin_pfn_private(vcpu, fault); + async = false; fault->pfn = __gfn_to_pfn_memslot(slot, fault->gfn, false, false, &async, fault->write, &fault->map_writable, @@ -7186,6 +7257,26 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm) } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, + struct kvm_gfn_range *range) +{ + /* + * Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only + * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM + * can simply ignore such slots. But if userspace is making memory + * PRIVATE, then KVM must prevent the guest from accessing the memory + * as shared. And if userspace is making memory SHARED and this point + * is reached, then at least one page within the range was previously + * PRIVATE, i.e. the slot's possible hugepage ranges are changing. + * Zapping SPTEs in this case ensures KVM will reassess whether or not + * a hugepage can be used for affected ranges. + */ + if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) + return false; + + return kvm_unmap_gfn_range(kvm, range); +} + static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, int level) { diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 68f8564d85a9..46317b757885 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -202,6 +202,7 @@ struct kvm_page_fault { /* Derived from mmu and global state. */ const bool is_tdp; + const bool is_private; const bool nx_huge_page_workaround_enabled; /* diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 9ff3272e34c0..29b34dc95fc9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2527,14 +2527,18 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr) #define KVM_DIRTY_RING_MAX_ENTRIES 65536 static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, - gpa_t gpa, gpa_t size) + gpa_t gpa, gpa_t size, + bool is_write, bool is_exec, + bool is_private) { vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT; vcpu->run->memory_fault.gpa = gpa; vcpu->run->memory_fault.size = size; - /* Flags are not (yet) defined or communicated to userspace. */ + /* RWX flags are not (yet) defined or communicated to userspace. */ vcpu->run->memory_fault.flags = 0; + if (is_private) + vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE; } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index b75168459321..e4e22cc24409 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -536,6 +536,7 @@ struct kvm_run { } notify; /* KVM_EXIT_MEMORY_FAULT */ struct { +#define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3) __u64 flags; __u64 gpa; __u64 size; -- Gitee From 49dd1650e30692290730035c5839f57fbda07a76 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:03 -0700 Subject: [PATCH 19/54] KVM: Drop superfluous __KVM_VCPU_MULTIPLE_ADDRESS_SPACE macro mainline inclusion from mainline-v6.8-rc1 commit 2333afa17af0f4b6651214ee17cfd5ae5f47787a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2333afa17af0f4b6651214ee17cfd5ae5f47787a ------------------------------------- Hygon-SIG: commit 2333afa17af0 KVM: Drop superfluous __KVM_VCPU_MULTIPLE_ADDRESS_SPACE macro Backport the guest-memory feature. ------------------------------------- Drop __KVM_VCPU_MULTIPLE_ADDRESS_SPACE and instead check the value of KVM_ADDRESS_SPACE_NUM. No functional change intended. Reviewed-by: Paolo Bonzini Signed-off-by: Sean Christopherson Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-22-seanjc@google.com> Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- arch/x86/include/asm/kvm_host.h | 1 - include/linux/kvm_host.h | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 02d338f81e34..af189e089d5f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2164,7 +2164,6 @@ enum { #define HF_SMM_MASK (1 << 1) #define HF_SMM_INSIDE_NMI_MASK (1 << 2) -# define __KVM_VCPU_MULTIPLE_ADDRESS_SPACE # define KVM_ADDRESS_SPACE_NUM 2 # define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 29b34dc95fc9..8516a8797989 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -722,7 +722,7 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm); #define KVM_MEM_SLOTS_NUM SHRT_MAX #define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_INTERNAL_MEM_SLOTS) -#ifndef __KVM_VCPU_MULTIPLE_ADDRESS_SPACE +#if KVM_ADDRESS_SPACE_NUM == 1 static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) { return 0; -- Gitee From 367a695293ea0c355c428a1495e6816ff9cb65b7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:04 -0700 Subject: [PATCH 20/54] KVM: Allow arch code to track number of memslot address spaces per VM mainline inclusion from mainline-v6.8-rc1 commit eed52e434bc33603ddb0af62b6c4ef818948489d category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=eed52e434bc33603ddb0af62b6c4ef818948489d ------------------------------------- Hygon-SIG: commit eed52e434bc3 KVM: Allow arch code to track number of memslot address spaces per VM Backport the guest-memory feature. ------------------------------------- Let x86 track the number of address spaces on a per-VM basis so that KVM can disallow SMM memslots for confidential VMs. Confidentials VMs are fundamentally incompatible with emulating SMM, which as the name suggests requires being able to read and write guest memory and register state. Disallowing SMM will simplify support for guest private memory, as KVM will not need to worry about tracking memory attributes for multiple address spaces (SMM is the only "non-default" address space across all architectures). Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Message-Id: <20231027182217.3615211-23-seanjc@google.com> Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log and resolve the conflict. This commit replaces the macro KVM_ADDRESS_SPACE_NUM with the inline function kvm_arch_nr_memslot_as_ids(), so the macro used by openEuler commit 9e8e34cf0397 ("kvm: supports hugepages POD when migrate vm") should also be substituted. ] Signed-off-by: Li Zhiquan --- arch/arm64/kvm/mmu.c | 2 +- arch/powerpc/kvm/book3s_hv.c | 2 +- arch/x86/include/asm/kvm_host.h | 8 +++++++- arch/x86/kvm/debugfs.c | 2 +- arch/x86/kvm/mmu/mmu.c | 6 +++--- arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 17 +++++++++++------ virt/kvm/dirty_ring.c | 2 +- virt/kvm/kvm_main.c | 26 ++++++++++++++------------ 9 files changed, 40 insertions(+), 27 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index cd7589d05064..f6ca0535ed7a 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -2378,7 +2378,7 @@ int kvm_mmu_mark_touched_log(struct kvm *kvm) while (has_next) { has_next = false; - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(memslot, bkt, slots) { gfn_t start, end; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 924689fa5efa..724cfbebb39e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -6106,7 +6106,7 @@ static int kvmhv_svm_off(struct kvm *kvm) } srcu_idx = srcu_read_lock(&kvm->srcu); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { struct kvm_memory_slot *memslot; struct kvm_memslots *slots = __kvm_memslots(kvm, i); int bkt; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index af189e089d5f..0676b02504aa 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -2164,9 +2164,15 @@ enum { #define HF_SMM_MASK (1 << 1) #define HF_SMM_INSIDE_NMI_MASK (1 << 2) -# define KVM_ADDRESS_SPACE_NUM 2 +# define KVM_MAX_NR_ADDRESS_SPACES 2 # define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) + +static inline int kvm_arch_nr_memslot_as_ids(struct kvm *kvm) +{ + return KVM_MAX_NR_ADDRESS_SPACES; +} + #else # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0) #endif diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index ee8c4c3496ed..42026b3f3ff3 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -111,7 +111,7 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v) mutex_lock(&kvm->slots_lock); write_lock(&kvm->mmu_lock); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { int bkt; slots = __kvm_memslots(kvm, i); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 765249f60a93..a9402215b0a9 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3755,7 +3755,7 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm) kvm_page_track_write_tracking_enabled(kvm)) goto out_success; - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(slot, bkt, slots) { /* @@ -6305,7 +6305,7 @@ static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_e if (!kvm_memslots_have_rmaps(kvm)) return flush; - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) { @@ -6802,7 +6802,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen) * modifier prior to checking for a wrap of the MMIO generation so * that a wrap in any address space is detected. */ - gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1); + gen &= ~((u64)kvm_arch_nr_memslot_as_ids(kvm) - 1); /* * The very rare case: if the MMIO generation number has wrapped, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 41f35743bc39..5c2a61cf28e8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12683,7 +12683,7 @@ void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, hva = slot->userspace_addr; } - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { struct kvm_userspace_memory_region2 m; m.slot = id | (i << 16); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8516a8797989..2565a7951bcd 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -80,8 +80,8 @@ /* Two fragments for cross MMIO pages. */ #define KVM_MAX_MMIO_FRAGMENTS 2 -#ifndef KVM_ADDRESS_SPACE_NUM -#define KVM_ADDRESS_SPACE_NUM 1 +#ifndef KVM_MAX_NR_ADDRESS_SPACES +#define KVM_MAX_NR_ADDRESS_SPACES 1 #endif /* @@ -722,7 +722,12 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm); #define KVM_MEM_SLOTS_NUM SHRT_MAX #define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_INTERNAL_MEM_SLOTS) -#if KVM_ADDRESS_SPACE_NUM == 1 +#if KVM_MAX_NR_ADDRESS_SPACES == 1 +static inline int kvm_arch_nr_memslot_as_ids(struct kvm *kvm) +{ + return KVM_MAX_NR_ADDRESS_SPACES; +} + static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu) { return 0; @@ -795,9 +800,9 @@ struct kvm { struct mm_struct *mm; /* userspace tied to this vm */ unsigned long nr_memslot_pages; /* The two memslot sets - active and inactive (per address space) */ - struct kvm_memslots __memslots[KVM_ADDRESS_SPACE_NUM][2]; + struct kvm_memslots __memslots[KVM_MAX_NR_ADDRESS_SPACES][2]; /* The current active memslot set for each address space */ - struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM]; + struct kvm_memslots __rcu *memslots[KVM_MAX_NR_ADDRESS_SPACES]; struct xarray vcpu_array; /* * Protected by slots_lock, but can be read outside if an @@ -1101,7 +1106,7 @@ void kvm_put_kvm_no_destroy(struct kvm *kvm); static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id) { - as_id = array_index_nospec(as_id, KVM_ADDRESS_SPACE_NUM); + as_id = array_index_nospec(as_id, KVM_MAX_NR_ADDRESS_SPACES); return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu, lockdep_is_held(&kvm->slots_lock) || !refcount_read(&kvm->users_count)); diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index 27e50190d419..7bc74969a819 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -61,7 +61,7 @@ static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask) as_id = slot >> 16; id = (u16)slot; - if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return; memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 77d23e0fa891..e21d545df75b 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -628,7 +628,7 @@ static __always_inline kvm_mn_ret_t __kvm_handle_hva_range(struct kvm *kvm, idx = srcu_read_lock(&kvm->srcu); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { struct interval_tree_node *node; slots = __kvm_memslots(kvm, i); @@ -1259,7 +1259,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) goto out_err_no_irq_srcu; refcount_set(&kvm->users_count, 1); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { for (j = 0; j < 2; j++) { slots = &kvm->__memslots[i][j]; @@ -1423,7 +1423,7 @@ static void kvm_destroy_vm(struct kvm *kvm) #endif kvm_arch_destroy_vm(kvm); kvm_destroy_devices(kvm); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { kvm_free_memslots(kvm, &kvm->__memslots[i][0]); kvm_free_memslots(kvm, &kvm->__memslots[i][1]); } @@ -1715,7 +1715,7 @@ static void kvm_swap_active_memslots(struct kvm *kvm, int as_id) * space 0 will use generations 0, 2, 4, ... while address space 1 will * use generations 1, 3, 5, ... */ - gen += KVM_ADDRESS_SPACE_NUM; + gen += kvm_arch_nr_memslot_as_ids(kvm); kvm_arch_memslots_updated(kvm, gen); @@ -2085,7 +2085,7 @@ int __kvm_set_memory_region(struct kvm *kvm, (mem->guest_memfd_offset & (PAGE_SIZE - 1) || mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset)) return -EINVAL; - if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) + if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM) return -EINVAL; if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) return -EINVAL; @@ -2221,7 +2221,7 @@ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, as_id = log->slot >> 16; id = (u16)log->slot; - if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL; slots = __kvm_memslots(kvm, as_id); @@ -2283,7 +2283,7 @@ static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log) as_id = log->slot >> 16; id = (u16)log->slot; - if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL; slots = __kvm_memslots(kvm, as_id); @@ -2395,7 +2395,7 @@ static int kvm_clear_dirty_log_protect(struct kvm *kvm, as_id = log->slot >> 16; id = (u16)log->slot; - if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) + if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL; if (log->first_page & 63) @@ -2526,7 +2526,7 @@ static __always_inline void kvm_handle_gfn_range(struct kvm *kvm, gfn_range.arg = range->arg; gfn_range.may_block = range->may_block; - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) { @@ -4922,9 +4922,11 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_IRQ_ROUTING: return KVM_MAX_IRQ_ROUTES; #endif -#if KVM_ADDRESS_SPACE_NUM > 1 +#if KVM_MAX_NR_ADDRESS_SPACES > 1 case KVM_CAP_MULTI_ADDRESS_SPACE: - return KVM_ADDRESS_SPACE_NUM; + if (kvm) + return kvm_arch_nr_memslot_as_ids(kvm); + return KVM_MAX_NR_ADDRESS_SPACES; #endif case KVM_CAP_NR_MEMSLOTS: return KVM_USER_MEM_SLOTS; @@ -5032,7 +5034,7 @@ bool kvm_are_all_memslots_empty(struct kvm *kvm) lockdep_assert_held(&kvm->slots_lock); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { if (!kvm_memslots_empty(__kvm_memslots(kvm, i))) return false; } -- Gitee From 72e6b6afae60cdcae88abe8c5e5392e25f9cbb8f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:05 -0700 Subject: [PATCH 21/54] KVM: x86: Add support for "protected VMs" that can utilize private memory mainline inclusion from mainline-v6.8-rc1 commit 89ea60c2c7b5838bf192c50062d5720cd6ab8662 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=89ea60c2c7b5838bf192c50062d5720cd6ab8662 ------------------------------------- Hygon-SIG: commit 89ea60c2c7b5 KVM: x86: Add support for "protected VMs" that can utilize private memory Backport the guest-memory feature. ------------------------------------- Add a new x86 VM type, KVM_X86_SW_PROTECTED_VM, to serve as a development and testing vehicle for Confidential (CoCo) VMs, and potentially to even become a "real" product in the distant future, e.g. a la pKVM. The private memory support in KVM x86 is aimed at AMD's SEV-SNP and Intel's TDX, but those technologies are extremely complex (understatement), difficult to debug, don't support running as nested guests, and require hardware that's isn't universally accessible. I.e. relying SEV-SNP or TDX for maintaining guest private memory isn't a realistic option. At the very least, KVM_X86_SW_PROTECTED_VM will enable a variety of selftests for guest_memfd and private memory support without requiring unique hardware. Signed-off-by: Sean Christopherson Reviewed-by: Paolo Bonzini Message-Id: <20231027182217.3615211-24-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Li Zhiquan --- Documentation/virt/kvm/api.rst | 32 ++++++++++++++++++++++++++++++++ arch/x86/include/asm/kvm_host.h | 15 +++++++++------ arch/x86/include/uapi/asm/kvm.h | 3 +++ arch/x86/kvm/Kconfig | 12 ++++++++++++ arch/x86/kvm/mmu/mmu_internal.h | 1 + arch/x86/kvm/x86.c | 16 +++++++++++++++- include/uapi/linux/kvm.h | 1 + virt/kvm/Kconfig | 5 +++++ 8 files changed, 78 insertions(+), 7 deletions(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index fc835672a18d..923e61011def 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -147,10 +147,29 @@ described as 'basic' will be available. The new VM has no virtual cpus and no memory. You probably want to use 0 as machine type. +X86: +^^^^ + +Supported X86 VM types can be queried via KVM_CAP_VM_TYPES. + +S390: +^^^^^ + In order to create user controlled virtual machines on S390, check KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as privileged user (CAP_SYS_ADMIN). +MIPS: +^^^^^ + +To use hardware assisted virtualization on MIPS (VZ ASE) rather than +the default trap & emulate implementation (which changes the virtual +memory layout to fit in user mode), check KVM_CAP_MIPS_VZ and use the +flag KVM_VM_MIPS_VZ. + +ARM64: +^^^^^^ + On arm64, the machine type identifier is used to encode a type and the physical address size for the VM. The lower byte (bits[7-0]) encode the address size and the upper bits[11-8] encode a machine type. The machine @@ -8858,6 +8877,19 @@ block sizes is exposed in KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES as a 64-bit bitmap (each bit describing a block size). The default value is 0, to disable the eager page splitting. +8.41 KVM_CAP_VM_TYPES +--------------------- + +:Capability: KVM_CAP_MEMORY_ATTRIBUTES +:Architectures: x86 +:Type: system ioctl + +This capability returns a bitmap of support VM types. The 1-setting of bit @n +means the VM type with value @n is supported. Possible values of @n are:: + + #define KVM_X86_DEFAULT_VM 0 + #define KVM_X86_SW_PROTECTED_VM 1 + 9. Known KVM API problems ========================= diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0676b02504aa..6790db1fadf4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1250,6 +1250,7 @@ enum kvm_apicv_inhibit { }; struct kvm_arch { + unsigned long vm_type; unsigned long n_used_mmu_pages; unsigned long n_requested_mmu_pages; unsigned long n_max_mmu_pages; @@ -2117,6 +2118,12 @@ void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd); void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, int tdp_max_root_level, int tdp_huge_page_level); +#ifdef CONFIG_KVM_PRIVATE_MEM +#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.vm_type != KVM_X86_DEFAULT_VM) +#else +#define kvm_arch_has_private_mem(kvm) false +#endif + static inline u16 kvm_read_ldt(void) { u16 ldt; @@ -2165,14 +2172,10 @@ enum { #define HF_SMM_INSIDE_NMI_MASK (1 << 2) # define KVM_MAX_NR_ADDRESS_SPACES 2 +/* SMM is currently unsupported for guests with private memory. */ +# define kvm_arch_nr_memslot_as_ids(kvm) (kvm_arch_has_private_mem(kvm) ? 1 : 2) # define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) - -static inline int kvm_arch_nr_memslot_as_ids(struct kvm *kvm) -{ - return KVM_MAX_NR_ADDRESS_SPACES; -} - #else # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0) #endif diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 1a6a1f987949..a448d0964fc0 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -562,4 +562,7 @@ struct kvm_pmu_event_filter { /* x86-specific KVM_EXIT_HYPERCALL flags. */ #define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0) +#define KVM_X86_DEFAULT_VM 0 +#define KVM_X86_SW_PROTECTED_VM 1 + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 21b101bf45e4..dbaa397a09cb 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -77,6 +77,18 @@ config KVM_WERROR If in doubt, say "N". +config KVM_SW_PROTECTED_VM + bool "Enable support for KVM software-protected VMs" + depends on EXPERT + depends on X86_64 + select KVM_GENERIC_PRIVATE_MEM + help + Enable support for KVM software-protected VMs. Currently "protected" + means the VM can be backed with memory provided by + KVM_CREATE_GUEST_MEMFD. + + If unsure, say "N". + config KVM_INTEL tristate "KVM for Intel (and compatible) processors support" depends on KVM && IA32_FEAT_CTL diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 46317b757885..0669a8a668ca 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -298,6 +298,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, .max_level = KVM_MAX_HUGEPAGE_LEVEL, .req_level = PG_LEVEL_4K, .goal_level = PG_LEVEL_4K, + .is_private = kvm_mem_is_private(vcpu->kvm, cr2_or_gpa >> PAGE_SHIFT), }; int r; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5c2a61cf28e8..09b9f79d9283 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4534,6 +4534,13 @@ static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu, } #endif +static bool kvm_is_vm_type_supported(unsigned long type) +{ + return type == KVM_X86_DEFAULT_VM || + (type == KVM_X86_SW_PROTECTED_VM && + IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_enabled); +} + int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r = 0; @@ -4729,6 +4736,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_X86_NOTIFY_VMEXIT: r = kvm_caps.has_notify_vmexit; break; + case KVM_CAP_VM_TYPES: + r = BIT(KVM_X86_DEFAULT_VM); + if (kvm_is_vm_type_supported(KVM_X86_SW_PROTECTED_VM)) + r |= BIT(KVM_X86_SW_PROTECTED_VM); + break; case KVM_CAP_SEV_ES_GHCB: r = 0; @@ -12541,9 +12553,11 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) int ret; unsigned long flags; - if (type) + if (!kvm_is_vm_type_supported(type)) return -EINVAL; + kvm->arch.vm_type = type; + ret = kvm_page_track_init(kvm); if (ret) goto out; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index e4e22cc24409..5991f8c3a549 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1245,6 +1245,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_MEMORY_FAULT_INFO 232 #define KVM_CAP_MEMORY_ATTRIBUTES 233 #define KVM_CAP_GUEST_MEMFD 234 +#define KVM_CAP_VM_TYPES 235 #define KVM_CAP_ARM_WRITABLE_IMP_ID_REGS 239 diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 4c1edf4efb57..988bccc5aaf0 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -105,5 +105,10 @@ config KVM_PRIVATE_MEM select XARRAY_MULTI bool +config KVM_GENERIC_PRIVATE_MEM + select KVM_GENERIC_MEMORY_ATTRIBUTES + select KVM_PRIVATE_MEM + bool + config HAVE_KVM_PINNED_VMID bool -- Gitee From 651a50e2e44397b709ef38be7e48bc4690df1756 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:06 -0700 Subject: [PATCH 22/54] KVM: selftests: Drop unused kvm_userspace_memory_region_find() helper mainline inclusion from mainline-v6.8-rc1 commit 335869c3f2b881bda6eeeb2df342841a1db3310b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=335869c3f2b881bda6eeeb2df342841a1db3310b ------------------------------------- Hygon-SIG: commit 335869c3f2b8 KVM: selftests: Drop unused kvm_userspace_memory_region_find() helper Backport the guest-memory feature. ------------------------------------- Drop kvm_userspace_memory_region_find(), it's unused and a terrible API (probably why it's unused). If anything outside of kvm_util.c needs to get at the memslot, userspace_mem_region_find() can be exposed to give others full access to all memory region/slot information. Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-25-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- .../selftests/kvm/include/kvm_util_base.h | 4 --- tools/testing/selftests/kvm/lib/kvm_util.c | 29 ------------------- 2 files changed, 33 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index a18db6a7b3cf..967eaaeacd75 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -776,10 +776,6 @@ vm_adjust_num_guest_pages(enum vm_guest_mode mode, unsigned int num_guest_pages) return n; } -struct kvm_userspace_memory_region * -kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, - uint64_t end); - #define sync_global_to_guest(vm, g) ({ \ typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \ memcpy(_p, &(g), sizeof(g)); \ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 7a8af1821f5d..f09295d56c23 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -590,35 +590,6 @@ userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) return NULL; } -/* - * KVM Userspace Memory Region Find - * - * Input Args: - * vm - Virtual Machine - * start - Starting VM physical address - * end - Ending VM physical address, inclusive. - * - * Output Args: None - * - * Return: - * Pointer to overlapping region, NULL if no such region. - * - * Public interface to userspace_mem_region_find. Allows tests to look up - * the memslot datastructure for a given range of guest physical memory. - */ -struct kvm_userspace_memory_region * -kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start, - uint64_t end) -{ - struct userspace_mem_region *region; - - region = userspace_mem_region_find(vm, start, end); - if (!region) - return NULL; - - return ®ion->region; -} - __weak void vcpu_arch_free(struct kvm_vcpu *vcpu) { -- Gitee From 2e6d15e13f865ae715876da605fc33ff5675e132 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:07 -0700 Subject: [PATCH 23/54] KVM: selftests: Convert lib's mem regions to KVM_SET_USER_MEMORY_REGION2 mainline inclusion from mainline-v6.8-rc1 commit 8d99e347c097ab3f9fb93d0f88dddf20051d7c88 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8d99e347c097ab3f9fb93d0f88dddf20051d7c88 ------------------------------------- Hygon-SIG: commit 8d99e347c097 KVM: selftests: Convert lib's mem regions to KVM_SET_USER_MEMORY_REGION2 Backport the guest-memory feature. ------------------------------------- Use KVM_SET_USER_MEMORY_REGION2 throughout KVM's selftests library so that support for guest private memory can be added without needing an entirely separate set of helpers. Note, this obviously makes selftests backwards-incompatible with older KVM versions from this point forward. Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-26-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- .../selftests/kvm/include/kvm_util_base.h | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 19 ++++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 967eaaeacd75..9f144841c2ee 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -44,7 +44,7 @@ typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */ typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */ struct userspace_mem_region { - struct kvm_userspace_memory_region region; + struct kvm_userspace_memory_region2 region; struct sparsebit *unused_phy_pages; int fd; off_t offset; diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index f09295d56c23..3676b37bea38 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -453,8 +453,9 @@ void kvm_vm_restart(struct kvm_vm *vmp) vm_create_irqchip(vmp); hash_for_each(vmp->regions.slot_hash, ctr, region, slot_node) { - int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, ®ion->region); - TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" + int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION2, ®ion->region); + + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" " rc: %i errno: %i\n" " slot: %u flags: 0x%x\n" " guest_phys_addr: 0x%llx size: 0x%llx", @@ -657,7 +658,7 @@ static void __vm_mem_region_delete(struct kvm_vm *vm, } region->region.memory_size = 0; - vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, ®ion->region); + vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); sparsebit_free(®ion->unused_phy_pages); ret = munmap(region->mmap_start, region->mmap_size); @@ -1014,8 +1015,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, region->region.guest_phys_addr = guest_paddr; region->region.memory_size = npages * vm->page_size; region->region.userspace_addr = (uintptr_t) region->host_mem; - ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, ®ion->region); - TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" + ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" " rc: %i errno: %i\n" " slot: %u flags: 0x%x\n" " guest_phys_addr: 0x%lx size: 0x%lx", @@ -1097,9 +1098,9 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags) region->region.flags = flags; - ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, ®ion->region); + ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); - TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n" + TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" " rc: %i errno: %i slot: %u flags: 0x%x", ret, errno, slot, flags); } @@ -1127,9 +1128,9 @@ void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa) region->region.guest_phys_addr = new_gpa; - ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION, ®ion->region); + ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); - TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION failed\n" + TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION2 failed\n" "ret: %i errno: %i slot: %u new_gpa: 0x%lx", ret, errno, slot, new_gpa); } -- Gitee From 2044bd64df79c6457bb0d87b4c351dcb005f681c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:08 -0700 Subject: [PATCH 24/54] KVM: selftests: Add support for creating private memslots mainline inclusion from mainline-v6.8-rc1 commit bb2968ad6c33c0902dce48ea57d58c5bb4f3c617 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=bb2968ad6c33c0902dce48ea57d58c5bb4f3c617 ------------------------------------- Hygon-SIG: commit bb2968ad6c33 KVM: selftests: Add support for creating private memslots Backport the guest-memory feature. ------------------------------------- Add support for creating "private" memslots via KVM_CREATE_GUEST_MEMFD and KVM_SET_USER_MEMORY_REGION2. Make vm_userspace_mem_region_add() a wrapper to its effective replacement, vm_mem_add(), so that private memslots are fully opt-in, i.e. don't require update all tests that add memory regions. Pivot on the KVM_MEM_PRIVATE flag instead of the validity of the "gmem" file descriptor so that simple tests can let vm_mem_add() do the heavy lifting of creating the guest memfd, but also allow the caller to pass in an explicit fd+offset so that fancier tests can do things like back multiple memslots with a single file. If the caller passes in a fd, dup() the fd so that (a) __vm_mem_region_delete() can close the fd associated with the memory region without needing yet another flag, and (b) so that the caller can safely close its copy of the fd without having to first destroy memslots. Co-developed-by: Ackerley Tng Signed-off-by: Ackerley Tng Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-27-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- .../selftests/kvm/include/kvm_util_base.h | 23 ++++++ .../testing/selftests/kvm/include/test_util.h | 5 ++ tools/testing/selftests/kvm/lib/kvm_util.c | 76 +++++++++++-------- 3 files changed, 73 insertions(+), 31 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 9f144841c2ee..9f861182c02a 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -431,6 +431,26 @@ static inline uint64_t vm_get_stat(struct kvm_vm *vm, const char *stat_name) void vm_create_irqchip(struct kvm_vm *vm); +static inline int __vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size, + uint64_t flags) +{ + struct kvm_create_guest_memfd guest_memfd = { + .size = size, + .flags = flags, + }; + + return __vm_ioctl(vm, KVM_CREATE_GUEST_MEMFD, &guest_memfd); +} + +static inline int vm_create_guest_memfd(struct kvm_vm *vm, uint64_t size, + uint64_t flags) +{ + int fd = __vm_create_guest_memfd(vm, size, flags); + + TEST_ASSERT(fd >= 0, KVM_IOCTL_ERROR(KVM_CREATE_GUEST_MEMFD, fd)); + return fd; +} + void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, uint64_t gpa, uint64_t size, void *hva); int __vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, @@ -439,6 +459,9 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, uint64_t guest_paddr, uint32_t slot, uint64_t npages, uint32_t flags); +void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, + uint64_t guest_paddr, uint32_t slot, uint64_t npages, + uint32_t flags, int guest_memfd_fd, uint64_t guest_memfd_offset); void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h index 7e614adc6cf4..7257f2243ab9 100644 --- a/tools/testing/selftests/kvm/include/test_util.h +++ b/tools/testing/selftests/kvm/include/test_util.h @@ -142,6 +142,11 @@ static inline bool backing_src_is_shared(enum vm_mem_backing_src_type t) return vm_mem_backing_src_alias(t)->flag & MAP_SHARED; } +static inline bool backing_src_can_be_huge(enum vm_mem_backing_src_type t) +{ + return t != VM_MEM_SRC_ANONYMOUS && t != VM_MEM_SRC_SHMEM; +} + /* Aligns x up to the next multiple of size. Size must be a power of 2. */ static inline uint64_t align_up(uint64_t x, uint64_t size) { diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 3676b37bea38..b63500fca627 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -669,6 +669,8 @@ static void __vm_mem_region_delete(struct kvm_vm *vm, TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); close(region->fd); } + if (region->region.guest_memfd >= 0) + close(region->region.guest_memfd); free(region); } @@ -870,36 +872,15 @@ void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, errno, strerror(errno)); } -/* - * VM Userspace Memory Region Add - * - * Input Args: - * vm - Virtual Machine - * src_type - Storage source for this region. - * NULL to use anonymous memory. - * guest_paddr - Starting guest physical address - * slot - KVM region slot - * npages - Number of physical pages - * flags - KVM memory region flags (e.g. KVM_MEM_LOG_DIRTY_PAGES) - * - * Output Args: None - * - * Return: None - * - * Allocates a memory area of the number of pages specified by npages - * and maps it to the VM specified by vm, at a starting physical address - * given by guest_paddr. The region is created with a KVM region slot - * given by slot, which must be unique and < KVM_MEM_SLOTS_NUM. The - * region is created with the flags given by flags. - */ -void vm_userspace_mem_region_add(struct kvm_vm *vm, - enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags) +/* FIXME: This thing needs to be ripped apart and rewritten. */ +void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, + uint64_t guest_paddr, uint32_t slot, uint64_t npages, + uint32_t flags, int guest_memfd, uint64_t guest_memfd_offset) { int ret; struct userspace_mem_region *region; size_t backing_src_pagesz = get_backing_src_pagesz(src_type); + size_t mem_size = npages * vm->page_size; size_t alignment; TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages, @@ -952,7 +933,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, /* Allocate and initialize new mem region structure. */ region = calloc(1, sizeof(*region)); TEST_ASSERT(region != NULL, "Insufficient Memory"); - region->mmap_size = npages * vm->page_size; + region->mmap_size = mem_size; #ifdef __s390x__ /* On s390x, the host address must be aligned to 1M (due to PGSTEs) */ @@ -999,14 +980,38 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, /* As needed perform madvise */ if ((src_type == VM_MEM_SRC_ANONYMOUS || src_type == VM_MEM_SRC_ANONYMOUS_THP) && thp_configured()) { - ret = madvise(region->host_mem, npages * vm->page_size, + ret = madvise(region->host_mem, mem_size, src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE); TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %s", - region->host_mem, npages * vm->page_size, + region->host_mem, mem_size, vm_mem_backing_src_alias(src_type)->name); } region->backing_src_type = src_type; + + if (flags & KVM_MEM_GUEST_MEMFD) { + if (guest_memfd < 0) { + uint32_t guest_memfd_flags = 0; + TEST_ASSERT(!guest_memfd_offset, + "Offset must be zero when creating new guest_memfd"); + guest_memfd = vm_create_guest_memfd(vm, mem_size, guest_memfd_flags); + } else { + /* + * Install a unique fd for each memslot so that the fd + * can be closed when the region is deleted without + * needing to track if the fd is owned by the framework + * or by the caller. + */ + guest_memfd = dup(guest_memfd); + TEST_ASSERT(guest_memfd >= 0, __KVM_SYSCALL_ERROR("dup()", guest_memfd)); + } + + region->region.guest_memfd = guest_memfd; + region->region.guest_memfd_offset = guest_memfd_offset; + } else { + region->region.guest_memfd = -1; + } + region->unused_phy_pages = sparsebit_alloc(); sparsebit_set_num(region->unused_phy_pages, guest_paddr >> vm->page_shift, npages); @@ -1019,9 +1024,10 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" " rc: %i errno: %i\n" " slot: %u flags: 0x%x\n" - " guest_phys_addr: 0x%lx size: 0x%lx", + " guest_phys_addr: 0x%lx size: 0x%lx guest_memfd: %d\n", ret, errno, slot, flags, - guest_paddr, (uint64_t) region->region.memory_size); + guest_paddr, (uint64_t) region->region.memory_size, + region->region.guest_memfd); /* Add to quick lookup data structures */ vm_userspace_mem_region_gpa_insert(&vm->regions.gpa_tree, region); @@ -1042,6 +1048,14 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, } } +void vm_userspace_mem_region_add(struct kvm_vm *vm, + enum vm_mem_backing_src_type src_type, + uint64_t guest_paddr, uint32_t slot, + uint64_t npages, uint32_t flags) +{ + vm_mem_add(vm, src_type, guest_paddr, slot, npages, flags, -1, 0); +} + /* * Memslot to region * -- Gitee From 26b1d3079e044dbf0fa0e730a9576e8119e08aac Mon Sep 17 00:00:00 2001 From: Vishal Annapurve Date: Fri, 27 Oct 2023 11:22:09 -0700 Subject: [PATCH 25/54] KVM: selftests: Add helpers to convert guest memory b/w private and shared mainline inclusion from mainline-v6.8-rc1 commit f7fa67495d118f734f98b406fd46888616b4a3c3 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f7fa67495d118f734f98b406fd46888616b4a3c3 ------------------------------------- Hygon-SIG: commit f7fa67495d11 KVM: selftests: Add helpers to convert guest memory b/w private and shared Backport the guest-memory feature. ------------------------------------- Add helpers to convert memory between private and shared via KVM's memory attributes, as well as helpers to free/allocate guest_memfd memory via fallocate(). Userspace, i.e. tests, is NOT required to do fallocate() when converting memory, as the attributes are the single source of truth. Provide allocate() helpers so that tests can mimic a userspace that frees private memory on conversion, e.g. to prioritize memory usage over performance. Signed-off-by: Vishal Annapurve Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-28-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- .../selftests/kvm/include/kvm_util_base.h | 48 +++++++++++++++++++ tools/testing/selftests/kvm/lib/kvm_util.c | 28 +++++++++++ 2 files changed, 76 insertions(+) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 9f861182c02a..1441fca6c273 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -333,6 +333,54 @@ static inline void vm_enable_cap(struct kvm_vm *vm, uint32_t cap, uint64_t arg0) vm_ioctl(vm, KVM_ENABLE_CAP, &enable_cap); } +static inline void vm_set_memory_attributes(struct kvm_vm *vm, uint64_t gpa, + uint64_t size, uint64_t attributes) +{ + struct kvm_memory_attributes attr = { + .attributes = attributes, + .address = gpa, + .size = size, + .flags = 0, + }; + + /* + * KVM_SET_MEMORY_ATTRIBUTES overwrites _all_ attributes. These flows + * need significant enhancements to support multiple attributes. + */ + TEST_ASSERT(!attributes || attributes == KVM_MEMORY_ATTRIBUTE_PRIVATE, + "Update me to support multiple attributes!"); + + vm_ioctl(vm, KVM_SET_MEMORY_ATTRIBUTES, &attr); +} + + +static inline void vm_mem_set_private(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_set_memory_attributes(vm, gpa, size, KVM_MEMORY_ATTRIBUTE_PRIVATE); +} + +static inline void vm_mem_set_shared(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_set_memory_attributes(vm, gpa, size, 0); +} + +void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t gpa, uint64_t size, + bool punch_hole); + +static inline void vm_guest_mem_punch_hole(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_guest_mem_fallocate(vm, gpa, size, true); +} + +static inline void vm_guest_mem_allocate(struct kvm_vm *vm, uint64_t gpa, + uint64_t size) +{ + vm_guest_mem_fallocate(vm, gpa, size, false); +} + void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size); const char *vm_guest_mode_string(uint32_t i); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index b63500fca627..cc6007abc4b1 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1167,6 +1167,34 @@ void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot) __vm_mem_region_delete(vm, memslot2region(vm, slot), true); } +void vm_guest_mem_fallocate(struct kvm_vm *vm, uint64_t base, uint64_t size, + bool punch_hole) +{ + const int mode = FALLOC_FL_KEEP_SIZE | (punch_hole ? FALLOC_FL_PUNCH_HOLE : 0); + struct userspace_mem_region *region; + uint64_t end = base + size; + uint64_t gpa, len; + off_t fd_offset; + int ret; + + for (gpa = base; gpa < end; gpa += len) { + uint64_t offset; + + region = userspace_mem_region_find(vm, gpa, gpa); + TEST_ASSERT(region && region->region.flags & KVM_MEM_GUEST_MEMFD, + "Private memory region not found for GPA 0x%lx", gpa); + + offset = gpa - region->region.guest_phys_addr; + fd_offset = region->region.guest_memfd_offset + offset; + len = min_t(uint64_t, end - gpa, region->region.memory_size - offset); + + ret = fallocate(region->region.guest_memfd, mode, fd_offset, len); + TEST_ASSERT(!ret, "fallocate() failed to %s at %lx (len = %lu), fd = %d, mode = %x, offset = %lx\n", + punch_hole ? "punch hole" : "allocate", gpa, len, + region->region.guest_memfd, mode, fd_offset); + } +} + /* Returns the size of a vCPU's kvm_run structure. */ static int vcpu_mmap_sz(void) { -- Gitee From 592bf862567d5e2f9e284f350bcebb340ef40514 Mon Sep 17 00:00:00 2001 From: Vishal Annapurve Date: Fri, 27 Oct 2023 11:22:10 -0700 Subject: [PATCH 26/54] KVM: selftests: Add helpers to do KVM_HC_MAP_GPA_RANGE hypercalls (x86) mainline inclusion from mainline-v6.8-rc1 commit 01244fce2fa22176e46609c051f6fe15cf81e188 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=01244fce2fa22176e46609c051f6fe15cf81e188 ------------------------------------- Hygon-SIG: commit 01244fce2fa2 KVM: selftests: Add helpers to do KVM_HC_MAP_GPA_RANGE hypercalls (x86) Backport the guest-memory feature. ------------------------------------- Add helpers for x86 guests to invoke the KVM_HC_MAP_GPA_RANGE hypercall, which KVM will forward to userspace and thus can be used by tests to coordinate private<=>shared conversions between host userspace code and guest code. Signed-off-by: Vishal Annapurve [sean: drop shared/private helpers (let tests specify flags)] Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-29-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- .../selftests/kvm/include/x86_64/processor.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 490a0a7efb3c..3a33f1106ba9 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -15,6 +15,7 @@ #include #include +#include #include #include "../kvm_util.h" @@ -1195,6 +1196,20 @@ uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2, uint64_t __xen_hypercall(uint64_t nr, uint64_t a0, void *a1); void xen_hypercall(uint64_t nr, uint64_t a0, void *a1); +static inline uint64_t __kvm_hypercall_map_gpa_range(uint64_t gpa, + uint64_t size, uint64_t flags) +{ + return kvm_hypercall(KVM_HC_MAP_GPA_RANGE, gpa, size >> PAGE_SHIFT, flags, 0); +} + +static inline void kvm_hypercall_map_gpa_range(uint64_t gpa, uint64_t size, + uint64_t flags) +{ + uint64_t ret = __kvm_hypercall_map_gpa_range(gpa, size, flags); + + GUEST_ASSERT(!ret); +} + void __vm_xsave_require_permission(uint64_t xfeature, const char *name); #define vm_xsave_require_permission(xfeature) \ -- Gitee From 89b5513a652febed9d959e9a7b1a9baaf8664840 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:11 -0700 Subject: [PATCH 27/54] KVM: selftests: Introduce VM "shape" to allow tests to specify the VM type mainline inclusion from mainline-v6.8-rc1 commit 672eaa351015d8165c41fff644ee7d2b369cea12 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=672eaa351015d8165c41fff644ee7d2b369cea12 ------------------------------------- Hygon-SIG: commit 672eaa351015 KVM: selftests: Introduce VM "shape" to allow tests to specify the VM type Backport the guest-memory feature. ------------------------------------- Add a "vm_shape" structure to encapsulate the selftests-defined "mode", along with the KVM-defined "type" for use when creating a new VM. "mode" tracks physical and virtual address properties, as well as the preferred backing memory type, while "type" corresponds to the VM type. Taking the VM type will allow adding tests for KVM_CREATE_GUEST_MEMFD without needing an entirely separate set of helpers. At this time, guest_memfd is effectively usable only by confidential VM types in the form of guest private memory, and it's expected that x86 will double down and require unique VM types for TDX and SNP guests. Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-30-seanjc@google.com> Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- .../selftests/kvm/aarch64/page_fault_test.c | 2 +- tools/testing/selftests/kvm/dirty_log_test.c | 2 +- .../selftests/kvm/include/kvm_util_base.h | 54 +++++++++++++++---- .../selftests/kvm/kvm_page_table_test.c | 2 +- tools/testing/selftests/kvm/lib/kvm_util.c | 43 +++++++-------- tools/testing/selftests/kvm/lib/memstress.c | 3 +- .../kvm/x86_64/ucna_injection_test.c | 2 +- 7 files changed, 73 insertions(+), 35 deletions(-) diff --git a/tools/testing/selftests/kvm/aarch64/page_fault_test.c b/tools/testing/selftests/kvm/aarch64/page_fault_test.c index 47bb914ab2fa..b1e2fe5a30ea 100644 --- a/tools/testing/selftests/kvm/aarch64/page_fault_test.c +++ b/tools/testing/selftests/kvm/aarch64/page_fault_test.c @@ -705,7 +705,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) print_test_banner(mode, p); - vm = ____vm_create(mode); + vm = ____vm_create(VM_SHAPE(mode)); setup_memslots(vm, p); kvm_vm_elf_load(vm, program_invocation_name); setup_ucall(vm); diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c index e96fababd3f0..4ca747c33846 100644 --- a/tools/testing/selftests/kvm/dirty_log_test.c +++ b/tools/testing/selftests/kvm/dirty_log_test.c @@ -686,7 +686,7 @@ static struct kvm_vm *create_vm(enum vm_guest_mode mode, struct kvm_vcpu **vcpu, pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); - vm = __vm_create(mode, 1, extra_mem_pages); + vm = __vm_create(VM_SHAPE(mode), 1, extra_mem_pages); log_mode_create_vm_done(vm); *vcpu = vm_vcpu_add(vm, 0, guest_code); diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 1441fca6c273..157508c071f3 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -188,6 +188,23 @@ enum vm_guest_mode { NUM_VM_MODES, }; +struct vm_shape { + enum vm_guest_mode mode; + unsigned int type; +}; + +#define VM_TYPE_DEFAULT 0 + +#define VM_SHAPE(__mode) \ +({ \ + struct vm_shape shape = { \ + .mode = (__mode), \ + .type = VM_TYPE_DEFAULT \ + }; \ + \ + shape; \ +}) + #if defined(__aarch64__) extern enum vm_guest_mode vm_mode_default; @@ -220,6 +237,8 @@ extern enum vm_guest_mode vm_mode_default; #endif +#define VM_SHAPE_DEFAULT VM_SHAPE(VM_MODE_DEFAULT) + #define MIN_PAGE_SIZE (1U << MIN_PAGE_SHIFT) #define PTES_PER_MIN_PAGE ptes_per_page(MIN_PAGE_SIZE) @@ -784,21 +803,21 @@ vm_paddr_t vm_alloc_page_table(struct kvm_vm *vm); * __vm_create() does NOT create vCPUs, @nr_runnable_vcpus is used purely to * calculate the amount of memory needed for per-vCPU data, e.g. stacks. */ -struct kvm_vm *____vm_create(enum vm_guest_mode mode); -struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus, +struct kvm_vm *____vm_create(struct vm_shape shape); +struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, uint64_t nr_extra_pages); static inline struct kvm_vm *vm_create_barebones(void) { - return ____vm_create(VM_MODE_DEFAULT); + return ____vm_create(VM_SHAPE_DEFAULT); } static inline struct kvm_vm *vm_create(uint32_t nr_runnable_vcpus) { - return __vm_create(VM_MODE_DEFAULT, nr_runnable_vcpus, 0); + return __vm_create(VM_SHAPE_DEFAULT, nr_runnable_vcpus, 0); } -struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, +struct kvm_vm *__vm_create_with_vcpus(struct vm_shape shape, uint32_t nr_vcpus, uint64_t extra_mem_pages, void *guest_code, struct kvm_vcpu *vcpus[]); @@ -806,17 +825,27 @@ static inline struct kvm_vm *vm_create_with_vcpus(uint32_t nr_vcpus, void *guest_code, struct kvm_vcpu *vcpus[]) { - return __vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, 0, + return __vm_create_with_vcpus(VM_SHAPE_DEFAULT, nr_vcpus, 0, guest_code, vcpus); } + +struct kvm_vm *__vm_create_shape_with_one_vcpu(struct vm_shape shape, + struct kvm_vcpu **vcpu, + uint64_t extra_mem_pages, + void *guest_code); + /* * Create a VM with a single vCPU with reasonable defaults and @extra_mem_pages * additional pages of guest memory. Returns the VM and vCPU (via out param). */ -struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, - uint64_t extra_mem_pages, - void *guest_code); +static inline struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, + uint64_t extra_mem_pages, + void *guest_code) +{ + return __vm_create_shape_with_one_vcpu(VM_SHAPE_DEFAULT, vcpu, + extra_mem_pages, guest_code); +} static inline struct kvm_vm *vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, void *guest_code) @@ -824,6 +853,13 @@ static inline struct kvm_vm *vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, return __vm_create_with_one_vcpu(vcpu, 0, guest_code); } +static inline struct kvm_vm *vm_create_shape_with_one_vcpu(struct vm_shape shape, + struct kvm_vcpu **vcpu, + void *guest_code) +{ + return __vm_create_shape_with_one_vcpu(shape, vcpu, 0, guest_code); +} + struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm); void kvm_pin_this_task_to_pcpu(uint32_t pcpu); diff --git a/tools/testing/selftests/kvm/kvm_page_table_test.c b/tools/testing/selftests/kvm/kvm_page_table_test.c index 69f26d80c821..e37dc9c21888 100644 --- a/tools/testing/selftests/kvm/kvm_page_table_test.c +++ b/tools/testing/selftests/kvm/kvm_page_table_test.c @@ -254,7 +254,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg) /* Create a VM with enough guest pages */ guest_num_pages = test_mem_size / guest_page_size; - vm = __vm_create_with_vcpus(mode, nr_vcpus, guest_num_pages, + vm = __vm_create_with_vcpus(VM_SHAPE(mode), nr_vcpus, guest_num_pages, guest_code, test_args.vcpus); /* Align down GPA of the testing memslot */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index cc6007abc4b1..bf15635eda11 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -209,7 +209,7 @@ __weak void vm_vaddr_populate_bitmap(struct kvm_vm *vm) (1ULL << (vm->va_bits - 1)) >> vm->page_shift); } -struct kvm_vm *____vm_create(enum vm_guest_mode mode) +struct kvm_vm *____vm_create(struct vm_shape shape) { struct kvm_vm *vm; @@ -221,13 +221,13 @@ struct kvm_vm *____vm_create(enum vm_guest_mode mode) vm->regions.hva_tree = RB_ROOT; hash_init(vm->regions.slot_hash); - vm->mode = mode; - vm->type = 0; + vm->mode = shape.mode; + vm->type = shape.type; - vm->pa_bits = vm_guest_mode_params[mode].pa_bits; - vm->va_bits = vm_guest_mode_params[mode].va_bits; - vm->page_size = vm_guest_mode_params[mode].page_size; - vm->page_shift = vm_guest_mode_params[mode].page_shift; + vm->pa_bits = vm_guest_mode_params[vm->mode].pa_bits; + vm->va_bits = vm_guest_mode_params[vm->mode].va_bits; + vm->page_size = vm_guest_mode_params[vm->mode].page_size; + vm->page_shift = vm_guest_mode_params[vm->mode].page_shift; /* Setup mode specific traits. */ switch (vm->mode) { @@ -265,7 +265,7 @@ struct kvm_vm *____vm_create(enum vm_guest_mode mode) /* * Ignore KVM support for 5-level paging (vm->va_bits == 57), * it doesn't take effect unless a CR4.LA57 is set, which it - * isn't for this VM_MODE. + * isn't for this mode (48-bit virtual address space). */ TEST_ASSERT(vm->va_bits == 48 || vm->va_bits == 57, "Linear address width (%d bits) not supported", @@ -285,10 +285,11 @@ struct kvm_vm *____vm_create(enum vm_guest_mode mode) vm->pgtable_levels = 5; break; default: - TEST_FAIL("Unknown guest mode, mode: 0x%x", mode); + TEST_FAIL("Unknown guest mode: 0x%x", vm->mode); } #ifdef __aarch64__ + TEST_ASSERT(!vm->type, "ARM doesn't support test-provided types"); if (vm->pa_bits != 40) vm->type = KVM_VM_TYPE_ARM_IPA_SIZE(vm->pa_bits); #endif @@ -347,19 +348,19 @@ static uint64_t vm_nr_pages_required(enum vm_guest_mode mode, return vm_adjust_num_guest_pages(mode, nr_pages); } -struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus, +struct kvm_vm *__vm_create(struct vm_shape shape, uint32_t nr_runnable_vcpus, uint64_t nr_extra_pages) { - uint64_t nr_pages = vm_nr_pages_required(mode, nr_runnable_vcpus, + uint64_t nr_pages = vm_nr_pages_required(shape.mode, nr_runnable_vcpus, nr_extra_pages); struct userspace_mem_region *slot0; struct kvm_vm *vm; int i; - pr_debug("%s: mode='%s' pages='%ld'\n", __func__, - vm_guest_mode_string(mode), nr_pages); + pr_debug("%s: mode='%s' type='%d', pages='%ld'\n", __func__, + vm_guest_mode_string(shape.mode), shape.type, nr_pages); - vm = ____vm_create(mode); + vm = ____vm_create(shape); vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, nr_pages, 0); for (i = 0; i < NR_MEM_REGIONS; i++) @@ -400,7 +401,7 @@ struct kvm_vm *__vm_create(enum vm_guest_mode mode, uint32_t nr_runnable_vcpus, * extra_mem_pages is only used to calculate the maximum page table size, * no real memory allocation for non-slot0 memory in this function. */ -struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus, +struct kvm_vm *__vm_create_with_vcpus(struct vm_shape shape, uint32_t nr_vcpus, uint64_t extra_mem_pages, void *guest_code, struct kvm_vcpu *vcpus[]) { @@ -409,7 +410,7 @@ struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus TEST_ASSERT(!nr_vcpus || vcpus, "Must provide vCPU array"); - vm = __vm_create(mode, nr_vcpus, extra_mem_pages); + vm = __vm_create(shape, nr_vcpus, extra_mem_pages); for (i = 0; i < nr_vcpus; ++i) vcpus[i] = vm_vcpu_add(vm, i, guest_code); @@ -417,15 +418,15 @@ struct kvm_vm *__vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus return vm; } -struct kvm_vm *__vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, - uint64_t extra_mem_pages, - void *guest_code) +struct kvm_vm *__vm_create_shape_with_one_vcpu(struct vm_shape shape, + struct kvm_vcpu **vcpu, + uint64_t extra_mem_pages, + void *guest_code) { struct kvm_vcpu *vcpus[1]; struct kvm_vm *vm; - vm = __vm_create_with_vcpus(VM_MODE_DEFAULT, 1, extra_mem_pages, - guest_code, vcpus); + vm = __vm_create_with_vcpus(shape, 1, extra_mem_pages, guest_code, vcpus); *vcpu = vcpus[0]; return vm; diff --git a/tools/testing/selftests/kvm/lib/memstress.c b/tools/testing/selftests/kvm/lib/memstress.c index df457452d146..d05487e5a371 100644 --- a/tools/testing/selftests/kvm/lib/memstress.c +++ b/tools/testing/selftests/kvm/lib/memstress.c @@ -168,7 +168,8 @@ struct kvm_vm *memstress_create_vm(enum vm_guest_mode mode, int nr_vcpus, * The memory is also added to memslot 0, but that's a benign side * effect as KVM allows aliasing HVAs in meslots. */ - vm = __vm_create_with_vcpus(mode, nr_vcpus, slot0_pages + guest_num_pages, + vm = __vm_create_with_vcpus(VM_SHAPE(mode), nr_vcpus, + slot0_pages + guest_num_pages, memstress_guest_code, vcpus); args->vm = vm; diff --git a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c index 85f34ca7e49e..0ed32ec903d0 100644 --- a/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c +++ b/tools/testing/selftests/kvm/x86_64/ucna_injection_test.c @@ -271,7 +271,7 @@ int main(int argc, char *argv[]) kvm_check_cap(KVM_CAP_MCE); - vm = __vm_create(VM_MODE_DEFAULT, 3, 0); + vm = __vm_create(VM_SHAPE_DEFAULT, 3, 0); kvm_ioctl(vm->kvm_fd, KVM_X86_GET_MCE_CAP_SUPPORTED, &supported_mcg_caps); -- Gitee From 4d53b25b36219898398029b9c0e1c7669cf18e49 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 27 Oct 2023 11:22:12 -0700 Subject: [PATCH 28/54] KVM: selftests: Add GUEST_SYNC[1-6] macros for synchronizing more data mainline inclusion from mainline-v6.8-rc1 commit 242331dfc4959f44e706c9b09b768f312aa5e117 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=242331dfc4959f44e706c9b09b768f312aa5e117 ------------------------------------- Hygon-SIG: commit 242331dfc495 KVM: selftests: Add GUEST_SYNC[1-6] macros for synchronizing more data Backport the guest-memory feature. ------------------------------------- Add GUEST_SYNC[1-6]() so that tests can pass the maximum amount of information supported via ucall(), without needing to resort to shared memory. Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-31-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- tools/testing/selftests/kvm/include/ucall_common.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/selftests/kvm/include/ucall_common.h b/tools/testing/selftests/kvm/include/ucall_common.h index ce33d306c2cb..0fb472a5a058 100644 --- a/tools/testing/selftests/kvm/include/ucall_common.h +++ b/tools/testing/selftests/kvm/include/ucall_common.h @@ -52,6 +52,17 @@ int ucall_nr_pages_required(uint64_t page_size); #define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4) \ ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4) #define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage) +#define GUEST_SYNC1(arg0) ucall(UCALL_SYNC, 1, arg0) +#define GUEST_SYNC2(arg0, arg1) ucall(UCALL_SYNC, 2, arg0, arg1) +#define GUEST_SYNC3(arg0, arg1, arg2) \ + ucall(UCALL_SYNC, 3, arg0, arg1, arg2) +#define GUEST_SYNC4(arg0, arg1, arg2, arg3) \ + ucall(UCALL_SYNC, 4, arg0, arg1, arg2, arg3) +#define GUEST_SYNC5(arg0, arg1, arg2, arg3, arg4) \ + ucall(UCALL_SYNC, 5, arg0, arg1, arg2, arg3, arg4) +#define GUEST_SYNC6(arg0, arg1, arg2, arg3, arg4, arg5) \ + ucall(UCALL_SYNC, 6, arg0, arg1, arg2, arg3, arg4, arg5) + #define GUEST_PRINTF(_fmt, _args...) ucall_fmt(UCALL_PRINTF, _fmt, ##_args) #define GUEST_DONE() ucall(UCALL_DONE, 0) -- Gitee From a705c83253e63860e30393907649096f352929e6 Mon Sep 17 00:00:00 2001 From: Vishal Annapurve Date: Fri, 27 Oct 2023 11:22:13 -0700 Subject: [PATCH 29/54] KVM: selftests: Add x86-only selftest for private memory conversions mainline inclusion from mainline-v6.8-rc1 commit 43f623f350ce1c46c53b6b77f4dbe741af8c44f3 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=43f623f350ce1c46c53b6b77f4dbe741af8c44f3 ------------------------------------- Hygon-SIG: commit 43f623f350ce KVM: selftests: Add x86-only selftest for private memory conversions Backport the guest-memory feature. ------------------------------------- Add a selftest to exercise implicit/explicit conversion functionality within KVM and verify: - Shared memory is visible to host userspace - Private memory is not visible to host userspace - Host userspace and guest can communicate over shared memory - Data in shared backing is preserved across conversions (test's host userspace doesn't free the data) - Private memory is bound to the lifetime of the VM Ideally, KVM's selftests infrastructure would be reworked to allow backing a single region of guest memory with multiple memslots for _all_ backing types and shapes, i.e. ideally the code for using a single backing fd across multiple memslots would work for "regular" memory as well. But sadly, support for KVM_CREATE_GUEST_MEMFD has languished for far too long, and overhauling selftests' memslots infrastructure would likely open a can of worms, i.e. delay things even further. In addition to the more obvious tests, verify that PUNCH_HOLE actually frees memory. Directly verifying that KVM frees memory is impractical, if it's even possible, so instead indirectly verify memory is freed by asserting that the guest reads zeroes after a PUNCH_HOLE. E.g. if KVM zaps SPTEs but doesn't actually punch a hole in the inode, the subsequent read will still see the previous value. And obviously punching a hole shouldn't cause explosions. Let the user specify the number of memslots in the private mem conversion test, i.e. don't require the number of memslots to be '1' or "nr_vcpus". Creating more memslots than vCPUs is particularly interesting, e.g. it can result in a single KVM_SET_MEMORY_ATTRIBUTES spanning multiple memslots. To keep the math reasonable, align each vCPU's chunk to at least 2MiB (the size is 2MiB+4KiB), and require the total size to be cleanly divisible by the number of memslots. The goal is to be able to validate that KVM plays nice with multiple memslots, being able to create a truly arbitrary number of memslots doesn't add meaningful value, i.e. isn't worth the cost. Intentionally don't take a requirement on KVM_CAP_GUEST_MEMFD, KVM_CAP_MEMORY_FAULT_INFO, KVM_MEMORY_ATTRIBUTE_PRIVATE, etc., as it's a KVM bug to advertise KVM_X86_SW_PROTECTED_VM without its prerequisites. Signed-off-by: Vishal Annapurve Co-developed-by: Ackerley Tng Signed-off-by: Ackerley Tng Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-32-seanjc@google.com> Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- tools/testing/selftests/kvm/Makefile | 1 + .../kvm/x86_64/private_mem_conversions_test.c | 482 ++++++++++++++++++ 2 files changed, 483 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index a3bb36fb3cfc..b709a52d5cdb 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -81,6 +81,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/monitor_mwait_test TEST_GEN_PROGS_x86_64 += x86_64/nested_exceptions_test TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test +TEST_GEN_PROGS_x86_64 += x86_64/private_mem_conversions_test TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test TEST_GEN_PROGS_x86_64 += x86_64/smaller_maxphyaddr_emulation_test diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c new file mode 100644 index 000000000000..4d6a37a5d896 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c @@ -0,0 +1,482 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022, Google LLC. + */ +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#define BASE_DATA_SLOT 10 +#define BASE_DATA_GPA ((uint64_t)(1ull << 32)) +#define PER_CPU_DATA_SIZE ((uint64_t)(SZ_2M + PAGE_SIZE)) + +/* Horrific macro so that the line info is captured accurately :-( */ +#define memcmp_g(gpa, pattern, size) \ +do { \ + uint8_t *mem = (uint8_t *)gpa; \ + size_t i; \ + \ + for (i = 0; i < size; i++) \ + __GUEST_ASSERT(mem[i] == pattern, \ + "Guest expected 0x%x at offset %lu (gpa 0x%llx), got 0x%x", \ + pattern, i, gpa + i, mem[i]); \ +} while (0) + +static void memcmp_h(uint8_t *mem, uint64_t gpa, uint8_t pattern, size_t size) +{ + size_t i; + + for (i = 0; i < size; i++) + TEST_ASSERT(mem[i] == pattern, + "Host expected 0x%x at gpa 0x%lx, got 0x%x", + pattern, gpa + i, mem[i]); +} + +/* + * Run memory conversion tests with explicit conversion: + * Execute KVM hypercall to map/unmap gpa range which will cause userspace exit + * to back/unback private memory. Subsequent accesses by guest to the gpa range + * will not cause exit to userspace. + * + * Test memory conversion scenarios with following steps: + * 1) Access private memory using private access and verify that memory contents + * are not visible to userspace. + * 2) Convert memory to shared using explicit conversions and ensure that + * userspace is able to access the shared regions. + * 3) Convert memory back to private using explicit conversions and ensure that + * userspace is again not able to access converted private regions. + */ + +#define GUEST_STAGE(o, s) { .offset = o, .size = s } + +enum ucall_syncs { + SYNC_SHARED, + SYNC_PRIVATE, +}; + +static void guest_sync_shared(uint64_t gpa, uint64_t size, + uint8_t current_pattern, uint8_t new_pattern) +{ + GUEST_SYNC5(SYNC_SHARED, gpa, size, current_pattern, new_pattern); +} + +static void guest_sync_private(uint64_t gpa, uint64_t size, uint8_t pattern) +{ + GUEST_SYNC4(SYNC_PRIVATE, gpa, size, pattern); +} + +/* Arbitrary values, KVM doesn't care about the attribute flags. */ +#define MAP_GPA_SET_ATTRIBUTES BIT(0) +#define MAP_GPA_SHARED BIT(1) +#define MAP_GPA_DO_FALLOCATE BIT(2) + +static void guest_map_mem(uint64_t gpa, uint64_t size, bool map_shared, + bool do_fallocate) +{ + uint64_t flags = MAP_GPA_SET_ATTRIBUTES; + + if (map_shared) + flags |= MAP_GPA_SHARED; + if (do_fallocate) + flags |= MAP_GPA_DO_FALLOCATE; + kvm_hypercall_map_gpa_range(gpa, size, flags); +} + +static void guest_map_shared(uint64_t gpa, uint64_t size, bool do_fallocate) +{ + guest_map_mem(gpa, size, true, do_fallocate); +} + +static void guest_map_private(uint64_t gpa, uint64_t size, bool do_fallocate) +{ + guest_map_mem(gpa, size, false, do_fallocate); +} + +struct { + uint64_t offset; + uint64_t size; +} static const test_ranges[] = { + GUEST_STAGE(0, PAGE_SIZE), + GUEST_STAGE(0, SZ_2M), + GUEST_STAGE(PAGE_SIZE, PAGE_SIZE), + GUEST_STAGE(PAGE_SIZE, SZ_2M), + GUEST_STAGE(SZ_2M, PAGE_SIZE), +}; + +static void guest_test_explicit_conversion(uint64_t base_gpa, bool do_fallocate) +{ + const uint8_t def_p = 0xaa; + const uint8_t init_p = 0xcc; + uint64_t j; + int i; + + /* Memory should be shared by default. */ + memset((void *)base_gpa, def_p, PER_CPU_DATA_SIZE); + memcmp_g(base_gpa, def_p, PER_CPU_DATA_SIZE); + guest_sync_shared(base_gpa, PER_CPU_DATA_SIZE, def_p, init_p); + + memcmp_g(base_gpa, init_p, PER_CPU_DATA_SIZE); + + for (i = 0; i < ARRAY_SIZE(test_ranges); i++) { + uint64_t gpa = base_gpa + test_ranges[i].offset; + uint64_t size = test_ranges[i].size; + uint8_t p1 = 0x11; + uint8_t p2 = 0x22; + uint8_t p3 = 0x33; + uint8_t p4 = 0x44; + + /* + * Set the test region to pattern one to differentiate it from + * the data range as a whole (contains the initial pattern). + */ + memset((void *)gpa, p1, size); + + /* + * Convert to private, set and verify the private data, and + * then verify that the rest of the data (map shared) still + * holds the initial pattern, and that the host always sees the + * shared memory (initial pattern). Unlike shared memory, + * punching a hole in private memory is destructive, i.e. + * previous values aren't guaranteed to be preserved. + */ + guest_map_private(gpa, size, do_fallocate); + + if (size > PAGE_SIZE) { + memset((void *)gpa, p2, PAGE_SIZE); + goto skip; + } + + memset((void *)gpa, p2, size); + guest_sync_private(gpa, size, p1); + + /* + * Verify that the private memory was set to pattern two, and + * that shared memory still holds the initial pattern. + */ + memcmp_g(gpa, p2, size); + if (gpa > base_gpa) + memcmp_g(base_gpa, init_p, gpa - base_gpa); + if (gpa + size < base_gpa + PER_CPU_DATA_SIZE) + memcmp_g(gpa + size, init_p, + (base_gpa + PER_CPU_DATA_SIZE) - (gpa + size)); + + /* + * Convert odd-number page frames back to shared to verify KVM + * also correctly handles holes in private ranges. + */ + for (j = 0; j < size; j += PAGE_SIZE) { + if ((j >> PAGE_SHIFT) & 1) { + guest_map_shared(gpa + j, PAGE_SIZE, do_fallocate); + guest_sync_shared(gpa + j, PAGE_SIZE, p1, p3); + + memcmp_g(gpa + j, p3, PAGE_SIZE); + } else { + guest_sync_private(gpa + j, PAGE_SIZE, p1); + } + } + +skip: + /* + * Convert the entire region back to shared, explicitly write + * pattern three to fill in the even-number frames before + * asking the host to verify (and write pattern four). + */ + guest_map_shared(gpa, size, do_fallocate); + memset((void *)gpa, p3, size); + guest_sync_shared(gpa, size, p3, p4); + memcmp_g(gpa, p4, size); + + /* Reset the shared memory back to the initial pattern. */ + memset((void *)gpa, init_p, size); + + /* + * Free (via PUNCH_HOLE) *all* private memory so that the next + * iteration starts from a clean slate, e.g. with respect to + * whether or not there are pages/folios in guest_mem. + */ + guest_map_shared(base_gpa, PER_CPU_DATA_SIZE, true); + } +} + +static void guest_punch_hole(uint64_t gpa, uint64_t size) +{ + /* "Mapping" memory shared via fallocate() is done via PUNCH_HOLE. */ + uint64_t flags = MAP_GPA_SHARED | MAP_GPA_DO_FALLOCATE; + + kvm_hypercall_map_gpa_range(gpa, size, flags); +} + +/* + * Test that PUNCH_HOLE actually frees memory by punching holes without doing a + * proper conversion. Freeing (PUNCH_HOLE) should zap SPTEs, and reallocating + * (subsequent fault) should zero memory. + */ +static void guest_test_punch_hole(uint64_t base_gpa, bool precise) +{ + const uint8_t init_p = 0xcc; + int i; + + /* + * Convert the entire range to private, this testcase is all about + * punching holes in guest_memfd, i.e. shared mappings aren't needed. + */ + guest_map_private(base_gpa, PER_CPU_DATA_SIZE, false); + + for (i = 0; i < ARRAY_SIZE(test_ranges); i++) { + uint64_t gpa = base_gpa + test_ranges[i].offset; + uint64_t size = test_ranges[i].size; + + /* + * Free all memory before each iteration, even for the !precise + * case where the memory will be faulted back in. Freeing and + * reallocating should obviously work, and freeing all memory + * minimizes the probability of cross-testcase influence. + */ + guest_punch_hole(base_gpa, PER_CPU_DATA_SIZE); + + /* Fault-in and initialize memory, and verify the pattern. */ + if (precise) { + memset((void *)gpa, init_p, size); + memcmp_g(gpa, init_p, size); + } else { + memset((void *)base_gpa, init_p, PER_CPU_DATA_SIZE); + memcmp_g(base_gpa, init_p, PER_CPU_DATA_SIZE); + } + + /* + * Punch a hole at the target range and verify that reads from + * the guest succeed and return zeroes. + */ + guest_punch_hole(gpa, size); + memcmp_g(gpa, 0, size); + } +} + +static void guest_code(uint64_t base_gpa) +{ + /* + * Run the conversion test twice, with and without doing fallocate() on + * the guest_memfd backing when converting between shared and private. + */ + guest_test_explicit_conversion(base_gpa, false); + guest_test_explicit_conversion(base_gpa, true); + + /* + * Run the PUNCH_HOLE test twice too, once with the entire guest_memfd + * faulted in, once with only the target range faulted in. + */ + guest_test_punch_hole(base_gpa, false); + guest_test_punch_hole(base_gpa, true); + GUEST_DONE(); +} + +static void handle_exit_hypercall(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + uint64_t gpa = run->hypercall.args[0]; + uint64_t size = run->hypercall.args[1] * PAGE_SIZE; + bool set_attributes = run->hypercall.args[2] & MAP_GPA_SET_ATTRIBUTES; + bool map_shared = run->hypercall.args[2] & MAP_GPA_SHARED; + bool do_fallocate = run->hypercall.args[2] & MAP_GPA_DO_FALLOCATE; + struct kvm_vm *vm = vcpu->vm; + + TEST_ASSERT(run->hypercall.nr == KVM_HC_MAP_GPA_RANGE, + "Wanted MAP_GPA_RANGE (%u), got '%llu'", + KVM_HC_MAP_GPA_RANGE, run->hypercall.nr); + + if (do_fallocate) + vm_guest_mem_fallocate(vm, gpa, size, map_shared); + + if (set_attributes) + vm_set_memory_attributes(vm, gpa, size, + map_shared ? 0 : KVM_MEMORY_ATTRIBUTE_PRIVATE); + run->hypercall.ret = 0; +} + +static bool run_vcpus; + +static void *__test_mem_conversions(void *__vcpu) +{ + struct kvm_vcpu *vcpu = __vcpu; + struct kvm_run *run = vcpu->run; + struct kvm_vm *vm = vcpu->vm; + struct ucall uc; + + while (!READ_ONCE(run_vcpus)) + ; + + for ( ;; ) { + vcpu_run(vcpu); + + if (run->exit_reason == KVM_EXIT_HYPERCALL) { + handle_exit_hypercall(vcpu); + continue; + } + + TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, + "Wanted KVM_EXIT_IO, got exit reason: %u (%s)", + run->exit_reason, exit_reason_str(run->exit_reason)); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + case UCALL_SYNC: { + uint64_t gpa = uc.args[1]; + size_t size = uc.args[2]; + size_t i; + + TEST_ASSERT(uc.args[0] == SYNC_SHARED || + uc.args[0] == SYNC_PRIVATE, + "Unknown sync command '%ld'", uc.args[0]); + + for (i = 0; i < size; i += vm->page_size) { + size_t nr_bytes = min_t(size_t, vm->page_size, size - i); + uint8_t *hva = addr_gpa2hva(vm, gpa + i); + + /* In all cases, the host should observe the shared data. */ + memcmp_h(hva, gpa + i, uc.args[3], nr_bytes); + + /* For shared, write the new pattern to guest memory. */ + if (uc.args[0] == SYNC_SHARED) + memset(hva, uc.args[4], nr_bytes); + } + break; + } + case UCALL_DONE: + return NULL; + default: + TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd); + } + } +} + +static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t nr_vcpus, + uint32_t nr_memslots) +{ + /* + * Allocate enough memory so that each vCPU's chunk of memory can be + * naturally aligned with respect to the size of the backing store. + */ + const size_t alignment = max_t(size_t, SZ_2M, get_backing_src_pagesz(src_type)); + const size_t per_cpu_size = align_up(PER_CPU_DATA_SIZE, alignment); + const size_t memfd_size = per_cpu_size * nr_vcpus; + const size_t slot_size = memfd_size / nr_memslots; + struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + pthread_t threads[KVM_MAX_VCPUS]; + struct kvm_vm *vm; + int memfd, i, r; + + const struct vm_shape shape = { + .mode = VM_MODE_DEFAULT, + .type = KVM_X86_SW_PROTECTED_VM, + }; + + TEST_ASSERT(slot_size * nr_memslots == memfd_size, + "The memfd size (0x%lx) needs to be cleanly divisible by the number of memslots (%u)", + memfd_size, nr_memslots); + vm = __vm_create_with_vcpus(shape, nr_vcpus, 0, guest_code, vcpus); + + vm_enable_cap(vm, KVM_CAP_EXIT_HYPERCALL, (1 << KVM_HC_MAP_GPA_RANGE)); + + memfd = vm_create_guest_memfd(vm, memfd_size, 0); + + for (i = 0; i < nr_memslots; i++) + vm_mem_add(vm, src_type, BASE_DATA_GPA + slot_size * i, + BASE_DATA_SLOT + i, slot_size / vm->page_size, + KVM_MEM_GUEST_MEMFD, memfd, slot_size * i); + + for (i = 0; i < nr_vcpus; i++) { + uint64_t gpa = BASE_DATA_GPA + i * per_cpu_size; + + vcpu_args_set(vcpus[i], 1, gpa); + + /* + * Map only what is needed so that an out-of-bounds access + * results #PF => SHUTDOWN instead of data corruption. + */ + virt_map(vm, gpa, gpa, PER_CPU_DATA_SIZE / vm->page_size); + + pthread_create(&threads[i], NULL, __test_mem_conversions, vcpus[i]); + } + + WRITE_ONCE(run_vcpus, true); + + for (i = 0; i < nr_vcpus; i++) + pthread_join(threads[i], NULL); + + kvm_vm_free(vm); + + /* + * Allocate and free memory from the guest_memfd after closing the VM + * fd. The guest_memfd is gifted a reference to its owning VM, i.e. + * should prevent the VM from being fully destroyed until the last + * reference to the guest_memfd is also put. + */ + r = fallocate(memfd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, memfd_size); + TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); + + r = fallocate(memfd, FALLOC_FL_KEEP_SIZE, 0, memfd_size); + TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); +} + +static void usage(const char *cmd) +{ + puts(""); + printf("usage: %s [-h] [-m nr_memslots] [-s mem_type] [-n nr_vcpus]\n", cmd); + puts(""); + backing_src_help("-s"); + puts(""); + puts(" -n: specify the number of vcpus (default: 1)"); + puts(""); + puts(" -m: specify the number of memslots (default: 1)"); + puts(""); +} + +int main(int argc, char *argv[]) +{ + enum vm_mem_backing_src_type src_type = DEFAULT_VM_MEM_SRC; + uint32_t nr_memslots = 1; + uint32_t nr_vcpus = 1; + int opt; + + TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM)); + + while ((opt = getopt(argc, argv, "hm:s:n:")) != -1) { + switch (opt) { + case 's': + src_type = parse_backing_src_type(optarg); + break; + case 'n': + nr_vcpus = atoi_positive("nr_vcpus", optarg); + break; + case 'm': + nr_memslots = atoi_positive("nr_memslots", optarg); + break; + case 'h': + default: + usage(argv[0]); + exit(0); + } + } + + test_mem_conversions(src_type, nr_vcpus, nr_memslots); + + return 0; +} -- Gitee From b4811f05bf848d0e6ed243e166d0780e6b515393 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:22:14 -0700 Subject: [PATCH 30/54] KVM: selftests: Add KVM_SET_USER_MEMORY_REGION2 helper mainline inclusion from mainline-v6.8-rc1 commit e6f4f345b259cad178bad7e40a9d4b65cf195067 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e6f4f345b259cad178bad7e40a9d4b65cf195067 ------------------------------------- Hygon-SIG: commit e6f4f345b259 KVM: selftests: Add KVM_SET_USER_MEMORY_REGION2 helper Backport the guest-memory feature. ------------------------------------- Add helpers to invoke KVM_SET_USER_MEMORY_REGION2 directly so that tests can validate of features that are unique to "version 2" of "set user memory region", e.g. do negative testing on gmem_fd and gmem_offset. Provide a raw version as well as an assert-success version to reduce the amount of boilerplate code need for basic usage. Signed-off-by: Chao Peng Signed-off-by: Ackerley Tng Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-33-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- .../selftests/kvm/include/kvm_util_base.h | 7 +++++ tools/testing/selftests/kvm/lib/kvm_util.c | 29 +++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 157508c071f3..8ec122f5fcc8 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -522,6 +522,13 @@ void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, uint64_t gpa, uint64_t size, void *hva); int __vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, uint64_t gpa, uint64_t size, void *hva); +void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva, + uint32_t guest_memfd, uint64_t guest_memfd_offset); +int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva, + uint32_t guest_memfd, uint64_t guest_memfd_offset); + void vm_userspace_mem_region_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, uint64_t guest_paddr, uint32_t slot, uint64_t npages, diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index bf15635eda11..9b29cbf49476 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -873,6 +873,35 @@ void vm_set_user_memory_region(struct kvm_vm *vm, uint32_t slot, uint32_t flags, errno, strerror(errno)); } +int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva, + uint32_t guest_memfd, uint64_t guest_memfd_offset) +{ + struct kvm_userspace_memory_region2 region = { + .slot = slot, + .flags = flags, + .guest_phys_addr = gpa, + .memory_size = size, + .userspace_addr = (uintptr_t)hva, + .guest_memfd = guest_memfd, + .guest_memfd_offset = guest_memfd_offset, + }; + + return ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION2, ®ion); +} + +void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags, + uint64_t gpa, uint64_t size, void *hva, + uint32_t guest_memfd, uint64_t guest_memfd_offset) +{ + int ret = __vm_set_user_memory_region2(vm, slot, flags, gpa, size, hva, + guest_memfd, guest_memfd_offset); + + TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION2 failed, errno = %d (%s)", + errno, strerror(errno)); +} + + /* FIXME: This thing needs to be ripped apart and rewritten. */ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, uint64_t guest_paddr, uint32_t slot, uint64_t npages, -- Gitee From 881facaea5b5b8c67050c380dcedfffbe8207a98 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:22:15 -0700 Subject: [PATCH 31/54] KVM: selftests: Expand set_memory_region_test to validate guest_memfd() mainline inclusion from mainline-v6.8-rc1 commit 2feabb855df8f0889146c2e951307ba477d1f37b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2feabb855df8f0889146c2e951307ba477d1f37b ------------------------------------- Hygon-SIG: commit 2feabb855df8 KVM: selftests: Expand set_memory_region_test to validate guest_memfd() Backport the guest-memory feature. ------------------------------------- Expand set_memory_region_test to exercise various positive and negative testcases for private memory. - Non-guest_memfd() file descriptor for private memory - guest_memfd() from different VM - Overlapping bindings - Unaligned bindings Signed-off-by: Chao Peng Co-developed-by: Ackerley Tng Signed-off-by: Ackerley Tng [sean: trim the testcases to remove duplicate coverage] Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-34-seanjc@google.com> Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- .../selftests/kvm/include/kvm_util_base.h | 12 ++ .../selftests/kvm/set_memory_region_test.c | 104 +++++++++++++++++- 2 files changed, 114 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index 8ec122f5fcc8..1b58f943562f 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -819,6 +819,18 @@ static inline struct kvm_vm *vm_create_barebones(void) return ____vm_create(VM_SHAPE_DEFAULT); } +#ifdef __x86_64__ +static inline struct kvm_vm *vm_create_barebones_protected_vm(void) +{ + const struct vm_shape shape = { + .mode = VM_MODE_DEFAULT, + .type = KVM_X86_SW_PROTECTED_VM, + }; + + return ____vm_create(shape); +} +#endif + static inline struct kvm_vm *vm_create(uint32_t nr_runnable_vcpus) { return __vm_create(VM_SHAPE_DEFAULT, nr_runnable_vcpus, 0); diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index b32960189f5f..a78394faee7c 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -385,13 +385,105 @@ static void test_add_max_memory_regions(void) kvm_vm_free(vm); } + +#ifdef __x86_64__ +static void test_invalid_guest_memfd(struct kvm_vm *vm, int memfd, + size_t offset, const char *msg) +{ + int r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA, MEM_REGION_SIZE, + 0, memfd, offset); + TEST_ASSERT(r == -1 && errno == EINVAL, "%s", msg); +} + +static void test_add_private_memory_region(void) +{ + struct kvm_vm *vm, *vm2; + int memfd, i; + + pr_info("Testing ADD of KVM_MEM_GUEST_MEMFD memory regions\n"); + + vm = vm_create_barebones_protected_vm(); + + test_invalid_guest_memfd(vm, vm->kvm_fd, 0, "KVM fd should fail"); + test_invalid_guest_memfd(vm, vm->fd, 0, "VM's fd should fail"); + + memfd = kvm_memfd_alloc(MEM_REGION_SIZE, false); + test_invalid_guest_memfd(vm, memfd, 0, "Regular memfd() should fail"); + close(memfd); + + vm2 = vm_create_barebones_protected_vm(); + memfd = vm_create_guest_memfd(vm2, MEM_REGION_SIZE, 0); + test_invalid_guest_memfd(vm, memfd, 0, "Other VM's guest_memfd() should fail"); + + vm_set_user_memory_region2(vm2, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA, MEM_REGION_SIZE, 0, memfd, 0); + close(memfd); + kvm_vm_free(vm2); + + memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE, 0); + for (i = 1; i < PAGE_SIZE; i++) + test_invalid_guest_memfd(vm, memfd, i, "Unaligned offset should fail"); + + vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA, MEM_REGION_SIZE, 0, memfd, 0); + close(memfd); + + kvm_vm_free(vm); +} + +static void test_add_overlapping_private_memory_regions(void) +{ + struct kvm_vm *vm; + int memfd; + int r; + + pr_info("Testing ADD of overlapping KVM_MEM_GUEST_MEMFD memory regions\n"); + + vm = vm_create_barebones_protected_vm(); + + memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE * 4, 0); + + vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA, MEM_REGION_SIZE * 2, 0, memfd, 0); + + vm_set_user_memory_region2(vm, MEM_REGION_SLOT + 1, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA * 2, MEM_REGION_SIZE * 2, + 0, memfd, MEM_REGION_SIZE * 2); + + /* + * Delete the first memslot, and then attempt to recreate it except + * with a "bad" offset that results in overlap in the guest_memfd(). + */ + vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA, 0, NULL, -1, 0); + + /* Overlap the front half of the other slot. */ + r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA * 2 - MEM_REGION_SIZE, + MEM_REGION_SIZE * 2, + 0, memfd, 0); + TEST_ASSERT(r == -1 && errno == EEXIST, "%s", + "Overlapping guest_memfd() bindings should fail with EEXIST"); + + /* And now the back half of the other slot. */ + r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA * 2 + MEM_REGION_SIZE, + MEM_REGION_SIZE * 2, + 0, memfd, 0); + TEST_ASSERT(r == -1 && errno == EEXIST, "%s", + "Overlapping guest_memfd() bindings should fail with EEXIST"); + + close(memfd); + kvm_vm_free(vm); +} +#endif + int main(int argc, char *argv[]) { #ifdef __x86_64__ int i, loops; -#endif -#ifdef __x86_64__ /* * FIXME: the zero-memslot test fails on aarch64 and s390x because * KVM_RUN fails with ENOEXEC or EFAULT. @@ -402,6 +494,14 @@ int main(int argc, char *argv[]) test_add_max_memory_regions(); #ifdef __x86_64__ + if (kvm_has_cap(KVM_CAP_GUEST_MEMFD) && + (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM))) { + test_add_private_memory_region(); + test_add_overlapping_private_memory_regions(); + } else { + pr_info("Skipping tests for KVM_MEM_GUEST_MEMFD memory regions\n"); + } + if (argc > 1) loops = atoi_positive("Number of iterations", argv[1]); else -- Gitee From 07264563de2bdaaf56850ebae69355d7d85ed015 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:22:16 -0700 Subject: [PATCH 32/54] KVM: selftests: Add basic selftest for guest_memfd() mainline inclusion from mainline-v6.8-rc1 commit 8a89efd43423cb3005c5e641e846184e292c1465 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=8a89efd43423cb3005c5e641e846184e292c1465 ------------------------------------- Hygon-SIG: commit 8a89efd43423 KVM: selftests: Add basic selftest for guest_memfd() Backport the guest-memory feature. ------------------------------------- Add a selftest to verify the basic functionality of guest_memfd(): + file descriptor created with the guest_memfd() ioctl does not allow read/write/mmap operations + file size and block size as returned from fstat are as expected + fallocate on the fd checks that offset/length on fallocate(FALLOC_FL_PUNCH_HOLE) should be page aligned + invalid inputs (misaligned size, invalid flags) are rejected + file size and inode are unique (the innocuous-sounding anon_inode_getfile() backs all files with a single inode...) Signed-off-by: Chao Peng Co-developed-by: Ackerley Tng Signed-off-by: Ackerley Tng Co-developed-by: Paolo Bonzini Signed-off-by: Paolo Bonzini Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-35-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- tools/testing/selftests/kvm/Makefile | 1 + .../testing/selftests/kvm/guest_memfd_test.c | 207 ++++++++++++++++++ 2 files changed, 208 insertions(+) create mode 100644 tools/testing/selftests/kvm/guest_memfd_test.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index b709a52d5cdb..2b1ef809d73a 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -124,6 +124,7 @@ TEST_GEN_PROGS_x86_64 += access_tracking_perf_test TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS_x86_64 += dirty_log_perf_test +TEST_GEN_PROGS_x86_64 += guest_memfd_test TEST_GEN_PROGS_x86_64 += guest_print_test TEST_GEN_PROGS_x86_64 += hardware_disable_test TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c new file mode 100644 index 000000000000..fd389663c49b --- /dev/null +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -0,0 +1,207 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright Intel Corporation, 2023 + * + * Author: Chao Peng + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util_base.h" + +static void test_file_read_write(int fd) +{ + char buf[64]; + + TEST_ASSERT(read(fd, buf, sizeof(buf)) < 0, + "read on a guest_mem fd should fail"); + TEST_ASSERT(write(fd, buf, sizeof(buf)) < 0, + "write on a guest_mem fd should fail"); + TEST_ASSERT(pread(fd, buf, sizeof(buf), 0) < 0, + "pread on a guest_mem fd should fail"); + TEST_ASSERT(pwrite(fd, buf, sizeof(buf), 0) < 0, + "pwrite on a guest_mem fd should fail"); +} + +static void test_mmap(int fd, size_t page_size) +{ + char *mem; + + mem = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + TEST_ASSERT_EQ(mem, MAP_FAILED); +} + +static void test_file_size(int fd, size_t page_size, size_t total_size) +{ + struct stat sb; + int ret; + + ret = fstat(fd, &sb); + TEST_ASSERT(!ret, "fstat should succeed"); + TEST_ASSERT_EQ(sb.st_size, total_size); + TEST_ASSERT_EQ(sb.st_blksize, page_size); +} + +static void test_fallocate(int fd, size_t page_size, size_t total_size) +{ + int ret; + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, total_size); + TEST_ASSERT(!ret, "fallocate with aligned offset and size should succeed"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + page_size - 1, page_size); + TEST_ASSERT(ret, "fallocate with unaligned offset should fail"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, total_size, page_size); + TEST_ASSERT(ret, "fallocate beginning at total_size should fail"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, total_size + page_size, page_size); + TEST_ASSERT(ret, "fallocate beginning after total_size should fail"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + total_size, page_size); + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) at total_size should succeed"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + total_size + page_size, page_size); + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) after total_size should succeed"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + page_size, page_size - 1); + TEST_ASSERT(ret, "fallocate with unaligned size should fail"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + page_size, page_size); + TEST_ASSERT(!ret, "fallocate(PUNCH_HOLE) with aligned offset and size should succeed"); + + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE, page_size, page_size); + TEST_ASSERT(!ret, "fallocate to restore punched hole should succeed"); +} + +static void test_invalid_punch_hole(int fd, size_t page_size, size_t total_size) +{ + struct { + off_t offset; + off_t len; + } testcases[] = { + {0, 1}, + {0, page_size - 1}, + {0, page_size + 1}, + + {1, 1}, + {1, page_size - 1}, + {1, page_size}, + {1, page_size + 1}, + + {page_size, 1}, + {page_size, page_size - 1}, + {page_size, page_size + 1}, + }; + int ret, i; + + for (i = 0; i < ARRAY_SIZE(testcases); i++) { + ret = fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + testcases[i].offset, testcases[i].len); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "PUNCH_HOLE with !PAGE_SIZE offset (%lx) and/or length (%lx) should fail", + testcases[i].offset, testcases[i].len); + } +} + +static void test_create_guest_memfd_invalid(struct kvm_vm *vm) +{ + size_t page_size = getpagesize(); + uint64_t flag; + size_t size; + int fd; + + for (size = 1; size < page_size; size++) { + fd = __vm_create_guest_memfd(vm, size, 0); + TEST_ASSERT(fd == -1 && errno == EINVAL, + "guest_memfd() with non-page-aligned page size '0x%lx' should fail with EINVAL", + size); + } + + for (flag = 1; flag; flag <<= 1) { + uint64_t bit; + + fd = __vm_create_guest_memfd(vm, page_size, flag); + TEST_ASSERT(fd == -1 && errno == EINVAL, + "guest_memfd() with flag '0x%lx' should fail with EINVAL", + flag); + + for_each_set_bit(bit, &valid_flags, 64) { + fd = __vm_create_guest_memfd(vm, page_size, flag | BIT_ULL(bit)); + TEST_ASSERT(fd == -1 && errno == EINVAL, + "guest_memfd() with flags '0x%llx' should fail with EINVAL", + flag | BIT_ULL(bit)); + } + } +} + +static void test_create_guest_memfd_multiple(struct kvm_vm *vm) +{ + int fd1, fd2, ret; + struct stat st1, st2; + + fd1 = __vm_create_guest_memfd(vm, 4096, 0); + TEST_ASSERT(fd1 != -1, "memfd creation should succeed"); + + ret = fstat(fd1, &st1); + TEST_ASSERT(ret != -1, "memfd fstat should succeed"); + TEST_ASSERT(st1.st_size == 4096, "memfd st_size should match requested size"); + + fd2 = __vm_create_guest_memfd(vm, 8192, 0); + TEST_ASSERT(fd2 != -1, "memfd creation should succeed"); + + ret = fstat(fd2, &st2); + TEST_ASSERT(ret != -1, "memfd fstat should succeed"); + TEST_ASSERT(st2.st_size == 8192, "second memfd st_size should match requested size"); + + ret = fstat(fd1, &st1); + TEST_ASSERT(ret != -1, "memfd fstat should succeed"); + TEST_ASSERT(st1.st_size == 4096, "first memfd st_size should still match requested size"); + TEST_ASSERT(st1.st_ino != st2.st_ino, "different memfd should have different inode numbers"); +} + +int main(int argc, char *argv[]) +{ + size_t page_size; + size_t total_size; + int fd; + struct kvm_vm *vm; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_GUEST_MEMFD)); + + page_size = getpagesize(); + total_size = page_size * 4; + + vm = vm_create_barebones(); + + test_create_guest_memfd_invalid(vm); + test_create_guest_memfd_multiple(vm); + + fd = vm_create_guest_memfd(vm, total_size, 0); + + test_file_read_write(fd); + test_mmap(fd, page_size); + test_file_size(fd, page_size, total_size); + test_fallocate(fd, page_size, total_size); + test_invalid_punch_hole(fd, page_size, total_size); + + close(fd); +} -- Gitee From ad037b70de71d3949ae651aac1f543744ac94797 Mon Sep 17 00:00:00 2001 From: Ackerley Tng Date: Fri, 27 Oct 2023 11:22:17 -0700 Subject: [PATCH 33/54] KVM: selftests: Test KVM exit behavior for private memory/access mainline inclusion from mainline-v6.8-rc1 commit e3577788de64139202d89224fe31613c0f02b790 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e3577788de64139202d89224fe31613c0f02b790 ------------------------------------- Hygon-SIG: commit e3577788de64 KVM: selftests: Test KVM exit behavior for private memory/access Backport the guest-memory feature. ------------------------------------- "Testing private access when memslot gets deleted" tests the behavior of KVM when a private memslot gets deleted while the VM is using the private memslot. When KVM looks up the deleted (slot = NULL) memslot, KVM should exit to userspace with KVM_EXIT_MEMORY_FAULT. In the second test, upon a private access to non-private memslot, KVM should also exit to userspace with KVM_EXIT_MEMORY_FAULT. Intentionally don't take a requirement on KVM_CAP_GUEST_MEMFD, KVM_CAP_MEMORY_FAULT_INFO, KVM_MEMORY_ATTRIBUTE_PRIVATE, etc., as it's a KVM bug to advertise KVM_X86_SW_PROTECTED_VM without its prerequisites. Signed-off-by: Ackerley Tng [sean: call out the similarities with set_memory_region_test] Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-36-seanjc@google.com> Reviewed-by: Fuad Tabba Tested-by: Fuad Tabba Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- tools/testing/selftests/kvm/Makefile | 1 + .../kvm/x86_64/private_mem_kvm_exits_test.c | 120 ++++++++++++++++++ 2 files changed, 121 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 2b1ef809d73a..f7fdd8244547 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -82,6 +82,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/nested_exceptions_test TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test TEST_GEN_PROGS_x86_64 += x86_64/pmu_event_filter_test TEST_GEN_PROGS_x86_64 += x86_64/private_mem_conversions_test +TEST_GEN_PROGS_x86_64 += x86_64/private_mem_kvm_exits_test TEST_GEN_PROGS_x86_64 += x86_64/set_boot_cpu_id TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test TEST_GEN_PROGS_x86_64 += x86_64/smaller_maxphyaddr_emulation_test diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c b/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c new file mode 100644 index 000000000000..13e72fcec8dd --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2023, Google LLC. + */ +#include +#include +#include + +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" + +/* Arbitrarily selected to avoid overlaps with anything else */ +#define EXITS_TEST_GVA 0xc0000000 +#define EXITS_TEST_GPA EXITS_TEST_GVA +#define EXITS_TEST_NPAGES 1 +#define EXITS_TEST_SIZE (EXITS_TEST_NPAGES * PAGE_SIZE) +#define EXITS_TEST_SLOT 10 + +static uint64_t guest_repeatedly_read(void) +{ + volatile uint64_t value; + + while (true) + value = *((uint64_t *) EXITS_TEST_GVA); + + return value; +} + +static uint32_t run_vcpu_get_exit_reason(struct kvm_vcpu *vcpu) +{ + int r; + + r = _vcpu_run(vcpu); + if (r) { + TEST_ASSERT(errno == EFAULT, KVM_IOCTL_ERROR(KVM_RUN, r)); + TEST_ASSERT_EQ(vcpu->run->exit_reason, KVM_EXIT_MEMORY_FAULT); + } + return vcpu->run->exit_reason; +} + +const struct vm_shape protected_vm_shape = { + .mode = VM_MODE_DEFAULT, + .type = KVM_X86_SW_PROTECTED_VM, +}; + +static void test_private_access_memslot_deleted(void) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + pthread_t vm_thread; + void *thread_return; + uint32_t exit_reason; + + vm = vm_create_shape_with_one_vcpu(protected_vm_shape, &vcpu, + guest_repeatedly_read); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + EXITS_TEST_GPA, EXITS_TEST_SLOT, + EXITS_TEST_NPAGES, + KVM_MEM_GUEST_MEMFD); + + virt_map(vm, EXITS_TEST_GVA, EXITS_TEST_GPA, EXITS_TEST_NPAGES); + + /* Request to access page privately */ + vm_mem_set_private(vm, EXITS_TEST_GPA, EXITS_TEST_SIZE); + + pthread_create(&vm_thread, NULL, + (void *(*)(void *))run_vcpu_get_exit_reason, + (void *)vcpu); + + vm_mem_region_delete(vm, EXITS_TEST_SLOT); + + pthread_join(vm_thread, &thread_return); + exit_reason = (uint32_t)(uint64_t)thread_return; + + TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT); + TEST_ASSERT_EQ(vcpu->run->memory_fault.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE); + TEST_ASSERT_EQ(vcpu->run->memory_fault.gpa, EXITS_TEST_GPA); + TEST_ASSERT_EQ(vcpu->run->memory_fault.size, EXITS_TEST_SIZE); + + kvm_vm_free(vm); +} + +static void test_private_access_memslot_not_private(void) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + uint32_t exit_reason; + + vm = vm_create_shape_with_one_vcpu(protected_vm_shape, &vcpu, + guest_repeatedly_read); + + /* Add a non-private memslot (flags = 0) */ + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + EXITS_TEST_GPA, EXITS_TEST_SLOT, + EXITS_TEST_NPAGES, 0); + + virt_map(vm, EXITS_TEST_GVA, EXITS_TEST_GPA, EXITS_TEST_NPAGES); + + /* Request to access page privately */ + vm_mem_set_private(vm, EXITS_TEST_GPA, EXITS_TEST_SIZE); + + exit_reason = run_vcpu_get_exit_reason(vcpu); + + TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT); + TEST_ASSERT_EQ(vcpu->run->memory_fault.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE); + TEST_ASSERT_EQ(vcpu->run->memory_fault.gpa, EXITS_TEST_GPA); + TEST_ASSERT_EQ(vcpu->run->memory_fault.size, EXITS_TEST_SIZE); + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + TEST_REQUIRE(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM)); + + test_private_access_memslot_deleted(); + test_private_access_memslot_not_private(); +} -- Gitee From 99619c0c459dde6868aabf32307f4718c4655a96 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 30 Oct 2023 17:20:49 -0700 Subject: [PATCH 34/54] KVM: selftests: Add a memory region subtest to validate invalid flags mainline inclusion from mainline-v6.8-rc1 commit 5d74316466f4aabdd2ee1e33b45e4933c9bc3ea1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5d74316466f4aabdd2ee1e33b45e4933c9bc3ea1 ------------------------------------- Hygon-SIG: commit 5d74316466f4 KVM: selftests: Add a memory region subtest to validate invalid flags Backport the guest-memory feature. ------------------------------------- Add a subtest to set_memory_region_test to verify that KVM rejects invalid flags and combinations with -EINVAL. KVM might or might not fail with EINVAL anyways, but we can at least try. Signed-off-by: Sean Christopherson Message-Id: <20231031002049.3915752-1-seanjc@google.com> Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- .../testing/selftests/kvm/guest_memfd_test.c | 9 +--- .../selftests/kvm/set_memory_region_test.c | 49 +++++++++++++++++++ 2 files changed, 50 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index fd389663c49b..318ba6ba8bd3 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -136,20 +136,13 @@ static void test_create_guest_memfd_invalid(struct kvm_vm *vm) size); } - for (flag = 1; flag; flag <<= 1) { + for (flag = 0; flag; flag <<= 1) { uint64_t bit; fd = __vm_create_guest_memfd(vm, page_size, flag); TEST_ASSERT(fd == -1 && errno == EINVAL, "guest_memfd() with flag '0x%lx' should fail with EINVAL", flag); - - for_each_set_bit(bit, &valid_flags, 64) { - fd = __vm_create_guest_memfd(vm, page_size, flag | BIT_ULL(bit)); - TEST_ASSERT(fd == -1 && errno == EINVAL, - "guest_memfd() with flags '0x%llx' should fail with EINVAL", - flag | BIT_ULL(bit)); - } } } diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index a78394faee7c..1efee1cfcff0 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -326,6 +326,53 @@ static void test_zero_memory_regions(void) } #endif /* __x86_64__ */ +static void test_invalid_memory_region_flags(void) +{ + uint32_t supported_flags = KVM_MEM_LOG_DIRTY_PAGES; + const uint32_t v2_only_flags = KVM_MEM_GUEST_MEMFD; + struct kvm_vm *vm; + int r, i; + +#ifdef __x86_64__ + supported_flags |= KVM_MEM_READONLY; + + if (kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(KVM_X86_SW_PROTECTED_VM)) + vm = vm_create_barebones_protected_vm(); + else +#endif + vm = vm_create_barebones(); + + if (kvm_check_cap(KVM_CAP_MEMORY_ATTRIBUTES) & KVM_MEMORY_ATTRIBUTE_PRIVATE) + supported_flags |= KVM_MEM_GUEST_MEMFD; + + for (i = 0; i < 32; i++) { + if ((supported_flags & BIT(i)) && !(v2_only_flags & BIT(i))) + continue; + + r = __vm_set_user_memory_region(vm, MEM_REGION_SLOT, BIT(i), + MEM_REGION_GPA, MEM_REGION_SIZE, NULL); + + TEST_ASSERT(r && errno == EINVAL, + "KVM_SET_USER_MEMORY_REGION should have failed on v2 only flag 0x%lx", BIT(i)); + + if (supported_flags & BIT(i)) + continue; + + r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, BIT(i), + MEM_REGION_GPA, MEM_REGION_SIZE, NULL, 0, 0); + TEST_ASSERT(r && errno == EINVAL, + "KVM_SET_USER_MEMORY_REGION2 should have failed on unsupported flag 0x%lx", BIT(i)); + } + + if (supported_flags & KVM_MEM_GUEST_MEMFD) { + r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, + KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_GUEST_MEMFD, + MEM_REGION_GPA, MEM_REGION_SIZE, NULL, 0, 0); + TEST_ASSERT(r && errno == EINVAL, + "KVM_SET_USER_MEMORY_REGION2 should have failed, dirty logging private memory is unsupported"); + } +} + /* * Test it can be added memory slots up to KVM_CAP_NR_MEMSLOTS, then any * tentative to add further slots should fail. @@ -491,6 +538,8 @@ int main(int argc, char *argv[]) test_zero_memory_regions(); #endif + test_invalid_memory_region_flags(); + test_add_max_memory_regions(); #ifdef __x86_64__ -- Gitee From e3dfa15aa9a1e895901ed97440e0b04214b22885 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 21 Nov 2023 11:24:08 -0500 Subject: [PATCH 35/54] selftests/kvm: fix compilation on non-x86_64 platforms mainline inclusion from mainline-v6.8-rc1 commit e9e60c82fe391d04db55a91c733df4a017c28b2f category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e9e60c82fe391d04db55a91c733df4a017c28b2f ------------------------------------- Hygon-SIG: commit e9e60c82fe39 upstream selftests/kvm: fix compilation on non-x86_64 platforms Backport the guest-memory feature. ------------------------------------- MEM_REGION_SLOT and MEM_REGION_GPA are not really needed in test_invalid_memory_region_flags; the VM never runs and there are no other slots, so it is okay to use slot 0 and place it at address zero. This fixes compilation on architectures that do not define them. Fixes: 5d74316466f4 ("KVM: selftests: Add a memory region subtest to validate invalid flags") Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- tools/testing/selftests/kvm/guest_memfd_test.c | 2 -- tools/testing/selftests/kvm/set_memory_region_test.c | 12 ++++++------ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index 318ba6ba8bd3..c78a98c1a915 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -137,8 +137,6 @@ static void test_create_guest_memfd_invalid(struct kvm_vm *vm) } for (flag = 0; flag; flag <<= 1) { - uint64_t bit; - fd = __vm_create_guest_memfd(vm, page_size, flag); TEST_ASSERT(fd == -1 && errno == EINVAL, "guest_memfd() with flag '0x%lx' should fail with EINVAL", diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 1efee1cfcff0..6637a0845acf 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -349,8 +349,8 @@ static void test_invalid_memory_region_flags(void) if ((supported_flags & BIT(i)) && !(v2_only_flags & BIT(i))) continue; - r = __vm_set_user_memory_region(vm, MEM_REGION_SLOT, BIT(i), - MEM_REGION_GPA, MEM_REGION_SIZE, NULL); + r = __vm_set_user_memory_region(vm, 0, BIT(i), + 0, MEM_REGION_SIZE, NULL); TEST_ASSERT(r && errno == EINVAL, "KVM_SET_USER_MEMORY_REGION should have failed on v2 only flag 0x%lx", BIT(i)); @@ -358,16 +358,16 @@ static void test_invalid_memory_region_flags(void) if (supported_flags & BIT(i)) continue; - r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, BIT(i), - MEM_REGION_GPA, MEM_REGION_SIZE, NULL, 0, 0); + r = __vm_set_user_memory_region2(vm, 0, BIT(i), + 0, MEM_REGION_SIZE, NULL, 0, 0); TEST_ASSERT(r && errno == EINVAL, "KVM_SET_USER_MEMORY_REGION2 should have failed on unsupported flag 0x%lx", BIT(i)); } if (supported_flags & KVM_MEM_GUEST_MEMFD) { - r = __vm_set_user_memory_region2(vm, MEM_REGION_SLOT, + r = __vm_set_user_memory_region2(vm, 0, KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_GUEST_MEMFD, - MEM_REGION_GPA, MEM_REGION_SIZE, NULL, 0, 0); + 0, MEM_REGION_SIZE, NULL, 0, 0); TEST_ASSERT(r && errno == EINVAL, "KVM_SET_USER_MEMORY_REGION2 should have failed, dirty logging private memory is unsupported"); } -- Gitee From c03cf7ebce3c3fb2d70c84f993f0aaf17ce58562 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 8 Dec 2023 13:46:48 -0500 Subject: [PATCH 36/54] KVM: guest-memfd: fix unused-function warning mainline inclusion from mainline-v6.8-rc1 commit 80583d0cfd8ff44d60a5fa76a6bf3c08eb67c328 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=80583d0cfd8ff44d60a5fa76a6bf3c08eb67c328 ------------------------------------- Hygon-SIG: commit 80583d0cfd8f upstream KVM: guest-memfd: fix unused-function warning Backport the guest-memory feature. ------------------------------------- With migration disabled, one function becomes unused: virt/kvm/guest_memfd.c:262:12: error: 'kvm_gmem_migrate_folio' defined but not used [-Werror=unused-function] 262 | static int kvm_gmem_migrate_folio(struct address_space *mapping, | ^~~~~~~~~~~~~~~~~~~~~~ Remove the #ifdef around the reference so that fallback_migrate_folio() is never used. The gmem implementation of the hook is trivial; since the gmem mapping is unmovable, the pages should not be migrated anyway. Fixes: a7800aa80ea4 ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") Reported-by: Arnd Bergmann Suggested-by: Sean Christopherson Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- virt/kvm/guest_memfd.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index e65f4170425c..7f9a146cf168 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -304,9 +304,7 @@ static int kvm_gmem_error_page(struct address_space *mapping, struct page *page) static const struct address_space_operations kvm_gmem_aops = { .dirty_folio = noop_dirty_folio, -#ifdef CONFIG_MIGRATION .migrate_folio = kvm_gmem_migrate_folio, -#endif .error_remove_page = kvm_gmem_error_page, }; -- Gitee From 87271db4738379d5e9bcf29da0354c3aa04514ec Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 11 Jun 2024 04:22:18 -0400 Subject: [PATCH 37/54] virt: guest_memfd: fix reference leak on hwpoisoned page mainline inclusion from mainline-v6.10-rc5 commit c31745d2c508796a0996c88bf2e55f552d513f65 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c31745d2c508796a0996c88bf2e55f552d513f65 ------------------------------------- Hygon-SIG: commit c31745d2c508 upstream virt: guest_memfd: fix reference leak on hwpoisoned page Backport the guest-memory feature. ------------------------------------- If kvm_gmem_get_pfn() detects an hwpoisoned page, it returns -EHWPOISON but it does not put back the reference that kvm_gmem_get_folio() had grabbed. Add the forgotten folio_put(). Fixes: a7800aa80ea4 ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") Cc: stable@vger.kernel.org Reviewed-by: Liam Merwick Reviewed-by: Isaku Yamahata Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- virt/kvm/guest_memfd.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 7f9a146cf168..5047b0f2de5a 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -514,8 +514,10 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, } if (folio_test_hwpoison(folio)) { + folio_unlock(folio); + folio_put(folio); r = -EHWPOISON; - goto out_unlock; + goto out_fput; } page = folio_file_page(folio, index); @@ -526,7 +528,6 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, r = 0; -out_unlock: folio_unlock(folio); out_fput: fput(file); -- Gitee From 9f99af1ffde6333454e6a212daa1116e0187a21e Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Fri, 26 Jan 2024 16:22:07 +0800 Subject: [PATCH 38/54] LoongArch: KVM: Fix build due to API changes mainline inclusion from mainline-v6.8-rc2 commit 614f362918c782d1cfa4ee50f96072a95eac264e category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=614f362918c782d1cfa4ee50f96072a95eac264e ------------------------------------- Hygon-SIG: commit 614f362918c7 upstream LoongArch: KVM: Fix build due to API changes Backport the guest-memory feature. ------------------------------------- Commit 8569992d64b8f750e34b7858eac ("KVM: Use gfn instead of hva for mmu_notifier_retry") replaces mmu_invalidate_retry_hva() usage with mmu_invalidate_retry_gfn() for X86, LoongArch also need similar changes to fix build. Signed-off-by: Huacai Chen [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- arch/loongarch/kvm/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c index d54b2388dfee..7fa477a28ee1 100644 --- a/arch/loongarch/kvm/mmu.c +++ b/arch/loongarch/kvm/mmu.c @@ -698,7 +698,7 @@ static bool fault_supports_huge_mapping(struct kvm_memory_slot *memslot, * * There are several ways to safely use this helper: * - * - Check mmu_invalidate_retry_hva() after grabbing the mapping level, before + * - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before * consuming it. In this case, mmu_lock doesn't need to be held during the * lookup, but it does need to be held while checking the MMU notifier. * @@ -879,7 +879,7 @@ static int kvm_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) /* Check if an invalidation has taken place since we got pfn */ spin_lock(&kvm->mmu_lock); - if (mmu_invalidate_retry_hva(kvm, mmu_seq, hva)) { + if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) { /* * This can happen when mappings are changed asynchronously, but * also synchronously if a COW is triggered by -- Gitee From 5bebf87fc5c5ea9a5b661857d3085e8840562c63 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sat, 27 Jan 2024 14:52:23 -0300 Subject: [PATCH 39/54] tools headers UAPI: Sync kvm headers with the kernel sources mainline inclusion from mainline-v6.8-rc3 commit e30dca91e5667568a6be54886020c43f1f6f95d3 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e30dca91e5667568a6be54886020c43f1f6f95d3 ------------------------------------- Hygon-SIG: commit e30dca91e566 upstream tools headers UAPI: Sync kvm headers with the kernel sources Backport the guest-memory feature. ------------------------------------- To pick the changes in: a5d3df8ae13fada7 ("KVM: remove deprecated UAPIs") 6d72283526090850 ("KVM x86/xen: add an override for PVCLOCK_TSC_STABLE_BIT") 89ea60c2c7b5838b ("KVM: x86: Add support for "protected VMs" that can utilize private memory") 8dd2eee9d526c30f ("KVM: x86/mmu: Handle page fault for private memory") a7800aa80ea4d535 ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") 5a475554db1e476a ("KVM: Introduce per-page memory attributes") 16f95f3b95caded2 ("KVM: Add KVM_EXIT_MEMORY_FAULT exit to report faults to userspace") bb58b90b1a8f753b ("KVM: Introduce KVM_SET_USER_MEMORY_REGION2") 3f9cd0ca848413fd ("KVM: arm64: Allow userspace to get the writable masks for feature ID registers") That automatically adds support for some new ioctls and remove a bunch of deprecated ones. This ends up making the new binary to forget about the deprecated one, so when used in an older system it will not be able to resolve those codes to strings. $ tools/perf/trace/beauty/kvm_ioctl.sh > before $ cp include/uapi/linux/kvm.h tools/include/uapi/linux/kvm.h $ tools/perf/trace/beauty/kvm_ioctl.sh > after $ diff -u before after --- before 2024-01-27 14:48:16.523014020 -0300 +++ after 2024-01-27 14:48:24.183932866 -0300 @@ -14,6 +14,7 @@ [0x46] = "SET_USER_MEMORY_REGION", [0x47] = "SET_TSS_ADDR", [0x48] = "SET_IDENTITY_MAP_ADDR", + [0x49] = "SET_USER_MEMORY_REGION2", [0x60] = "CREATE_IRQCHIP", [0x61] = "IRQ_LINE", [0x62] = "GET_IRQCHIP", @@ -22,14 +23,8 @@ [0x65] = "GET_PIT", [0x66] = "SET_PIT", [0x67] = "IRQ_LINE_STATUS", - [0x69] = "ASSIGN_PCI_DEVICE", [0x6a] = "SET_GSI_ROUTING", - [0x70] = "ASSIGN_DEV_IRQ", [0x71] = "REINJECT_CONTROL", - [0x72] = "DEASSIGN_PCI_DEVICE", - [0x73] = "ASSIGN_SET_MSIX_NR", - [0x74] = "ASSIGN_SET_MSIX_ENTRY", - [0x75] = "DEASSIGN_DEV_IRQ", [0x76] = "IRQFD", [0x77] = "CREATE_PIT2", [0x78] = "SET_BOOT_CPU_ID", @@ -66,7 +61,6 @@ [0x9f] = "GET_VCPU_EVENTS", [0xa0] = "SET_VCPU_EVENTS", [0xa3] = "ENABLE_CAP", - [0xa4] = "ASSIGN_SET_INTX_MASK", [0xa5] = "SIGNAL_MSI", [0xa6] = "GET_XCRS", [0xa7] = "SET_XCRS", @@ -97,6 +91,8 @@ [0xcd] = "SET_SREGS2", [0xce] = "GET_STATS_FD", [0xd0] = "XEN_HVM_EVTCHN_SEND", + [0xd2] = "SET_MEMORY_ATTRIBUTES", + [0xd4] = "CREATE_GUEST_MEMFD", [0xe0] = "CREATE_DEVICE", [0xe1] = "SET_DEVICE_ATTR", [0xe2] = "GET_DEVICE_ATTR", $ This silences these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h diff -u tools/arch/x86/include/uapi/asm/kvm.h arch/x86/include/uapi/asm/kvm.h Cc: Adrian Hunter Cc: Chao Peng Cc: Ian Rogers Cc: Jing Zhang Cc: Jiri Olsa Cc: Namhyung Kim Cc: Oliver Upton Cc: Paolo Bonzini Cc: Paul Durrant Cc: Sean Christopherson Link: https://lore.kernel.org/lkml/ZbVLbkngp4oq13qN@kernel.org Signed-off-by: Arnaldo Carvalho de Melo [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Li Zhiquan --- tools/arch/x86/include/uapi/asm/kvm.h | 3 + tools/include/uapi/linux/kvm.h | 140 +++++++++----------------- 2 files changed, 53 insertions(+), 90 deletions(-) diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h index 1a6a1f987949..a448d0964fc0 100644 --- a/tools/arch/x86/include/uapi/asm/kvm.h +++ b/tools/arch/x86/include/uapi/asm/kvm.h @@ -562,4 +562,7 @@ struct kvm_pmu_event_filter { /* x86-specific KVM_EXIT_HYPERCALL flags. */ #define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0) +#define KVM_X86_DEFAULT_VM 0 +#define KVM_X86_SW_PROTECTED_VM 1 + #endif /* _ASM_X86_KVM_H */ diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index bd1a496b5448..be935dde7da0 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -16,76 +16,6 @@ #define KVM_API_VERSION 12 -/* *** Deprecated interfaces *** */ - -#define KVM_TRC_SHIFT 16 - -#define KVM_TRC_ENTRYEXIT (1 << KVM_TRC_SHIFT) -#define KVM_TRC_HANDLER (1 << (KVM_TRC_SHIFT + 1)) - -#define KVM_TRC_VMENTRY (KVM_TRC_ENTRYEXIT + 0x01) -#define KVM_TRC_VMEXIT (KVM_TRC_ENTRYEXIT + 0x02) -#define KVM_TRC_PAGE_FAULT (KVM_TRC_HANDLER + 0x01) - -#define KVM_TRC_HEAD_SIZE 12 -#define KVM_TRC_CYCLE_SIZE 8 -#define KVM_TRC_EXTRA_MAX 7 - -#define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02) -#define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03) -#define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04) -#define KVM_TRC_IO_READ (KVM_TRC_HANDLER + 0x05) -#define KVM_TRC_IO_WRITE (KVM_TRC_HANDLER + 0x06) -#define KVM_TRC_CR_READ (KVM_TRC_HANDLER + 0x07) -#define KVM_TRC_CR_WRITE (KVM_TRC_HANDLER + 0x08) -#define KVM_TRC_DR_READ (KVM_TRC_HANDLER + 0x09) -#define KVM_TRC_DR_WRITE (KVM_TRC_HANDLER + 0x0A) -#define KVM_TRC_MSR_READ (KVM_TRC_HANDLER + 0x0B) -#define KVM_TRC_MSR_WRITE (KVM_TRC_HANDLER + 0x0C) -#define KVM_TRC_CPUID (KVM_TRC_HANDLER + 0x0D) -#define KVM_TRC_INTR (KVM_TRC_HANDLER + 0x0E) -#define KVM_TRC_NMI (KVM_TRC_HANDLER + 0x0F) -#define KVM_TRC_VMMCALL (KVM_TRC_HANDLER + 0x10) -#define KVM_TRC_HLT (KVM_TRC_HANDLER + 0x11) -#define KVM_TRC_CLTS (KVM_TRC_HANDLER + 0x12) -#define KVM_TRC_LMSW (KVM_TRC_HANDLER + 0x13) -#define KVM_TRC_APIC_ACCESS (KVM_TRC_HANDLER + 0x14) -#define KVM_TRC_TDP_FAULT (KVM_TRC_HANDLER + 0x15) -#define KVM_TRC_GTLB_WRITE (KVM_TRC_HANDLER + 0x16) -#define KVM_TRC_STLB_WRITE (KVM_TRC_HANDLER + 0x17) -#define KVM_TRC_STLB_INVAL (KVM_TRC_HANDLER + 0x18) -#define KVM_TRC_PPC_INSTR (KVM_TRC_HANDLER + 0x19) - -struct kvm_user_trace_setup { - __u32 buf_size; - __u32 buf_nr; -}; - -#define __KVM_DEPRECATED_MAIN_W_0x06 \ - _IOW(KVMIO, 0x06, struct kvm_user_trace_setup) -#define __KVM_DEPRECATED_MAIN_0x07 _IO(KVMIO, 0x07) -#define __KVM_DEPRECATED_MAIN_0x08 _IO(KVMIO, 0x08) - -#define __KVM_DEPRECATED_VM_R_0x70 _IOR(KVMIO, 0x70, struct kvm_assigned_irq) - -struct kvm_breakpoint { - __u32 enabled; - __u32 padding; - __u64 address; -}; - -struct kvm_debug_guest { - __u32 enabled; - __u32 pad; - struct kvm_breakpoint breakpoints[4]; - __u32 singlestep; -}; - -#define __KVM_DEPRECATED_VCPU_W_0x87 _IOW(KVMIO, 0x87, struct kvm_debug_guest) - -/* *** End of deprecated interfaces *** */ - - /* for KVM_SET_USER_MEMORY_REGION */ struct kvm_userspace_memory_region { __u32 slot; @@ -95,6 +25,19 @@ struct kvm_userspace_memory_region { __u64 userspace_addr; /* start of the userspace allocated memory */ }; +/* for KVM_SET_USER_MEMORY_REGION2 */ +struct kvm_userspace_memory_region2 { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; + __u64 userspace_addr; + __u64 guest_memfd_offset; + __u32 guest_memfd; + __u32 pad1; + __u64 pad2[14]; +}; + /* * The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for * userspace, other bits are reserved for kvm internal use which are defined @@ -102,6 +45,7 @@ struct kvm_userspace_memory_region { */ #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) #define KVM_MEM_READONLY (1UL << 1) +#define KVM_MEM_GUEST_MEMFD (1UL << 2) /* for KVM_IRQ_LINE */ struct kvm_irq_level { @@ -264,6 +208,7 @@ struct kvm_xen_exit { #define KVM_EXIT_RISCV_SBI 35 #define KVM_EXIT_RISCV_CSR 36 #define KVM_EXIT_NOTIFY 37 +#define KVM_EXIT_MEMORY_FAULT 39 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -510,6 +455,13 @@ struct kvm_run { #define KVM_NOTIFY_CONTEXT_INVALID (1 << 0) __u32 flags; } notify; + /* KVM_EXIT_MEMORY_FAULT */ + struct { +#define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3) + __u64 flags; + __u64 gpa; + __u64 size; + } memory_fault; /* Fix the size of the union. */ char padding[256]; }; @@ -937,9 +889,6 @@ struct kvm_ppc_resize_hpt { */ #define KVM_GET_VCPU_MMAP_SIZE _IO(KVMIO, 0x04) /* in bytes */ #define KVM_GET_SUPPORTED_CPUID _IOWR(KVMIO, 0x05, struct kvm_cpuid2) -#define KVM_TRACE_ENABLE __KVM_DEPRECATED_MAIN_W_0x06 -#define KVM_TRACE_PAUSE __KVM_DEPRECATED_MAIN_0x07 -#define KVM_TRACE_DISABLE __KVM_DEPRECATED_MAIN_0x08 #define KVM_GET_EMULATED_CPUID _IOWR(KVMIO, 0x09, struct kvm_cpuid2) #define KVM_GET_MSR_FEATURE_INDEX_LIST _IOWR(KVMIO, 0x0a, struct kvm_msr_list) @@ -1192,6 +1141,11 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_COUNTER_OFFSET 227 #define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228 #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 +#define KVM_CAP_USER_MEMORY2 231 +#define KVM_CAP_MEMORY_FAULT_INFO 232 +#define KVM_CAP_MEMORY_ATTRIBUTES 233 +#define KVM_CAP_GUEST_MEMFD 234 +#define KVM_CAP_VM_TYPES 235 #define KVM_CAP_ARM_HW_DIRTY_STATE_TRACK 502 @@ -1284,6 +1238,7 @@ struct kvm_x86_mce { #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) #define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) +#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7) struct kvm_xen_hvm_config { __u32 flags; @@ -1474,6 +1429,8 @@ struct kvm_vfio_spapr_tce { struct kvm_userspace_memory_region) #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64) +#define KVM_SET_USER_MEMORY_REGION2 _IOW(KVMIO, 0x49, \ + struct kvm_userspace_memory_region2) /* enable ucontrol for s390 */ struct kvm_s390_ucas_mapping { @@ -1498,20 +1455,8 @@ struct kvm_s390_ucas_mapping { _IOW(KVMIO, 0x67, struct kvm_coalesced_mmio_zone) #define KVM_UNREGISTER_COALESCED_MMIO \ _IOW(KVMIO, 0x68, struct kvm_coalesced_mmio_zone) -#define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \ - struct kvm_assigned_pci_dev) #define KVM_SET_GSI_ROUTING _IOW(KVMIO, 0x6a, struct kvm_irq_routing) -/* deprecated, replaced by KVM_ASSIGN_DEV_IRQ */ -#define KVM_ASSIGN_IRQ __KVM_DEPRECATED_VM_R_0x70 -#define KVM_ASSIGN_DEV_IRQ _IOW(KVMIO, 0x70, struct kvm_assigned_irq) #define KVM_REINJECT_CONTROL _IO(KVMIO, 0x71) -#define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \ - struct kvm_assigned_pci_dev) -#define KVM_ASSIGN_SET_MSIX_NR _IOW(KVMIO, 0x73, \ - struct kvm_assigned_msix_nr) -#define KVM_ASSIGN_SET_MSIX_ENTRY _IOW(KVMIO, 0x74, \ - struct kvm_assigned_msix_entry) -#define KVM_DEASSIGN_DEV_IRQ _IOW(KVMIO, 0x75, struct kvm_assigned_irq) #define KVM_IRQFD _IOW(KVMIO, 0x76, struct kvm_irqfd) #define KVM_CREATE_PIT2 _IOW(KVMIO, 0x77, struct kvm_pit_config) #define KVM_SET_BOOT_CPU_ID _IO(KVMIO, 0x78) @@ -1528,9 +1473,6 @@ struct kvm_s390_ucas_mapping { * KVM_CAP_VM_TSC_CONTROL to set defaults for a VM */ #define KVM_SET_TSC_KHZ _IO(KVMIO, 0xa2) #define KVM_GET_TSC_KHZ _IO(KVMIO, 0xa3) -/* Available with KVM_CAP_PCI_2_3 */ -#define KVM_ASSIGN_SET_INTX_MASK _IOW(KVMIO, 0xa4, \ - struct kvm_assigned_pci_dev) /* Available with KVM_CAP_SIGNAL_MSI */ #define KVM_SIGNAL_MSI _IOW(KVMIO, 0xa5, struct kvm_msi) /* Available with KVM_CAP_PPC_GET_SMMU_INFO */ @@ -1582,8 +1524,6 @@ struct kvm_s390_ucas_mapping { #define KVM_SET_SREGS _IOW(KVMIO, 0x84, struct kvm_sregs) #define KVM_TRANSLATE _IOWR(KVMIO, 0x85, struct kvm_translation) #define KVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt) -/* KVM_DEBUG_GUEST is no longer supported, use KVM_SET_GUEST_DEBUG instead */ -#define KVM_DEBUG_GUEST __KVM_DEPRECATED_VCPU_W_0x87 #define KVM_GET_MSRS _IOWR(KVMIO, 0x88, struct kvm_msrs) #define KVM_SET_MSRS _IOW(KVMIO, 0x89, struct kvm_msrs) #define KVM_SET_CPUID _IOW(KVMIO, 0x8a, struct kvm_cpuid) @@ -2258,4 +2198,24 @@ struct kvm_s390_zpci_op { /* flags for kvm_s390_zpci_op->u.reg_aen.flags */ #define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) +/* Available with KVM_CAP_MEMORY_ATTRIBUTES */ +#define KVM_SET_MEMORY_ATTRIBUTES _IOW(KVMIO, 0xd2, struct kvm_memory_attributes) + +struct kvm_memory_attributes { + __u64 address; + __u64 size; + __u64 attributes; + __u64 flags; +}; + +#define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3) + +#define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd) + +struct kvm_create_guest_memfd { + __u64 size; + __u64 flags; + __u64 reserved[6]; +}; + #endif /* __LINUX_KVM_H */ -- Gitee From 1a86b8cf5859619c4879a7318fa712703189a8f8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 Feb 2024 11:06:08 -0800 Subject: [PATCH 40/54] KVM: Make KVM_MEM_GUEST_MEMFD mutually exclusive with KVM_MEM_READONLY mainline inclusion from mainline-v6.9-rc1 commit e563592224e02f87048edee3ce3f0da16cceee88 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e563592224e02f87048edee3ce3f0da16cceee88 ------------------------------------- Hygon-SIG: commit e563592224e0 upstream KVM: Make KVM_MEM_GUEST_MEMFD mutually exclusive with KVM_MEM_READONLY Backport the guest-memory feature. ------------------------------------- Disallow creating read-only memslots that support GUEST_MEMFD, as GUEST_MEMFD is fundamentally incompatible with KVM's semantics for read-only memslots. Read-only memslots allow the userspace VMM to emulate option ROMs by filling the backing memory with readable, executable code and data, while triggering emulated MMIO on writes. GUEST_MEMFD doesn't currently support writes from userspace and KVM doesn't support emulated MMIO on private accesses, i.e. the guest can only ever read zeros, and writes will always be treated as errors. Cc: Fuad Tabba Cc: Michael Roth Cc: Isaku Yamahata Cc: Yu Zhang Cc: Chao Peng Fixes: a7800aa80ea4 ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") Link: https://lore.kernel.org/r/20240222190612.2942589-2-seanjc@google.com Signed-off-by: Sean Christopherson [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- virt/kvm/kvm_main.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e21d545df75b..c9b9abe77e44 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1654,7 +1654,13 @@ static int check_memory_region_flags(struct kvm *kvm, valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES; #ifdef __KVM_HAVE_READONLY_MEM - valid_flags |= KVM_MEM_READONLY; + /* + * GUEST_MEMFD is incompatible with read-only memslots, as writes to + * read-only memslots have emulated MMIO, not page fault, semantics, + * and KVM doesn't allow emulated MMIO for private memory. + */ + if (!(mem->flags & KVM_MEM_GUEST_MEMFD)) + valid_flags |= KVM_MEM_READONLY; #endif valid_flags |= KVM_MEM_HUGE_POD; -- Gitee From 90217bfc3a40566f065eb61bac980d53d535cf58 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 22 Feb 2024 11:06:11 -0800 Subject: [PATCH 41/54] KVM: selftests: Create GUEST_MEMFD for relevant invalid flags testcases mainline inclusion from mainline-v6.9-rc1 commit 63e5c5a10559077bb5f32edf783084e7164af9c3 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=63e5c5a10559077bb5f32edf783084e7164af9c3 ------------------------------------- Hygon-SIG: commit 63e5c5a10559 upstream KVM: selftests: Create GUEST_MEMFD for relevant invalid flags testcases Backport the guest-memory feature. ------------------------------------- Actually create a GUEST_MEMFD instance and pass it to KVM when doing negative tests for KVM_SET_USER_MEMORY_REGION2 + KVM_MEM_GUEST_MEMFD. Without a valid GUEST_MEMFD file descriptor, KVM_SET_USER_MEMORY_REGION2 will always fail with -EINVAL, resulting in false passes for any and all tests of illegal combinations of KVM_MEM_GUEST_MEMFD and other flags. Fixes: 5d74316466f4 ("KVM: selftests: Add a memory region subtest to validate invalid flags") Link: https://lore.kernel.org/r/20240222190612.2942589-5-seanjc@google.com Signed-off-by: Sean Christopherson [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- tools/testing/selftests/kvm/set_memory_region_test.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c index 6637a0845acf..0c7b4d2754da 100644 --- a/tools/testing/selftests/kvm/set_memory_region_test.c +++ b/tools/testing/selftests/kvm/set_memory_region_test.c @@ -365,11 +365,15 @@ static void test_invalid_memory_region_flags(void) } if (supported_flags & KVM_MEM_GUEST_MEMFD) { + int guest_memfd = vm_create_guest_memfd(vm, MEM_REGION_SIZE, 0); + r = __vm_set_user_memory_region2(vm, 0, KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_GUEST_MEMFD, - 0, MEM_REGION_SIZE, NULL, 0, 0); + 0, MEM_REGION_SIZE, NULL, guest_memfd, 0); TEST_ASSERT(r && errno == EINVAL, "KVM_SET_USER_MEMORY_REGION2 should have failed, dirty logging private memory is unsupported"); + + close(guest_memfd); } } -- Gitee From 2b87f84a2b1213e7cb51569c2917a8cece3252e9 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Mon, 26 Feb 2024 17:57:16 -0800 Subject: [PATCH 42/54] KVM: selftests: Explicitly close guest_memfd files in some gmem tests mainline inclusion from mainline-v6.9-rc1 commit e9da6f08edb0bd4c621165496778d77a222e1174 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e9da6f08edb0bd4c621165496778d77a222e1174 ------------------------------------- Hygon-SIG: commit e9da6f08edb0 upstream KVM: selftests: Explicitly close guest_memfd files in some gmem tests Backport the guest-memory feature. ------------------------------------- Explicitly close() guest_memfd files in various guest_memfd and private_mem_conversions tests, there's no reason to keep the files open until the test exits. Fixes: 8a89efd43423 ("KVM: selftests: Add basic selftest for guest_memfd()") Fixes: 43f623f350ce ("KVM: selftests: Add x86-only selftest for private memory conversions") Signed-off-by: Dongli Zhang Link: https://lore.kernel.org/r/20240227015716.27284-1-dongli.zhang@oracle.com [sean: massage changelog] Signed-off-by: Sean Christopherson [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- tools/testing/selftests/kvm/guest_memfd_test.c | 3 +++ .../selftests/kvm/x86_64/private_mem_conversions_test.c | 2 ++ 2 files changed, 5 insertions(+) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index c78a98c1a915..92eae206baa6 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -167,6 +167,9 @@ static void test_create_guest_memfd_multiple(struct kvm_vm *vm) TEST_ASSERT(ret != -1, "memfd fstat should succeed"); TEST_ASSERT(st1.st_size == 4096, "first memfd st_size should still match requested size"); TEST_ASSERT(st1.st_ino != st2.st_ino, "different memfd should have different inode numbers"); + + close(fd2); + close(fd1); } int main(int argc, char *argv[]) diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c index 4d6a37a5d896..39c994bff1a7 100644 --- a/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c +++ b/tools/testing/selftests/kvm/x86_64/private_mem_conversions_test.c @@ -434,6 +434,8 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t r = fallocate(memfd, FALLOC_FL_KEEP_SIZE, 0, memfd_size); TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); + + close(memfd); } static void usage(const char *cmd) -- Gitee From 4acfb20abaa53bbb5d275859761a6a0dc0695d00 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 21 Feb 2024 17:26:40 -0800 Subject: [PATCH 43/54] KVM: x86/mmu: Retry fault before acquiring mmu_lock if mapping is changing mainline inclusion from mainline-v6.9-rc1 commit d02c357e5bfa7dfd618b7b3015624beb71f58f1f category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d02c357e5bfa7dfd618b7b3015624beb71f58f1f ------------------------------------- Hygon-SIG: commit d02c357e5bfa upstream KVM: x86/mmu: Retry fault before acquiring mmu_lock if mapping is changing Backport the guest-memory feature. ------------------------------------- Retry page faults without acquiring mmu_lock, and without even faulting the page into the primary MMU, if the resolved gfn is covered by an active invalidation. Contending for mmu_lock is especially problematic on preemptible kernels as the mmu_notifier invalidation task will yield mmu_lock (see rwlock_needbreak()), delay the in-progress invalidation, and ultimately increase the latency of resolving the page fault. And in the worst case scenario, yielding will be accompanied by a remote TLB flush, e.g. if the invalidation covers a large range of memory and vCPUs are accessing addresses that were already zapped. Faulting the page into the primary MMU is similarly problematic, as doing so may acquire locks that need to be taken for the invalidation to complete (the primary MMU has finer grained locks than KVM's MMU), and/or may cause unnecessary churn (getting/putting pages, marking them accessed, etc). Alternatively, the yielding issue could be mitigated by teaching KVM's MMU iterators to perform more work before yielding, but that wouldn't solve the lock contention and would negatively affect scenarios where a vCPU is trying to fault in an address that is NOT covered by the in-progress invalidation. Add a dedicated lockess version of the range-based retry check to avoid false positives on the sanity check on start+end WARN, and so that it's super obvious that checking for a racing invalidation without holding mmu_lock is unsafe (though obviously useful). Wrap mmu_invalidate_in_progress in READ_ONCE() to ensure that pre-checking invalidation in a loop won't put KVM into an infinite loop, e.g. due to caching the in-progress flag and never seeing it go to '0'. Force a load of mmu_invalidate_seq as well, even though it isn't strictly necessary to avoid an infinite loop, as doing so improves the probability that KVM will detect an invalidation that already completed before acquiring mmu_lock and bailing anyways. Do the pre-check even for non-preemptible kernels, as waiting to detect the invalidation until mmu_lock is held guarantees the vCPU will observe the worst case latency in terms of handling the fault, and can generate even more mmu_lock contention. E.g. the vCPU will acquire mmu_lock, detect retry, drop mmu_lock, re-enter the guest, retake the fault, and eventually re-acquire mmu_lock. This behavior is also why there are no new starvation issues due to losing the fairness guarantees provided by rwlocks: if the vCPU needs to retry, it _must_ drop mmu_lock, i.e. waiting on mmu_lock doesn't guarantee forward progress in the face of _another_ mmu_notifier invalidation event. Note, adding READ_ONCE() isn't entirely free, e.g. on x86, the READ_ONCE() may generate a load into a register instead of doing a direct comparison (MOV+TEST+Jcc instead of CMP+Jcc), but practically speaking the added cost is a few bytes of code and maaaaybe a cycle or three. Reported-by: Yan Zhao Closes: https://lore.kernel.org/all/ZNnPF4W26ZbAyGto@yzhao56-desk.sh.intel.com Reported-by: Friedrich Weber Cc: Kai Huang Cc: Yan Zhao Cc: Yuan Yao Cc: Xu Yilun Acked-by: Kai Huang Reviewed-by: Yan Zhao Link: https://lore.kernel.org/r/20240222012640.2820927-1-seanjc@google.com Signed-off-by: Sean Christopherson [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- arch/x86/kvm/mmu/mmu.c | 42 ++++++++++++++++++++++++++++++++++++++++ include/linux/kvm_host.h | 26 +++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index a9402215b0a9..97af09560be2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4401,6 +4401,31 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); + /* + * Check for a relevant mmu_notifier invalidation event before getting + * the pfn from the primary MMU, and before acquiring mmu_lock. + * + * For mmu_lock, if there is an in-progress invalidation and the kernel + * allows preemption, the invalidation task may drop mmu_lock and yield + * in response to mmu_lock being contended, which is *very* counter- + * productive as this vCPU can't actually make forward progress until + * the invalidation completes. + * + * Retrying now can also avoid unnessary lock contention in the primary + * MMU, as the primary MMU doesn't necessarily hold a single lock for + * the duration of the invalidation, i.e. faulting in a conflicting pfn + * can cause the invalidation to take longer by holding locks that are + * needed to complete the invalidation. + * + * Do the pre-check even for non-preemtible kernels, i.e. even if KVM + * will never yield mmu_lock in response to contention, as this vCPU is + * *guaranteed* to need to retry, i.e. waiting until mmu_lock is held + * to detect retry guarantees the worst case latency for the vCPU. + */ + if (fault->slot && + mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) + return RET_PF_RETRY; + ret = __kvm_faultin_pfn(vcpu, fault); if (ret != RET_PF_CONTINUE) return ret; @@ -4411,6 +4436,18 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (unlikely(!fault->slot)) return kvm_handle_noslot_fault(vcpu, fault, access); + /* + * Check again for a relevant mmu_notifier invalidation event purely to + * avoid contending mmu_lock. Most invalidations will be detected by + * the previous check, but checking is extremely cheap relative to the + * overall cost of failing to detect the invalidation until after + * mmu_lock is acquired. + */ + if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) { + kvm_release_pfn_clean(fault->pfn); + return RET_PF_RETRY; + } + return RET_PF_CONTINUE; } @@ -4438,6 +4475,11 @@ static bool is_page_fault_stale(struct kvm_vcpu *vcpu, if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) return true; + /* + * Check for a relevant mmu_notifier invalidation event one last time + * now that mmu_lock is held, as the "unsafe" checks performed without + * holding mmu_lock can get false negatives. + */ return fault->slot && mmu_invalidate_retry_gfn(vcpu->kvm, fault->mmu_seq, fault->gfn); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2565a7951bcd..a68424463aae 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2179,6 +2179,32 @@ static inline int mmu_invalidate_retry_gfn(struct kvm *kvm, return 1; return 0; } + +/* + * This lockless version of the range-based retry check *must* be paired with a + * call to the locked version after acquiring mmu_lock, i.e. this is safe to + * use only as a pre-check to avoid contending mmu_lock. This version *will* + * get false negatives and false positives. + */ +static inline bool mmu_invalidate_retry_gfn_unsafe(struct kvm *kvm, + unsigned long mmu_seq, + gfn_t gfn) +{ + /* + * Use READ_ONCE() to ensure the in-progress flag and sequence counter + * are always read from memory, e.g. so that checking for retry in a + * loop won't result in an infinite retry loop. Don't force loads for + * start+end, as the key to avoiding infinite retry loops is observing + * the 1=>0 transition of in-progress, i.e. getting false negatives + * due to stale start+end values is acceptable. + */ + if (unlikely(READ_ONCE(kvm->mmu_invalidate_in_progress)) && + gfn >= kvm->mmu_invalidate_range_start && + gfn < kvm->mmu_invalidate_range_end) + return true; + + return READ_ONCE(kvm->mmu_invalidate_seq) != mmu_seq; +} #endif #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING -- Gitee From 598bae90892d67bea6d322edd7a35d443f1deaa4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 27 Feb 2024 18:41:40 -0800 Subject: [PATCH 44/54] KVM: x86/mmu: Move private vs. shared check above slot validity checks mainline inclusion from mainline-v6.10-rc1 commit 44f42ef37deb49682abf0108bf9ede88d4478a20 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=44f42ef37deb49682abf0108bf9ede88d4478a20 ------------------------------------- Hygon-SIG: commit 44f42ef37deb upstream KVM: x86/mmu: Move private vs. shared check above slot validity checks Backport the guest-memory feature. ------------------------------------- Prioritize private vs. shared gfn attribute checks above slot validity checks to ensure a consistent userspace ABI. E.g. as is, KVM will exit to userspace if there is no memslot, but emulate accesses to the APIC access page even if the attributes mismatch. Fixes: 8dd2eee9d526 ("KVM: x86/mmu: Handle page fault for private memory") Cc: Yu Zhang Cc: Chao Peng Cc: Fuad Tabba Cc: Michael Roth Cc: Isaku Yamahata Signed-off-by: Sean Christopherson Reviewed-by: Kai Huang Message-ID: <20240228024147.41573-10-seanjc@google.com> Signed-off-by: Paolo Bonzini [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- arch/x86/kvm/mmu/mmu.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 97af09560be2..88ee6bb1f5a1 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4356,11 +4356,6 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault return RET_PF_EMULATE; } - if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) { - kvm_mmu_prepare_memory_fault_exit(vcpu, fault); - return -EFAULT; - } - if (fault->is_private) return kvm_faultin_pfn_private(vcpu, fault); @@ -4398,9 +4393,24 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, { int ret; + /* + * Note that the mmu_invalidate_seq also serves to detect a concurrent + * change in attributes. is_page_fault_stale() will detect an + * invalidation relate to fault->fn and resume the guest without + * installing a mapping in the page tables. + */ fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq; smp_rmb(); + /* + * Now that we have a snapshot of mmu_invalidate_seq we can check for a + * private vs. shared mismatch. + */ + if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)) { + kvm_mmu_prepare_memory_fault_exit(vcpu, fault); + return -EFAULT; + } + /* * Check for a relevant mmu_notifier invalidation event before getting * the pfn from the primary MMU, and before acquiring mmu_lock. -- Gitee From 27390df99f2503f1b2eb919ab73cd9027453fc0f Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Thu, 14 Mar 2024 14:29:02 -0700 Subject: [PATCH 45/54] KVM: x86/mmu: x86: Don't overflow lpage_info when checking attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.9-rc5 commit 992b54bd083c5bee24ff7cc35991388ab08598c4 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=992b54bd083c5bee24ff7cc35991388ab08598c4 ------------------------------------- Hygon-SIG: commit 992b54bd083c upstream KVM: x86/mmu: x86: Don't overflow lpage_info when checking attributes Backport the guest-memory feature. ------------------------------------- Fix KVM_SET_MEMORY_ATTRIBUTES to not overflow lpage_info array and trigger KASAN splat, as seen in the private_mem_conversions_test selftest. When memory attributes are set on a GFN range, that range will have specific properties applied to the TDP. A huge page cannot be used when the attributes are inconsistent, so they are disabled for those the specific huge pages. For internal KVM reasons, huge pages are also not allowed to span adjacent memslots regardless of whether the backing memory could be mapped as huge. What GFNs support which huge page sizes is tracked by an array of arrays 'lpage_info' on the memslot, of ‘kvm_lpage_info’ structs. Each index of lpage_info contains a vmalloc allocated array of these for a specific supported page size. The kvm_lpage_info denotes whether a specific huge page (GFN and page size) on the memslot is supported. These arrays include indices for unaligned head and tail huge pages. Preventing huge pages from spanning adjacent memslot is covered by incrementing the count in head and tail kvm_lpage_info when the memslot is allocated, but disallowing huge pages for memory that has mixed attributes has to be done in a more complicated way. During the KVM_SET_MEMORY_ATTRIBUTES ioctl KVM updates lpage_info for each memslot in the range that has mismatched attributes. KVM does this a memslot at a time, and marks a special bit, KVM_LPAGE_MIXED_FLAG, in the kvm_lpage_info for any huge page. This bit is essentially a permanently elevated count. So huge pages will not be mapped for the GFN at that page size if the count is elevated in either case: a huge head or tail page unaligned to the memslot or if KVM_LPAGE_MIXED_FLAG is set because it has mixed attributes. To determine whether a huge page has consistent attributes, the KVM_SET_MEMORY_ATTRIBUTES operation checks an xarray to make sure it consistently has the incoming attribute. Since level - 1 huge pages are aligned to level huge pages, it employs an optimization. As long as the level - 1 huge pages are checked first, it can just check these and assume that if each level - 1 huge page contained within the level sized huge page is not mixed, then the level size huge page is not mixed. This optimization happens in the helper hugepage_has_attrs(). Unfortunately, although the kvm_lpage_info array representing page size 'level' will contain an entry for an unaligned tail page of size level, the array for level - 1 will not contain an entry for each GFN at page size level. The level - 1 array will only contain an index for any unaligned region covered by level - 1 huge page size, which can be a smaller region. So this causes the optimization to overflow the level - 1 kvm_lpage_info and perform a vmalloc out of bounds read. In some cases of head and tail pages where an overflow could happen, callers skip the operation completely as KVM_LPAGE_MIXED_FLAG is not required to prevent huge pages as discussed earlier. But for memslots that are smaller than the 1GB page size, it does call hugepage_has_attrs(). In this case the huge page is both the head and tail page. The issue can be observed simply by compiling the kernel with CONFIG_KASAN_VMALLOC and running the selftest “private_mem_conversions_test”, which produces the output like the following: BUG: KASAN: vmalloc-out-of-bounds in hugepage_has_attrs+0x7e/0x110 Read of size 4 at addr ffffc900000a3008 by task private_mem_con/169 Call Trace: dump_stack_lvl print_report ? __virt_addr_valid ? hugepage_has_attrs ? hugepage_has_attrs kasan_report ? hugepage_has_attrs hugepage_has_attrs kvm_arch_post_set_memory_attributes kvm_vm_ioctl It is a little ambiguous whether the unaligned head page (in the bug case also the tail page) should be expected to have KVM_LPAGE_MIXED_FLAG set. It is not functionally required, as the unaligned head/tail pages will already have their kvm_lpage_info count incremented. The comments imply not setting it on unaligned head pages is intentional, so fix the callers to skip trying to set KVM_LPAGE_MIXED_FLAG in this case, and in doing so not call hugepage_has_attrs(). Cc: stable@vger.kernel.org Fixes: 90b4fe17981e ("KVM: x86: Disallow hugepages when memory attributes are mixed") Signed-off-by: Rick Edgecombe Reviewed-by: Kai Huang Reviewed-by: Chao Peng Link: https://lore.kernel.org/r/20240314212902.2762507-1-rick.p.edgecombe@intel.com Signed-off-by: Sean Christopherson [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- arch/x86/kvm/mmu/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 88ee6bb1f5a1..87762e32dc5d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7398,7 +7398,8 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, * by the memslot, KVM can't use a hugepage due to the * misaligned address regardless of memory attributes. */ - if (gfn >= slot->base_gfn) { + if (gfn >= slot->base_gfn && + gfn + nr_pages <= slot->base_gfn + slot->npages) { if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) hugepage_clear_mixed(slot, gfn, level); else -- Gitee From 583bf8d1bc84936ea1f6dce291beb15f08ee5e88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20L=C3=B3pez?= Date: Wed, 24 Apr 2024 12:33:16 +0200 Subject: [PATCH 46/54] KVM: fix documentation for KVM_CREATE_GUEST_MEMFD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.10-rc1 commit 2098acaf24455698c149b27f0347eb4ddc6d2058 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2098acaf24455698c149b27f0347eb4ddc6d2058 ------------------------------------- Hygon-SIG: commit 2098acaf2445 upstream KVM: fix documentation for KVM_CREATE_GUEST_MEMFD Backport the guest-memory feature. ------------------------------------- The KVM_CREATE_GUEST_MEMFD ioctl returns a file descriptor, and is documented as such in the description. However, the "Returns" field in the documentation states that the ioctl returns 0 on success. Update this to match the description. Signed-off-by: Carlos López Fixes: a7800aa80ea4 ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") Link: https://lore.kernel.org/r/20240424103317.28522-1-clopez@suse.de Signed-off-by: Sean Christopherson [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- Documentation/virt/kvm/api.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 923e61011def..bd1cdd0e1890 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6307,7 +6307,7 @@ The "flags" field is reserved for future extensions and must be '0'. :Architectures: none :Type: vm ioctl :Parameters: struct kvm_create_guest_memfd(in) -:Returns: 0 on success, <0 on error +:Returns: A file descriptor on success, <0 on error KVM_CREATE_GUEST_MEMFD creates an anonymous file and returns a file descriptor that refers to it. guest_memfd files are roughly analogous to files created -- Gitee From 2fcc25093e52d6aa3489539bac3a260686f44d26 Mon Sep 17 00:00:00 2001 From: Liam Merwick Date: Mon, 9 Jun 2025 09:11:19 +0000 Subject: [PATCH 47/54] KVM: Allow CPU to reschedule while setting per-page memory attributes mainline inclusion from mainline-v6.16-rc6 commit 47bb584237cc285e3a860b70c01f7bda9dcfb05b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=47bb584237cc285e3a860b70c01f7bda9dcfb05b ------------------------------------- Hygon-SIG: commit 47bb584237cc upstream KVM: Allow CPU to reschedule while setting per-page memory attributes Backport the guest-memory feature. ------------------------------------- When running an SEV-SNP guest with a sufficiently large amount of memory (1TB+), the host can experience CPU soft lockups when running an operation in kvm_vm_set_mem_attributes() to set memory attributes on the whole range of guest memory. watchdog: BUG: soft lockup - CPU#8 stuck for 26s! [qemu-kvm:6372] CPU: 8 UID: 0 PID: 6372 Comm: qemu-kvm Kdump: loaded Not tainted 6.15.0-rc7.20250520.el9uek.rc1.x86_64 #1 PREEMPT(voluntary) Hardware name: Oracle Corporation ORACLE SERVER E4-2c/Asm,MB Tray,2U,E4-2c, BIOS 78016600 11/13/2024 RIP: 0010:xas_create+0x78/0x1f0 Code: 00 00 00 41 80 fc 01 0f 84 82 00 00 00 ba 06 00 00 00 bd 06 00 00 00 49 8b 45 08 4d 8d 65 08 41 39 d6 73 20 83 ed 06 48 85 c0 <74> 67 48 89 c2 83 e2 03 48 83 fa 02 75 0c 48 3d 00 10 00 00 0f 87 RSP: 0018:ffffad890a34b940 EFLAGS: 00000286 RAX: ffff96f30b261daa RBX: ffffad890a34b9c8 RCX: 0000000000000000 RDX: 000000000000001e RSI: 0000000000000000 RDI: 0000000000000000 RBP: 0000000000000018 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffffad890a356868 R13: ffffad890a356860 R14: 0000000000000000 R15: ffffad890a356868 FS: 00007f5578a2a400(0000) GS:ffff97ed317e1000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f015c70fb18 CR3: 00000001109fd006 CR4: 0000000000f70ef0 PKRU: 55555554 Call Trace: xas_store+0x58/0x630 __xa_store+0xa5/0x130 xa_store+0x2c/0x50 kvm_vm_set_mem_attributes+0x343/0x710 [kvm] kvm_vm_ioctl+0x796/0xab0 [kvm] __x64_sys_ioctl+0xa3/0xd0 do_syscall_64+0x8c/0x7a0 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f5578d031bb Code: ff ff ff 85 c0 79 9b 49 c7 c4 ff ff ff ff 5b 5d 4c 89 e0 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 2d 4c 0f 00 f7 d8 64 89 01 48 RSP: 002b:00007ffe0a742b88 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 000000004020aed2 RCX: 00007f5578d031bb RDX: 00007ffe0a742c80 RSI: 000000004020aed2 RDI: 000000000000000b RBP: 0000010000000000 R08: 0000010000000000 R09: 0000017680000000 R10: 0000000000000080 R11: 0000000000000246 R12: 00005575e5f95120 R13: 00007ffe0a742c80 R14: 0000000000000008 R15: 00005575e5f961e0 While looping through the range of memory setting the attributes, call cond_resched() to give the scheduler a chance to run a higher priority task on the runqueue if necessary and avoid staying in kernel mode long enough to trigger the lockup. Fixes: 5a475554db1e ("KVM: Introduce per-page memory attributes") Cc: stable@vger.kernel.org # 6.12.x Suggested-by: Sean Christopherson Signed-off-by: Liam Merwick Reviewed-by: Pankaj Gupta Link: https://lore.kernel.org/r/20250609091121.2497429-2-liam.merwick@oracle.com Signed-off-by: Sean Christopherson [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- virt/kvm/kvm_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c9b9abe77e44..c8bf139e3a7f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2621,6 +2621,8 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end, r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT); if (r) goto out_unlock; + + cond_resched(); } kvm_handle_gfn_range(kvm, &pre_set_range); @@ -2629,6 +2631,7 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end, r = xa_err(xa_store(&kvm->mem_attr_array, i, entry, GFP_KERNEL_ACCOUNT)); KVM_BUG_ON(r, kvm); + cond_resched(); } kvm_handle_gfn_range(kvm, &post_set_range); -- Gitee From 2cc9ce2efce2035c8a2de307019040369ad844e9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 30 Apr 2025 15:09:54 -0700 Subject: [PATCH 48/54] KVM: x86/mmu: Prevent installing hugepages when mem attributes are changing mainline inclusion from mainline-v6.15-rc6 commit 9129633d568edd36aa22bf703b12835153cec985 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9129633d568edd36aa22bf703b12835153cec985 ------------------------------------- Hygon-SIG: commit 9129633d568e upstream KVM: x86/mmu: Prevent installing hugepages when mem attributes are changing Backport the guest-memory feature. ------------------------------------- When changing memory attributes on a subset of a potential hugepage, add the hugepage to the invalidation range tracking to prevent installing a hugepage until the attributes are fully updated. Like the actual hugepage tracking updates in kvm_arch_post_set_memory_attributes(), process only the head and tail pages, as any potential hugepages that are entirely covered by the range will already be tracked. Note, only hugepage chunks whose current attributes are NOT mixed need to be added to the invalidation set, as mixed attributes already prevent installing a hugepage, and it's perfectly safe to install a smaller mapping for a gfn whose attributes aren't changing. Fixes: 8dd2eee9d526 ("KVM: x86/mmu: Handle page fault for private memory") Cc: stable@vger.kernel.org Reported-by: Michael Roth Tested-by: Michael Roth Link: https://lore.kernel.org/r/20250430220954.522672-1-seanjc@google.com Signed-off-by: Sean Christopherson [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Li Zhiquan --- arch/x86/kvm/mmu/mmu.c | 69 ++++++++++++++++++++++++++++++++---------- 1 file changed, 53 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 87762e32dc5d..bb4618d4b74e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7309,9 +7309,30 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm) } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; +} + bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, struct kvm_gfn_range *range) { + struct kvm_memory_slot *slot = range->slot; + int level; + /* * Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only * supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it *seems* like KVM @@ -7326,27 +7347,43 @@ bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm, if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) return false; - return kvm_unmap_gfn_range(kvm, range); -} + if (WARN_ON_ONCE(range->end <= range->start)) + return false; -static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; -} + /* + * If the head and tail pages of the range currently allow a hugepage, + * i.e. reside fully in the slot and don't have mixed attributes, then + * add each corresponding hugepage range to the ongoing invalidation, + * e.g. to prevent KVM from creating a hugepage in response to a fault + * for a gfn whose attributes aren't changing. Note, only the range + * of gfns whose attributes are being modified needs to be explicitly + * unmapped, as that will unmap any existing hugepages. + */ + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + gfn_t start = gfn_round_for_level(range->start, level); + gfn_t end = gfn_round_for_level(range->end - 1, level); + gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); -static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; -} + if ((start != range->start || start + nr_pages > range->end) && + start >= slot->base_gfn && + start + nr_pages <= slot->base_gfn + slot->npages && + !hugepage_test_mixed(slot, start, level)) + kvm_mmu_invalidate_range_add(kvm, start, start + nr_pages); -static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, - int level) -{ - lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; + if (end == start) + continue; + + if ((end + nr_pages) > range->end && + (end + nr_pages) <= (slot->base_gfn + slot->npages) && + !hugepage_test_mixed(slot, end, level)) + kvm_mmu_invalidate_range_add(kvm, end, end + nr_pages); + } + + return kvm_unmap_gfn_range(kvm, range); } + + static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int level, unsigned long attrs) { -- Gitee From a81c031a26b9ea6e52ebe1ee61be9d6e30136945 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 3 Oct 2025 16:25:57 -0700 Subject: [PATCH 49/54] KVM: Explicitly mark KVM_GUEST_MEMFD as depending on KVM_GENERIC_MMU_NOTIFIER mainline inclusion from mainline-v6.18-rc2 commit 9aef71c892a55e004419923ba7129abe3e58d9f1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=9aef71c892a55e004419923ba7129abe3e58d9f1 ------------------------------------- Hygon-SIG: commit 9aef71c892a5 upstream KVM: Explicitly mark KVM_GUEST_MEMFD as depending on KVM_GENERIC_MMU_NOTIFIER Backport the guest-memory feature. ------------------------------------- Add KVM_GENERIC_MMU_NOTIFIER as a dependency for selecting KVM_GUEST_MEMFD, as guest_memfd relies on kvm_mmu_invalidate_{begin,end}(), which are defined if and only if the generic mmu_notifier implementation is enabled. The missing dependency is currently benign as s390 is the only KVM arch that doesn't utilize the generic mmu_notifier infrastructure, and s390 doesn't currently support guest_memfd. Fixes: a7800aa80ea4 ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") Reviewed-by: David Hildenbrand Link: https://lore.kernel.org/r/20251003232606.4070510-5-seanjc@google.com Signed-off-by: Sean Christopherson [ Zhiquan Li: amend commit log and resolve the conflict. The commit 19a9a1ab5c3d ("KVM: Rename CONFIG_KVM_PRIVATE_MEM to CONFIG_KVM_GUEST_MEMFD") will introduce unnecessary dependencies, so don't backport it now. ] Signed-off-by: Li Zhiquan --- virt/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 988bccc5aaf0..baa479484dc7 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -102,6 +102,7 @@ config KVM_GENERIC_MEMORY_ATTRIBUTES bool config KVM_PRIVATE_MEM + depends on KVM_GENERIC_MMU_NOTIFIER select XARRAY_MULTI bool -- Gitee From 0d8266fd64636a2e5f8be0effc7cff3fcd8bca63 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Thu, 24 Oct 2024 10:59:53 +0100 Subject: [PATCH 50/54] KVM: selftests: fix unintentional noop test in guest_memfd_test.c mainline inclusion from mainline-v6.13-rc1 commit 945bdae20be5a13f1fcdcb14ec356dcbeee35839 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=945bdae20be5a13f1fcdcb14ec356dcbeee35839 ------------------------------------- Hygon-SIG: commit 945bdae20be5 upstream KVM: selftests: fix unintentional noop test in guest_memfd_test.c Backport the guest-memory feature. ------------------------------------- The loop in test_create_guest_memfd_invalid() that is supposed to test that nothing is accepted as a valid flag to KVM_CREATE_GUEST_MEMFD was initializing `flag` as 0 instead of BIT(0). This caused the loop to immediately exit instead of iterating over BIT(0), BIT(1), ... . Fixes: 8a89efd43423 ("KVM: selftests: Add basic selftest for guest_memfd()") Signed-off-by: Patrick Roy Reviewed-by: James Gowans Reviewed-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20241024095956.3668818-1-roypat@amazon.co.uk Signed-off-by: Sean Christopherson [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- tools/testing/selftests/kvm/guest_memfd_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index 92eae206baa6..2f3379b7e67d 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -136,7 +136,7 @@ static void test_create_guest_memfd_invalid(struct kvm_vm *vm) size); } - for (flag = 0; flag; flag <<= 1) { + for (flag = BIT(0); flag; flag <<= 1) { fd = __vm_create_guest_memfd(vm, page_size, flag); TEST_ASSERT(fd == -1 && errno == EINVAL, "guest_memfd() with flag '0x%lx' should fail with EINVAL", -- Gitee From 797fac289265a24414f314b53e56a66fb30c23f6 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 12 Aug 2024 14:12:20 -0400 Subject: [PATCH 51/54] mm/mprotect: push mmu notifier to PUDs mainline inclusion from mainline-v6.12-rc1 commit 7f06e3aa2e836c11a70d161ae19dc4414eecd80d category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7f06e3aa2e836c11a70d161ae19dc4414eecd80d ------------------------------------- Hygon-SIG: commit 7f06e3aa2e83 upstream mm/mprotect: push mmu notifier to PUDs Backport the guest-memory feature. ------------------------------------- mprotect() does mmu notifiers in PMD levels. It's there since 2014 of commit a5338093bfb4 ("mm: move mmu notifier call from change_protection to change_pmd_range"). At that time, the issue was that NUMA balancing can be applied on a huge range of VM memory, even if nothing was populated. The notification can be avoided in this case if no valid pmd detected, which includes either THP or a PTE pgtable page. Now to pave way for PUD handling, this isn't enough. We need to generate mmu notifications even on PUD entries properly. mprotect() is currently broken on PUD (e.g., one can easily trigger kernel error with dax 1G mappings already), this is the start to fix it. To fix that, this patch proposes to push such notifications to the PUD layers. There is risk on regressing the problem Rik wanted to resolve before, but I think it shouldn't really happen, and I still chose this solution because of a few reasons: 1) Consider a large VM that should definitely contain more than GBs of memory, it's highly likely that PUDs are also none. In this case there will have no regression. 2) KVM has evolved a lot over the years to get rid of rmap walks, which might be the major cause of the previous soft-lockup. At least TDP MMU already got rid of rmap as long as not nested (which should be the major use case, IIUC), then the TDP MMU pgtable walker will simply see empty VM pgtable (e.g. EPT on x86), the invalidation of a full empty region in most cases could be pretty fast now, comparing to 2014. 3) KVM has explicit code paths now to even give way for mmu notifiers just like this one, e.g. in commit d02c357e5bfa ("KVM: x86/mmu: Retry fault before acquiring mmu_lock if mapping is changing"). It'll also avoid contentions that may also contribute to a soft-lockup. 4) Stick with PMD layer simply don't work when PUD is there... We need one way or another to fix PUD mappings on mprotect(). Pushing it to PUD should be the safest approach as of now, e.g. there's yet no sign of huge P4D coming on any known archs. Link: https://lkml.kernel.org/r/20240812181225.1360970-3-peterx@redhat.com Signed-off-by: Peter Xu Cc: Sean Christopherson Cc: Paolo Bonzini Cc: David Rientjes Cc: Rik van Riel Cc: Aneesh Kumar K.V Cc: Borislav Petkov Cc: Christophe Leroy Cc: Dan Williams Cc: Dave Hansen Cc: Dave Jiang Cc: David Hildenbrand Cc: "Edgecombe, Rick P" Cc: Hugh Dickins Cc: Ingo Molnar Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Thomas Gleixner Cc: Vlastimil Babka Signed-off-by: Andrew Morton [ Zhiquan Li: amend commit log. ] Signed-off-by: Li Zhiquan --- mm/mprotect.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 86f98c453fe4..5d14d104e67e 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -361,9 +361,6 @@ static inline long change_pmd_range(struct mmu_gather *tlb, unsigned long next; long pages = 0; unsigned long nr_huge_updates = 0; - struct mmu_notifier_range range; - - range.start = 0; pmd = pmd_offset(pud, addr); do { @@ -381,14 +378,6 @@ static inline long change_pmd_range(struct mmu_gather *tlb, if (pmd_none(*pmd)) goto next; - /* invoke the mmu notifier if the pmd is populated */ - if (!range.start) { - mmu_notifier_range_init(&range, - MMU_NOTIFY_PROTECTION_VMA, 0, - vma->vm_mm, addr, end); - mmu_notifier_invalidate_range_start(&range); - } - _pmd = pmdp_get_lockless(pmd); if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) { if ((next - addr != HPAGE_PMD_SIZE) || @@ -429,9 +418,6 @@ static inline long change_pmd_range(struct mmu_gather *tlb, cond_resched(); } while (pmd++, addr = next, addr != end); - if (range.start) - mmu_notifier_invalidate_range_end(&range); - if (nr_huge_updates) count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); return pages; @@ -441,22 +427,36 @@ static inline long change_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, pgprot_t newprot, unsigned long cp_flags) { + struct mmu_notifier_range range; pud_t *pud; unsigned long next; long pages = 0, ret; + range.start = 0; + pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); ret = change_prepare(vma, pud, pmd, addr, cp_flags); - if (ret) - return ret; + if (ret) { + pages = ret; + break; + } if (pud_none_or_clear_bad(pud)) continue; + if (!range.start) { + mmu_notifier_range_init(&range, + MMU_NOTIFY_PROTECTION_VMA, 0, + vma->vm_mm, addr, end); + mmu_notifier_invalidate_range_start(&range); + } pages += change_pmd_range(tlb, vma, pud, addr, next, newprot, cp_flags); } while (pud++, addr = next, addr != end); + if (range.start) + mmu_notifier_invalidate_range_end(&range); + return pages; } -- Gitee From ceb0a82d6ce17dbbb6d8e191b3a180bc00801813 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 3 Nov 2025 17:12:05 -0800 Subject: [PATCH 52/54] KVM: guest_memfd: Remove bindings on memslot deletion when gmem is dying mainline inclusion from mainline-v6.18-rc6 commit ae431059e75d36170a5ae6b44cc4d06d43613215 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ae431059e75d36170a5ae6b44cc4d06d43613215 ------------------------------------- Hygon-SIG: commit ae431059e75d upstream KVM: guest_memfd: Remove bindings on memslot deletion when gmem is dying Backport the guest-memory feature. ------------------------------------- When unbinding a memslot from a guest_memfd instance, remove the bindings even if the guest_memfd file is dying, i.e. even if its file refcount has gone to zero. If the memslot is freed before the file is fully released, nullifying the memslot side of the binding in kvm_gmem_release() will write to freed memory, as detected by syzbot+KASAN: ================================================================== BUG: KASAN: slab-use-after-free in kvm_gmem_release+0x176/0x440 virt/kvm/guest_memfd.c:353 Write of size 8 at addr ffff88807befa508 by task syz.0.17/6022 CPU: 0 UID: 0 PID: 6022 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/02/2025 Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xca/0x240 mm/kasan/report.c:482 kasan_report+0x118/0x150 mm/kasan/report.c:595 kvm_gmem_release+0x176/0x440 virt/kvm/guest_memfd.c:353 __fput+0x44c/0xa70 fs/file_table.c:468 task_work_run+0x1d4/0x260 kernel/task_work.c:227 resume_user_mode_work include/linux/resume_user_mode.h:50 [inline] exit_to_user_mode_loop+0xe9/0x130 kernel/entry/common.c:43 exit_to_user_mode_prepare include/linux/irq-entry-common.h:225 [inline] syscall_exit_to_user_mode_work include/linux/entry-common.h:175 [inline] syscall_exit_to_user_mode include/linux/entry-common.h:210 [inline] do_syscall_64+0x2bd/0xfa0 arch/x86/entry/syscall_64.c:100 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fbeeff8efc9 Allocated by task 6023: kasan_save_stack mm/kasan/common.c:56 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:77 poison_kmalloc_redzone mm/kasan/common.c:397 [inline] __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:414 kasan_kmalloc include/linux/kasan.h:262 [inline] __kmalloc_cache_noprof+0x3e2/0x700 mm/slub.c:5758 kmalloc_noprof include/linux/slab.h:957 [inline] kzalloc_noprof include/linux/slab.h:1094 [inline] kvm_set_memory_region+0x747/0xb90 virt/kvm/kvm_main.c:2104 kvm_vm_ioctl_set_memory_region+0x6f/0xd0 virt/kvm/kvm_main.c:2154 kvm_vm_ioctl+0x957/0xc60 virt/kvm/kvm_main.c:5201 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:597 [inline] __se_sys_ioctl+0xfc/0x170 fs/ioctl.c:583 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0xfa0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 6023: kasan_save_stack mm/kasan/common.c:56 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:77 kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:584 poison_slab_object mm/kasan/common.c:252 [inline] __kasan_slab_free+0x5c/0x80 mm/kasan/common.c:284 kasan_slab_free include/linux/kasan.h:234 [inline] slab_free_hook mm/slub.c:2533 [inline] slab_free mm/slub.c:6622 [inline] kfree+0x19a/0x6d0 mm/slub.c:6829 kvm_set_memory_region+0x9c4/0xb90 virt/kvm/kvm_main.c:2130 kvm_vm_ioctl_set_memory_region+0x6f/0xd0 virt/kvm/kvm_main.c:2154 kvm_vm_ioctl+0x957/0xc60 virt/kvm/kvm_main.c:5201 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:597 [inline] __se_sys_ioctl+0xfc/0x170 fs/ioctl.c:583 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0xfa0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Deliberately don't acquire filemap invalid lock when the file is dying as the lifecycle of f_mapping is outside the purview of KVM. Dereferencing the mapping is *probably* fine, but there's no need to invalidate anything as memslot deletion is responsible for zapping SPTEs, and the only code that can access the dying file is kvm_gmem_release(), whose core code is mutually exclusive with unbinding. Note, the mutual exclusivity is also what makes it safe to access the bindings on a dying gmem instance. Unbinding either runs with slots_lock held, or after the last reference to the owning "struct kvm" is put, and kvm_gmem_release() nullifies the slot pointer under slots_lock, and puts its reference to the VM after that is done. Reported-by: syzbot+2479e53d0db9b32ae2aa@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/68fa7a22.a70a0220.3bf6c6.008b.GAE@google.com Tested-by: syzbot+2479e53d0db9b32ae2aa@syzkaller.appspotmail.com Fixes: a7800aa80ea4 ("KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory") Cc: stable@vger.kernel.org Cc: Hillf Danton Reviewed-By: Vishal Annapurve Link: https://patch.msgid.link/20251104011205.3853541-1-seanjc@google.com Signed-off-by: Sean Christopherson [ Zhiquan Li: amend commit log and resolve the conflict. ] Signed-off-by: Li Zhiquan --- virt/kvm/guest_memfd.c | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 5047b0f2de5a..9ee87ed199a4 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -460,27 +460,46 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, return r; } -void kvm_gmem_unbind(struct kvm_memory_slot *slot) +static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct kvm_gmem *gmem) { unsigned long start = slot->gmem.pgoff; unsigned long end = start + slot->npages; - struct kvm_gmem *gmem; + + xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL); + rcu_assign_pointer(slot->gmem.file, NULL); + synchronize_rcu(); +} + +void kvm_gmem_unbind(struct kvm_memory_slot *slot) +{ struct file *file; /* - * Nothing to do if the underlying file was already closed (or is being - * closed right now), kvm_gmem_release() invalidates all bindings. + * Nothing to do if the underlying file was _already_ closed, as + * kvm_gmem_release() invalidates and nullifies all bindings. */ - file = kvm_gmem_get_file(slot); - if (!file) + if (!slot->gmem.file) return; - gmem = file->private_data; + file = kvm_gmem_get_file(slot); + + /* + * However, if the file is _being_ closed, then the bindings need to be + * removed as kvm_gmem_release() might not run until after the memslot + * is freed. Note, modifying the bindings is safe even though the file + * is dying as kvm_gmem_release() nullifies slot->gmem.file under + * slots_lock, and only puts its reference to KVM after destroying all + * bindings. I.e. reaching this point means kvm_gmem_release() hasn't + * yet destroyed the bindings or freed the gmem_file, and can't do so + * until the caller drops slots_lock. + */ + if (!file) { + __kvm_gmem_unbind(slot, slot->gmem.file->private_data); + return; + } filemap_invalidate_lock(file->f_mapping); - xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL); - rcu_assign_pointer(slot->gmem.file, NULL); - synchronize_rcu(); + __kvm_gmem_unbind(slot, file->private_data); filemap_invalidate_unlock(file->f_mapping); fput(file); -- Gitee From 28617bc0800381b07e428770c517c1d52fe8414f Mon Sep 17 00:00:00 2001 From: Li Zhiquan Date: Sat, 1 Nov 2025 22:40:36 +0800 Subject: [PATCH 53/54] KVM: fix KVM kABI breakage for the changes in KVM common structures hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A ------------------------------------- Hygon-SIG: commit none KVM: fix KVM kABI breakage for the changes in KVM common structs Backport the guest-memory feature. ------------------------------------- Fix kABI breakage due to below changes: - struct kvm_memory_slot: using KABI_EXTEND for the new field "gmem". - struct kvm_arch: using KABI_REPLACE for the type changes of mmu_invalidate_range_start and mmu_invalidate_range_end. - struct kvm_run: using macros instead of the new member "memory_fault" in union. Because including kabi.h in UAPI header will cause build errors, some other macro in kabi.h like IS_ENABLED() and IS_BUILTIN() are defined in kconfig.h, UAPI headers should not include kconfig.h. In order to limit the blast radius, the most reasonable way should be repurposing the exist field in struct kvm_run and adapt the references, here "osi.gprs[0-3]" are repurposed for the "memory_fault_flags/gpa/size" since their sizes are totally match. Signed-off-by: Li Zhiquan --- include/linux/kvm_host.h | 26 ++++++++++--------- include/uapi/linux/kvm.h | 12 ++++----- tools/include/uapi/linux/kvm.h | 12 ++++----- .../kvm/x86_64/private_mem_kvm_exits_test.c | 12 ++++----- 4 files changed, 30 insertions(+), 32 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a68424463aae..3f64a21390c9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -612,10 +612,10 @@ struct kvm_memory_slot { u16 as_id; #ifdef CONFIG_KVM_PRIVATE_MEM - struct { + KABI_EXTEND(struct { struct file __rcu *file; pgoff_t pgoff; - } gmem; + } gmem) #endif }; @@ -866,8 +866,10 @@ struct kvm { struct mmu_notifier mmu_notifier; unsigned long mmu_invalidate_seq; long mmu_invalidate_in_progress; - gfn_t mmu_invalidate_range_start; - gfn_t mmu_invalidate_range_end; + KABI_REPLACE(unsigned long mmu_invalidate_range_start, + gfn_t mmu_invalidate_range_start) + KABI_REPLACE(unsigned long mmu_invalidate_range_end, + gfn_t mmu_invalidate_range_end) #endif struct list_head devices; u64 manual_dirty_log_protect; @@ -885,15 +887,15 @@ struct kvm { #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER struct notifier_block pm_notifier; -#endif -#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES - /* Protected by slots_locks (for writes) and RCU (for reads) */ - struct xarray mem_attr_array; #endif char stats_id[KVM_STATS_NAME_SIZE]; #ifdef CONFIG_ARM64_HDBSS KABI_EXTEND(bool enable_hdbss) #endif +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES + /* Protected by slots_locks (for writes) and RCU (for reads) */ + KABI_EXTEND(struct xarray mem_attr_array) +#endif }; #define kvm_err(fmt, ...) \ @@ -2563,13 +2565,13 @@ static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu, bool is_private) { vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT; - vcpu->run->memory_fault.gpa = gpa; - vcpu->run->memory_fault.size = size; + vcpu->run->memory_fault_gpa = gpa; + vcpu->run->memory_fault_size = size; /* RWX flags are not (yet) defined or communicated to userspace. */ - vcpu->run->memory_fault.flags = 0; + vcpu->run->memory_fault_flags = 0; if (is_private) - vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE; + vcpu->run->memory_fault_flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE; } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 5991f8c3a549..1ca9c5219a95 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -443,6 +443,11 @@ struct kvm_run { struct { __u64 gprs[32]; } osi; + /* KVM_EXIT_MEMORY_FAULT */ +#define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3) +#define memory_fault_flags osi.gprs[0] +#define memory_fault_gpa osi.gprs[1] +#define memory_fault_size osi.gprs[2] /* KVM_EXIT_PAPR_HCALL */ struct { __u64 nr; @@ -534,13 +539,6 @@ struct kvm_run { #define KVM_NOTIFY_CONTEXT_INVALID (1 << 0) __u32 flags; } notify; - /* KVM_EXIT_MEMORY_FAULT */ - struct { -#define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3) - __u64 flags; - __u64 gpa; - __u64 size; - } memory_fault; /* Fix the size of the union. */ char padding[256]; }; diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index be935dde7da0..79bbab083af8 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -364,6 +364,11 @@ struct kvm_run { struct { __u64 gprs[32]; } osi; + /* KVM_EXIT_MEMORY_FAULT */ +#define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3) +#define memory_fault_flags osi.gprs[0] +#define memory_fault_gpa osi.gprs[1] +#define memory_fault_size osi.gprs[2] /* KVM_EXIT_PAPR_HCALL */ struct { __u64 nr; @@ -455,13 +460,6 @@ struct kvm_run { #define KVM_NOTIFY_CONTEXT_INVALID (1 << 0) __u32 flags; } notify; - /* KVM_EXIT_MEMORY_FAULT */ - struct { -#define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3) - __u64 flags; - __u64 gpa; - __u64 size; - } memory_fault; /* Fix the size of the union. */ char padding[256]; }; diff --git a/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c b/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c index 13e72fcec8dd..f602c14d47b3 100644 --- a/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c +++ b/tools/testing/selftests/kvm/x86_64/private_mem_kvm_exits_test.c @@ -75,9 +75,9 @@ static void test_private_access_memslot_deleted(void) exit_reason = (uint32_t)(uint64_t)thread_return; TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT); - TEST_ASSERT_EQ(vcpu->run->memory_fault.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE); - TEST_ASSERT_EQ(vcpu->run->memory_fault.gpa, EXITS_TEST_GPA); - TEST_ASSERT_EQ(vcpu->run->memory_fault.size, EXITS_TEST_SIZE); + TEST_ASSERT_EQ(vcpu->run->memory_fault_flags, KVM_MEMORY_EXIT_FLAG_PRIVATE); + TEST_ASSERT_EQ(vcpu->run->memory_fault_gpa, EXITS_TEST_GPA); + TEST_ASSERT_EQ(vcpu->run->memory_fault_size, EXITS_TEST_SIZE); kvm_vm_free(vm); } @@ -104,9 +104,9 @@ static void test_private_access_memslot_not_private(void) exit_reason = run_vcpu_get_exit_reason(vcpu); TEST_ASSERT_EQ(exit_reason, KVM_EXIT_MEMORY_FAULT); - TEST_ASSERT_EQ(vcpu->run->memory_fault.flags, KVM_MEMORY_EXIT_FLAG_PRIVATE); - TEST_ASSERT_EQ(vcpu->run->memory_fault.gpa, EXITS_TEST_GPA); - TEST_ASSERT_EQ(vcpu->run->memory_fault.size, EXITS_TEST_SIZE); + TEST_ASSERT_EQ(vcpu->run->memory_fault_flags, KVM_MEMORY_EXIT_FLAG_PRIVATE); + TEST_ASSERT_EQ(vcpu->run->memory_fault_gpa, EXITS_TEST_GPA); + TEST_ASSERT_EQ(vcpu->run->memory_fault_size, EXITS_TEST_SIZE); kvm_vm_free(vm); } -- Gitee From 7a43845acf856659003c8dc43d653bbb827ff6d4 Mon Sep 17 00:00:00 2001 From: Li Zhiquan Date: Sun, 2 Nov 2025 18:09:32 +0800 Subject: [PATCH 54/54] KVM: x86: fix KVM kABI breakage due to adding the vm_type field into struct kvm_arch hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID4I2Z CVE: N/A ------------------------------------- Hygon-SIG: commit none KVM: fix KVM kABI breakage due to adding the vm_type field into struct kvm_arch Backport the guest-memory feature. ------------------------------------- The kABI is broken due to commit 89ea60c2c7b5 ("KVM: x86: Add support for "protected VMs" that can utilize private memory") adding the new field 'vm_type' into struct kvm_arch, since the 'struct kvm' contains the field 'struct kvm_arch arch', the offset of the other fields that following 'arch' in 'struct kvm' is shifted, it will break kABI. To fix the kABI breakage, move the 'vm_type' field up to 'struct kvm', and all the references have to be changed. Signed-off-by: Li Zhiquan --- arch/x86/include/asm/kvm_host.h | 3 +-- arch/x86/kvm/x86.c | 2 +- include/linux/kvm_host.h | 1 + 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6790db1fadf4..a0db76b68fd3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1250,7 +1250,6 @@ enum kvm_apicv_inhibit { }; struct kvm_arch { - unsigned long vm_type; unsigned long n_used_mmu_pages; unsigned long n_requested_mmu_pages; unsigned long n_max_mmu_pages; @@ -2119,7 +2118,7 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, int tdp_max_root_level, int tdp_huge_page_level); #ifdef CONFIG_KVM_PRIVATE_MEM -#define kvm_arch_has_private_mem(kvm) ((kvm)->arch.vm_type != KVM_X86_DEFAULT_VM) +#define kvm_arch_has_private_mem(kvm) ((kvm)->vm_type != KVM_X86_DEFAULT_VM) #else #define kvm_arch_has_private_mem(kvm) false #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 09b9f79d9283..b70e82e7925c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12556,7 +12556,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) if (!kvm_is_vm_type_supported(type)) return -EINVAL; - kvm->arch.vm_type = type; + kvm->vm_type = type; ret = kvm_page_track_init(kvm); if (ret) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3f64a21390c9..32b727041fbb 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -896,6 +896,7 @@ struct kvm { /* Protected by slots_locks (for writes) and RCU (for reads) */ KABI_EXTEND(struct xarray mem_attr_array) #endif + KABI_EXTEND(unsigned long vm_type) }; #define kvm_err(fmt, ...) \ -- Gitee