Skip to content

Commit a6ad541

Browse files
committed
Merge branch 'guest-memfd-mmap' into HEAD
Add support for host userspace mapping of guest_memfd-backed memory for VM types that do NOT use support KVM_MEMORY_ATTRIBUTE_PRIVATE (which isn't precisely the same thing as CoCo VMs, since x86's SEV-MEM and SEV-ES have no way to detect private vs. shared). mmap() support paves the way for several evolving KVM use cases: * Allows VMMs like Firecracker to run guests entirely backed by guest_memfd [1]. This provides a unified memory management model for both confidential and non-confidential guests, simplifying VMM design. * Enhanced Security via direct map removal: When combined with Patrick's series for direct map removal [2], this provides additional hardening against Spectre-like transient execution attacks by eliminating the need for host kernel direct maps of guest memory. * Lays the groundwork for *restricted* mmap() support for guest_memfd-backed memory on CoCo platforms [3] that permit in-place sharing of guest memory with the host. Signed-off-by: Paolo Bonzini <[email protected]>
2 parents 0dc4a75 + 4218866 commit a6ad541

File tree

26 files changed

+648
-214
lines changed

26 files changed

+648
-214
lines changed

Documentation/virt/kvm/api.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6414,6 +6414,15 @@ most one mapping per page, i.e. binding multiple memory regions to a single
64146414
guest_memfd range is not allowed (any number of memory regions can be bound to
64156415
a single guest_memfd file, but the bound ranges must not overlap).
64166416

6417+
When the capability KVM_CAP_GUEST_MEMFD_MMAP is supported, the 'flags' field
6418+
supports GUEST_MEMFD_FLAG_MMAP. Setting this flag on guest_memfd creation
6419+
enables mmap() and faulting of guest_memfd memory to host userspace.
6420+
6421+
When the KVM MMU performs a PFN lookup to service a guest fault and the backing
6422+
guest_memfd has the GUEST_MEMFD_FLAG_MMAP set, then the fault will always be
6423+
consumed from guest_memfd, regardless of whether it is a shared or a private
6424+
fault.
6425+
64176426
See KVM_SET_USER_MEMORY_REGION2 for additional details.
64186427

64196428
4.143 KVM_PRE_FAULT_MEMORY

arch/arm64/kvm/Kconfig

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ menuconfig KVM
3737
select HAVE_KVM_VCPU_RUN_PID_CHANGE
3838
select SCHED_INFO
3939
select GUEST_PERF_EVENTS if PERF_EVENTS
40+
select KVM_GUEST_MEMFD
4041
help
4142
Support hosting virtualized guest machines.
4243

arch/arm64/kvm/mmu.c

Lines changed: 149 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1477,13 +1477,132 @@ static bool kvm_vma_is_cacheable(struct vm_area_struct *vma)
14771477
}
14781478
}
14791479

1480+
static int prepare_mmu_memcache(struct kvm_vcpu *vcpu, bool topup_memcache,
1481+
void **memcache)
1482+
{
1483+
int min_pages;
1484+
1485+
if (!is_protected_kvm_enabled())
1486+
*memcache = &vcpu->arch.mmu_page_cache;
1487+
else
1488+
*memcache = &vcpu->arch.pkvm_memcache;
1489+
1490+
if (!topup_memcache)
1491+
return 0;
1492+
1493+
min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
1494+
1495+
if (!is_protected_kvm_enabled())
1496+
return kvm_mmu_topup_memory_cache(*memcache, min_pages);
1497+
1498+
return topup_hyp_memcache(*memcache, min_pages);
1499+
}
1500+
1501+
/*
1502+
* Potentially reduce shadow S2 permissions to match the guest's own S2. For
1503+
* exec faults, we'd only reach this point if the guest actually allowed it (see
1504+
* kvm_s2_handle_perm_fault).
1505+
*
1506+
* Also encode the level of the original translation in the SW bits of the leaf
1507+
* entry as a proxy for the span of that translation. This will be retrieved on
1508+
* TLB invalidation from the guest and used to limit the invalidation scope if a
1509+
* TTL hint or a range isn't provided.
1510+
*/
1511+
static void adjust_nested_fault_perms(struct kvm_s2_trans *nested,
1512+
enum kvm_pgtable_prot *prot,
1513+
bool *writable)
1514+
{
1515+
*writable &= kvm_s2_trans_writable(nested);
1516+
if (!kvm_s2_trans_readable(nested))
1517+
*prot &= ~KVM_PGTABLE_PROT_R;
1518+
1519+
*prot |= kvm_encode_nested_level(nested);
1520+
}
1521+
1522+
#define KVM_PGTABLE_WALK_MEMABORT_FLAGS (KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED)
1523+
1524+
static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
1525+
struct kvm_s2_trans *nested,
1526+
struct kvm_memory_slot *memslot, bool is_perm)
1527+
{
1528+
bool write_fault, exec_fault, writable;
1529+
enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS;
1530+
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
1531+
struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt;
1532+
unsigned long mmu_seq;
1533+
struct page *page;
1534+
struct kvm *kvm = vcpu->kvm;
1535+
void *memcache;
1536+
kvm_pfn_t pfn;
1537+
gfn_t gfn;
1538+
int ret;
1539+
1540+
ret = prepare_mmu_memcache(vcpu, true, &memcache);
1541+
if (ret)
1542+
return ret;
1543+
1544+
if (nested)
1545+
gfn = kvm_s2_trans_output(nested) >> PAGE_SHIFT;
1546+
else
1547+
gfn = fault_ipa >> PAGE_SHIFT;
1548+
1549+
write_fault = kvm_is_write_fault(vcpu);
1550+
exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
1551+
1552+
VM_WARN_ON_ONCE(write_fault && exec_fault);
1553+
1554+
mmu_seq = kvm->mmu_invalidate_seq;
1555+
/* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
1556+
smp_rmb();
1557+
1558+
ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, &page, NULL);
1559+
if (ret) {
1560+
kvm_prepare_memory_fault_exit(vcpu, fault_ipa, PAGE_SIZE,
1561+
write_fault, exec_fault, false);
1562+
return ret;
1563+
}
1564+
1565+
writable = !(memslot->flags & KVM_MEM_READONLY);
1566+
1567+
if (nested)
1568+
adjust_nested_fault_perms(nested, &prot, &writable);
1569+
1570+
if (writable)
1571+
prot |= KVM_PGTABLE_PROT_W;
1572+
1573+
if (exec_fault ||
1574+
(cpus_have_final_cap(ARM64_HAS_CACHE_DIC) &&
1575+
(!nested || kvm_s2_trans_executable(nested))))
1576+
prot |= KVM_PGTABLE_PROT_X;
1577+
1578+
kvm_fault_lock(kvm);
1579+
if (mmu_invalidate_retry(kvm, mmu_seq)) {
1580+
ret = -EAGAIN;
1581+
goto out_unlock;
1582+
}
1583+
1584+
ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, PAGE_SIZE,
1585+
__pfn_to_phys(pfn), prot,
1586+
memcache, flags);
1587+
1588+
out_unlock:
1589+
kvm_release_faultin_page(kvm, page, !!ret, writable);
1590+
kvm_fault_unlock(kvm);
1591+
1592+
if (writable && !ret)
1593+
mark_page_dirty_in_slot(kvm, memslot, gfn);
1594+
1595+
return ret != -EAGAIN ? ret : 0;
1596+
}
1597+
14801598
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
14811599
struct kvm_s2_trans *nested,
14821600
struct kvm_memory_slot *memslot, unsigned long hva,
14831601
bool fault_is_perm)
14841602
{
14851603
int ret = 0;
1486-
bool write_fault, writable, force_pte = false;
1604+
bool topup_memcache;
1605+
bool write_fault, writable;
14871606
bool exec_fault, mte_allowed, is_vma_cacheable;
14881607
bool s2_force_noncacheable = false, vfio_allow_any_uc = false;
14891608
unsigned long mmu_seq;
@@ -1495,46 +1614,30 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
14951614
gfn_t gfn;
14961615
kvm_pfn_t pfn;
14971616
bool logging_active = memslot_is_logging(memslot);
1617+
bool force_pte = logging_active;
14981618
long vma_pagesize, fault_granule;
14991619
enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
15001620
struct kvm_pgtable *pgt;
15011621
struct page *page;
15021622
vm_flags_t vm_flags;
1503-
enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
1623+
enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS;
15041624

15051625
if (fault_is_perm)
15061626
fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
15071627
write_fault = kvm_is_write_fault(vcpu);
15081628
exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
1509-
VM_BUG_ON(write_fault && exec_fault);
1510-
1511-
if (fault_is_perm && !write_fault && !exec_fault) {
1512-
kvm_err("Unexpected L2 read permission error\n");
1513-
return -EFAULT;
1514-
}
1515-
1516-
if (!is_protected_kvm_enabled())
1517-
memcache = &vcpu->arch.mmu_page_cache;
1518-
else
1519-
memcache = &vcpu->arch.pkvm_memcache;
1629+
VM_WARN_ON_ONCE(write_fault && exec_fault);
15201630

15211631
/*
15221632
* Permission faults just need to update the existing leaf entry,
15231633
* and so normally don't require allocations from the memcache. The
15241634
* only exception to this is when dirty logging is enabled at runtime
15251635
* and a write fault needs to collapse a block entry into a table.
15261636
*/
1527-
if (!fault_is_perm || (logging_active && write_fault)) {
1528-
int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
1529-
1530-
if (!is_protected_kvm_enabled())
1531-
ret = kvm_mmu_topup_memory_cache(memcache, min_pages);
1532-
else
1533-
ret = topup_hyp_memcache(memcache, min_pages);
1534-
1535-
if (ret)
1536-
return ret;
1537-
}
1637+
topup_memcache = !fault_is_perm || (logging_active && write_fault);
1638+
ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache);
1639+
if (ret)
1640+
return ret;
15381641

15391642
/*
15401643
* Let's check if we will get back a huge page backed by hugetlbfs, or
@@ -1548,16 +1651,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
15481651
return -EFAULT;
15491652
}
15501653

1551-
/*
1552-
* logging_active is guaranteed to never be true for VM_PFNMAP
1553-
* memslots.
1554-
*/
1555-
if (logging_active) {
1556-
force_pte = true;
1654+
if (force_pte)
15571655
vma_shift = PAGE_SHIFT;
1558-
} else {
1656+
else
15591657
vma_shift = get_vma_page_shift(vma, hva);
1560-
}
15611658

15621659
switch (vma_shift) {
15631660
#ifndef __PAGETABLE_PMD_FOLDED
@@ -1609,7 +1706,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
16091706
max_map_size = PAGE_SIZE;
16101707

16111708
force_pte = (max_map_size == PAGE_SIZE);
1612-
vma_pagesize = min(vma_pagesize, (long)max_map_size);
1709+
vma_pagesize = min_t(long, vma_pagesize, max_map_size);
16131710
}
16141711

16151712
/*
@@ -1642,7 +1739,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
16421739
* Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
16431740
* with the smp_wmb() in kvm_mmu_invalidate_end().
16441741
*/
1645-
mmu_seq = vcpu->kvm->mmu_invalidate_seq;
1742+
mmu_seq = kvm->mmu_invalidate_seq;
16461743
mmap_read_unlock(current->mm);
16471744

16481745
pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
@@ -1698,24 +1795,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
16981795
if (exec_fault && s2_force_noncacheable)
16991796
return -ENOEXEC;
17001797

1701-
/*
1702-
* Potentially reduce shadow S2 permissions to match the guest's own
1703-
* S2. For exec faults, we'd only reach this point if the guest
1704-
* actually allowed it (see kvm_s2_handle_perm_fault).
1705-
*
1706-
* Also encode the level of the original translation in the SW bits
1707-
* of the leaf entry as a proxy for the span of that translation.
1708-
* This will be retrieved on TLB invalidation from the guest and
1709-
* used to limit the invalidation scope if a TTL hint or a range
1710-
* isn't provided.
1711-
*/
1712-
if (nested) {
1713-
writable &= kvm_s2_trans_writable(nested);
1714-
if (!kvm_s2_trans_readable(nested))
1715-
prot &= ~KVM_PGTABLE_PROT_R;
1716-
1717-
prot |= kvm_encode_nested_level(nested);
1718-
}
1798+
if (nested)
1799+
adjust_nested_fault_perms(nested, &prot, &writable);
17191800

17201801
kvm_fault_lock(kvm);
17211802
pgt = vcpu->arch.hw_mmu->pgt;
@@ -1981,8 +2062,15 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
19812062
goto out_unlock;
19822063
}
19832064

1984-
ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
1985-
esr_fsc_is_permission_fault(esr));
2065+
VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) &&
2066+
!write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu));
2067+
2068+
if (kvm_slot_has_gmem(memslot))
2069+
ret = gmem_abort(vcpu, fault_ipa, nested, memslot,
2070+
esr_fsc_is_permission_fault(esr));
2071+
else
2072+
ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
2073+
esr_fsc_is_permission_fault(esr));
19862074
if (ret == 0)
19872075
ret = 1;
19882076
out:
@@ -2214,6 +2302,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
22142302
if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT))
22152303
return -EFAULT;
22162304

2305+
/*
2306+
* Only support guest_memfd backed memslots with mappable memory, since
2307+
* there aren't any CoCo VMs that support only private memory on arm64.
2308+
*/
2309+
if (kvm_slot_has_gmem(new) && !kvm_memslot_is_gmem_only(new))
2310+
return -EINVAL;
2311+
22172312
hva = new->userspace_addr;
22182313
reg_end = hva + (new->npages << PAGE_SHIFT);
22192314

arch/arm64/kvm/nested.c

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1172,8 +1172,9 @@ static u64 read_vncr_el2(struct kvm_vcpu *vcpu)
11721172
return (u64)sign_extend64(__vcpu_sys_reg(vcpu, VNCR_EL2), 48);
11731173
}
11741174

1175-
static int kvm_translate_vncr(struct kvm_vcpu *vcpu)
1175+
static int kvm_translate_vncr(struct kvm_vcpu *vcpu, bool *is_gmem)
11761176
{
1177+
struct kvm_memory_slot *memslot;
11771178
bool write_fault, writable;
11781179
unsigned long mmu_seq;
11791180
struct vncr_tlb *vt;
@@ -1216,10 +1217,25 @@ static int kvm_translate_vncr(struct kvm_vcpu *vcpu)
12161217
smp_rmb();
12171218

12181219
gfn = vt->wr.pa >> PAGE_SHIFT;
1219-
pfn = kvm_faultin_pfn(vcpu, gfn, write_fault, &writable, &page);
1220-
if (is_error_noslot_pfn(pfn) || (write_fault && !writable))
1220+
memslot = gfn_to_memslot(vcpu->kvm, gfn);
1221+
if (!memslot)
12211222
return -EFAULT;
12221223

1224+
*is_gmem = kvm_slot_has_gmem(memslot);
1225+
if (!*is_gmem) {
1226+
pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
1227+
&writable, &page);
1228+
if (is_error_noslot_pfn(pfn) || (write_fault && !writable))
1229+
return -EFAULT;
1230+
} else {
1231+
ret = kvm_gmem_get_pfn(vcpu->kvm, memslot, gfn, &pfn, &page, NULL);
1232+
if (ret) {
1233+
kvm_prepare_memory_fault_exit(vcpu, vt->wr.pa, PAGE_SIZE,
1234+
write_fault, false, false);
1235+
return ret;
1236+
}
1237+
}
1238+
12231239
scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
12241240
if (mmu_invalidate_retry(vcpu->kvm, mmu_seq))
12251241
return -EAGAIN;
@@ -1292,23 +1308,36 @@ int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu)
12921308
if (esr_fsc_is_permission_fault(esr)) {
12931309
inject_vncr_perm(vcpu);
12941310
} else if (esr_fsc_is_translation_fault(esr)) {
1295-
bool valid;
1311+
bool valid, is_gmem = false;
12961312
int ret;
12971313

12981314
scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
12991315
valid = kvm_vncr_tlb_lookup(vcpu);
13001316

13011317
if (!valid)
1302-
ret = kvm_translate_vncr(vcpu);
1318+
ret = kvm_translate_vncr(vcpu, &is_gmem);
13031319
else
13041320
ret = -EPERM;
13051321

13061322
switch (ret) {
13071323
case -EAGAIN:
1308-
case -ENOMEM:
13091324
/* Let's try again... */
13101325
break;
1326+
case -ENOMEM:
1327+
/*
1328+
* For guest_memfd, this indicates that it failed to
1329+
* create a folio to back the memory. Inform userspace.
1330+
*/
1331+
if (is_gmem)
1332+
return 0;
1333+
/* Otherwise, let's try again... */
1334+
break;
13111335
case -EFAULT:
1336+
case -EIO:
1337+
case -EHWPOISON:
1338+
if (is_gmem)
1339+
return 0;
1340+
fallthrough;
13121341
case -EINVAL:
13131342
case -ENOENT:
13141343
case -EACCES:

arch/x86/include/asm/kvm-x86-ops.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
145145
KVM_X86_OP_OPTIONAL(get_untagged_addr)
146146
KVM_X86_OP_OPTIONAL(alloc_apic_backing_page)
147147
KVM_X86_OP_OPTIONAL_RET0(gmem_prepare)
148-
KVM_X86_OP_OPTIONAL_RET0(private_max_mapping_level)
148+
KVM_X86_OP_OPTIONAL_RET0(gmem_max_mapping_level)
149149
KVM_X86_OP_OPTIONAL(gmem_invalidate)
150150

151151
#undef KVM_X86_OP

0 commit comments

Comments
 (0)