@@ -1477,13 +1477,132 @@ static bool kvm_vma_is_cacheable(struct vm_area_struct *vma)
14771477 }
14781478}
14791479
1480+ static int prepare_mmu_memcache (struct kvm_vcpu * vcpu , bool topup_memcache ,
1481+ void * * memcache )
1482+ {
1483+ int min_pages ;
1484+
1485+ if (!is_protected_kvm_enabled ())
1486+ * memcache = & vcpu -> arch .mmu_page_cache ;
1487+ else
1488+ * memcache = & vcpu -> arch .pkvm_memcache ;
1489+
1490+ if (!topup_memcache )
1491+ return 0 ;
1492+
1493+ min_pages = kvm_mmu_cache_min_pages (vcpu -> arch .hw_mmu );
1494+
1495+ if (!is_protected_kvm_enabled ())
1496+ return kvm_mmu_topup_memory_cache (* memcache , min_pages );
1497+
1498+ return topup_hyp_memcache (* memcache , min_pages );
1499+ }
1500+
1501+ /*
1502+ * Potentially reduce shadow S2 permissions to match the guest's own S2. For
1503+ * exec faults, we'd only reach this point if the guest actually allowed it (see
1504+ * kvm_s2_handle_perm_fault).
1505+ *
1506+ * Also encode the level of the original translation in the SW bits of the leaf
1507+ * entry as a proxy for the span of that translation. This will be retrieved on
1508+ * TLB invalidation from the guest and used to limit the invalidation scope if a
1509+ * TTL hint or a range isn't provided.
1510+ */
1511+ static void adjust_nested_fault_perms (struct kvm_s2_trans * nested ,
1512+ enum kvm_pgtable_prot * prot ,
1513+ bool * writable )
1514+ {
1515+ * writable &= kvm_s2_trans_writable (nested );
1516+ if (!kvm_s2_trans_readable (nested ))
1517+ * prot &= ~KVM_PGTABLE_PROT_R ;
1518+
1519+ * prot |= kvm_encode_nested_level (nested );
1520+ }
1521+
1522+ #define KVM_PGTABLE_WALK_MEMABORT_FLAGS (KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED)
1523+
1524+ static int gmem_abort (struct kvm_vcpu * vcpu , phys_addr_t fault_ipa ,
1525+ struct kvm_s2_trans * nested ,
1526+ struct kvm_memory_slot * memslot , bool is_perm )
1527+ {
1528+ bool write_fault , exec_fault , writable ;
1529+ enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS ;
1530+ enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R ;
1531+ struct kvm_pgtable * pgt = vcpu -> arch .hw_mmu -> pgt ;
1532+ unsigned long mmu_seq ;
1533+ struct page * page ;
1534+ struct kvm * kvm = vcpu -> kvm ;
1535+ void * memcache ;
1536+ kvm_pfn_t pfn ;
1537+ gfn_t gfn ;
1538+ int ret ;
1539+
1540+ ret = prepare_mmu_memcache (vcpu , true, & memcache );
1541+ if (ret )
1542+ return ret ;
1543+
1544+ if (nested )
1545+ gfn = kvm_s2_trans_output (nested ) >> PAGE_SHIFT ;
1546+ else
1547+ gfn = fault_ipa >> PAGE_SHIFT ;
1548+
1549+ write_fault = kvm_is_write_fault (vcpu );
1550+ exec_fault = kvm_vcpu_trap_is_exec_fault (vcpu );
1551+
1552+ VM_WARN_ON_ONCE (write_fault && exec_fault );
1553+
1554+ mmu_seq = kvm -> mmu_invalidate_seq ;
1555+ /* Pairs with the smp_wmb() in kvm_mmu_invalidate_end(). */
1556+ smp_rmb ();
1557+
1558+ ret = kvm_gmem_get_pfn (kvm , memslot , gfn , & pfn , & page , NULL );
1559+ if (ret ) {
1560+ kvm_prepare_memory_fault_exit (vcpu , fault_ipa , PAGE_SIZE ,
1561+ write_fault , exec_fault , false);
1562+ return ret ;
1563+ }
1564+
1565+ writable = !(memslot -> flags & KVM_MEM_READONLY );
1566+
1567+ if (nested )
1568+ adjust_nested_fault_perms (nested , & prot , & writable );
1569+
1570+ if (writable )
1571+ prot |= KVM_PGTABLE_PROT_W ;
1572+
1573+ if (exec_fault ||
1574+ (cpus_have_final_cap (ARM64_HAS_CACHE_DIC ) &&
1575+ (!nested || kvm_s2_trans_executable (nested ))))
1576+ prot |= KVM_PGTABLE_PROT_X ;
1577+
1578+ kvm_fault_lock (kvm );
1579+ if (mmu_invalidate_retry (kvm , mmu_seq )) {
1580+ ret = - EAGAIN ;
1581+ goto out_unlock ;
1582+ }
1583+
1584+ ret = KVM_PGT_FN (kvm_pgtable_stage2_map )(pgt , fault_ipa , PAGE_SIZE ,
1585+ __pfn_to_phys (pfn ), prot ,
1586+ memcache , flags );
1587+
1588+ out_unlock :
1589+ kvm_release_faultin_page (kvm , page , !!ret , writable );
1590+ kvm_fault_unlock (kvm );
1591+
1592+ if (writable && !ret )
1593+ mark_page_dirty_in_slot (kvm , memslot , gfn );
1594+
1595+ return ret != - EAGAIN ? ret : 0 ;
1596+ }
1597+
14801598static int user_mem_abort (struct kvm_vcpu * vcpu , phys_addr_t fault_ipa ,
14811599 struct kvm_s2_trans * nested ,
14821600 struct kvm_memory_slot * memslot , unsigned long hva ,
14831601 bool fault_is_perm )
14841602{
14851603 int ret = 0 ;
1486- bool write_fault , writable , force_pte = false;
1604+ bool topup_memcache ;
1605+ bool write_fault , writable ;
14871606 bool exec_fault , mte_allowed , is_vma_cacheable ;
14881607 bool s2_force_noncacheable = false, vfio_allow_any_uc = false;
14891608 unsigned long mmu_seq ;
@@ -1495,46 +1614,30 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
14951614 gfn_t gfn ;
14961615 kvm_pfn_t pfn ;
14971616 bool logging_active = memslot_is_logging (memslot );
1617+ bool force_pte = logging_active ;
14981618 long vma_pagesize , fault_granule ;
14991619 enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R ;
15001620 struct kvm_pgtable * pgt ;
15011621 struct page * page ;
15021622 vm_flags_t vm_flags ;
1503- enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED ;
1623+ enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_MEMABORT_FLAGS ;
15041624
15051625 if (fault_is_perm )
15061626 fault_granule = kvm_vcpu_trap_get_perm_fault_granule (vcpu );
15071627 write_fault = kvm_is_write_fault (vcpu );
15081628 exec_fault = kvm_vcpu_trap_is_exec_fault (vcpu );
1509- VM_BUG_ON (write_fault && exec_fault );
1510-
1511- if (fault_is_perm && !write_fault && !exec_fault ) {
1512- kvm_err ("Unexpected L2 read permission error\n" );
1513- return - EFAULT ;
1514- }
1515-
1516- if (!is_protected_kvm_enabled ())
1517- memcache = & vcpu -> arch .mmu_page_cache ;
1518- else
1519- memcache = & vcpu -> arch .pkvm_memcache ;
1629+ VM_WARN_ON_ONCE (write_fault && exec_fault );
15201630
15211631 /*
15221632 * Permission faults just need to update the existing leaf entry,
15231633 * and so normally don't require allocations from the memcache. The
15241634 * only exception to this is when dirty logging is enabled at runtime
15251635 * and a write fault needs to collapse a block entry into a table.
15261636 */
1527- if (!fault_is_perm || (logging_active && write_fault )) {
1528- int min_pages = kvm_mmu_cache_min_pages (vcpu -> arch .hw_mmu );
1529-
1530- if (!is_protected_kvm_enabled ())
1531- ret = kvm_mmu_topup_memory_cache (memcache , min_pages );
1532- else
1533- ret = topup_hyp_memcache (memcache , min_pages );
1534-
1535- if (ret )
1536- return ret ;
1537- }
1637+ topup_memcache = !fault_is_perm || (logging_active && write_fault );
1638+ ret = prepare_mmu_memcache (vcpu , topup_memcache , & memcache );
1639+ if (ret )
1640+ return ret ;
15381641
15391642 /*
15401643 * Let's check if we will get back a huge page backed by hugetlbfs, or
@@ -1548,16 +1651,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
15481651 return - EFAULT ;
15491652 }
15501653
1551- /*
1552- * logging_active is guaranteed to never be true for VM_PFNMAP
1553- * memslots.
1554- */
1555- if (logging_active ) {
1556- force_pte = true;
1654+ if (force_pte )
15571655 vma_shift = PAGE_SHIFT ;
1558- } else {
1656+ else
15591657 vma_shift = get_vma_page_shift (vma , hva );
1560- }
15611658
15621659 switch (vma_shift ) {
15631660#ifndef __PAGETABLE_PMD_FOLDED
@@ -1609,7 +1706,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
16091706 max_map_size = PAGE_SIZE ;
16101707
16111708 force_pte = (max_map_size == PAGE_SIZE );
1612- vma_pagesize = min ( vma_pagesize , ( long ) max_map_size );
1709+ vma_pagesize = min_t ( long , vma_pagesize , max_map_size );
16131710 }
16141711
16151712 /*
@@ -1642,7 +1739,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
16421739 * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
16431740 * with the smp_wmb() in kvm_mmu_invalidate_end().
16441741 */
1645- mmu_seq = vcpu -> kvm -> mmu_invalidate_seq ;
1742+ mmu_seq = kvm -> mmu_invalidate_seq ;
16461743 mmap_read_unlock (current -> mm );
16471744
16481745 pfn = __kvm_faultin_pfn (memslot , gfn , write_fault ? FOLL_WRITE : 0 ,
@@ -1698,24 +1795,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
16981795 if (exec_fault && s2_force_noncacheable )
16991796 return - ENOEXEC ;
17001797
1701- /*
1702- * Potentially reduce shadow S2 permissions to match the guest's own
1703- * S2. For exec faults, we'd only reach this point if the guest
1704- * actually allowed it (see kvm_s2_handle_perm_fault).
1705- *
1706- * Also encode the level of the original translation in the SW bits
1707- * of the leaf entry as a proxy for the span of that translation.
1708- * This will be retrieved on TLB invalidation from the guest and
1709- * used to limit the invalidation scope if a TTL hint or a range
1710- * isn't provided.
1711- */
1712- if (nested ) {
1713- writable &= kvm_s2_trans_writable (nested );
1714- if (!kvm_s2_trans_readable (nested ))
1715- prot &= ~KVM_PGTABLE_PROT_R ;
1716-
1717- prot |= kvm_encode_nested_level (nested );
1718- }
1798+ if (nested )
1799+ adjust_nested_fault_perms (nested , & prot , & writable );
17191800
17201801 kvm_fault_lock (kvm );
17211802 pgt = vcpu -> arch .hw_mmu -> pgt ;
@@ -1981,8 +2062,15 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
19812062 goto out_unlock ;
19822063 }
19832064
1984- ret = user_mem_abort (vcpu , fault_ipa , nested , memslot , hva ,
1985- esr_fsc_is_permission_fault (esr ));
2065+ VM_WARN_ON_ONCE (kvm_vcpu_trap_is_permission_fault (vcpu ) &&
2066+ !write_fault && !kvm_vcpu_trap_is_exec_fault (vcpu ));
2067+
2068+ if (kvm_slot_has_gmem (memslot ))
2069+ ret = gmem_abort (vcpu , fault_ipa , nested , memslot ,
2070+ esr_fsc_is_permission_fault (esr ));
2071+ else
2072+ ret = user_mem_abort (vcpu , fault_ipa , nested , memslot , hva ,
2073+ esr_fsc_is_permission_fault (esr ));
19862074 if (ret == 0 )
19872075 ret = 1 ;
19882076out :
@@ -2214,6 +2302,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
22142302 if ((new -> base_gfn + new -> npages ) > (kvm_phys_size (& kvm -> arch .mmu ) >> PAGE_SHIFT ))
22152303 return - EFAULT ;
22162304
2305+ /*
2306+ * Only support guest_memfd backed memslots with mappable memory, since
2307+ * there aren't any CoCo VMs that support only private memory on arm64.
2308+ */
2309+ if (kvm_slot_has_gmem (new ) && !kvm_memslot_is_gmem_only (new ))
2310+ return - EINVAL ;
2311+
22172312 hva = new -> userspace_addr ;
22182313 reg_end = hva + (new -> npages << PAGE_SHIFT );
22192314
0 commit comments