@@ -451,33 +451,84 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
451451 __wake_up (wq , TASK_NORMAL , wake_all ? 0 : 1 , & key );
452452}
453453
454+ static int __dax_invalidate_mapping_entry (struct address_space * mapping ,
455+ pgoff_t index , bool trunc )
456+ {
457+ int ret = 0 ;
458+ void * entry ;
459+ struct radix_tree_root * page_tree = & mapping -> page_tree ;
460+
461+ spin_lock_irq (& mapping -> tree_lock );
462+ entry = get_unlocked_mapping_entry (mapping , index , NULL );
463+ if (!entry || !radix_tree_exceptional_entry (entry ))
464+ goto out ;
465+ if (!trunc &&
466+ (radix_tree_tag_get (page_tree , index , PAGECACHE_TAG_DIRTY ) ||
467+ radix_tree_tag_get (page_tree , index , PAGECACHE_TAG_TOWRITE )))
468+ goto out ;
469+ radix_tree_delete (page_tree , index );
470+ mapping -> nrexceptional -- ;
471+ ret = 1 ;
472+ out :
473+ put_unlocked_mapping_entry (mapping , index , entry );
474+ spin_unlock_irq (& mapping -> tree_lock );
475+ return ret ;
476+ }
454477/*
455478 * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
456479 * entry to get unlocked before deleting it.
457480 */
458481int dax_delete_mapping_entry (struct address_space * mapping , pgoff_t index )
459482{
460- void * entry ;
483+ int ret = __dax_invalidate_mapping_entry ( mapping , index , true) ;
461484
462- spin_lock_irq (& mapping -> tree_lock );
463- entry = get_unlocked_mapping_entry (mapping , index , NULL );
464485 /*
465486 * This gets called from truncate / punch_hole path. As such, the caller
466487 * must hold locks protecting against concurrent modifications of the
467488 * radix tree (usually fs-private i_mmap_sem for writing). Since the
468489 * caller has seen exceptional entry for this index, we better find it
469490 * at that index as well...
470491 */
471- if (WARN_ON_ONCE (!entry || !radix_tree_exceptional_entry (entry ))) {
472- spin_unlock_irq (& mapping -> tree_lock );
473- return 0 ;
474- }
475- radix_tree_delete (& mapping -> page_tree , index );
492+ WARN_ON_ONCE (!ret );
493+ return ret ;
494+ }
495+
496+ /*
497+ * Invalidate exceptional DAX entry if easily possible. This handles DAX
498+ * entries for invalidate_inode_pages() so we evict the entry only if we can
499+ * do so without blocking.
500+ */
501+ int dax_invalidate_mapping_entry (struct address_space * mapping , pgoff_t index )
502+ {
503+ int ret = 0 ;
504+ void * entry , * * slot ;
505+ struct radix_tree_root * page_tree = & mapping -> page_tree ;
506+
507+ spin_lock_irq (& mapping -> tree_lock );
508+ entry = __radix_tree_lookup (page_tree , index , NULL , & slot );
509+ if (!entry || !radix_tree_exceptional_entry (entry ) ||
510+ slot_locked (mapping , slot ))
511+ goto out ;
512+ if (radix_tree_tag_get (page_tree , index , PAGECACHE_TAG_DIRTY ) ||
513+ radix_tree_tag_get (page_tree , index , PAGECACHE_TAG_TOWRITE ))
514+ goto out ;
515+ radix_tree_delete (page_tree , index );
476516 mapping -> nrexceptional -- ;
517+ ret = 1 ;
518+ out :
477519 spin_unlock_irq (& mapping -> tree_lock );
478- dax_wake_mapping_entry_waiter (mapping , index , entry , true);
520+ if (ret )
521+ dax_wake_mapping_entry_waiter (mapping , index , entry , true);
522+ return ret ;
523+ }
479524
480- return 1 ;
525+ /*
526+ * Invalidate exceptional DAX entry if it is clean.
527+ */
528+ int dax_invalidate_mapping_entry_sync (struct address_space * mapping ,
529+ pgoff_t index )
530+ {
531+ return __dax_invalidate_mapping_entry (mapping , index , false);
481532}
482533
483534/*
@@ -488,24 +539,34 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
488539 * otherwise it will simply fall out of the page cache under memory
489540 * pressure without ever having been dirtied.
490541 */
491- static int dax_load_hole (struct address_space * mapping , void * entry ,
542+ static int dax_load_hole (struct address_space * mapping , void * * entry ,
492543 struct vm_fault * vmf )
493544{
494545 struct page * page ;
546+ int ret ;
495547
496548 /* Hole page already exists? Return it... */
497- if (!radix_tree_exceptional_entry (entry )) {
498- vmf -> page = entry ;
499- return VM_FAULT_LOCKED ;
549+ if (!radix_tree_exceptional_entry (* entry )) {
550+ page = * entry ;
551+ goto out ;
500552 }
501553
502554 /* This will replace locked radix tree entry with a hole page */
503555 page = find_or_create_page (mapping , vmf -> pgoff ,
504556 vmf -> gfp_mask | __GFP_ZERO );
505557 if (!page )
506558 return VM_FAULT_OOM ;
559+ out :
507560 vmf -> page = page ;
508- return VM_FAULT_LOCKED ;
561+ ret = finish_fault (vmf );
562+ vmf -> page = NULL ;
563+ * entry = page ;
564+ if (!ret ) {
565+ /* Grab reference for PTE that is now referencing the page */
566+ get_page (page );
567+ return VM_FAULT_NOPAGE ;
568+ }
569+ return ret ;
509570}
510571
511572static int copy_user_dax (struct block_device * bdev , sector_t sector , size_t size ,
@@ -934,6 +995,17 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
934995 if (WARN_ON_ONCE (iomap -> type != IOMAP_MAPPED ))
935996 return - EIO ;
936997
998+ /*
999+ * Write can allocate block for an area which has a hole page mapped
1000+ * into page tables. We have to tear down these mappings so that data
1001+ * written by write(2) is visible in mmap.
1002+ */
1003+ if ((iomap -> flags & IOMAP_F_NEW ) && inode -> i_mapping -> nrpages ) {
1004+ invalidate_inode_pages2_range (inode -> i_mapping ,
1005+ pos >> PAGE_SHIFT ,
1006+ (end - 1 ) >> PAGE_SHIFT );
1007+ }
1008+
9371009 while (pos < end ) {
9381010 unsigned offset = pos & (PAGE_SIZE - 1 );
9391011 struct blk_dax_ctl dax = { 0 };
@@ -992,23 +1064,6 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
9921064 if (iov_iter_rw (iter ) == WRITE )
9931065 flags |= IOMAP_WRITE ;
9941066
995- /*
996- * Yes, even DAX files can have page cache attached to them: A zeroed
997- * page is inserted into the pagecache when we have to serve a write
998- * fault on a hole. It should never be dirtied and can simply be
999- * dropped from the pagecache once we get real data for the page.
1000- *
1001- * XXX: This is racy against mmap, and there's nothing we can do about
1002- * it. We'll eventually need to shift this down even further so that
1003- * we can check if we allocated blocks over a hole first.
1004- */
1005- if (mapping -> nrpages ) {
1006- ret = invalidate_inode_pages2_range (mapping ,
1007- pos >> PAGE_SHIFT ,
1008- (pos + iov_iter_count (iter ) - 1 ) >> PAGE_SHIFT );
1009- WARN_ON_ONCE (ret );
1010- }
1011-
10121067 while (iov_iter_count (iter )) {
10131068 ret = iomap_apply (inode , pos , iov_iter_count (iter ), flags , ops ,
10141069 iter , dax_iomap_actor );
@@ -1023,6 +1078,15 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
10231078}
10241079EXPORT_SYMBOL_GPL (dax_iomap_rw );
10251080
1081+ static int dax_fault_return (int error )
1082+ {
1083+ if (error == 0 )
1084+ return VM_FAULT_NOPAGE ;
1085+ if (error == - ENOMEM )
1086+ return VM_FAULT_OOM ;
1087+ return VM_FAULT_SIGBUS ;
1088+ }
1089+
10261090/**
10271091 * dax_iomap_fault - handle a page fault on a DAX file
10281092 * @vma: The virtual memory area where the fault occurred
@@ -1055,12 +1119,6 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
10551119 if (pos >= i_size_read (inode ))
10561120 return VM_FAULT_SIGBUS ;
10571121
1058- entry = grab_mapping_entry (mapping , vmf -> pgoff , 0 );
1059- if (IS_ERR (entry )) {
1060- error = PTR_ERR (entry );
1061- goto out ;
1062- }
1063-
10641122 if ((vmf -> flags & FAULT_FLAG_WRITE ) && !vmf -> cow_page )
10651123 flags |= IOMAP_WRITE ;
10661124
@@ -1071,9 +1129,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
10711129 */
10721130 error = ops -> iomap_begin (inode , pos , PAGE_SIZE , flags , & iomap );
10731131 if (error )
1074- goto unlock_entry ;
1132+ return dax_fault_return ( error ) ;
10751133 if (WARN_ON_ONCE (iomap .offset + iomap .length < pos + PAGE_SIZE )) {
1076- error = - EIO ; /* fs corruption? */
1134+ vmf_ret = dax_fault_return (- EIO ); /* fs corruption? */
1135+ goto finish_iomap ;
1136+ }
1137+
1138+ entry = grab_mapping_entry (mapping , vmf -> pgoff , 0 );
1139+ if (IS_ERR (entry )) {
1140+ vmf_ret = dax_fault_return (PTR_ERR (entry ));
10771141 goto finish_iomap ;
10781142 }
10791143
@@ -1096,13 +1160,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
10961160 }
10971161
10981162 if (error )
1099- goto finish_iomap ;
1163+ goto error_unlock_entry ;
11001164
11011165 __SetPageUptodate (vmf -> cow_page );
11021166 vmf_ret = finish_fault (vmf );
11031167 if (!vmf_ret )
11041168 vmf_ret = VM_FAULT_DONE_COW ;
1105- goto finish_iomap ;
1169+ goto unlock_entry ;
11061170 }
11071171
11081172 switch (iomap .type ) {
@@ -1114,12 +1178,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
11141178 }
11151179 error = dax_insert_mapping (mapping , iomap .bdev , sector ,
11161180 PAGE_SIZE , & entry , vma , vmf );
1181+ /* -EBUSY is fine, somebody else faulted on the same PTE */
1182+ if (error == - EBUSY )
1183+ error = 0 ;
11171184 break ;
11181185 case IOMAP_UNWRITTEN :
11191186 case IOMAP_HOLE :
11201187 if (!(vmf -> flags & FAULT_FLAG_WRITE )) {
1121- vmf_ret = dax_load_hole (mapping , entry , vmf );
1122- break ;
1188+ vmf_ret = dax_load_hole (mapping , & entry , vmf );
1189+ goto unlock_entry ;
11231190 }
11241191 /*FALLTHRU*/
11251192 default :
@@ -1128,31 +1195,25 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
11281195 break ;
11291196 }
11301197
1198+ error_unlock_entry :
1199+ vmf_ret = dax_fault_return (error ) | major ;
1200+ unlock_entry :
1201+ put_locked_mapping_entry (mapping , vmf -> pgoff , entry );
11311202 finish_iomap :
11321203 if (ops -> iomap_end ) {
1133- if (error || (vmf_ret & VM_FAULT_ERROR )) {
1134- /* keep previous error */
1135- ops -> iomap_end (inode , pos , PAGE_SIZE , 0 , flags ,
1136- & iomap );
1137- } else {
1138- error = ops -> iomap_end (inode , pos , PAGE_SIZE ,
1139- PAGE_SIZE , flags , & iomap );
1140- }
1141- }
1142- unlock_entry :
1143- if (vmf_ret != VM_FAULT_LOCKED || error )
1144- put_locked_mapping_entry (mapping , vmf -> pgoff , entry );
1145- out :
1146- if (error == - ENOMEM )
1147- return VM_FAULT_OOM | major ;
1148- /* -EBUSY is fine, somebody else faulted on the same PTE */
1149- if (error < 0 && error != - EBUSY )
1150- return VM_FAULT_SIGBUS | major ;
1151- if (vmf_ret ) {
1152- WARN_ON_ONCE (error ); /* -EBUSY from ops->iomap_end? */
1153- return vmf_ret ;
1204+ int copied = PAGE_SIZE ;
1205+
1206+ if (vmf_ret & VM_FAULT_ERROR )
1207+ copied = 0 ;
1208+ /*
1209+ * The fault is done by now and there's no way back (other
1210+ * thread may be already happily using PTE we have installed).
1211+ * Just ignore error from ->iomap_end since we cannot do much
1212+ * with it.
1213+ */
1214+ ops -> iomap_end (inode , pos , PAGE_SIZE , copied , flags , & iomap );
11541215 }
1155- return VM_FAULT_NOPAGE | major ;
1216+ return vmf_ret ;
11561217}
11571218EXPORT_SYMBOL_GPL (dax_iomap_fault );
11581219
@@ -1276,16 +1337,6 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
12761337 if ((pgoff | PG_PMD_COLOUR ) > max_pgoff )
12771338 goto fallback ;
12781339
1279- /*
1280- * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
1281- * PMD or a HZP entry. If it can't (because a 4k page is already in
1282- * the tree, for instance), it will return -EEXIST and we just fall
1283- * back to 4k entries.
1284- */
1285- entry = grab_mapping_entry (mapping , pgoff , RADIX_DAX_PMD );
1286- if (IS_ERR (entry ))
1287- goto fallback ;
1288-
12891340 /*
12901341 * Note that we don't use iomap_apply here. We aren't doing I/O, only
12911342 * setting up a mapping, so really we're using iomap_begin() as a way
@@ -1294,10 +1345,21 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
12941345 pos = (loff_t )pgoff << PAGE_SHIFT ;
12951346 error = ops -> iomap_begin (inode , pos , PMD_SIZE , iomap_flags , & iomap );
12961347 if (error )
1297- goto unlock_entry ;
1348+ goto fallback ;
1349+
12981350 if (iomap .offset + iomap .length < pos + PMD_SIZE )
12991351 goto finish_iomap ;
13001352
1353+ /*
1354+ * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
1355+ * PMD or a HZP entry. If it can't (because a 4k page is already in
1356+ * the tree, for instance), it will return -EEXIST and we just fall
1357+ * back to 4k entries.
1358+ */
1359+ entry = grab_mapping_entry (mapping , pgoff , RADIX_DAX_PMD );
1360+ if (IS_ERR (entry ))
1361+ goto finish_iomap ;
1362+
13011363 vmf .pgoff = pgoff ;
13021364 vmf .flags = flags ;
13031365 vmf .gfp_mask = mapping_gfp_mask (mapping ) | __GFP_IO ;
@@ -1310,7 +1372,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
13101372 case IOMAP_UNWRITTEN :
13111373 case IOMAP_HOLE :
13121374 if (WARN_ON_ONCE (write ))
1313- goto finish_iomap ;
1375+ goto unlock_entry ;
13141376 result = dax_pmd_load_hole (vma , pmd , & vmf , address , & iomap ,
13151377 & entry );
13161378 break ;
@@ -1319,20 +1381,23 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
13191381 break ;
13201382 }
13211383
1384+ unlock_entry :
1385+ put_locked_mapping_entry (mapping , pgoff , entry );
13221386 finish_iomap :
13231387 if (ops -> iomap_end ) {
1324- if (result == VM_FAULT_FALLBACK ) {
1325- ops -> iomap_end (inode , pos , PMD_SIZE , 0 , iomap_flags ,
1326- & iomap );
1327- } else {
1328- error = ops -> iomap_end (inode , pos , PMD_SIZE , PMD_SIZE ,
1329- iomap_flags , & iomap );
1330- if (error )
1331- result = VM_FAULT_FALLBACK ;
1332- }
1388+ int copied = PMD_SIZE ;
1389+
1390+ if (result == VM_FAULT_FALLBACK )
1391+ copied = 0 ;
1392+ /*
1393+ * The fault is done by now and there's no way back (other
1394+ * thread may be already happily using PMD we have installed).
1395+ * Just ignore error from ->iomap_end since we cannot do much
1396+ * with it.
1397+ */
1398+ ops -> iomap_end (inode , pos , PMD_SIZE , copied , iomap_flags ,
1399+ & iomap );
13331400 }
1334- unlock_entry :
1335- put_locked_mapping_entry (mapping , pgoff , entry );
13361401 fallback :
13371402 if (result == VM_FAULT_FALLBACK ) {
13381403 split_huge_pmd (vma , pmd , address );
0 commit comments