Skip to content

Commit 570a335

Browse files
Hugh Dickinstorvalds
authored andcommitted
swap_info: swap count continuations
Swap is duplicated (reference count incremented by one) whenever the same swap page is inserted into another mm (when forking finds a swap entry in place of a pte, or when reclaim unmaps a pte to insert the swap entry). swap_info_struct's vmalloc'ed swap_map is the array of these reference counts: but what happens when the unsigned short (or unsigned char since the preceding patch) is full? (and its high bit is kept for a cache flag) We then lose track of it, never freeing, leaving it in use until swapoff: at which point we _hope_ that a single pass will have found all instances, assume there are no more, and will lose user data if we're wrong. Swapping of KSM pages has not yet been enabled; but it is implemented, and makes it very easy for a user to overflow the maximum swap count: possible with ordinary process pages, but unlikely, even when pid_max has been raised from PID_MAX_DEFAULT. This patch implements swap count continuations: when the count overflows, a continuation page is allocated and linked to the original vmalloc'ed map page, and this used to hold the continuation counts for that entry and its neighbours. These continuation pages are seldom referenced: the common paths all work on the original swap_map, only referring to a continuation page when the low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. Signed-off-by: Hugh Dickins <[email protected]> Cc: KAMEZAWA Hiroyuki <[email protected]> Cc: Rik van Riel <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 8d69aae commit 570a335

File tree

4 files changed

+287
-64
lines changed

4 files changed

+287
-64
lines changed

include/linux/swap.h

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -145,15 +145,18 @@ enum {
145145
SWP_DISCARDABLE = (1 << 2), /* blkdev supports discard */
146146
SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */
147147
SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */
148+
SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */
148149
/* add others here before... */
149150
SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */
150151
};
151152

152153
#define SWAP_CLUSTER_MAX 32
153154

154-
#define SWAP_MAP_MAX 0x7e
155-
#define SWAP_MAP_BAD 0x7f
156-
#define SWAP_HAS_CACHE 0x80 /* There is a swap cache of entry. */
155+
#define SWAP_MAP_MAX 0x3e /* Max duplication count, in first swap_map */
156+
#define SWAP_MAP_BAD 0x3f /* Note pageblock is bad, in first swap_map */
157+
#define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */
158+
#define SWAP_CONT_MAX 0x7f /* Max count, in each swap_map continuation */
159+
#define COUNT_CONTINUED 0x80 /* See swap_map continuation for full count */
157160

158161
/*
159162
* The in-memory structure used to track swap areas.
@@ -311,9 +314,10 @@ extern long total_swap_pages;
311314
extern void si_swapinfo(struct sysinfo *);
312315
extern swp_entry_t get_swap_page(void);
313316
extern swp_entry_t get_swap_page_of_type(int);
314-
extern void swap_duplicate(swp_entry_t);
315-
extern int swapcache_prepare(swp_entry_t);
316317
extern int valid_swaphandles(swp_entry_t, unsigned long *);
318+
extern int add_swap_count_continuation(swp_entry_t, gfp_t);
319+
extern int swap_duplicate(swp_entry_t);
320+
extern int swapcache_prepare(swp_entry_t);
317321
extern void swap_free(swp_entry_t);
318322
extern void swapcache_free(swp_entry_t, struct page *page);
319323
extern int free_swap_and_cache(swp_entry_t);
@@ -385,8 +389,14 @@ static inline void show_swap_cache_info(void)
385389
#define free_swap_and_cache(swp) is_migration_entry(swp)
386390
#define swapcache_prepare(swp) is_migration_entry(swp)
387391

388-
static inline void swap_duplicate(swp_entry_t swp)
392+
static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
389393
{
394+
return 0;
395+
}
396+
397+
static inline int swap_duplicate(swp_entry_t swp)
398+
{
399+
return 0;
390400
}
391401

392402
static inline void swap_free(swp_entry_t swp)

mm/memory.c

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -572,7 +572,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
572572
* covered by this vma.
573573
*/
574574

575-
static inline void
575+
static inline unsigned long
576576
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
577577
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
578578
unsigned long addr, int *rss)
@@ -586,7 +586,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
586586
if (!pte_file(pte)) {
587587
swp_entry_t entry = pte_to_swp_entry(pte);
588588

589-
swap_duplicate(entry);
589+
if (swap_duplicate(entry) < 0)
590+
return entry.val;
591+
590592
/* make sure dst_mm is on swapoff's mmlist. */
591593
if (unlikely(list_empty(&dst_mm->mmlist))) {
592594
spin_lock(&mmlist_lock);
@@ -635,6 +637,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
635637

636638
out_set_pte:
637639
set_pte_at(dst_mm, addr, dst_pte, pte);
640+
return 0;
638641
}
639642

640643
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -646,6 +649,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
646649
spinlock_t *src_ptl, *dst_ptl;
647650
int progress = 0;
648651
int rss[2];
652+
swp_entry_t entry = (swp_entry_t){0};
649653

650654
again:
651655
rss[1] = rss[0] = 0;
@@ -674,7 +678,10 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
674678
progress++;
675679
continue;
676680
}
677-
copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
681+
entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
682+
vma, addr, rss);
683+
if (entry.val)
684+
break;
678685
progress += 8;
679686
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
680687

@@ -684,6 +691,12 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
684691
add_mm_rss(dst_mm, rss[0], rss[1]);
685692
pte_unmap_unlock(orig_dst_pte, dst_ptl);
686693
cond_resched();
694+
695+
if (entry.val) {
696+
if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
697+
return -ENOMEM;
698+
progress = 0;
699+
}
687700
if (addr != end)
688701
goto again;
689702
return 0;

mm/rmap.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -822,7 +822,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
822822
* Store the swap location in the pte.
823823
* See handle_pte_fault() ...
824824
*/
825-
swap_duplicate(entry);
825+
if (swap_duplicate(entry) < 0) {
826+
set_pte_at(mm, address, pte, pteval);
827+
ret = SWAP_FAIL;
828+
goto out_unmap;
829+
}
826830
if (list_empty(&mm->mmlist)) {
827831
spin_lock(&mmlist_lock);
828832
if (list_empty(&mm->mmlist))

0 commit comments

Comments
 (0)