Skip to content

Commit 6769183

Browse files
ryncsnakpm00
authored andcommitted
mm/swap_cgroup: decouple swap cgroup recording and clearing
The current implementation of swap cgroup tracking is a bit complex and fragile: On charging path, swap_cgroup_record always records an actual memcg id, and it depends on the caller to make sure all entries passed in must belong to one single folio. As folios are always charged or uncharged as a whole, and always charged and uncharged in order, swap_cgroup doesn't need an extra lock. On uncharging path, swap_cgroup_record always sets the record to zero. These entries won't be charged again until uncharging is done. So there is no extra lock needed either. Worth noting that swap cgroup clearing may happen without folio involved, eg. exiting processes will zap its page table without swapin. The xchg/cmpxchg provides atomic operations and barriers to ensure no tearing or synchronization issue of these swap cgroup records. It works but quite error-prone. Things can be much clear and robust by decoupling recording and clearing into two helpers. Recording takes the actual folio being charged as argument, and clearing always set the record to zero, and refine the debug sanity checks to better reflect their usage Benchmark even showed a very slight improvement as it saved some extra arguments and lookups: make -j96 with defconfig on tmpfs in 1.5G memory cgroup using 4k folios: Before: sys 9617.23 (stdev 37.764062) After : sys 9541.54 (stdev 42.973976) make -j96 with defconfig on tmpfs in 2G memory cgroup using 64k folios: Before: sys 7358.98 (stdev 54.927593) After : sys 7337.82 (stdev 39.398956) Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Kairui Song <[email protected]> Suggested-by: Chris Li <[email protected]> Cc: Barry Song <[email protected]> Cc: Hugh Dickins <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Roman Gushchin <[email protected]> Cc: Shakeel Butt <[email protected]> Cc: Yosry Ahmed <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent 2b3a58b commit 6769183

File tree

3 files changed

+55
-36
lines changed

3 files changed

+55
-36
lines changed

include/linux/swap_cgroup.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,21 @@
66

77
#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
88

9-
extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
10-
unsigned int nr_ents);
9+
extern void swap_cgroup_record(struct folio *folio, swp_entry_t ent);
10+
extern unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents);
1111
extern unsigned short lookup_swap_cgroup_id(swp_entry_t ent);
1212
extern int swap_cgroup_swapon(int type, unsigned long max_pages);
1313
extern void swap_cgroup_swapoff(int type);
1414

1515
#else
1616

1717
static inline
18-
unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
19-
unsigned int nr_ents)
18+
void swap_cgroup_record(struct folio *folio, swp_entry_t ent)
19+
{
20+
}
21+
22+
static inline
23+
unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents)
2024
{
2125
return 0;
2226
}

mm/memcontrol.c

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4959,7 +4959,6 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
49594959
{
49604960
struct mem_cgroup *memcg, *swap_memcg;
49614961
unsigned int nr_entries;
4962-
unsigned short oldid;
49634962

49644963
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
49654964
VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
@@ -4986,11 +4985,10 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
49864985
/* Get references for the tail pages, too */
49874986
if (nr_entries > 1)
49884987
mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
4989-
oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
4990-
nr_entries);
4991-
VM_BUG_ON_FOLIO(oldid, folio);
49924988
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
49934989

4990+
swap_cgroup_record(folio, entry);
4991+
49944992
folio_unqueue_deferred_split(folio);
49954993
folio->memcg_data = 0;
49964994

@@ -5021,7 +5019,6 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
50215019
unsigned int nr_pages = folio_nr_pages(folio);
50225020
struct page_counter *counter;
50235021
struct mem_cgroup *memcg;
5024-
unsigned short oldid;
50255022

50265023
if (do_memsw_account())
50275024
return 0;
@@ -5050,10 +5047,10 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry)
50505047
/* Get references for the tail pages, too */
50515048
if (nr_pages > 1)
50525049
mem_cgroup_id_get_many(memcg, nr_pages - 1);
5053-
oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
5054-
VM_BUG_ON_FOLIO(oldid, folio);
50555050
mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
50565051

5052+
swap_cgroup_record(folio, entry);
5053+
50575054
return 0;
50585055
}
50595056

@@ -5067,7 +5064,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
50675064
struct mem_cgroup *memcg;
50685065
unsigned short id;
50695066

5070-
id = swap_cgroup_record(entry, 0, nr_pages);
5067+
id = swap_cgroup_clear(entry, nr_pages);
50715068
rcu_read_lock();
50725069
memcg = mem_cgroup_from_id(id);
50735070
if (memcg) {

mm/swap_cgroup.c

Lines changed: 42 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -21,17 +21,6 @@ struct swap_cgroup_ctrl {
2121

2222
static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
2323

24-
/*
25-
* SwapCgroup implements "lookup" and "exchange" operations.
26-
* In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
27-
* against SwapCache. At swap_free(), this is accessed directly from swap.
28-
*
29-
* This means,
30-
* - we have no race in "exchange" when we're accessed via SwapCache because
31-
* SwapCache(and its swp_entry) is under lock.
32-
* - When called via swap_free(), there is no user of this entry and no race.
33-
* Then, we don't need lock around "exchange".
34-
*/
3524
static unsigned short __swap_cgroup_id_lookup(struct swap_cgroup *map,
3625
pgoff_t offset)
3726
{
@@ -63,29 +52,58 @@ static unsigned short __swap_cgroup_id_xchg(struct swap_cgroup *map,
6352
}
6453

6554
/**
66-
* swap_cgroup_record - record mem_cgroup for a set of swap entries
55+
* swap_cgroup_record - record mem_cgroup for a set of swap entries.
56+
* These entries must belong to one single folio, and that folio
57+
* must be being charged for swap space (swap out), and these
58+
* entries must not have been charged
59+
*
60+
* @folio: the folio that the swap entry belongs to
61+
* @ent: the first swap entry to be recorded
62+
*/
63+
void swap_cgroup_record(struct folio *folio, swp_entry_t ent)
64+
{
65+
unsigned int nr_ents = folio_nr_pages(folio);
66+
struct swap_cgroup *map;
67+
pgoff_t offset, end;
68+
unsigned short old;
69+
70+
offset = swp_offset(ent);
71+
end = offset + nr_ents;
72+
map = swap_cgroup_ctrl[swp_type(ent)].map;
73+
74+
do {
75+
old = __swap_cgroup_id_xchg(map, offset,
76+
mem_cgroup_id(folio_memcg(folio)));
77+
VM_BUG_ON(old);
78+
} while (++offset != end);
79+
}
80+
81+
/**
82+
* swap_cgroup_clear - clear mem_cgroup for a set of swap entries.
83+
* These entries must be being uncharged from swap. They either
84+
* belongs to one single folio in the swap cache (swap in for
85+
* cgroup v1), or no longer have any users (slot freeing).
86+
*
6787
* @ent: the first swap entry to be recorded into
68-
* @id: mem_cgroup to be recorded
6988
* @nr_ents: number of swap entries to be recorded
7089
*
71-
* Returns old value at success, 0 at failure.
72-
* (Of course, old value can be 0.)
90+
* Returns the existing old value.
7391
*/
74-
unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
75-
unsigned int nr_ents)
92+
unsigned short swap_cgroup_clear(swp_entry_t ent, unsigned int nr_ents)
7693
{
77-
struct swap_cgroup_ctrl *ctrl;
7894
pgoff_t offset = swp_offset(ent);
7995
pgoff_t end = offset + nr_ents;
80-
unsigned short old, iter;
8196
struct swap_cgroup *map;
97+
unsigned short old, iter = 0;
8298

83-
ctrl = &swap_cgroup_ctrl[swp_type(ent)];
84-
map = ctrl->map;
99+
offset = swp_offset(ent);
100+
end = offset + nr_ents;
101+
map = swap_cgroup_ctrl[swp_type(ent)].map;
85102

86-
old = __swap_cgroup_id_lookup(map, offset);
87103
do {
88-
iter = __swap_cgroup_id_xchg(map, offset, id);
104+
old = __swap_cgroup_id_xchg(map, offset, 0);
105+
if (!iter)
106+
iter = old;
89107
VM_BUG_ON(iter != old);
90108
} while (++offset != end);
91109

@@ -119,7 +137,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
119137

120138
BUILD_BUG_ON(sizeof(unsigned short) * ID_PER_SC !=
121139
sizeof(struct swap_cgroup));
122-
map = vcalloc(DIV_ROUND_UP(max_pages, ID_PER_SC),
140+
map = vzalloc(DIV_ROUND_UP(max_pages, ID_PER_SC) *
123141
sizeof(struct swap_cgroup));
124142
if (!map)
125143
goto nomem;

0 commit comments

Comments
 (0)