Skip to content

Commit bf8d5d5

Browse files
rgushchintorvalds
authored andcommitted
memcg: introduce memory.min
Memory controller implements the memory.low best-effort memory protection mechanism, which works perfectly in many cases and allows protecting working sets of important workloads from sudden reclaim. But its semantics has a significant limitation: it works only as long as there is a supply of reclaimable memory. This makes it pretty useless against any sort of slow memory leaks or memory usage increases. This is especially true for swapless systems. If swap is enabled, memory soft protection effectively postpones problems, allowing a leaking application to fill all swap area, which makes no sense. The only effective way to guarantee the memory protection in this case is to invoke the OOM killer. It's possible to handle this case in userspace by reacting on MEMCG_LOW events; but there is still a place for a fail-safe in-kernel mechanism to provide stronger guarantees. This patch introduces the memory.min interface for cgroup v2 memory controller. It works very similarly to memory.low (sharing the same hierarchical behavior), except that it's not disabled if there is no more reclaimable memory in the system. If cgroup is not populated, its memory.min is ignored, because otherwise even the OOM killer wouldn't be able to reclaim the protected memory, and the system can stall. [[email protected]: s/low/min/ in docs] Link: http://lkml.kernel.org/r/[email protected] Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Roman Gushchin <[email protected]> Reviewed-by: Randy Dunlap <[email protected]> Acked-by: Johannes Weiner <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Vladimir Davydov <[email protected]> Cc: Tejun Heo <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent fb52bba commit bf8d5d5

File tree

6 files changed

+202
-50
lines changed

6 files changed

+202
-50
lines changed

Documentation/admin-guide/cgroup-v2.rst

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1001,6 +1001,29 @@ PAGE_SIZE multiple when read back.
10011001
The total amount of memory currently being used by the cgroup
10021002
and its descendants.
10031003

1004+
memory.min
1005+
A read-write single value file which exists on non-root
1006+
cgroups. The default is "0".
1007+
1008+
Hard memory protection. If the memory usage of a cgroup
1009+
is within its effective min boundary, the cgroup's memory
1010+
won't be reclaimed under any conditions. If there is no
1011+
unprotected reclaimable memory available, OOM killer
1012+
is invoked.
1013+
1014+
Effective min boundary is limited by memory.min values of
1015+
all ancestor cgroups. If there is memory.min overcommitment
1016+
(child cgroup or cgroups are requiring more protected memory
1017+
than parent will allow), then each child cgroup will get
1018+
the part of parent's protection proportional to its
1019+
actual memory usage below memory.min.
1020+
1021+
Putting more memory than generally available under this
1022+
protection is discouraged and may lead to constant OOMs.
1023+
1024+
If a memory cgroup is not populated with processes,
1025+
its memory.min is ignored.
1026+
10041027
memory.low
10051028
A read-write single value file which exists on non-root
10061029
cgroups. The default is "0".
@@ -1012,9 +1035,9 @@ PAGE_SIZE multiple when read back.
10121035

10131036
Effective low boundary is limited by memory.low values of
10141037
all ancestor cgroups. If there is memory.low overcommitment
1015-
(child cgroup or cgroups are requiring more protected memory,
1038+
(child cgroup or cgroups are requiring more protected memory
10161039
than parent will allow), then each child cgroup will get
1017-
the part of parent's protection proportional to the its
1040+
the part of parent's protection proportional to its
10181041
actual memory usage below memory.low.
10191042

10201043
Putting more memory than generally available under this

include/linux/memcontrol.h

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,12 @@ enum memcg_memory_event {
5858
MEMCG_NR_MEMORY_EVENTS,
5959
};
6060

61+
enum mem_cgroup_protection {
62+
MEMCG_PROT_NONE,
63+
MEMCG_PROT_LOW,
64+
MEMCG_PROT_MIN,
65+
};
66+
6167
struct mem_cgroup_reclaim_cookie {
6268
pg_data_t *pgdat;
6369
int priority;
@@ -289,7 +295,8 @@ static inline bool mem_cgroup_disabled(void)
289295
return !cgroup_subsys_enabled(memory_cgrp_subsys);
290296
}
291297

292-
bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
298+
enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
299+
struct mem_cgroup *memcg);
293300

294301
int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
295302
gfp_t gfp_mask, struct mem_cgroup **memcgp,
@@ -734,10 +741,10 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg,
734741
{
735742
}
736743

737-
static inline bool mem_cgroup_low(struct mem_cgroup *root,
738-
struct mem_cgroup *memcg)
744+
static inline enum mem_cgroup_protection mem_cgroup_protected(
745+
struct mem_cgroup *root, struct mem_cgroup *memcg)
739746
{
740-
return false;
747+
return MEMCG_PROT_NONE;
741748
}
742749

743750
static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,

include/linux/page_counter.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,16 @@
88

99
struct page_counter {
1010
atomic_long_t usage;
11-
unsigned long max;
11+
unsigned long min;
1212
unsigned long low;
13+
unsigned long max;
1314
struct page_counter *parent;
1415

16+
/* effective memory.min and memory.min usage tracking */
17+
unsigned long emin;
18+
atomic_long_t min_usage;
19+
atomic_long_t children_min_usage;
20+
1521
/* effective memory.low and memory.low usage tracking */
1622
unsigned long elow;
1723
atomic_long_t low_usage;
@@ -47,8 +53,9 @@ bool page_counter_try_charge(struct page_counter *counter,
4753
unsigned long nr_pages,
4854
struct page_counter **fail);
4955
void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
50-
int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages);
56+
void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages);
5157
void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages);
58+
int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages);
5259
int page_counter_memparse(const char *buf, const char *max,
5360
unsigned long *nr_pages);
5461

mm/memcontrol.c

Lines changed: 94 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4275,6 +4275,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
42754275
}
42764276
spin_unlock(&memcg->event_list_lock);
42774277

4278+
page_counter_set_min(&memcg->memory, 0);
42784279
page_counter_set_low(&memcg->memory, 0);
42794280

42804281
memcg_offline_kmem(memcg);
@@ -4329,6 +4330,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
43294330
page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
43304331
page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
43314332
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
4333+
page_counter_set_min(&memcg->memory, 0);
43324334
page_counter_set_low(&memcg->memory, 0);
43334335
memcg->high = PAGE_COUNTER_MAX;
43344336
memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -5066,6 +5068,36 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
50665068
return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
50675069
}
50685070

5071+
static int memory_min_show(struct seq_file *m, void *v)
5072+
{
5073+
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5074+
unsigned long min = READ_ONCE(memcg->memory.min);
5075+
5076+
if (min == PAGE_COUNTER_MAX)
5077+
seq_puts(m, "max\n");
5078+
else
5079+
seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
5080+
5081+
return 0;
5082+
}
5083+
5084+
static ssize_t memory_min_write(struct kernfs_open_file *of,
5085+
char *buf, size_t nbytes, loff_t off)
5086+
{
5087+
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5088+
unsigned long min;
5089+
int err;
5090+
5091+
buf = strstrip(buf);
5092+
err = page_counter_memparse(buf, "max", &min);
5093+
if (err)
5094+
return err;
5095+
5096+
page_counter_set_min(&memcg->memory, min);
5097+
5098+
return nbytes;
5099+
}
5100+
50695101
static int memory_low_show(struct seq_file *m, void *v)
50705102
{
50715103
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -5300,6 +5332,12 @@ static struct cftype memory_files[] = {
53005332
.flags = CFTYPE_NOT_ON_ROOT,
53015333
.read_u64 = memory_current_read,
53025334
},
5335+
{
5336+
.name = "min",
5337+
.flags = CFTYPE_NOT_ON_ROOT,
5338+
.seq_show = memory_min_show,
5339+
.write = memory_min_write,
5340+
},
53035341
{
53045342
.name = "low",
53055343
.flags = CFTYPE_NOT_ON_ROOT,
@@ -5349,19 +5387,24 @@ struct cgroup_subsys memory_cgrp_subsys = {
53495387
};
53505388

53515389
/**
5352-
* mem_cgroup_low - check if memory consumption is in the normal range
5390+
* mem_cgroup_protected - check if memory consumption is in the normal range
53535391
* @root: the top ancestor of the sub-tree being checked
53545392
* @memcg: the memory cgroup to check
53555393
*
53565394
* WARNING: This function is not stateless! It can only be used as part
53575395
* of a top-down tree iteration, not for isolated queries.
53585396
*
5359-
* Returns %true if memory consumption of @memcg is in the normal range.
5397+
* Returns one of the following:
5398+
* MEMCG_PROT_NONE: cgroup memory is not protected
5399+
* MEMCG_PROT_LOW: cgroup memory is protected as long there is
5400+
* an unprotected supply of reclaimable memory from other cgroups.
5401+
* MEMCG_PROT_MIN: cgroup memory is protected
53605402
*
5361-
* @root is exclusive; it is never low when looked at directly
5403+
* @root is exclusive; it is never protected when looked at directly
53625404
*
5363-
* To provide a proper hierarchical behavior, effective memory.low value
5364-
* is used.
5405+
* To provide a proper hierarchical behavior, effective memory.min/low values
5406+
* are used. Below is the description of how effective memory.low is calculated.
5407+
* Effective memory.min values is calculated in the same way.
53655408
*
53665409
* Effective memory.low is always equal or less than the original memory.low.
53675410
* If there is no memory.low overcommittment (which is always true for
@@ -5406,51 +5449,78 @@ struct cgroup_subsys memory_cgrp_subsys = {
54065449
* E/memory.current = 0
54075450
*
54085451
* These calculations require constant tracking of the actual low usages
5409-
* (see propagate_low_usage()), as well as recursive calculation of
5410-
* effective memory.low values. But as we do call mem_cgroup_low()
5452+
* (see propagate_protected_usage()), as well as recursive calculation of
5453+
* effective memory.low values. But as we do call mem_cgroup_protected()
54115454
* path for each memory cgroup top-down from the reclaim,
54125455
* it's possible to optimize this part, and save calculated elow
54135456
* for next usage. This part is intentionally racy, but it's ok,
54145457
* as memory.low is a best-effort mechanism.
54155458
*/
5416-
bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
5459+
enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
5460+
struct mem_cgroup *memcg)
54175461
{
5418-
unsigned long usage, low_usage, siblings_low_usage;
5419-
unsigned long elow, parent_elow;
54205462
struct mem_cgroup *parent;
5463+
unsigned long emin, parent_emin;
5464+
unsigned long elow, parent_elow;
5465+
unsigned long usage;
54215466

54225467
if (mem_cgroup_disabled())
5423-
return false;
5468+
return MEMCG_PROT_NONE;
54245469

54255470
if (!root)
54265471
root = root_mem_cgroup;
54275472
if (memcg == root)
5428-
return false;
5473+
return MEMCG_PROT_NONE;
54295474

5430-
elow = memcg->memory.low;
54315475
usage = page_counter_read(&memcg->memory);
5432-
parent = parent_mem_cgroup(memcg);
5476+
if (!usage)
5477+
return MEMCG_PROT_NONE;
5478+
5479+
emin = memcg->memory.min;
5480+
elow = memcg->memory.low;
54335481

5482+
parent = parent_mem_cgroup(memcg);
54345483
if (parent == root)
54355484
goto exit;
54365485

5486+
parent_emin = READ_ONCE(parent->memory.emin);
5487+
emin = min(emin, parent_emin);
5488+
if (emin && parent_emin) {
5489+
unsigned long min_usage, siblings_min_usage;
5490+
5491+
min_usage = min(usage, memcg->memory.min);
5492+
siblings_min_usage = atomic_long_read(
5493+
&parent->memory.children_min_usage);
5494+
5495+
if (min_usage && siblings_min_usage)
5496+
emin = min(emin, parent_emin * min_usage /
5497+
siblings_min_usage);
5498+
}
5499+
54375500
parent_elow = READ_ONCE(parent->memory.elow);
54385501
elow = min(elow, parent_elow);
5502+
if (elow && parent_elow) {
5503+
unsigned long low_usage, siblings_low_usage;
54395504

5440-
if (!elow || !parent_elow)
5441-
goto exit;
5505+
low_usage = min(usage, memcg->memory.low);
5506+
siblings_low_usage = atomic_long_read(
5507+
&parent->memory.children_low_usage);
54425508

5443-
low_usage = min(usage, memcg->memory.low);
5444-
siblings_low_usage = atomic_long_read(
5445-
&parent->memory.children_low_usage);
5446-
5447-
if (!low_usage || !siblings_low_usage)
5448-
goto exit;
5509+
if (low_usage && siblings_low_usage)
5510+
elow = min(elow, parent_elow * low_usage /
5511+
siblings_low_usage);
5512+
}
54495513

5450-
elow = min(elow, parent_elow * low_usage / siblings_low_usage);
54515514
exit:
5515+
memcg->memory.emin = emin;
54525516
memcg->memory.elow = elow;
5453-
return usage && usage <= elow;
5517+
5518+
if (usage <= emin)
5519+
return MEMCG_PROT_MIN;
5520+
else if (usage <= elow)
5521+
return MEMCG_PROT_LOW;
5522+
else
5523+
return MEMCG_PROT_NONE;
54545524
}
54555525

54565526
/**

0 commit comments

Comments
 (0)