Skip to content

Commit 8d0ba6b

Browse files
committed
zig build: add an OOM-prevention system
The problem is that one may execute too many subprocesses concurrently that, together, exceed an RSS value that causes the OOM killer to kill something problematic such as the window manager. Or worse, nothing, and the system freezes. This is a real world problem. For example when building LLVM a simple `ninja install` will bring your system to its knees if you don't know that you should add `-DLLVM_PARALLEL_LINK_JOBS=1`. In particular: compiling the zig std lib tests takes about 2G each, which at 16x at once (8 cores + hyperthreading) is using all 32GB of my RAM, causing the OOM killer to kill my window manager The idea here is that you can annotate steps that might use a high amount of system resources with an upper bound. So for example I could mark the std lib tests as having an upper bound peak RSS of 3 GiB. Then the build system will do 2 things: 1. ulimit the child process, so that it will fail if it would exceed that memory limit. 2. Notice how much system RAM is available and avoid running too many concurrent jobs at once that would total more than that. This implements (1) not with an operating system enforced limit, but by checking the maxrss after a child process exits. However it does implement (2) correctly. The available memory used by the build system defaults to the total system memory, regardless of whether it is used by other processes at the time of spawning the build runner. This value can be overridden with the new --maxrss flag to `zig build`. This mechanism will ensure that the sum total of upper bound RSS memory of concurrent tasks will not exceed this value. This system makes it so that project maintainers can annotate problematic subprocesses, avoiding bug reports from users, who can blissfully execute `zig build` without worrying about the project's internals. Nobody's computer crashes, and the build system uses as much parallelism as possible without risking OOM. Users do not need to unnecessarily resort to -j1 when the build system can figure this out for them.
1 parent 97385b9 commit 8d0ba6b

File tree

4 files changed

+207
-41
lines changed

4 files changed

+207
-41
lines changed

lib/build_runner.zig

Lines changed: 154 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -84,20 +84,21 @@ pub fn main() !void {
8484
);
8585
defer builder.destroy();
8686

87+
const Color = enum { auto, off, on };
88+
8789
var targets = ArrayList([]const u8).init(arena);
8890
var debug_log_scopes = ArrayList([]const u8).init(arena);
8991
var thread_pool_options: std.Thread.Pool.Options = .{ .allocator = arena };
9092

91-
const stderr_stream = io.getStdErr().writer();
92-
const stdout_stream = io.getStdOut().writer();
93-
9493
var install_prefix: ?[]const u8 = null;
9594
var dir_list = std.Build.DirList{};
9695
var enable_summary: ?bool = null;
97-
98-
const Color = enum { auto, off, on };
96+
var max_rss: usize = 0;
9997
var color: Color = .auto;
10098

99+
const stderr_stream = io.getStdErr().writer();
100+
const stdout_stream = io.getStdOut().writer();
101+
101102
while (nextArg(args, &arg_idx)) |arg| {
102103
if (mem.startsWith(u8, arg, "-D")) {
103104
const option_contents = arg[2..];
@@ -147,6 +148,18 @@ pub fn main() !void {
147148
usageAndErr(builder, false, stderr_stream);
148149
};
149150
builder.sysroot = sysroot;
151+
} else if (mem.eql(u8, arg, "--maxrss")) {
152+
const max_rss_text = nextArg(args, &arg_idx) orelse {
153+
std.debug.print("Expected argument after --sysroot\n\n", .{});
154+
usageAndErr(builder, false, stderr_stream);
155+
};
156+
// TODO: support shorthand such as "2GiB", "2GB", or "2G"
157+
max_rss = std.fmt.parseInt(usize, max_rss_text, 10) catch |err| {
158+
std.debug.print("invalid byte size: '{s}': {s}\n", .{
159+
max_rss_text, @errorName(err),
160+
});
161+
process.exit(1);
162+
};
150163
} else if (mem.eql(u8, arg, "--search-prefix")) {
151164
const search_prefix = nextArg(args, &arg_idx) orelse {
152165
std.debug.print("Expected argument after --search-prefix\n\n", .{});
@@ -280,30 +293,55 @@ pub fn main() !void {
280293
if (builder.validateUserInputDidItFail())
281294
usageAndErr(builder, true, stderr_stream);
282295

296+
var run: Run = .{
297+
.max_rss = max_rss,
298+
.max_rss_is_default = false,
299+
.max_rss_mutex = .{},
300+
.memory_blocked_steps = std.ArrayList(*Step).init(arena),
301+
302+
.claimed_rss = 0,
303+
.enable_summary = enable_summary,
304+
.ttyconf = ttyconf,
305+
.stderr = stderr,
306+
};
307+
308+
if (run.max_rss == 0) {
309+
run.max_rss = process.totalSystemMemory() catch std.math.maxInt(usize);
310+
run.max_rss_is_default = true;
311+
}
312+
283313
runStepNames(
284314
arena,
285315
builder,
286316
targets.items,
287317
main_progress_node,
288318
thread_pool_options,
289-
ttyconf,
290-
stderr,
291-
enable_summary,
319+
&run,
292320
) catch |err| switch (err) {
293321
error.UncleanExit => process.exit(1),
294322
else => return err,
295323
};
296324
}
297325

326+
const Run = struct {
327+
max_rss: usize,
328+
max_rss_is_default: bool,
329+
max_rss_mutex: std.Thread.Mutex,
330+
memory_blocked_steps: std.ArrayList(*Step),
331+
332+
claimed_rss: usize,
333+
enable_summary: ?bool,
334+
ttyconf: std.debug.TTY.Config,
335+
stderr: std.fs.File,
336+
};
337+
298338
fn runStepNames(
299339
arena: std.mem.Allocator,
300340
b: *std.Build,
301341
step_names: []const []const u8,
302342
parent_prog_node: *std.Progress.Node,
303343
thread_pool_options: std.Thread.Pool.Options,
304-
ttyconf: std.debug.TTY.Config,
305-
stderr: std.fs.File,
306-
enable_summary: ?bool,
344+
run: *Run,
307345
) !void {
308346
const gpa = b.allocator;
309347
var step_stack: std.AutoArrayHashMapUnmanaged(*Step, void) = .{};
@@ -331,6 +369,26 @@ fn runStepNames(
331369
};
332370
}
333371

372+
{
373+
// Check that we have enough memory to complete the build.
374+
var any_problems = false;
375+
for (step_stack.keys()) |s| {
376+
if (s.max_rss == 0) continue;
377+
if (s.max_rss > run.max_rss) {
378+
std.debug.print("{s}{s}: this step declares an upper bound of {d} bytes of memory, exceeding the available {d} bytes of memory\n", .{
379+
s.owner.dep_prefix, s.name, s.max_rss, run.max_rss,
380+
});
381+
any_problems = true;
382+
}
383+
}
384+
if (any_problems) {
385+
if (run.max_rss_is_default) {
386+
std.debug.print("note: use --maxrss to override the default", .{});
387+
}
388+
return error.UncleanExit;
389+
}
390+
}
391+
334392
var thread_pool: std.Thread.Pool = undefined;
335393
try thread_pool.init(thread_pool_options);
336394
defer thread_pool.deinit();
@@ -353,10 +411,11 @@ fn runStepNames(
353411

354412
wait_group.start();
355413
thread_pool.spawn(workerMakeOneStep, .{
356-
&wait_group, &thread_pool, b, step, &step_prog, ttyconf,
414+
&wait_group, &thread_pool, b, step, &step_prog, run,
357415
}) catch @panic("OOM");
358416
}
359417
}
418+
assert(run.memory_blocked_steps.items.len == 0);
360419

361420
var success_count: usize = 0;
362421
var skipped_count: usize = 0;
@@ -396,9 +455,12 @@ fn runStepNames(
396455

397456
// A proper command line application defaults to silently succeeding.
398457
// The user may request verbose mode if they have a different preference.
399-
if (failure_count == 0 and enable_summary != true) return cleanExit();
458+
if (failure_count == 0 and run.enable_summary != true) return cleanExit();
459+
460+
const ttyconf = run.ttyconf;
461+
const stderr = run.stderr;
400462

401-
if (enable_summary != false) {
463+
if (run.enable_summary != false) {
402464
const total_count = success_count + failure_count + pending_count + skipped_count;
403465
ttyconf.setColor(stderr, .Cyan) catch {};
404466
stderr.writeAll("Build Summary:") catch {};
@@ -407,7 +469,7 @@ fn runStepNames(
407469
if (skipped_count > 0) stderr.writer().print("; {d} skipped", .{skipped_count}) catch {};
408470
if (failure_count > 0) stderr.writer().print("; {d} failed", .{failure_count}) catch {};
409471

410-
if (enable_summary == null) {
472+
if (run.enable_summary == null) {
411473
ttyconf.setColor(stderr, .Dim) catch {};
412474
stderr.writeAll(" (disable with -fno-summary)") catch {};
413475
ttyconf.setColor(stderr, .Reset) catch {};
@@ -623,7 +685,7 @@ fn workerMakeOneStep(
623685
b: *std.Build,
624686
s: *Step,
625687
prog_node: *std.Progress.Node,
626-
ttyconf: std.debug.TTY.Config,
688+
run: *Run,
627689
) void {
628690
defer wg.finish();
629691

@@ -646,10 +708,32 @@ fn workerMakeOneStep(
646708
}
647709
}
648710

649-
// Avoid running steps twice.
650-
if (@cmpxchgStrong(Step.State, &s.state, .precheck_done, .running, .SeqCst, .SeqCst) != null) {
651-
// Another worker got the job.
652-
return;
711+
if (s.max_rss != 0) {
712+
run.max_rss_mutex.lock();
713+
defer run.max_rss_mutex.unlock();
714+
715+
// Avoid running steps twice.
716+
if (s.state != .precheck_done) {
717+
// Another worker got the job.
718+
return;
719+
}
720+
721+
const new_claimed_rss = run.claimed_rss + s.max_rss;
722+
if (new_claimed_rss > run.max_rss) {
723+
// Running this step right now could possibly exceed the allotted RSS.
724+
// Add this step to the queue of memory-blocked steps.
725+
run.memory_blocked_steps.append(s) catch @panic("OOM");
726+
return;
727+
}
728+
729+
run.claimed_rss = new_claimed_rss;
730+
s.state = .running;
731+
} else {
732+
// Avoid running steps twice.
733+
if (@cmpxchgStrong(Step.State, &s.state, .precheck_done, .running, .SeqCst, .SeqCst) != null) {
734+
// Another worker got the job.
735+
return;
736+
}
653737
}
654738

655739
var sub_prog_node = prog_node.start(s.name, 0);
@@ -667,7 +751,8 @@ fn workerMakeOneStep(
667751
sub_prog_node.context.lock_stderr();
668752
defer sub_prog_node.context.unlock_stderr();
669753

670-
const stderr = std.io.getStdErr();
754+
const stderr = run.stderr;
755+
const ttyconf = run.ttyconf;
671756

672757
for (s.result_error_msgs.items) |msg| {
673758
// Sometimes it feels like you just can't catch a break. Finally,
@@ -684,22 +769,55 @@ fn workerMakeOneStep(
684769
}
685770
}
686771

687-
if (make_result) |_| {
688-
@atomicStore(Step.State, &s.state, .success, .SeqCst);
689-
} else |err| switch (err) {
690-
error.MakeFailed => {
691-
@atomicStore(Step.State, &s.state, .failure, .SeqCst);
692-
return;
693-
},
694-
error.MakeSkipped => @atomicStore(Step.State, &s.state, .skipped, .SeqCst),
772+
handle_result: {
773+
if (make_result) |_| {
774+
@atomicStore(Step.State, &s.state, .success, .SeqCst);
775+
} else |err| switch (err) {
776+
error.MakeFailed => {
777+
@atomicStore(Step.State, &s.state, .failure, .SeqCst);
778+
break :handle_result;
779+
},
780+
error.MakeSkipped => @atomicStore(Step.State, &s.state, .skipped, .SeqCst),
781+
}
782+
783+
// Successful completion of a step, so we queue up its dependants as well.
784+
for (s.dependants.items) |dep| {
785+
wg.start();
786+
thread_pool.spawn(workerMakeOneStep, .{
787+
wg, thread_pool, b, dep, prog_node, run,
788+
}) catch @panic("OOM");
789+
}
695790
}
696791

697-
// Successful completion of a step, so we queue up its dependants as well.
698-
for (s.dependants.items) |dep| {
699-
wg.start();
700-
thread_pool.spawn(workerMakeOneStep, .{
701-
wg, thread_pool, b, dep, prog_node, ttyconf,
702-
}) catch @panic("OOM");
792+
// If this is a step that claims resources, we must now queue up other
793+
// steps that are waiting for resources.
794+
if (s.max_rss != 0) {
795+
run.max_rss_mutex.lock();
796+
defer run.max_rss_mutex.unlock();
797+
798+
// Give the memory back to the scheduler.
799+
run.claimed_rss -= s.max_rss;
800+
// Avoid kicking off too many tasks that we already know will not have
801+
// enough resources.
802+
var remaining = run.max_rss - run.claimed_rss;
803+
var i: usize = 0;
804+
var j: usize = 0;
805+
while (j < run.memory_blocked_steps.items.len) : (j += 1) {
806+
const dep = run.memory_blocked_steps.items[j];
807+
assert(dep.max_rss != 0);
808+
if (dep.max_rss <= remaining) {
809+
remaining -= dep.max_rss;
810+
811+
wg.start();
812+
thread_pool.spawn(workerMakeOneStep, .{
813+
wg, thread_pool, b, dep, prog_node, run,
814+
}) catch @panic("OOM");
815+
} else {
816+
run.memory_blocked_steps.items[i] = dep;
817+
i += 1;
818+
}
819+
}
820+
run.memory_blocked_steps.shrinkRetainingCapacity(i);
703821
}
704822
}
705823

@@ -770,6 +888,7 @@ fn usage(builder: *std.Build, already_ran_build: bool, out_stream: anytype) !voi
770888
\\ --color [auto|off|on] Enable or disable colored error messages
771889
\\ --prominent-compile-errors Output compile errors formatted for a human to read
772890
\\ -j<N> Limit concurrent jobs (default is to use all CPU cores)
891+
\\ --maxrss <bytes> Limit memory usage (default is to use available memory)
773892
\\
774893
\\Project-Specific Options:
775894
\\

0 commit comments

Comments
 (0)