Skip to content

Commit 3b534bd

Browse files
committed
Ensuring that backoff restarts work in process supervision
Minor bash fixups rebase fixups
1 parent 658ffe4 commit 3b534bd

File tree

18 files changed

+214
-102
lines changed

18 files changed

+214
-102
lines changed
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift Distributed Actors open source project
4+
//
5+
// Copyright (c) 2018-2019 Apple Inc. and the Swift Distributed Actors project authors
6+
// Licensed under Apache License v2.0
7+
//
8+
// See LICENSE.txt for license information
9+
// See CONTRIBUTORS.md for the list of Swift Distributed Actors project authors
10+
//
11+
// SPDX-License-Identifier: Apache-2.0
12+
//
13+
//===----------------------------------------------------------------------===//
14+
15+
#if os(OSX)
16+
import Darwin.C
17+
#else
18+
import Glibc
19+
#endif
20+
21+
import DistributedActors
22+
23+
let isolated = ProcessIsolated { boot in
24+
boot.settings.defaultLogLevel = .info
25+
boot.runOn(role: .servant) {
26+
boot.settings.failure.onGuardianFailure = .systemExit(-1)
27+
}
28+
return ActorSystem(settings: boot.settings)
29+
}
30+
31+
pprint("Started process: \(getpid()) with roles: \(isolated.roles)")
32+
33+
struct OnPurposeBoom: Error {}
34+
35+
isolated.run(on: .master) {
36+
isolated.spawnServantProcess(supervision:
37+
.respawn(
38+
atMost: 5, within: nil,
39+
backoff: Backoff.exponential(
40+
initialInterval: .milliseconds(100),
41+
multiplier: 1.5,
42+
randomFactor: 0
43+
)
44+
)
45+
)
46+
}
47+
48+
try isolated.run(on: .servant) {
49+
isolated.system.log.info("ISOLATED RUNNING: \(CommandLine.arguments)")
50+
51+
_ = try isolated.system.spawn("failed", of: String.self,
52+
props: Props().supervision(strategy: .escalate),
53+
.setup { context in
54+
context.log.info("Spawned \(context.path) on servant node it will fail soon...")
55+
context.timers.startSingle(key: "explode", message: "Boom", delay: .seconds(1))
56+
57+
return .receiveMessage { message in
58+
context.log.error("Time to crash with: fatalError")
59+
// crashes process since we do not isolate faults
60+
fatalError("FATAL ERROR ON PURPOSE")
61+
}
62+
})
63+
}
64+
65+
// finally, once prepared, you have to invoke the following:
66+
// which will BLOCK on the master process and use the main thread to
67+
// process any incoming process commands (e.g. spawn another servant)
68+
isolated.blockAndSuperviseServants()
69+
70+
// ~~~ unreachable ~~~

IntegrationTests/tests_02_process_isolated/it_ProcessIsolated_escalatingWorkers/main.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ pprint("Started process: \(getpid()) with roles: \(isolated.roles)")
3333
struct OnPurposeBoom: Error {}
3434

3535
isolated.run(on: .master) {
36-
isolated.spawnServantProcess(supervision: .replace(atMost: 1, within: nil), args: ["fatalError"])
37-
isolated.spawnServantProcess(supervision: .replace(atMost: 1, within: nil), args: ["escalateError"])
36+
isolated.spawnServantProcess(supervision: .respawn(atMost: 1, within: nil), args: ["fatalError"])
37+
isolated.spawnServantProcess(supervision: .respawn(atMost: 1, within: nil), args: ["escalateError"])
3838
}
3939

4040
try isolated.run(on: .servant) {

IntegrationTests/tests_02_process_isolated/it_ProcessIsolated_noLeaking/main.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ let isolated = ProcessIsolated { boot in
3232
pprint("Started process: \(getpid()) with roles: \(isolated.roles)")
3333

3434
// though one can ensure to only run if in a process of a given role:
35-
try isolated.run(on: .master) {
35+
isolated.run(on: .master) {
3636
// open some fds, hope to not leak them into children!
3737
var fds: [Int] = []
3838
for i in 1 ... 1000 {
@@ -43,7 +43,7 @@ try isolated.run(on: .master) {
4343

4444
/// spawn a servant
4545

46-
isolated.spawnServantProcess(supervision: .replace(atMost: 100, within: .seconds(1)), args: ["ALPHA"])
46+
isolated.spawnServantProcess(supervision: .respawn(atMost: 100, within: .seconds(1)), args: ["ALPHA"])
4747
}
4848

4949
// finally, once prepared, you have to invoke the following:

IntegrationTests/tests_02_process_isolated/it_ProcessIsolated_respawnsServants/main.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ try isolated.run(on: .master) {
4949
})
5050

5151
// should we allow anyone to issue this, or only on master? we could `runOnMaster { control` etc
52-
isolated.spawnServantProcess(supervision: .replace(atMost: 100, within: .seconds(1)), args: ["ALPHA"])
52+
isolated.spawnServantProcess(supervision: .respawn(atMost: 100, within: .seconds(1)), args: ["ALPHA"])
5353
}
5454

5555
// Notice that master has no workers, just the pool...

IntegrationTests/tests_02_process_isolated/shared.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,11 @@
1313
##
1414
##===----------------------------------------------------------------------===##
1515

16+
RED='\033[0;31m'
17+
RST='\033[0m'
18+
1619
function echoerr() {
17-
echo "$@" 1>&2;
20+
echo "${RED}$@${RST}" 1>&2;
1821
}
1922

2023
function _killall() {

IntegrationTests/tests_02_process_isolated/test_03_servant_spawning_not_leak_fds.sh

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,6 @@
1616
set -e
1717
#set -x # verbose
1818

19-
declare -r RED='\033[0;31m'
20-
declare -r RST='\033[0m'
21-
2219
declare -r my_path="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
2320
declare -r root_path="$my_path/.."
2421

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,6 @@
1616
set -e
1717
#set -x # verbose
1818

19-
declare -r RED='\033[0;31m'
20-
declare -r RST='\033[0m'
21-
2219
declare -r my_path="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
2320
declare -r root_path="$my_path/.."
2421

@@ -41,14 +38,14 @@ declare -r log_file="/tmp/${app_name}.log"
4138
rm -f ${log_file}
4239
swift run ${app_name} > ${log_file} &
4340

44-
declare -r supervision_replace_grep_txt='supervision: REPLACE'
41+
declare -r supervision_respawn_grep_txt='supervision: RESPAWN'
4542
declare -r supervision_stop_grep_txt='supervision: STOP'
4643

47-
# we want to wait until 2 STOPs are found in the logs; then we can check if the other conditions are as we expect
48-
echo "Waiting for servants to REPLACE and STOP..."
44+
# we want to wait until 2 RESPAs are found in the logs; then we can check if the other conditions are as we expect
45+
echo "Waiting for servant to RESPAWN a few times..."
4946
spin=1 # spin counter
5047
max_spins=20
51-
while [[ $(cat ${log_file} | grep "${supervision_stop_grep_txt}" | wc -l) -ne 2 ]]; do
48+
while [[ $(cat ${log_file} | grep "${supervision_stop_grep_txt}" | wc -l) -ne 3 ]]; do
5249
sleep 1
5350
spin=$((spin+1))
5451
if [[ ${spin} -eq ${max_spins} ]]; then
@@ -59,15 +56,15 @@ while [[ $(cat ${log_file} | grep "${supervision_stop_grep_txt}" | wc -l) -ne 2
5956
done
6057

6158
echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
62-
cat ${log_file} | grep "${supervision_replace_grep_txt}"
59+
cat ${log_file} | grep "${supervision_respawn_grep_txt}"
6360
echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
6461

6562
echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
6663
cat ${log_file} | grep "${supervision_stop_grep_txt}"
6764
echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
6865

69-
if [[ $(cat ${log_file} | grep "${supervision_replace_grep_txt}" | wc -l) -ne 2 ]]; then
70-
echoerr "ERROR: We expected 2 servants to only restart once, yet more restarts were detected!"
66+
if [[ $(cat ${log_file} | grep "${supervision_respawn_grep_txt}" | wc -l) -ne 2 ]]; then
67+
echoerr "ERROR: We expected 2 servants to only respawn once, yet other number of respawns was detected!"
7168
cat ${log_file}
7269

7370
_killall ${app_name}
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#!/bin/bash
2+
##===----------------------------------------------------------------------===##
3+
##
4+
## This source file is part of the Swift Distributed Actors open source project
5+
##
6+
## Copyright (c) 2018-2019 Apple Inc. and the Swift Distributed Actors project authors
7+
## Licensed under Apache License v2.0
8+
##
9+
## See LICENSE.txt for license information
10+
## See CONTRIBUTORS.md for the list of Swift Distributed Actors project authors
11+
##
12+
## SPDX-License-Identifier: Apache-2.0
13+
##
14+
##===----------------------------------------------------------------------===##
15+
16+
set -e
17+
#set -x # verbose
18+
19+
declare -r my_path="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
20+
declare -r root_path="$my_path/.."
21+
22+
declare -r app_name='it_ProcessIsolated_backoffRespawn'
23+
24+
cd ${root_path}
25+
26+
source ${my_path}/shared.sh
27+
28+
_killall ${app_name}
29+
30+
# ====------------------------------------------------------------------------------------------------------------------
31+
# MARK: the app has workers which fail so hard that the failures reach the top level actors which then terminate the system
32+
# when the system terminates we kill the process; once the process terminates, the servant supervision kicks in and
33+
# restarts the entire process; layered supervision for they win!
34+
35+
swift build # synchronously ensure built
36+
37+
declare -r log_file="/tmp/${app_name}.log"
38+
rm -f ${log_file}
39+
swift run ${app_name} > ${log_file} &
40+
41+
declare -r supervision_respawn_grep_txt='supervision: RESPAWN BACKOFF'
42+
43+
# we want to wait until 2 RESPAWNs are found in the logs; then we can check if the other conditions are as we expect
44+
echo "Waiting for servants to RESPAWN BACKOFFs..."
45+
spin=1 # spin counter
46+
max_spins=20
47+
while [[ $(cat ${log_file} | grep "${supervision_respawn_grep_txt}" | wc -l) -le 2 ]]; do
48+
sleep 1
49+
spin=$((spin+1))
50+
if [[ ${spin} -eq ${max_spins} ]]; then
51+
echoerr "Never saw enough '${supervision_respawn_grep_txt}' in logs."
52+
cat ${log_file}
53+
exit -1
54+
fi
55+
done
56+
57+
echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
58+
cat ${log_file} | grep "${supervision_respawn_grep_txt}"
59+
echo '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
60+
61+
if [[ $(cat ${log_file} | grep "${supervision_respawn_grep_txt}" | wc -l) -lt 3 ]]; then
62+
echoerr "ERROR: We expected servant to respawn many times..."
63+
cat ${log_file}
64+
65+
_killall ${app_name}
66+
exit -1
67+
fi
68+
69+
if [[ $(cat ${log_file} | grep "restartsWithinCurrentPeriod: 1" | wc -l) -ne 1 ]]; then
70+
echoerr "Expected the backoff supervision to have logged: restartsWithinCurrentPeriod: 1"
71+
fi
72+
if [[ $(cat ${log_file} | grep "restartsWithinCurrentPeriod: 2" | wc -l) -ne 1 ]]; then
73+
echoerr "Expected the backoff supervision to have logged: restartsWithinCurrentPeriod: 2"
74+
fi
75+
if [[ $(cat ${log_file} | grep "restartsWithinCurrentPeriod: 3" | wc -l) -ne 1 ]]; then
76+
echoerr "Expected the backoff supervision to have logged: restartsWithinCurrentPeriod: 3"
77+
fi
78+
79+
80+
# === cleanup ----------------------------------------------------------------------------------------------------------
81+
82+
_killall ${app_name}

Package.swift

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,13 @@ let targets: [PackageDescription.Target] = [
8585
],
8686
path: "IntegrationTests/tests_02_process_isolated/it_ProcessIsolated_noLeaking"
8787
),
88+
.target(
89+
name: "it_ProcessIsolated_backoffRespawn",
90+
dependencies: [
91+
"DistributedActors",
92+
],
93+
path: "IntegrationTests/tests_02_process_isolated/it_ProcessIsolated_backoffRespawn"
94+
),
8895

8996
// ==== ----------------------------------------------------------------------------------------------------------------
9097
// MARK: Performance / Benchmarks

0 commit comments

Comments
 (0)