Skip to content

Commit 0864b35

Browse files
kpamnanyRAI CI (GitHub Action Automation)
authored andcommitted
Change heartbeat thread controls
When enabling heartbeats, the user must specify: - heartbeat_s: jl_heartbeat() must be called at least once every heartbeat_s; if it isn't, a one-line heartbeat loss report is printed - show_tasks_after_n: after these many heartbeat_s have passed without jl_heartbeat() being called, print task backtraces and stop all reporting - reset_after_n: after these many heartbeat_s have passed with jl_heartbeat() being called, print a heartbeats recovered message and reset reporting
1 parent 847a209 commit 0864b35

File tree

1 file changed

+46
-49
lines changed

1 file changed

+46
-49
lines changed

src/threading.c

Lines changed: 46 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1001,9 +1001,9 @@ volatile int heartbeat_enabled;
10011001
uv_sem_t heartbeat_on_sem, // jl_heartbeat_enable -> thread
10021002
heartbeat_off_sem; // thread -> jl_heartbeat_enable
10031003
int heartbeat_interval_s,
1004-
n_loss_reports,
1005-
reset_reporting_s;
1006-
int last_report_s, report_interval_s, n_reported;
1004+
tasks_after_n,
1005+
reset_tasks_after_n;
1006+
int tasks_showed, n_hbs_missed, n_hbs_recvd;
10071007
_Atomic(int) heartbeats;
10081008

10091009
JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT;
@@ -1022,21 +1022,19 @@ void jl_init_heartbeat(void)
10221022

10231023
// enable/disable heartbeats
10241024
// heartbeat_s: interval within which jl_heartbeat() must be called
1025-
// n_reports: for one heartbeat loss interval, how many times to report
1026-
// reset_reporting_after_s: how long to wait after a heartbeat loss
1027-
// interval and a return to steady heartbeats, before resetting
1028-
// reporting behavior
1025+
// show_tasks_after_n: number of heartbeats missed before printing task backtraces
1026+
// reset_after_n: number of heartbeats after which to reset
10291027
//
10301028
// When disabling heartbeats, the heartbeat thread must wake up,
10311029
// find out that heartbeats are now diabled, and reset. For now, we
10321030
// handle this by preventing re-enabling of heartbeats until this
10331031
// completes.
1034-
JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int n_reports,
1035-
int reset_reporting_after_s)
1032+
JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int show_tasks_after_n,
1033+
int reset_after_n)
10361034
{
10371035
if (heartbeat_s <= 0) {
10381036
heartbeat_enabled = 0;
1039-
heartbeat_interval_s = n_loss_reports = reset_reporting_s = 0;
1037+
heartbeat_interval_s = tasks_after_n = reset_tasks_after_n = 0;
10401038
}
10411039
else {
10421040
// must disable before enabling
@@ -1050,10 +1048,11 @@ JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int n_reports,
10501048

10511049
jl_atomic_store_relaxed(&heartbeats, 0);
10521050
heartbeat_interval_s = heartbeat_s;
1053-
n_loss_reports = n_reports;
1054-
reset_reporting_s = reset_reporting_after_s;
1055-
last_report_s = 0;
1056-
report_interval_s = heartbeat_interval_s;
1051+
tasks_after_n = show_tasks_after_n;
1052+
reset_tasks_after_n = reset_after_n;
1053+
tasks_showed = 0;
1054+
n_hbs_missed = 0;
1055+
n_hbs_recvd = 0;
10571056
heartbeat_enabled = 1;
10581057
uv_sem_post(&heartbeat_on_sem); // wake the heartbeat thread
10591058
}
@@ -1089,44 +1088,42 @@ void sleep_for(int secs, int nsecs)
10891088
uint8_t check_heartbeats(uint8_t gc_state)
10901089
{
10911090
int hb = jl_atomic_exchange(&heartbeats, 0);
1092-
uint64_t curr_s = jl_hrtime() / 1e9;
10931091

10941092
if (hb <= 0) {
1095-
// we didn't get a heartbeat in the last interval; should we report?
1096-
if (n_reported < n_loss_reports &&
1097-
curr_s - last_report_s >= report_interval_s) {
1098-
jl_task_t *ct = jl_current_task;
1099-
jl_ptls_t ptls = ct->ptls;
1100-
1101-
// exit GC-safe region to report then re-enter
1102-
jl_gc_safe_leave(ptls, gc_state);
1103-
jl_safe_printf("==== heartbeat loss ====\n");
1104-
jl_print_task_backtraces(0);
1105-
gc_state = jl_gc_safe_enter(ptls);
1106-
1107-
// we've reported
1108-
n_reported++;
1109-
1110-
// record the reporting time _after_ the report
1111-
last_report_s = jl_hrtime() / 1e9;
1112-
1113-
// double the reporting interval up to a maximum
1114-
if (report_interval_s < 60 * heartbeat_interval_s) {
1115-
report_interval_s *= 2;
1093+
// we didn't get a heartbeat
1094+
n_hbs_recvd = 0;
1095+
n_hbs_missed++;
1096+
1097+
// if we've printed task backtraces already, do nothing
1098+
if (!tasks_showed) {
1099+
// otherwise, at least show this message
1100+
jl_safe_printf("==== heartbeat loss (%ds) ====\n",
1101+
n_hbs_missed * heartbeat_interval_s);
1102+
// if we've missed enough heartbeats, print task backtraces
1103+
if (n_hbs_missed >= tasks_after_n) {
1104+
jl_task_t *ct = jl_current_task;
1105+
jl_ptls_t ptls = ct->ptls;
1106+
1107+
// exit GC-safe region to report then re-enter
1108+
jl_gc_safe_leave(ptls, gc_state);
1109+
jl_print_task_backtraces(0);
1110+
gc_state = jl_gc_safe_enter(ptls);
1111+
1112+
// we printed task backtraces
1113+
tasks_showed = 1;
11161114
}
11171115
}
1118-
// no heartbeats, don't change reporting state
1119-
return gc_state;
11201116
}
11211117
else {
1122-
// we got a heartbeat; reset the report count
1123-
n_reported = 0;
1124-
}
1125-
1126-
// reset the reporting interval only once we're steadily getting
1127-
// heartbeats for the requested reset interval
1128-
if (curr_s - reset_reporting_s > last_report_s) {
1129-
report_interval_s = heartbeat_interval_s;
1118+
// got a heartbeat
1119+
n_hbs_recvd++;
1120+
// if we'd printed task backtraces, check for reset
1121+
if (tasks_showed && n_hbs_recvd >= reset_tasks_after_n) {
1122+
tasks_showed = 0;
1123+
jl_safe_printf("==== heartbeats recovered (lost for %ds) ====\n",
1124+
n_hbs_missed * heartbeat_interval_s);
1125+
}
1126+
n_hbs_missed = 0;
11301127
}
11311128

11321129
return gc_state;
@@ -1135,7 +1132,7 @@ uint8_t check_heartbeats(uint8_t gc_state)
11351132
// heartbeat thread function
11361133
void jl_heartbeat_threadfun(void *arg)
11371134
{
1138-
int s, ns = 1e9 - 1, rs;
1135+
int s = 59, ns = 1e9 - 1, rs;
11391136
uint64_t t0, tchb;
11401137

11411138
// We need a TLS because backtraces are accumulated into ptls->bt_size
@@ -1193,8 +1190,8 @@ void jl_init_heartbeat(void)
11931190
{
11941191
}
11951192

1196-
JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int n_reports,
1197-
int reset_reporting_after_s)
1193+
JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int show_tasks_after_n,
1194+
int reset_after_n)
11981195
{
11991196
return -1;
12001197
}

0 commit comments

Comments
 (0)