@@ -1001,9 +1001,9 @@ volatile int heartbeat_enabled;
10011001uv_sem_t heartbeat_on_sem , // jl_heartbeat_enable -> thread
10021002 heartbeat_off_sem ; // thread -> jl_heartbeat_enable
10031003int heartbeat_interval_s ,
1004- n_loss_reports ,
1005- reset_reporting_s ;
1006- int last_report_s , report_interval_s , n_reported ;
1004+ tasks_after_n ,
1005+ reset_tasks_after_n ;
1006+ int tasks_showed , n_hbs_missed , n_hbs_recvd ;
10071007_Atomic(int ) heartbeats ;
10081008
10091009JL_DLLEXPORT void jl_print_task_backtraces (int show_done ) JL_NOTSAFEPOINT ;
@@ -1022,21 +1022,19 @@ void jl_init_heartbeat(void)
10221022
10231023// enable/disable heartbeats
10241024// heartbeat_s: interval within which jl_heartbeat() must be called
1025- // n_reports: for one heartbeat loss interval, how many times to report
1026- // reset_reporting_after_s: how long to wait after a heartbeat loss
1027- // interval and a return to steady heartbeats, before resetting
1028- // reporting behavior
1025+ // show_tasks_after_n: number of heartbeats missed before printing task backtraces
1026+ // reset_after_n: number of heartbeats after which to reset
10291027//
10301028// When disabling heartbeats, the heartbeat thread must wake up,
10311029// find out that heartbeats are now diabled, and reset. For now, we
10321030// handle this by preventing re-enabling of heartbeats until this
10331031// completes.
1034- JL_DLLEXPORT int jl_heartbeat_enable (int heartbeat_s , int n_reports ,
1035- int reset_reporting_after_s )
1032+ JL_DLLEXPORT int jl_heartbeat_enable (int heartbeat_s , int show_tasks_after_n ,
1033+ int reset_after_n )
10361034{
10371035 if (heartbeat_s <= 0 ) {
10381036 heartbeat_enabled = 0 ;
1039- heartbeat_interval_s = n_loss_reports = reset_reporting_s = 0 ;
1037+ heartbeat_interval_s = tasks_after_n = reset_tasks_after_n = 0 ;
10401038 }
10411039 else {
10421040 // must disable before enabling
@@ -1050,10 +1048,11 @@ JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int n_reports,
10501048
10511049 jl_atomic_store_relaxed (& heartbeats , 0 );
10521050 heartbeat_interval_s = heartbeat_s ;
1053- n_loss_reports = n_reports ;
1054- reset_reporting_s = reset_reporting_after_s ;
1055- last_report_s = 0 ;
1056- report_interval_s = heartbeat_interval_s ;
1051+ tasks_after_n = show_tasks_after_n ;
1052+ reset_tasks_after_n = reset_after_n ;
1053+ tasks_showed = 0 ;
1054+ n_hbs_missed = 0 ;
1055+ n_hbs_recvd = 0 ;
10571056 heartbeat_enabled = 1 ;
10581057 uv_sem_post (& heartbeat_on_sem ); // wake the heartbeat thread
10591058 }
@@ -1089,44 +1088,42 @@ void sleep_for(int secs, int nsecs)
10891088uint8_t check_heartbeats (uint8_t gc_state )
10901089{
10911090 int hb = jl_atomic_exchange (& heartbeats , 0 );
1092- uint64_t curr_s = jl_hrtime () / 1e9 ;
10931091
10941092 if (hb <= 0 ) {
1095- // we didn't get a heartbeat in the last interval; should we report?
1096- if ( n_reported < n_loss_reports &&
1097- curr_s - last_report_s >= report_interval_s ) {
1098- jl_task_t * ct = jl_current_task ;
1099- jl_ptls_t ptls = ct -> ptls ;
1100-
1101- // exit GC-safe region to report then re-enter
1102- jl_gc_safe_leave ( ptls , gc_state );
1103- jl_safe_printf ( "==== heartbeat loss ====\n" );
1104- jl_print_task_backtraces ( 0 );
1105- gc_state = jl_gc_safe_enter ( ptls );
1106-
1107- // we've reported
1108- n_reported ++ ;
1109-
1110- // record the reporting time _after_ the report
1111- last_report_s = jl_hrtime () / 1e9 ;
1112-
1113- // double the reporting interval up to a maximum
1114- if ( report_interval_s < 60 * heartbeat_interval_s ) {
1115- report_interval_s *= 2 ;
1093+ // we didn't get a heartbeat
1094+ n_hbs_recvd = 0 ;
1095+ n_hbs_missed ++ ;
1096+
1097+ // if we've printed task backtraces already, do nothing
1098+ if (! tasks_showed ) {
1099+ // otherwise, at least show this message
1100+ jl_safe_printf ( "==== heartbeat loss (%ds) ====\n" ,
1101+ n_hbs_missed * heartbeat_interval_s );
1102+ // if we've missed enough heartbeats, print task backtraces
1103+ if ( n_hbs_missed >= tasks_after_n ) {
1104+ jl_task_t * ct = jl_current_task ;
1105+ jl_ptls_t ptls = ct -> ptls ;
1106+
1107+ // exit GC-safe region to report then re-enter
1108+ jl_gc_safe_leave ( ptls , gc_state );
1109+ jl_print_task_backtraces ( 0 ) ;
1110+ gc_state = jl_gc_safe_enter ( ptls );
1111+
1112+ // we printed task backtraces
1113+ tasks_showed = 1 ;
11161114 }
11171115 }
1118- // no heartbeats, don't change reporting state
1119- return gc_state ;
11201116 }
11211117 else {
1122- // we got a heartbeat; reset the report count
1123- n_reported = 0 ;
1124- }
1125-
1126- // reset the reporting interval only once we're steadily getting
1127- // heartbeats for the requested reset interval
1128- if (curr_s - reset_reporting_s > last_report_s ) {
1129- report_interval_s = heartbeat_interval_s ;
1118+ // got a heartbeat
1119+ n_hbs_recvd ++ ;
1120+ // if we'd printed task backtraces, check for reset
1121+ if (tasks_showed && n_hbs_recvd >= reset_tasks_after_n ) {
1122+ tasks_showed = 0 ;
1123+ jl_safe_printf ("==== heartbeats recovered (lost for %ds) ====\n" ,
1124+ n_hbs_missed * heartbeat_interval_s );
1125+ }
1126+ n_hbs_missed = 0 ;
11301127 }
11311128
11321129 return gc_state ;
@@ -1135,7 +1132,7 @@ uint8_t check_heartbeats(uint8_t gc_state)
11351132// heartbeat thread function
11361133void jl_heartbeat_threadfun (void * arg )
11371134{
1138- int s , ns = 1e9 - 1 , rs ;
1135+ int s = 59 , ns = 1e9 - 1 , rs ;
11391136 uint64_t t0 , tchb ;
11401137
11411138 // We need a TLS because backtraces are accumulated into ptls->bt_size
@@ -1193,8 +1190,8 @@ void jl_init_heartbeat(void)
11931190{
11941191}
11951192
1196- JL_DLLEXPORT int jl_heartbeat_enable (int heartbeat_s , int n_reports ,
1197- int reset_reporting_after_s )
1193+ JL_DLLEXPORT int jl_heartbeat_enable (int heartbeat_s , int show_tasks_after_n ,
1194+ int reset_after_n )
11981195{
11991196 return -1 ;
12001197}
0 commit comments