2222PROGRESS_UPDATE = 30.0 # seconds
2323assert PROGRESS_UPDATE >= PROGRESS_MIN_TIME
2424
25+ # Kill the main process after 5 minutes. It is supposed to write an update
26+ # every PROGRESS_UPDATE seconds. Tolerate 5 minutes for Python slowest
27+ # buildbot workers.
28+ MAIN_PROCESS_TIMEOUT = 5 * 60.0
29+ assert MAIN_PROCESS_TIMEOUT >= PROGRESS_UPDATE
30+
2531# Time to wait until a worker completes: should be immediate
2632JOIN_TIMEOUT = 30.0 # seconds
2733
@@ -159,20 +165,6 @@ def mp_result_error(self, test_name, error_type, stdout='', stderr='',
159165 result = TestResult (test_name , error_type , test_time , None )
160166 return MultiprocessResult (result , stdout , stderr , err_msg )
161167
162- def _timedout (self , test_name ):
163- self ._kill ()
164-
165- stdout = stderr = ''
166- popen = self ._popen
167- try :
168- stdout , stderr = popen .communicate (timeout = JOIN_TIMEOUT )
169- except (subprocess .TimeoutExpired , OSError ) as exc :
170- print_warning (f"Failed to read { self } output "
171- f"(timeout={ format_duration (JOIN_TIMEOUT )} ): "
172- f"{ exc !r} " )
173-
174- return self .mp_result_error (test_name , TIMEOUT , stdout , stderr )
175-
176168 def _run_process (self , test_name ):
177169 self .start_time = time .monotonic ()
178170
@@ -195,23 +187,32 @@ def _run_process(self, test_name):
195187
196188 try :
197189 stdout , stderr = popen .communicate (timeout = self .timeout )
190+ retcode = popen .returncode
191+ assert retcode is not None
198192 except subprocess .TimeoutExpired :
199193 if self ._stopped :
200194 # kill() has been called: communicate() fails
201195 # on reading closed stdout/stderr
202196 raise ExitThread
203197
204- return self ._timedout (test_name )
198+ # On timeout, kill the process
199+ self ._kill ()
200+
201+ # None means TIMEOUT for the caller
202+ retcode = None
203+ # bpo-38207: Don't attempt to call communicate() again: on it
204+ # can hang until all child processes using stdout and stderr
205+ # pipes completes.
206+ stdout = stderr = ''
205207 except OSError :
206208 if self ._stopped :
207209 # kill() has been called: communicate() fails
208210 # on reading closed stdout/stderr
209211 raise ExitThread
210212 raise
211-
212- retcode = popen .returncode
213- stdout = stdout .strip ()
214- stderr = stderr .rstrip ()
213+ else :
214+ stdout = stdout .strip ()
215+ stderr = stderr .rstrip ()
215216
216217 return (retcode , stdout , stderr )
217218 except :
@@ -223,13 +224,10 @@ def _run_process(self, test_name):
223224 self .current_test_name = None
224225
225226 def _runtest (self , test_name ):
226- result = self ._run_process (test_name )
227-
228- if isinstance (result , MultiprocessResult ):
229- # _timedout() case
230- return result
227+ retcode , stdout , stderr = self ._run_process (test_name )
231228
232- retcode , stdout , stderr = result
229+ if retcode is None :
230+ return self .mp_result_error (test_name , TIMEOUT , stdout , stderr )
233231
234232 err_msg = None
235233 if retcode != 0 :
@@ -248,7 +246,8 @@ def _runtest(self, test_name):
248246 err_msg = "Failed to parse worker JSON: %s" % exc
249247
250248 if err_msg is not None :
251- return self .mp_result_error (test_name , CHILD_ERROR , stdout , stderr , err_msg )
249+ return self .mp_result_error (test_name , CHILD_ERROR ,
250+ stdout , stderr , err_msg )
252251
253252 return MultiprocessResult (result , stdout , stderr , err_msg )
254253
@@ -287,6 +286,16 @@ def _wait_completed(self):
287286 f"{ exc !r} " )
288287
289288 def wait_stopped (self , start_time ):
289+ # bpo-38207: MultiprocessTestRunner.stop_workers() called self.stop()
290+ # which killed the process. Sometimes, killing the process from the
291+ # main thread does not interrupt popen.communicate() in
292+ # TestWorkerProcess thread. This loop with a timeout is a workaround
293+ # for that.
294+ #
295+ # Moreover, if this method fails to join the thread, it is likely
296+ # that Python will hang at exit while calling threading._shutdown()
297+ # which tries again to join the blocked thread. Regrtest.main()
298+ # uses EXIT_TIMEOUT to workaround this second bug.
290299 while True :
291300 # Write a message every second
292301 self .join (1.0 )
@@ -353,7 +362,8 @@ def _get_result(self):
353362 timeout = PROGRESS_UPDATE
354363 while True :
355364 if use_faulthandler :
356- faulthandler .dump_traceback_later (timeout * 2.0 , exit = True )
365+ faulthandler .dump_traceback_later (MAIN_PROCESS_TIMEOUT ,
366+ exit = True )
357367
358368 # wait for a thread
359369 try :
0 commit comments