2222PROGRESS_UPDATE = 30.0 # seconds
2323assert PROGRESS_UPDATE >= PROGRESS_MIN_TIME
2424
25+ # Kill the main process after 5 minutes. It is supposed to write an update
26+ # every PROGRESS_UPDATE seconds. Tolerate 5 minutes for Python slowest
27+ # buildbot workers.
28+ MAIN_PROCESS_TIMEOUT = 5 * 60.0
29+ assert MAIN_PROCESS_TIMEOUT >= PROGRESS_UPDATE
30+
2531# Time to wait until a worker completes: should be immediate
2632JOIN_TIMEOUT = 30.0 # seconds
2733
@@ -160,20 +166,6 @@ def mp_result_error(self, test_name, error_type, stdout='', stderr='',
160166 result = TestResult (test_name , error_type , test_time , None )
161167 return MultiprocessResult (result , stdout , stderr , err_msg )
162168
163- def _timedout (self , test_name ):
164- self ._kill ()
165-
166- stdout = stderr = ''
167- popen = self ._popen
168- try :
169- stdout , stderr = popen .communicate (timeout = JOIN_TIMEOUT )
170- except (subprocess .TimeoutExpired , OSError ) as exc :
171- print_warning (f"Failed to read { self } output "
172- f"(timeout={ format_duration (JOIN_TIMEOUT )} ): "
173- f"{ exc !r} " )
174-
175- return self .mp_result_error (test_name , TIMEOUT , stdout , stderr )
176-
177169 def _run_process (self , test_name ):
178170 self .start_time = time .monotonic ()
179171
@@ -197,23 +189,32 @@ def _run_process(self, test_name):
197189
198190 try :
199191 stdout , stderr = popen .communicate (timeout = self .timeout )
192+ retcode = popen .returncode
193+ assert retcode is not None
200194 except subprocess .TimeoutExpired :
201195 if self ._stopped :
202196 # kill() has been called: communicate() fails
203197 # on reading closed stdout/stderr
204198 raise ExitThread
205199
206- return self ._timedout (test_name )
200+ # On timeout, kill the process
201+ self ._kill ()
202+
203+ # None means TIMEOUT for the caller
204+ retcode = None
205+ # bpo-38207: Don't attempt to call communicate() again: on it
206+ # can hang until all child processes using stdout and stderr
207+ # pipes completes.
208+ stdout = stderr = ''
207209 except OSError :
208210 if self ._stopped :
209211 # kill() has been called: communicate() fails
210212 # on reading closed stdout/stderr
211213 raise ExitThread
212214 raise
213-
214- retcode = popen .returncode
215- stdout = stdout .strip ()
216- stderr = stderr .rstrip ()
215+ else :
216+ stdout = stdout .strip ()
217+ stderr = stderr .rstrip ()
217218
218219 return (retcode , stdout , stderr )
219220 except :
@@ -225,13 +226,10 @@ def _run_process(self, test_name):
225226 self .current_test_name = None
226227
227228 def _runtest (self , test_name ):
228- result = self ._run_process (test_name )
229-
230- if isinstance (result , MultiprocessResult ):
231- # _timedout() case
232- return result
229+ retcode , stdout , stderr = self ._run_process (test_name )
233230
234- retcode , stdout , stderr = result
231+ if retcode is None :
232+ return self .mp_result_error (test_name , TIMEOUT , stdout , stderr )
235233
236234 err_msg = None
237235 if retcode != 0 :
@@ -250,7 +248,8 @@ def _runtest(self, test_name):
250248 err_msg = "Failed to parse worker JSON: %s" % exc
251249
252250 if err_msg is not None :
253- return self .mp_result_error (test_name , CHILD_ERROR , stdout , stderr , err_msg )
251+ return self .mp_result_error (test_name , CHILD_ERROR ,
252+ stdout , stderr , err_msg )
254253
255254 return MultiprocessResult (result , stdout , stderr , err_msg )
256255
@@ -289,6 +288,16 @@ def _wait_completed(self):
289288 f"{ exc !r} " )
290289
291290 def wait_stopped (self , start_time ):
291+ # bpo-38207: MultiprocessTestRunner.stop_workers() called self.stop()
292+ # which killed the process. Sometimes, killing the process from the
293+ # main thread does not interrupt popen.communicate() in
294+ # TestWorkerProcess thread. This loop with a timeout is a workaround
295+ # for that.
296+ #
297+ # Moreover, if this method fails to join the thread, it is likely
298+ # that Python will hang at exit while calling threading._shutdown()
299+ # which tries again to join the blocked thread. Regrtest.main()
300+ # uses EXIT_TIMEOUT to workaround this second bug.
292301 while True :
293302 # Write a message every second
294303 self .join (1.0 )
@@ -355,7 +364,8 @@ def _get_result(self):
355364 timeout = PROGRESS_UPDATE
356365 while True :
357366 if use_faulthandler :
358- faulthandler .dump_traceback_later (timeout * 2.0 , exit = True )
367+ faulthandler .dump_traceback_later (MAIN_PROCESS_TIMEOUT ,
368+ exit = True )
359369
360370 # wait for a thread
361371 try :
0 commit comments