2525#ifdef  HAVE_UNISTD_H 
2626#include  <unistd.h> 
2727#endif 
28+ #ifdef  HAVE_SYS_TYPES_H 
29+ #include  <sys/types.h> 
30+ #endif 
31+ #ifdef  HAVE_SYS_STAT_H 
32+ #include  <sys/stat.h> 
33+ #endif 
34+ #ifdef  HAVE_SYS_FCNTL_H 
35+ #include  <fcntl.h> 
36+ #endif 
2837
2938#include  <string.h> 
3039#include  <signal.h> 
3544#include  "opal/util/output.h" 
3645#include  "opal/util/show_help.h" 
3746#include  "opal/util/argv.h" 
47+ #include  "opal/util/proc.h" 
3848#include  "opal/runtime/opal_params.h" 
3949
4050#ifndef  _NSIG 
4353
4454#define  HOSTFORMAT  "[%s:%05d] "
4555
56+ int     opal_stacktrace_output_fileno  =  -1 ;
57+ static  char   * opal_stacktrace_output_filename_base  =  NULL ;
58+ static  size_t  opal_stacktrace_output_filename_max_len  =  0 ;
4659static  char  stacktrace_hostname [OPAL_MAXHOSTNAMELEN ];
4760static  char  * unable_to_print_msg  =  "Unable to print stack trace!\n" ;
4861
62+ /* 
63+  * Set the stacktrace filename: 
64+  * stacktrace.PID 
65+  * -or, if VPID is available- 
66+  * stacktrace.VPID.PID 
67+  */ 
68+ static  void  set_stacktrace_filename (void ) {
69+     opal_proc_t  * my_proc  =  opal_proc_local_get ();
70+ 
71+     if ( NULL  ==  my_proc  ) {
72+         snprintf (opal_stacktrace_output_filename , opal_stacktrace_output_filename_max_len ,
73+                  "%s.%lu" ,
74+                  opal_stacktrace_output_filename_base , (unsigned long )getpid ());
75+     }
76+     else  {
77+         snprintf (opal_stacktrace_output_filename , opal_stacktrace_output_filename_max_len ,
78+                  "%s.%lu.%lu" ,
79+                  opal_stacktrace_output_filename_base , (unsigned long )my_proc -> proc_name .vpid , (unsigned long )getpid ());
80+     }
81+ 
82+     return ;
83+ }
84+ 
4985/** 
5086 * This function is being called as a signal-handler in response 
5187 * to a user-specified signal (e.g. SIGFPE or SIGSEGV). 
@@ -69,12 +105,37 @@ static void show_stackframe (int signo, siginfo_t * info, void * p)
69105    int  ret ;
70106    char  * si_code_str  =  "" ;
71107
108+     /* Do not print the stack trace */ 
109+     if ( 0  >  opal_stacktrace_output_fileno  &&  0  ==  opal_stacktrace_output_filename_max_len  ) {
110+         /* Raise the signal again, so we don't accidentally mask critical signals. 
111+          * For critical signals, it is preferred that we call 'raise' instead of 
112+          * 'exit' or 'abort' so that the return status is set properly for this 
113+          * process. 
114+          */ 
115+         signal (signo , SIG_DFL );
116+         raise (signo );
117+ 
118+         return ;
119+     }
120+ 
121+     /* Update the file name with the RANK, if available */ 
122+     if ( 0  <  opal_stacktrace_output_filename_max_len  ) {
123+         set_stacktrace_filename ();
124+         opal_stacktrace_output_fileno  =  open (opal_stacktrace_output_filename ,
125+                                              O_CREAT |O_WRONLY |O_TRUNC , S_IRUSR |S_IWUSR );
126+         if ( 0  >  opal_stacktrace_output_fileno  ) {
127+             opal_output (0 , "Error: Failed to open the stacktrace output file. Default: stderr\n\tFilename: %s\n\tErrno: %s" ,
128+                         opal_stacktrace_output_filename , strerror (errno ));
129+             opal_stacktrace_output_fileno  =  fileno (stderr );
130+         }
131+     }
132+ 
72133    /* write out the footer information */ 
73134    memset  (print_buffer , 0 , sizeof  (print_buffer ));
74135    ret  =  snprintf (print_buffer , sizeof (print_buffer ),
75136                   HOSTFORMAT  "*** Process received signal ***\n" ,
76137                   stacktrace_hostname , getpid ());
77-     write (fileno ( stderr ) , print_buffer , ret );
138+     write (opal_stacktrace_output_fileno , print_buffer , ret );
78139
79140
80141    memset  (print_buffer , 0 , sizeof  (print_buffer ));
@@ -324,14 +385,14 @@ static void show_stackframe (int signo, siginfo_t * info, void * p)
324385    }
325386
326387    /* write out the signal information generated above */ 
327-     write (fileno ( stderr ) , print_buffer , sizeof (print_buffer )- size );
388+     write (opal_stacktrace_output_fileno , print_buffer , sizeof (print_buffer )- size );
328389
329390    /* print out the stack trace */ 
330391    snprintf (print_buffer , sizeof (print_buffer ), HOSTFORMAT ,
331392             stacktrace_hostname , getpid ());
332-     ret  =  opal_backtrace_print (stderr , print_buffer , 2 );
393+     ret  =  opal_backtrace_print (NULL , print_buffer , 2 );
333394    if  (OPAL_SUCCESS  !=  ret ) {
334-         write (fileno ( stderr ) , unable_to_print_msg , strlen (unable_to_print_msg ));
395+         write (opal_stacktrace_output_fileno , unable_to_print_msg , strlen (unable_to_print_msg ));
335396    }
336397
337398    /* write out the footer information */ 
@@ -340,9 +401,15 @@ static void show_stackframe (int signo, siginfo_t * info, void * p)
340401                   HOSTFORMAT  "*** End of error message ***\n" ,
341402                   stacktrace_hostname , getpid ());
342403    if  (ret  >  0 ) {
343-         write (fileno ( stderr ) , print_buffer , ret );
404+         write (opal_stacktrace_output_fileno , print_buffer , ret );
344405    } else  {
345-         write (fileno (stderr ), unable_to_print_msg , strlen (unable_to_print_msg ));
406+         write (opal_stacktrace_output_fileno , unable_to_print_msg , strlen (unable_to_print_msg ));
407+     }
408+ 
409+     if ( fileno (stdout ) !=  opal_stacktrace_output_fileno  && 
410+         fileno (stderr ) !=  opal_stacktrace_output_fileno  ) {
411+         close (opal_stacktrace_output_fileno );
412+         opal_stacktrace_output_fileno  =  -1 ;
346413    }
347414
348415    /* Raise the signal again, so we don't accidentally mask critical signals. 
@@ -373,7 +440,30 @@ void opal_stackframe_output(int stream)
373440            opal_output (stream , "%s" , traces [i ]);
374441        }
375442    } else  {
376-         opal_backtrace_print (stderr , NULL , 2 );
443+         /* Do not print the stack trace */ 
444+         if ( 0  >  opal_stacktrace_output_fileno  &&  0  ==  opal_stacktrace_output_filename_max_len  ) {
445+             return ;
446+         }
447+ 
448+         /* Update the file name with the RANK, if available */ 
449+         if ( 0  <  opal_stacktrace_output_filename_max_len  ) {
450+             set_stacktrace_filename ();
451+             opal_stacktrace_output_fileno  =  open (opal_stacktrace_output_filename ,
452+                                                  O_CREAT |O_WRONLY |O_TRUNC , S_IRUSR |S_IWUSR );
453+             if ( 0  >  opal_stacktrace_output_fileno  ) {
454+                 opal_output (0 , "Error: Failed to open the stacktrace output file. Default: stderr\n\tFilename: %s\n\tErrno: %s" ,
455+                             opal_stacktrace_output_filename , strerror (errno ));
456+                 opal_stacktrace_output_fileno  =  fileno (stderr );
457+             }
458+         }
459+ 
460+         opal_backtrace_print (NULL , NULL , 2 );
461+ 
462+         if ( fileno (stdout ) !=  opal_stacktrace_output_fileno  && 
463+             fileno (stderr ) !=  opal_stacktrace_output_fileno  ) {
464+             close (opal_stacktrace_output_fileno );
465+             opal_stacktrace_output_fileno  =  -1 ;
466+         }
377467    }
378468}
379469
@@ -444,6 +534,50 @@ int opal_util_register_stackhandlers (void)
444534        }
445535    }
446536
537+     /* Setup the output stream to use */ 
538+     if ( NULL  ==  opal_stacktrace_output_filename  || 
539+         0  ==  strcasecmp (opal_stacktrace_output_filename , "none" ) ) {
540+         opal_stacktrace_output_fileno  =  -1 ;
541+     }
542+     else  if ( 0  ==  strcasecmp (opal_stacktrace_output_filename , "stdout" ) ) {
543+         opal_stacktrace_output_fileno  =  fileno (stdout );
544+     }
545+     else  if ( 0  ==  strcasecmp (opal_stacktrace_output_filename , "stderr" ) ) {
546+         opal_stacktrace_output_fileno  =  fileno (stdout );
547+     }
548+     else  if ( 0  ==  strcasecmp (opal_stacktrace_output_filename , "file"  ) || 
549+              0  ==  strcasecmp (opal_stacktrace_output_filename , "file:" ) ) {
550+         opal_stacktrace_output_filename_base  =  strdup ("stacktrace" );
551+ 
552+         free (opal_stacktrace_output_filename );
553+         // Magic number: 8 = space for .PID and .RANK (allow 7 digits each) 
554+         opal_stacktrace_output_filename_max_len  =  strlen ("stacktrace" ) +  8  +  8 ;
555+         opal_stacktrace_output_filename  =  (char * )malloc (sizeof (char ) *  opal_stacktrace_output_filename_max_len );
556+         set_stacktrace_filename ();
557+         opal_stacktrace_output_fileno  =  -1 ;
558+     }
559+     else  if ( 0  ==  strncasecmp (opal_stacktrace_output_filename , "file:" , 5 ) ) {
560+         char  * filename_cpy  =  NULL ;
561+         next  =  strchr (opal_stacktrace_output_filename , ':' );
562+         next ++ ; // move past the ':' to the filename specified 
563+ 
564+         opal_stacktrace_output_filename_base  =  strdup (next );
565+ 
566+         free (opal_stacktrace_output_filename );
567+         // Magic number: 8 = space for .PID and .RANK (allow 7 digits each) 
568+         opal_stacktrace_output_filename_max_len  =  strlen (opal_stacktrace_output_filename_base ) +  8  +  8 ;
569+         opal_stacktrace_output_filename  =  (char * )malloc (sizeof (char ) *  opal_stacktrace_output_filename_max_len );
570+         set_stacktrace_filename ();
571+         opal_stacktrace_output_fileno  =  -1 ;
572+ 
573+         free (filename_cpy );
574+     }
575+     else  {
576+         opal_stacktrace_output_fileno  =  fileno (stderr );
577+     }
578+ 
579+ 
580+     /* Setup the signals to catch */ 
447581    memset (& act , 0 , sizeof (act ));
448582    act .sa_sigaction  =  show_stackframe ;
449583    act .sa_flags  =  SA_SIGINFO ;
0 commit comments