From e52122f348e222b06b581df323c306825e3fb108 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Mon, 16 Jun 2025 11:04:59 +0200 Subject: [PATCH 1/2] add flexible summary with multiple formats --- doubleml/did/did_binary.py | 63 ++++---------------- doubleml/did/did_cs_binary.py | 65 ++++---------------- doubleml/double_ml.py | 109 +++++++++++++++++++++++----------- doubleml/irm/iivm.py | 15 +---- 4 files changed, 100 insertions(+), 152 deletions(-) diff --git a/doubleml/did/did_binary.py b/doubleml/did/did_binary.py index 99e18e28..99ce7ef9 100644 --- a/doubleml/did/did_binary.py +++ b/doubleml/did/did_binary.py @@ -239,58 +239,17 @@ def __init__( self._sensitivity_implemented = True self._external_predictions_implemented = True - def __str__(self): - class_name = self.__class__.__name__ - header = f"================== {class_name} Object ==================\n" - data_summary = self._dml_data._data_summary_str() - score_info = ( - f"Score function: {str(self.score)}\n" - f"Treatment group: {str(self.g_value)}\n" - f"Pre-treatment period: {str(self.t_value_pre)}\n" - f"Evaluation period: {str(self.t_value_eval)}\n" - f"Control group: {str(self.control_group)}\n" - f"Anticipation periods: {str(self.anticipation_periods)}\n" - f"Effective sample size: {str(self.n_obs_subset)}\n" - ) - learner_info = "" - for key, value in self.learner.items(): - learner_info += f"Learner {key}: {str(value)}\n" - if self.nuisance_loss is not None: - learner_info += "Out-of-sample Performance:\n" - is_classifier = [value for value in self._is_classifier.values()] - is_regressor = [not value for value in is_classifier] - if any(is_regressor): - learner_info += "Regression:\n" - for learner in [key for key, value in self._is_classifier.items() if value is False]: - learner_info += f"Learner {learner} RMSE: {self.nuisance_loss[learner]}\n" - if any(is_classifier): - learner_info += "Classification:\n" - for learner in [key for key, value in self._is_classifier.items() if value is True]: - learner_info += f"Learner {learner} Log Loss: {self.nuisance_loss[learner]}\n" - - if self._is_cluster_data: - resampling_info = ( - f"No. folds per cluster: {self._n_folds_per_cluster}\n" - f"No. folds: {self.n_folds}\n" - f"No. repeated sample splits: {self.n_rep}\n" - ) - else: - resampling_info = f"No. folds: {self.n_folds}\nNo. repeated sample splits: {self.n_rep}\n" - fit_summary = str(self.summary) - res = ( - header - + "\n------------------ Data summary ------------------\n" - + data_summary - + "\n------------------ Score & algorithm ------------------\n" - + score_info - + "\n------------------ Machine learner ------------------\n" - + learner_info - + "\n------------------ Resampling ------------------\n" - + resampling_info - + "\n------------------ Fit summary ------------------\n" - + fit_summary - ) - return res + def _format_score_info_str(self): + lines = [ + f"Score function: {str(self.score)}", + f"Treatment group: {str(self.g_value)}", + f"Pre-treatment period: {str(self.t_value_pre)}", + f"Evaluation period: {str(self.t_value_eval)}", + f"Control group: {str(self.control_group)}", + f"Anticipation periods: {str(self.anticipation_periods)}", + f"Effective sample size: {str(self.n_obs_subset)}", + ] + return "\\n".join(lines) @property def g_value(self): diff --git a/doubleml/did/did_cs_binary.py b/doubleml/did/did_cs_binary.py index a6005d53..73b9152f 100644 --- a/doubleml/did/did_cs_binary.py +++ b/doubleml/did/did_cs_binary.py @@ -156,58 +156,19 @@ def __init__( self._sensitivity_implemented = True self._external_predictions_implemented = True - def __str__(self): - class_name = self.__class__.__name__ - header = f"================== {class_name} Object ==================\n" - data_summary = self._dml_data._data_summary_str() - score_info = ( - f"Score function: {str(self.score)}\n" - f"Treatment group: {str(self.g_value)}\n" - f"Pre-treatment period: {str(self.t_value_pre)}\n" - f"Evaluation period: {str(self.t_value_eval)}\n" - f"Control group: {str(self.control_group)}\n" - f"Anticipation periods: {str(self.anticipation_periods)}\n" - f"Effective sample size: {str(self.n_obs_subset)}\n" - ) - learner_info = "" - for key, value in self.learner.items(): - learner_info += f"Learner {key}: {str(value)}\n" - if self.nuisance_loss is not None: - learner_info += "Out-of-sample Performance:\n" - is_classifier = [value for value in self._is_classifier.values()] - is_regressor = [not value for value in is_classifier] - if any(is_regressor): - learner_info += "Regression:\n" - for learner in [key for key, value in self._is_classifier.items() if value is False]: - learner_info += f"Learner {learner} RMSE: {self.nuisance_loss[learner]}\n" - if any(is_classifier): - learner_info += "Classification:\n" - for learner in [key for key, value in self._is_classifier.items() if value is True]: - learner_info += f"Learner {learner} Log Loss: {self.nuisance_loss[learner]}\n" - - if self._is_cluster_data: - resampling_info = ( - f"No. folds per cluster: {self._n_folds_per_cluster}\n" - f"No. folds: {self.n_folds}\n" - f"No. repeated sample splits: {self.n_rep}\n" - ) - else: - resampling_info = f"No. folds: {self.n_folds}\nNo. repeated sample splits: {self.n_rep}\n" - fit_summary = str(self.summary) - res = ( - header - + "\n------------------ Data summary ------------------\n" - + data_summary - + "\n------------------ Score & algorithm ------------------\n" - + score_info - + "\n------------------ Machine learner ------------------\n" - + learner_info - + "\n------------------ Resampling ------------------\n" - + resampling_info - + "\n------------------ Fit summary ------------------\n" - + fit_summary - ) - return res + def _format_score_info_str(self): + lines = [ + f"Score function: {str(self.score)}", + f"Treatment group: {str(self.g_value)}", + f"Pre-treatment period: {str(self.t_value_pre)}", + f"Evaluation period: {str(self.t_value_eval)}", + f"Control group: {str(self.control_group)}", + f"Anticipation periods: {str(self.anticipation_periods)}", + f"Effective sample size: {str(self.n_obs_subset)}", + ] + return "\n".join(lines) + + # _format_learner_info_str method is inherited from DoubleML base class. @property def g_value(self): diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 88f677ef..72f3b44a 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -110,50 +110,87 @@ def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting): self._i_rep = None self._i_treat = None - def __str__(self): + def _format_header_str(self): class_name = self.__class__.__name__ - header = f"================== {class_name} Object ==================\n" - data_summary = self._dml_data._data_summary_str() - score_info = f"Score function: {str(self.score)}\n" + return f"================== {class_name} Object ==================" + + def _format_score_info_str(self): + return f"Score function: {str(self.score)}" + + def _format_learner_info_str(self): learner_info = "" - for key, value in self.learner.items(): - learner_info += f"Learner {key}: {str(value)}\n" + if self.learner is not None: + for key, value in self.learner.items(): + learner_info += f"Learner {key}: {str(value)}\\n" if self.nuisance_loss is not None: - learner_info += "Out-of-sample Performance:\n" - is_classifier = [value for value in self._is_classifier.values()] - is_regressor = [not value for value in is_classifier] - if any(is_regressor): - learner_info += "Regression:\n" - for learner in [key for key, value in self._is_classifier.items() if value is False]: - learner_info += f"Learner {learner} RMSE: {self.nuisance_loss[learner]}\n" - if any(is_classifier): - learner_info += "Classification:\n" - for learner in [key for key, value in self._is_classifier.items() if value is True]: - learner_info += f"Learner {learner} Log Loss: {self.nuisance_loss[learner]}\n" + learner_info += "Out-of-sample Performance:\\n" + # Check if _is_classifier is populated, otherwise, it might be called before fit + if self._is_classifier: + is_classifier_any = any(self._is_classifier.values()) + is_regressor_any = any(not v for v in self._is_classifier.values()) + + if is_regressor_any: + learner_info += "Regression:\\n" + for learner_name in self.params_names: # Iterate through known learners + if not self._is_classifier.get(learner_name, True): # Default to not regressor if not found + loss_val = self.nuisance_loss.get(learner_name, "N/A") + learner_info += f"Learner {learner_name} RMSE: {loss_val}\\n" + if is_classifier_any: + learner_info += "Classification:\\n" + for learner_name in self.params_names: # Iterate through known learners + if self._is_classifier.get(learner_name, False): # Default to not classifier if not found + loss_val = self.nuisance_loss.get(learner_name, "N/A") + learner_info += f"Learner {learner_name} Log Loss: {loss_val}\\n" + else: + learner_info += " (Run .fit() to see out-of-sample performance)\\n" + return learner_info.strip() + def _format_resampling_info_str(self): if self._is_cluster_data: - resampling_info = ( - f"No. folds per cluster: {self._n_folds_per_cluster}\n" - f"No. folds: {self.n_folds}\n" - f"No. repeated sample splits: {self.n_rep}\n" + return ( + f"No. folds per cluster: {self._n_folds_per_cluster}\\\\n" + f"No. folds: {self.n_folds}\\\\n" + f"No. repeated sample splits: {self.n_rep}" ) else: - resampling_info = f"No. folds: {self.n_folds}\nNo. repeated sample splits: {self.n_rep}\n" - fit_summary = str(self.summary) - res = ( - header - + "\n------------------ Data summary ------------------\n" - + data_summary - + "\n------------------ Score & algorithm ------------------\n" - + score_info - + "\n------------------ Machine learner ------------------\n" - + learner_info - + "\n------------------ Resampling ------------------\n" - + resampling_info - + "\n------------------ Fit summary ------------------\n" - + fit_summary + return f"No. folds: {self.n_folds}\\\\nNo. repeated sample splits: {self.n_rep}" + + def _format_additional_info_str(self): + """ + Hook for subclasses to add additional information to the string representation. + Returns an empty string by default. + Subclasses should override this method to provide content. + The content should not include the 'Additional Information' header itself. + """ + return "" + + def __str__(self): + header = self._format_header_str() + # Assumes self._dml_data._data_summary_str() exists and is well-formed + data_summary = self._dml_data._data_summary_str() + score_info = self._format_score_info_str() + learner_info = self._format_learner_info_str() + resampling_info = self._format_resampling_info_str() + fit_summary = str(self.summary) # Assumes self.summary is well-formed + + representation = ( + f"{header}\\n" + f"\\n------------------ Data Summary ------------------\\n" + f"{data_summary}\\n" + f"\\n------------------ Score & Algorithm ------------------\\n" + f"{score_info}\\n" + f"\\n------------------ Machine Learner ------------------\\n" + f"{learner_info}\\n" + f"\\n------------------ Resampling ------------------\\n" + f"{resampling_info}\\n" + f"\\n------------------ Fit Summary ------------------\\n" + f"{fit_summary}" ) - return res + + additional_info = self._format_additional_info_str() + if additional_info: + representation += f"\\n\\n------------------ Additional Information ------------------\\n" f"{additional_info}" + return representation @property def n_folds(self): diff --git a/doubleml/irm/iivm.py b/doubleml/irm/iivm.py index a43c0a03..b3cc11e7 100644 --- a/doubleml/irm/iivm.py +++ b/doubleml/irm/iivm.py @@ -197,22 +197,13 @@ def __init__( self.subgroups = subgroups self._external_predictions_implemented = True - def __str__(self): - parent_str = super().__str__() - - # add robust confset + def _format_additional_info_str(self): if self.framework is None: - confset_str = "" + return "" else: confset = self.robust_confset() formatted_confset = ", ".join([f"[{lower:.4f}, {upper:.4f}]" for lower, upper in confset]) - confset_str = ( - "\n\n--------------- Additional Information ----------------\n" - + f"Robust Confidence Set: {formatted_confset}\n" - ) - - res = parent_str + confset_str - return res + return f"Robust Confidence Set: {formatted_confset}" @property def normalize_ipw(self): From bf7e16af8a6b3dde11f7fd80c76549659b1e11a7 Mon Sep 17 00:00:00 2001 From: SvenKlaassen Date: Mon, 16 Jun 2025 12:09:09 +0200 Subject: [PATCH 2/2] fix format --- doubleml/double_ml.py | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 72f3b44a..694968bc 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -121,39 +121,39 @@ def _format_learner_info_str(self): learner_info = "" if self.learner is not None: for key, value in self.learner.items(): - learner_info += f"Learner {key}: {str(value)}\\n" + learner_info += f"Learner {key}: {str(value)}\n" if self.nuisance_loss is not None: - learner_info += "Out-of-sample Performance:\\n" + learner_info += "Out-of-sample Performance:\n" # Check if _is_classifier is populated, otherwise, it might be called before fit if self._is_classifier: is_classifier_any = any(self._is_classifier.values()) is_regressor_any = any(not v for v in self._is_classifier.values()) if is_regressor_any: - learner_info += "Regression:\\n" + learner_info += "Regression:\n" for learner_name in self.params_names: # Iterate through known learners if not self._is_classifier.get(learner_name, True): # Default to not regressor if not found loss_val = self.nuisance_loss.get(learner_name, "N/A") - learner_info += f"Learner {learner_name} RMSE: {loss_val}\\n" + learner_info += f"Learner {learner_name} RMSE: {loss_val}\n" if is_classifier_any: - learner_info += "Classification:\\n" + learner_info += "Classification:\n" for learner_name in self.params_names: # Iterate through known learners if self._is_classifier.get(learner_name, False): # Default to not classifier if not found loss_val = self.nuisance_loss.get(learner_name, "N/A") - learner_info += f"Learner {learner_name} Log Loss: {loss_val}\\n" + learner_info += f"Learner {learner_name} Log Loss: {loss_val}\n" else: - learner_info += " (Run .fit() to see out-of-sample performance)\\n" + learner_info += " (Run .fit() to see out-of-sample performance)\n" return learner_info.strip() def _format_resampling_info_str(self): if self._is_cluster_data: return ( - f"No. folds per cluster: {self._n_folds_per_cluster}\\\\n" - f"No. folds: {self.n_folds}\\\\n" + f"No. folds per cluster: {self._n_folds_per_cluster}\n" + f"No. folds: {self.n_folds}\n" f"No. repeated sample splits: {self.n_rep}" ) else: - return f"No. folds: {self.n_folds}\\\\nNo. repeated sample splits: {self.n_rep}" + return f"No. folds: {self.n_folds}\nNo. repeated sample splits: {self.n_rep}" def _format_additional_info_str(self): """ @@ -174,22 +174,22 @@ def __str__(self): fit_summary = str(self.summary) # Assumes self.summary is well-formed representation = ( - f"{header}\\n" - f"\\n------------------ Data Summary ------------------\\n" - f"{data_summary}\\n" - f"\\n------------------ Score & Algorithm ------------------\\n" - f"{score_info}\\n" - f"\\n------------------ Machine Learner ------------------\\n" - f"{learner_info}\\n" - f"\\n------------------ Resampling ------------------\\n" - f"{resampling_info}\\n" - f"\\n------------------ Fit Summary ------------------\\n" + f"{header}\n" + f"\n------------------ Data Summary ------------------\n" + f"{data_summary}\n" + f"\n------------------ Score & Algorithm ------------------\n" + f"{score_info}\n" + f"\n------------------ Machine Learner ------------------\n" + f"{learner_info}\n" + f"\n------------------ Resampling ------------------\n" + f"{resampling_info}\n" + f"\n------------------ Fit Summary ------------------\n" f"{fit_summary}" ) additional_info = self._format_additional_info_str() if additional_info: - representation += f"\\n\\n------------------ Additional Information ------------------\\n" f"{additional_info}" + representation += f"\n\n------------------ Additional Information ------------------\n" f"{additional_info}" return representation @property