Merge pull request #336 from DoubleML/s-update-summary

SvenKlaassen · web-flow · commit 62a6838e1db9 · 2025-06-16T13:15:08.000+02:00
Update DoubleML __str__ method
diff --git a/doubleml/did/did_binary.py b/doubleml/did/did_binary.py
@@ -239,58 +239,17 @@ def __init__(
         self._sensitivity_implemented = True
         self._external_predictions_implemented = True
 
-    def __str__(self):
-        class_name = self.__class__.__name__
-        header = f"================== {class_name} Object ==================\n"
-        data_summary = self._dml_data._data_summary_str()
-        score_info = (
-            f"Score function: {str(self.score)}\n"
-            f"Treatment group: {str(self.g_value)}\n"
-            f"Pre-treatment period: {str(self.t_value_pre)}\n"
-            f"Evaluation period: {str(self.t_value_eval)}\n"
-            f"Control group: {str(self.control_group)}\n"
-            f"Anticipation periods: {str(self.anticipation_periods)}\n"
-            f"Effective sample size: {str(self.n_obs_subset)}\n"
-        )
-        learner_info = ""
-        for key, value in self.learner.items():
-            learner_info += f"Learner {key}: {str(value)}\n"
-        if self.nuisance_loss is not None:
-            learner_info += "Out-of-sample Performance:\n"
-            is_classifier = [value for value in self._is_classifier.values()]
-            is_regressor = [not value for value in is_classifier]
-            if any(is_regressor):
-                learner_info += "Regression:\n"
-                for learner in [key for key, value in self._is_classifier.items() if value is False]:
-                    learner_info += f"Learner {learner} RMSE: {self.nuisance_loss[learner]}\n"
-            if any(is_classifier):
-                learner_info += "Classification:\n"
-                for learner in [key for key, value in self._is_classifier.items() if value is True]:
-                    learner_info += f"Learner {learner} Log Loss: {self.nuisance_loss[learner]}\n"
-
-        if self._is_cluster_data:
-            resampling_info = (
-                f"No. folds per cluster: {self._n_folds_per_cluster}\n"
-                f"No. folds: {self.n_folds}\n"
-                f"No. repeated sample splits: {self.n_rep}\n"
-            )
-        else:
-            resampling_info = f"No. folds: {self.n_folds}\nNo. repeated sample splits: {self.n_rep}\n"
-        fit_summary = str(self.summary)
-        res = (
-            header
-            + "\n------------------ Data summary      ------------------\n"
-            + data_summary
-            + "\n------------------ Score & algorithm ------------------\n"
-            + score_info
-            + "\n------------------ Machine learner   ------------------\n"
-            + learner_info
-            + "\n------------------ Resampling        ------------------\n"
-            + resampling_info
-            + "\n------------------ Fit summary       ------------------\n"
-            + fit_summary
-        )
-        return res
+    def _format_score_info_str(self):
+        lines = [
+            f"Score function: {str(self.score)}",
+            f"Treatment group: {str(self.g_value)}",
+            f"Pre-treatment period: {str(self.t_value_pre)}",
+            f"Evaluation period: {str(self.t_value_eval)}",
+            f"Control group: {str(self.control_group)}",
+            f"Anticipation periods: {str(self.anticipation_periods)}",
+            f"Effective sample size: {str(self.n_obs_subset)}",
+        ]
+        return "\\n".join(lines)
 
     @property
     def g_value(self):
diff --git a/doubleml/did/did_cs_binary.py b/doubleml/did/did_cs_binary.py
@@ -156,58 +156,19 @@ def __init__(
         self._sensitivity_implemented = True
         self._external_predictions_implemented = True
 
-    def __str__(self):
-        class_name = self.__class__.__name__
-        header = f"================== {class_name} Object ==================\n"
-        data_summary = self._dml_data._data_summary_str()
-        score_info = (
-            f"Score function: {str(self.score)}\n"
-            f"Treatment group: {str(self.g_value)}\n"
-            f"Pre-treatment period: {str(self.t_value_pre)}\n"
-            f"Evaluation period: {str(self.t_value_eval)}\n"
-            f"Control group: {str(self.control_group)}\n"
-            f"Anticipation periods: {str(self.anticipation_periods)}\n"
-            f"Effective sample size: {str(self.n_obs_subset)}\n"
-        )
-        learner_info = ""
-        for key, value in self.learner.items():
-            learner_info += f"Learner {key}: {str(value)}\n"
-        if self.nuisance_loss is not None:
-            learner_info += "Out-of-sample Performance:\n"
-            is_classifier = [value for value in self._is_classifier.values()]
-            is_regressor = [not value for value in is_classifier]
-            if any(is_regressor):
-                learner_info += "Regression:\n"
-                for learner in [key for key, value in self._is_classifier.items() if value is False]:
-                    learner_info += f"Learner {learner} RMSE: {self.nuisance_loss[learner]}\n"
-            if any(is_classifier):
-                learner_info += "Classification:\n"
-                for learner in [key for key, value in self._is_classifier.items() if value is True]:
-                    learner_info += f"Learner {learner} Log Loss: {self.nuisance_loss[learner]}\n"
-
-        if self._is_cluster_data:
-            resampling_info = (
-                f"No. folds per cluster: {self._n_folds_per_cluster}\n"
-                f"No. folds: {self.n_folds}\n"
-                f"No. repeated sample splits: {self.n_rep}\n"
-            )
-        else:
-            resampling_info = f"No. folds: {self.n_folds}\nNo. repeated sample splits: {self.n_rep}\n"
-        fit_summary = str(self.summary)
-        res = (
-            header
-            + "\n------------------ Data summary      ------------------\n"
-            + data_summary
-            + "\n------------------ Score & algorithm ------------------\n"
-            + score_info
-            + "\n------------------ Machine learner   ------------------\n"
-            + learner_info
-            + "\n------------------ Resampling        ------------------\n"
-            + resampling_info
-            + "\n------------------ Fit summary       ------------------\n"
-            + fit_summary
-        )
-        return res
+    def _format_score_info_str(self):
+        lines = [
+            f"Score function: {str(self.score)}",
+            f"Treatment group: {str(self.g_value)}",
+            f"Pre-treatment period: {str(self.t_value_pre)}",
+            f"Evaluation period: {str(self.t_value_eval)}",
+            f"Control group: {str(self.control_group)}",
+            f"Anticipation periods: {str(self.anticipation_periods)}",
+            f"Effective sample size: {str(self.n_obs_subset)}",
+        ]
+        return "\n".join(lines)
+
+    # _format_learner_info_str method is inherited from DoubleML base class.
 
     @property
     def g_value(self):
diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py
@@ -110,50 +110,87 @@ def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting):
         self._i_rep = None
         self._i_treat = None
 
-    def __str__(self):
+    def _format_header_str(self):
         class_name = self.__class__.__name__
-        header = f"================== {class_name} Object ==================\n"
-        data_summary = self._dml_data._data_summary_str()
-        score_info = f"Score function: {str(self.score)}\n"
+        return f"================== {class_name} Object =================="
+
+    def _format_score_info_str(self):
+        return f"Score function: {str(self.score)}"
+
+    def _format_learner_info_str(self):
         learner_info = ""
-        for key, value in self.learner.items():
-            learner_info += f"Learner {key}: {str(value)}\n"
+        if self.learner is not None:
+            for key, value in self.learner.items():
+                learner_info += f"Learner {key}: {str(value)}\n"
         if self.nuisance_loss is not None:
             learner_info += "Out-of-sample Performance:\n"
-            is_classifier = [value for value in self._is_classifier.values()]
-            is_regressor = [not value for value in is_classifier]
-            if any(is_regressor):
-                learner_info += "Regression:\n"
-                for learner in [key for key, value in self._is_classifier.items() if value is False]:
-                    learner_info += f"Learner {learner} RMSE: {self.nuisance_loss[learner]}\n"
-            if any(is_classifier):
-                learner_info += "Classification:\n"
-                for learner in [key for key, value in self._is_classifier.items() if value is True]:
-                    learner_info += f"Learner {learner} Log Loss: {self.nuisance_loss[learner]}\n"
+            # Check if _is_classifier is populated, otherwise, it might be called before fit
+            if self._is_classifier:
+                is_classifier_any = any(self._is_classifier.values())
+                is_regressor_any = any(not v for v in self._is_classifier.values())
+
+                if is_regressor_any:
+                    learner_info += "Regression:\n"
+                    for learner_name in self.params_names:  # Iterate through known learners
+                        if not self._is_classifier.get(learner_name, True):  # Default to not regressor if not found
+                            loss_val = self.nuisance_loss.get(learner_name, "N/A")
+                            learner_info += f"Learner {learner_name} RMSE: {loss_val}\n"
+                if is_classifier_any:
+                    learner_info += "Classification:\n"
+                    for learner_name in self.params_names:  # Iterate through known learners
+                        if self._is_classifier.get(learner_name, False):  # Default to not classifier if not found
+                            loss_val = self.nuisance_loss.get(learner_name, "N/A")
+                            learner_info += f"Learner {learner_name} Log Loss: {loss_val}\n"
+            else:
+                learner_info += " (Run .fit() to see out-of-sample performance)\n"
+        return learner_info.strip()
 
+    def _format_resampling_info_str(self):
         if self._is_cluster_data:
-            resampling_info = (
+            return (
                 f"No. folds per cluster: {self._n_folds_per_cluster}\n"
                 f"No. folds: {self.n_folds}\n"
-                f"No. repeated sample splits: {self.n_rep}\n"
+                f"No. repeated sample splits: {self.n_rep}"
             )
         else:
-            resampling_info = f"No. folds: {self.n_folds}\nNo. repeated sample splits: {self.n_rep}\n"
-        fit_summary = str(self.summary)
-        res = (
-            header
-            + "\n------------------ Data summary      ------------------\n"
-            + data_summary
-            + "\n------------------ Score & algorithm ------------------\n"
-            + score_info
-            + "\n------------------ Machine learner   ------------------\n"
-            + learner_info
-            + "\n------------------ Resampling        ------------------\n"
-            + resampling_info
-            + "\n------------------ Fit summary       ------------------\n"
-            + fit_summary
+            return f"No. folds: {self.n_folds}\nNo. repeated sample splits: {self.n_rep}"
+
+    def _format_additional_info_str(self):
+        """
+        Hook for subclasses to add additional information to the string representation.
+        Returns an empty string by default.
+        Subclasses should override this method to provide content.
+        The content should not include the 'Additional Information' header itself.
+        """
+        return ""
+
+    def __str__(self):
+        header = self._format_header_str()
+        # Assumes self._dml_data._data_summary_str() exists and is well-formed
+        data_summary = self._dml_data._data_summary_str()
+        score_info = self._format_score_info_str()
+        learner_info = self._format_learner_info_str()
+        resampling_info = self._format_resampling_info_str()
+        fit_summary = str(self.summary)  # Assumes self.summary is well-formed
+
+        representation = (
+            f"{header}\n"
+            f"\n------------------ Data Summary      ------------------\n"
+            f"{data_summary}\n"
+            f"\n------------------ Score & Algorithm ------------------\n"
+            f"{score_info}\n"
+            f"\n------------------ Machine Learner   ------------------\n"
+            f"{learner_info}\n"
+            f"\n------------------ Resampling        ------------------\n"
+            f"{resampling_info}\n"
+            f"\n------------------ Fit Summary       ------------------\n"
+            f"{fit_summary}"
         )
-        return res
+
+        additional_info = self._format_additional_info_str()
+        if additional_info:
+            representation += f"\n\n------------------ Additional Information ------------------\n" f"{additional_info}"
+        return representation
 
     @property
     def n_folds(self):
diff --git a/doubleml/irm/iivm.py b/doubleml/irm/iivm.py
@@ -197,22 +197,13 @@ def __init__(
         self.subgroups = subgroups
         self._external_predictions_implemented = True
 
-    def __str__(self):
-        parent_str = super().__str__()
-
-        # add robust confset
+    def _format_additional_info_str(self):
         if self.framework is None:
-            confset_str = ""
+            return ""
         else:
             confset = self.robust_confset()
             formatted_confset = ", ".join([f"[{lower:.4f}, {upper:.4f}]" for lower, upper in confset])
-            confset_str = (
-                "\n\n--------------- Additional Information ----------------\n"
-                + f"Robust Confidence Set: {formatted_confset}\n"
-            )
-
-        res = parent_str + confset_str
-        return res
+            return f"Robust Confidence Set: {formatted_confset}"
 
     @property
     def normalize_ipw(self):