diff --git a/.gitignore b/.gitignore
index ba091906..306442b1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,4 +30,4 @@ MANIFEST
 *.idea
 *.vscode
 .flake8
-.coverage
\ No newline at end of file
+.coverage
diff --git a/doubleml/__init__.py b/doubleml/__init__.py
index a86735c8..102ea995 100644
--- a/doubleml/__init__.py
+++ b/doubleml/__init__.py
@@ -1,8 +1,8 @@
 import importlib.metadata
 
+from .data import DoubleMLClusterData, DoubleMLData
 from .did.did import DoubleMLDID
 from .did.did_cs import DoubleMLDIDCS
-from .double_ml_data import DoubleMLClusterData, DoubleMLData
 from .double_ml_framework import DoubleMLFramework, concat
 from .irm.apo import DoubleMLAPO
 from .irm.apos import DoubleMLAPOS
diff --git a/doubleml/data/__init__.py b/doubleml/data/__init__.py
new file mode 100644
index 00000000..d8a920c6
--- /dev/null
+++ b/doubleml/data/__init__.py
@@ -0,0 +1,13 @@
+"""
+The :mod:`doubleml.data` module implements data classes for double machine learning.
+"""
+
+from .base_data import DoubleMLData
+from .cluster_data import DoubleMLClusterData
+from .panel_data import DoubleMLPanelData
+
+__all__ = [
+    "DoubleMLData",
+    "DoubleMLClusterData",
+    "DoubleMLPanelData",
+]
diff --git a/doubleml/double_ml_data.py b/doubleml/data/base_data.py
similarity index 57%
rename from doubleml/double_ml_data.py
rename to doubleml/data/base_data.py
index 3ebf2f76..318508e9 100644
--- a/doubleml/double_ml_data.py
+++ b/doubleml/data/base_data.py
@@ -7,8 +7,7 @@
 from sklearn.utils.multiclass import type_of_target
 from sklearn.utils.validation import check_array, check_consistent_length, column_or_1d
 
-from .utils._checks import _check_set
-from .utils._estimation import _assure_2d_array
+from doubleml.utils._estimation import _assure_2d_array
 
 
 class DoubleMLBaseData(ABC):
@@ -127,6 +126,14 @@ class DoubleMLData(DoubleMLBaseData):
         in the covariates ``x``.
         Default is ``True``.
 
+    force_all_d_finite : bool or str
+        Indicates whether to raise an error on infinite values and / or missings in the treatment variables ``d``.
+        Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are
+        allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed).
+        Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the model used allows for missing
+        and / or infinite values in the treatment variables ``d`` (e.g. panel data models).
+        Default is ``True``.
+
     Examples
     --------
     >>> from doubleml import DoubleMLData
@@ -150,6 +157,7 @@ def __init__(
         s_col=None,
         use_other_treat_as_covariate=True,
         force_all_x_finite=True,
+        force_all_d_finite=True,
     ):
         DoubleMLBaseData.__init__(self, data)
 
@@ -159,9 +167,10 @@ def __init__(
         self.t_col = t_col
         self.s_col = s_col
         self.x_cols = x_cols
-        self._check_disjoint_sets_y_d_x_z_t_s()
+        self._check_disjoint_sets()
         self.use_other_treat_as_covariate = use_other_treat_as_covariate
         self.force_all_x_finite = force_all_x_finite
+        self.force_all_d_finite = force_all_d_finite
         self._binary_treats = self._check_binary_treats()
         self._binary_outcome = self._check_binary_outcome()
         self._set_y_z_t_s()
@@ -197,7 +206,18 @@ def _data_summary_str(self):
         return data_summary
 
     @classmethod
-    def from_arrays(cls, x, y, d, z=None, t=None, s=None, use_other_treat_as_covariate=True, force_all_x_finite=True):
+    def from_arrays(
+        cls,
+        x,
+        y,
+        d,
+        z=None,
+        t=None,
+        s=None,
+        use_other_treat_as_covariate=True,
+        force_all_x_finite=True,
+        force_all_d_finite=True,
+    ):
         """
         Initialize :class:`DoubleMLData` from :class:`numpy.ndarray`'s.
 
@@ -237,6 +257,14 @@ def from_arrays(cls, x, y, d, z=None, t=None, s=None, use_other_treat_as_covaria
             in the covariates ``x``.
             Default is ``True``.
 
+        force_all_d_finite : bool or str
+            Indicates whether to raise an error on infinite values and / or missings in the treatment variables ``d``.
+            Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are
+            allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed).
+            Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the model used allows for missing
+            and / or infinite values in the treatment variables ``d`` (e.g. panel data models).
+            Default is ``True``.
+
         Examples
         --------
         >>> from doubleml import DoubleMLData
@@ -255,8 +283,19 @@ def from_arrays(cls, x, y, d, z=None, t=None, s=None, use_other_treat_as_covaria
         elif not isinstance(force_all_x_finite, bool):
             raise TypeError("Invalid force_all_x_finite. " + "force_all_x_finite must be True, False or 'allow-nan'.")
 
+        if isinstance(force_all_d_finite, str):
+            if force_all_d_finite != "allow-nan":
+                raise ValueError(
+                    "Invalid force_all_d_finite "
+                    + force_all_d_finite
+                    + ". "
+                    + "force_all_d_finite must be True, False or 'allow-nan'."
+                )
+        elif not isinstance(force_all_d_finite, bool):
+            raise TypeError("Invalid force_all_d_finite. " + "force_all_d_finite must be True, False or 'allow-nan'.")
+
         x = check_array(x, ensure_2d=False, allow_nd=False, force_all_finite=force_all_x_finite)
-        d = check_array(d, ensure_2d=False, allow_nd=False)
+        d = check_array(d, ensure_2d=False, allow_nd=False, force_all_finite=force_all_x_finite)
         y = column_or_1d(y, warn=True)
 
         x = _assure_2d_array(x)
@@ -296,7 +335,7 @@ def from_arrays(cls, x, y, d, z=None, t=None, s=None, use_other_treat_as_covaria
 
         x_cols = [f"X{i + 1}" for i in np.arange(x.shape[1])]
 
-        # basline version with features, outcome and treatments
+        # baseline version with features, outcome and treatments
         data = pd.DataFrame(np.column_stack((x, y, d)), columns=x_cols + [y_col] + d_cols)
 
         if z is not None:
@@ -309,7 +348,18 @@ def from_arrays(cls, x, y, d, z=None, t=None, s=None, use_other_treat_as_covaria
         if s is not None:
             data[s_col] = s
 
-        return cls(data, y_col, d_cols, x_cols, z_cols, t_col, s_col, use_other_treat_as_covariate, force_all_x_finite)
+        return cls(
+            data,
+            y_col,
+            d_cols,
+            x_cols,
+            z_cols,
+            t_col,
+            s_col,
+            use_other_treat_as_covariate,
+            force_all_x_finite,
+            force_all_d_finite,
+        )
 
     @property
     def x(self):
@@ -431,14 +481,14 @@ def x_cols(self, value):
                 raise ValueError("Invalid covariates x_cols. At least one covariate is no data column.")
             assert set(value).issubset(set(self.all_variables))
             self._x_cols = value
+
         else:
-            excluded_cols = set.union({self.y_col}, set(self.d_cols))
-            if self.z_cols is not None:
-                excluded_cols = set.union(excluded_cols, set(self.z_cols))
-            for col in [self.t_col, self.s_col]:
-                col = _check_set(col)
-                excluded_cols = set.union(excluded_cols, col)
+            excluded_cols = {self.y_col} | set(self.d_cols)
+            optional_col_sets = self._get_optional_col_sets()
+            for optional_col_set in optional_col_sets:
+                excluded_cols |= optional_col_set
             self._x_cols = [col for col in self.data.columns if col not in excluded_cols]
+
         if reset_value:
             self._check_disjoint_sets()
             # by default, we initialize to the first treatment variable
@@ -612,26 +662,41 @@ def force_all_x_finite(self, value):
             # by default, we initialize to the first treatment variable
             self.set_x_d(self.d_cols[0])
 
-    def _set_y_z_t_s(self):
-        assert_all_finite(self.data.loc[:, self.y_col])
-        self._y = self.data.loc[:, self.y_col]
-        if self.z_cols is None:
-            self._z = None
-        else:
-            assert_all_finite(self.data.loc[:, self.z_cols])
-            self._z = self.data.loc[:, self.z_cols]
+    @property
+    def force_all_d_finite(self):
+        """
+        Indicates whether to raise an error on infinite values and / or missings in the treatment variables ``d``.
+        Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are
+        allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed).
+        """
+        return self._force_all_d_finite
 
-        if self.t_col is None:
-            self._t = None
-        else:
-            assert_all_finite(self.data.loc[:, self.t_col])
-            self._t = self.data.loc[:, self.t_col]
+    @force_all_d_finite.setter
+    def force_all_d_finite(self, value):
+        reset_value = hasattr(self, "_force_all_d_finite")
+        if isinstance(value, str):
+            if value != "allow-nan":
+                raise ValueError(
+                    "Invalid force_all_d_finite " + value + ". " + "force_all_d_finite must be True, False or 'allow-nan'."
+                )
+        elif not isinstance(value, bool):
+            raise TypeError("Invalid force_all_d_finite. " + "force_all_d_finite must be True, False or 'allow-nan'.")
+        self._force_all_d_finite = value
+        if reset_value:
+            # by default, we initialize to the first treatment variable
+            self.set_x_d(self.d_cols[0])
 
-        if self.s_col is None:
-            self._s = None
-        else:
-            assert_all_finite(self.data.loc[:, self.s_col])
-            self._s = self.data.loc[:, self.s_col]
+    def _set_y_z_t_s(self):
+        def _set_attr(col):
+            if col is None:
+                return None
+            assert_all_finite(self.data.loc[:, col])
+            return self.data.loc[:, col]
+
+        self._y = _set_attr(self.y_col)
+        self._z = _set_attr(self.z_cols)
+        self._t = _set_attr(self.t_col)
+        self._s = _set_attr(self.s_col)
 
     def set_x_d(self, treatment_var):
         """
@@ -655,19 +720,31 @@ def set_x_d(self, treatment_var):
             xd_list.remove(treatment_var)
         else:
             xd_list = self.x_cols
-        assert_all_finite(self.data.loc[:, treatment_var])
+        if self.force_all_d_finite:
+            assert_all_finite(self.data.loc[:, self.d_cols], allow_nan=self.force_all_d_finite == "allow-nan")
         if self.force_all_x_finite:
             assert_all_finite(self.data.loc[:, xd_list], allow_nan=self.force_all_x_finite == "allow-nan")
         self._d = self.data.loc[:, treatment_var]
         self._X = self.data.loc[:, xd_list]
 
+    def _get_optional_col_sets(self):
+        # this function can be extended in inherited subclasses
+        z_cols_set = set(self.z_cols or [])
+        t_col_set = {self.t_col} if self.t_col else set()
+        s_col_set = {self.s_col} if self.s_col else set()
+
+        return [z_cols_set, t_col_set, s_col_set]
+
     def _check_binary_treats(self):
         is_binary = pd.Series(dtype=bool, index=self.d_cols)
-        for treatment_var in self.d_cols:
-            this_d = self.data.loc[:, treatment_var]
-            binary_treat = type_of_target(this_d) == "binary"
-            zero_one_treat = np.all((np.power(this_d, 2) - this_d) == 0)
-            is_binary[treatment_var] = binary_treat & zero_one_treat
+        if not self.force_all_d_finite:
+            is_binary[:] = False  # if we allow infinite values, we cannot check for binary
+        else:
+            for treatment_var in self.d_cols:
+                this_d = self.data.loc[:, treatment_var]
+                binary_treat = type_of_target(this_d) == "binary"
+                zero_one_treat = np.all((np.power(this_d, 2) - this_d) == 0)
+                is_binary[treatment_var] = binary_treat & zero_one_treat
         return is_binary
 
     def _check_binary_outcome(self):
@@ -677,11 +754,18 @@ def _check_binary_outcome(self):
         is_binary = binary_outcome & zero_one_outcome
         return is_binary
 
+    @staticmethod
+    def _check_disjoint(set1, set2, name1, arg1, name2, arg2):
+        """Helper method to check for disjoint sets."""
+        if not set1.isdisjoint(set2):
+            raise ValueError(f"At least one variable/column is set as {name1} ({arg1}) and {name2} ({arg2}).")
+
     def _check_disjoint_sets(self):
         # this function can be extended in inherited subclasses
-        self._check_disjoint_sets_y_d_x_z_t_s()
+        self._check_disjoint_sets_y_d_x()
+        self._check_disjoint_sets_z_t_s()
 
-    def _check_disjoint_sets_y_d_x_z_t_s(self):
+    def _check_disjoint_sets_y_d_x(self):
         y_col_set = {self.y_col}
         x_cols_set = set(self.x_cols)
         d_cols_set = set(self.d_cols)
@@ -700,396 +784,31 @@ def _check_disjoint_sets_y_d_x_z_t_s(self):
                 "(``x_cols``). Consider using parameter ``use_other_treat_as_covariate``."
             )
 
-        if self.z_cols is not None:
-            z_cols_set = set(self.z_cols)
-            if not y_col_set.isdisjoint(z_cols_set):
-                raise ValueError(
-                    f"{str(self.y_col)} cannot be set as outcome variable ``y_col`` and instrumental variable in ``z_cols``."
-                )
-            if not d_cols_set.isdisjoint(z_cols_set):
-                raise ValueError(
-                    "At least one variable/column is set as treatment variable (``d_cols``) and "
-                    "instrumental variable in ``z_cols``."
-                )
-            if not x_cols_set.isdisjoint(z_cols_set):
-                raise ValueError(
-                    "At least one variable/column is set as covariate (``x_cols``) and instrumental variable in ``z_cols``."
-                )
-
-        self._check_disjoint_sets_t_s()
-
-    def _check_disjoint_sets_t_s(self):
+    def _check_disjoint_sets_z_t_s(self):
         y_col_set = {self.y_col}
         x_cols_set = set(self.x_cols)
         d_cols_set = set(self.d_cols)
 
-        if self.t_col is not None:
-            t_col_set = {self.t_col}
-            if not t_col_set.isdisjoint(x_cols_set):
-                raise ValueError(f"{str(self.t_col)} cannot be set as time variable ``t_col`` and covariate in ``x_cols``.")
-            if not t_col_set.isdisjoint(d_cols_set):
-                raise ValueError(
-                    f"{str(self.t_col)} cannot be set as time variable ``t_col`` and treatment variable in ``d_cols``."
-                )
-            if not t_col_set.isdisjoint(y_col_set):
-                raise ValueError(f"{str(self.t_col)} cannot be set as time variable ``t_col`` and outcome variable ``y_col``.")
-            if self.z_cols is not None:
-                z_cols_set = set(self.z_cols)
-                if not t_col_set.isdisjoint(z_cols_set):
-                    raise ValueError(
-                        f"{str(self.t_col)} cannot be set as time variable ``t_col`` and instrumental variable in ``z_cols``."
-                    )
-
-        if self.s_col is not None:
-            s_col_set = {self.s_col}
-            if not s_col_set.isdisjoint(x_cols_set):
-                raise ValueError(
-                    f"{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and covariate in ``x_cols``."
-                )
-            if not s_col_set.isdisjoint(d_cols_set):
-                raise ValueError(
-                    f"{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and treatment "
-                    "variable in ``d_cols``."
-                )
-            if not s_col_set.isdisjoint(y_col_set):
-                raise ValueError(
-                    f"{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and outcome variable ``y_col``."
-                )
-            if self.z_cols is not None:
-                z_cols_set = set(self.z_cols)
-                if not s_col_set.isdisjoint(z_cols_set):
-                    raise ValueError(
-                        f"{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and "
-                        "instrumental variable in ``z_cols``."
-                    )
-            if self.t_col is not None:
-                t_col_set = {self.t_col}
-                if not s_col_set.isdisjoint(t_col_set):
-                    raise ValueError(
-                        f"{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and time "
-                        "variable ``t_col``."
-                    )
-
-
-class DoubleMLClusterData(DoubleMLData):
-    """Double machine learning data-backend for data with cluster variables.
-
-    :class:`DoubleMLClusterData` objects can be initialized from
-    :class:`pandas.DataFrame`'s as well as :class:`numpy.ndarray`'s.
-
-    Parameters
-    ----------
-    data : :class:`pandas.DataFrame`
-        The data.
-
-    y_col : str
-        The outcome variable.
-
-    d_cols : str or list
-        The treatment variable(s).
-
-    cluster_cols : str or list
-        The cluster variable(s).
-
-    x_cols : None, str or list
-        The covariates.
-        If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor
-        treatment variables ``d_cols``, nor instrumental variables ``z_cols`` are used as covariates.
-        Default is ``None``.
-
-    z_cols : None, str or list
-        The instrumental variable(s).
-        Default is ``None``.
-
-    t_col : None or str
-        The time variable (only relevant/used for DiD Estimators).
-        Default is ``None``.
-
-    s_col : None or str
-        The score or selection variable (only relevant/used for RDD and SSM Estimatiors).
-        Default is ``None``.
-
-    use_other_treat_as_covariate : bool
-        Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates.
-        Default is ``True``.
-
-    force_all_x_finite : bool or str
-        Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``.
-        Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are
-        allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed).
-        Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used
-        for the nuisance functions are capable to provide valid predictions with missings and / or infinite values
-        in the covariates ``x``.
-        Default is ``True``.
-
-    Examples
-    --------
-    >>> from doubleml import DoubleMLClusterData
-    >>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021
-    >>> # initialization from pandas.DataFrame
-    >>> df = make_pliv_multiway_cluster_CKMS2021(return_type='DataFrame')
-    >>> obj_dml_data_from_df = DoubleMLClusterData(df, 'Y', 'D', ['cluster_var_i', 'cluster_var_j'], z_cols='Z')
-    >>> # initialization from np.ndarray
-    >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array')
-    >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z)
-    """
-
-    def __init__(
-        self,
-        data,
-        y_col,
-        d_cols,
-        cluster_cols,
-        x_cols=None,
-        z_cols=None,
-        t_col=None,
-        s_col=None,
-        use_other_treat_as_covariate=True,
-        force_all_x_finite=True,
-    ):
-        DoubleMLBaseData.__init__(self, data)
-
-        # we need to set cluster_cols (needs _data) before call to the super __init__ because of the x_cols setter
-        self.cluster_cols = cluster_cols
-        self._set_cluster_vars()
-        DoubleMLData.__init__(
-            self, data, y_col, d_cols, x_cols, z_cols, t_col, s_col, use_other_treat_as_covariate, force_all_x_finite
-        )
-        self._check_disjoint_sets_cluster_cols()
-
-    def __str__(self):
-        data_summary = self._data_summary_str()
-        buf = io.StringIO()
-        self.data.info(verbose=False, buf=buf)
-        df_info = buf.getvalue()
-        res = (
-            "================== DoubleMLClusterData Object ==================\n"
-            + "\n------------------ Data summary      ------------------\n"
-            + data_summary
-            + "\n------------------ DataFrame info    ------------------\n"
-            + df_info
-        )
-        return res
-
-    def _data_summary_str(self):
-        data_summary = (
-            f"Outcome variable: {self.y_col}\n"
-            f"Treatment variable(s): {self.d_cols}\n"
-            f"Cluster variable(s): {self.cluster_cols}\n"
-            f"Covariates: {self.x_cols}\n"
-            f"Instrument variable(s): {self.z_cols}\n"
-        )
-        if self.t_col is not None:
-            data_summary += f"Time variable: {self.t_col}\n"
-        if self.s_col is not None:
-            data_summary += f"Score/Selection variable: {self.s_col}\n"
-
-        data_summary += f"No. Observations: {self.n_obs}\n"
-        return data_summary
-
-    @classmethod
-    def from_arrays(
-        cls, x, y, d, cluster_vars, z=None, t=None, s=None, use_other_treat_as_covariate=True, force_all_x_finite=True
-    ):
-        """
-        Initialize :class:`DoubleMLClusterData` from :class:`numpy.ndarray`'s.
-
-        Parameters
-        ----------
-        x : :class:`numpy.ndarray`
-            Array of covariates.
-
-        y : :class:`numpy.ndarray`
-            Array of the outcome variable.
-
-        d : :class:`numpy.ndarray`
-            Array of treatment variables.
-
-        cluster_vars : :class:`numpy.ndarray`
-            Array of cluster variables.
-
-        z : None or :class:`numpy.ndarray`
-            Array of instrumental variables.
-            Default is ``None``.
-
-        t : :class:`numpy.ndarray`
-            Array of the time variable (only relevant/used for DiD models).
-            Default is ``None``.
-
-        s : :class:`numpy.ndarray`
-            Array of the score or selection variable (only relevant/used for RDD or SSM models).
-            Default is ``None``.
-
-        use_other_treat_as_covariate : bool
-            Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates.
-            Default is ``True``.
-
-        force_all_x_finite : bool or str
-            Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``.
-            Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are
-            allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed).
-            Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used
-            for the nuisance functions are capable to provide valid predictions with missings and / or infinite values
-            in the covariates ``x``.
-            Default is ``True``.
-
-        Examples
-        --------
-        >>> from doubleml import DoubleMLClusterData
-        >>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021
-        >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array')
-        >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z)
-        """
-        dml_data = DoubleMLData.from_arrays(x, y, d, z, t, s, use_other_treat_as_covariate, force_all_x_finite)
-        cluster_vars = check_array(cluster_vars, ensure_2d=False, allow_nd=False)
-        cluster_vars = _assure_2d_array(cluster_vars)
-        if cluster_vars.shape[1] == 1:
-            cluster_cols = ["cluster_var"]
-        else:
-            cluster_cols = [f"cluster_var{i + 1}" for i in np.arange(cluster_vars.shape[1])]
-
-        data = pd.concat((pd.DataFrame(cluster_vars, columns=cluster_cols), dml_data.data), axis=1)
-
-        return cls(
-            data,
-            dml_data.y_col,
-            dml_data.d_cols,
-            cluster_cols,
-            dml_data.x_cols,
-            dml_data.z_cols,
-            dml_data.t_col,
-            dml_data.s_col,
-            dml_data.use_other_treat_as_covariate,
-            dml_data.force_all_x_finite,
-        )
-
-    @property
-    def cluster_cols(self):
-        """
-        The cluster variable(s).
-        """
-        return self._cluster_cols
-
-    @cluster_cols.setter
-    def cluster_cols(self, value):
-        reset_value = hasattr(self, "_cluster_cols")
-        if isinstance(value, str):
-            value = [value]
-        if not isinstance(value, list):
-            raise TypeError(
-                "The cluster variable(s) cluster_cols must be of str or list type. "
-                f"{str(value)} of type {str(type(value))} was passed."
+        z_cols_set = set(self.z_cols or [])
+        t_col_set = {self.t_col} if self.t_col else set()
+        s_col_set = {self.s_col} if self.s_col else set()
+
+        instrument_checks_args = [
+            (y_col_set, "outcome variable", "``y_col``"),
+            (d_cols_set, "treatment variable", "``d_cols``"),
+            (x_cols_set, "covariate", "``x_cols``"),
+        ]
+        for set1, name, argument in instrument_checks_args:
+            self._check_disjoint(
+                set1=set1, name1=name, arg1=argument, set2=z_cols_set, name2="instrumental variable", arg2="``z_cols``"
             )
-        if not len(set(value)) == len(value):
-            raise ValueError("Invalid cluster variable(s) cluster_cols: Contains duplicate values.")
-        if not set(value).issubset(set(self.all_variables)):
-            raise ValueError("Invalid cluster variable(s) cluster_cols. At least one cluster variable is no data column.")
-        self._cluster_cols = value
-        if reset_value:
-            self._check_disjoint_sets()
-            self._set_cluster_vars()
-
-    @property
-    def n_cluster_vars(self):
-        """
-        The number of cluster variables.
-        """
-        return len(self.cluster_cols)
-
-    @property
-    def cluster_vars(self):
-        """
-        Array of cluster variable(s).
-        """
-        return self._cluster_vars.values
-
-    @DoubleMLData.x_cols.setter
-    def x_cols(self, value):
-        if value is not None:
-            # this call might become much easier with https://github.com/python/cpython/pull/26194
-            super(self.__class__, self.__class__).x_cols.__set__(self, value)
-        else:
-            if self.s_col is None:
-                if (self.z_cols is not None) & (self.t_col is not None):
-                    y_d_z_t = set.union({self.y_col}, set(self.d_cols), set(self.z_cols), {self.t_col}, set(self.cluster_cols))
-                    x_cols = [col for col in self.data.columns if col not in y_d_z_t]
-                elif self.z_cols is not None:
-                    y_d_z = set.union({self.y_col}, set(self.d_cols), set(self.z_cols), set(self.cluster_cols))
-                    x_cols = [col for col in self.data.columns if col not in y_d_z]
-                elif self.t_col is not None:
-                    y_d_t = set.union({self.y_col}, set(self.d_cols), {self.t_col}, set(self.cluster_cols))
-                    x_cols = [col for col in self.data.columns if col not in y_d_t]
-                else:
-                    y_d = set.union({self.y_col}, set(self.d_cols), set(self.cluster_cols))
-                    x_cols = [col for col in self.data.columns if col not in y_d]
-            else:
-                if (self.z_cols is not None) & (self.t_col is not None):
-                    y_d_z_t_s = set.union(
-                        {self.y_col}, set(self.d_cols), set(self.z_cols), {self.t_col}, {self.s_col}, set(self.cluster_cols)
-                    )
-                    x_cols = [col for col in self.data.columns if col not in y_d_z_t_s]
-                elif self.z_cols is not None:
-                    y_d_z_s = set.union({self.y_col}, set(self.d_cols), set(self.z_cols), {self.s_col}, set(self.cluster_cols))
-                    x_cols = [col for col in self.data.columns if col not in y_d_z_s]
-                elif self.t_col is not None:
-                    y_d_t_s = set.union({self.y_col}, set(self.d_cols), {self.t_col}, {self.s_col}, set(self.cluster_cols))
-                    x_cols = [col for col in self.data.columns if col not in y_d_t_s]
-                else:
-                    y_d_s = set.union({self.y_col}, set(self.d_cols), {self.s_col}, set(self.cluster_cols))
-                    x_cols = [col for col in self.data.columns if col not in y_d_s]
-            # this call might become much easier with https://github.com/python/cpython/pull/26194
-            super(self.__class__, self.__class__).x_cols.__set__(self, x_cols)
-
-    def _check_disjoint_sets(self):
-        # apply the standard checks from the DoubleMLData class
-        super(DoubleMLClusterData, self)._check_disjoint_sets()
-        self._check_disjoint_sets_cluster_cols()
-
-    def _check_disjoint_sets_cluster_cols(self):
-        # apply the standard checks from the DoubleMLData class
-        super(DoubleMLClusterData, self)._check_disjoint_sets()
 
-        # special checks for the additional cluster variables
-        cluster_cols_set = set(self.cluster_cols)
-        y_col_set = {self.y_col}
-        x_cols_set = set(self.x_cols)
-        d_cols_set = set(self.d_cols)
-        t_col_set = {self.t_col}
-        s_col_set = {self.s_col}
+        time_check_args = instrument_checks_args + [(z_cols_set, "instrumental variable", "``z_cols``")]
+        for set1, name, argument in time_check_args:
+            self._check_disjoint(set1=set1, name1=name, arg1=argument, set2=t_col_set, name2="time variable", arg2="``t_col``")
 
-        if not y_col_set.isdisjoint(cluster_cols_set):
-            raise ValueError(
-                f"{str(self.y_col)} cannot be set as outcome variable ``y_col`` and cluster variable in ``cluster_cols``."
-            )
-        if not d_cols_set.isdisjoint(cluster_cols_set):
-            raise ValueError(
-                "At least one variable/column is set as treatment variable (``d_cols``) and "
-                "cluster variable in ``cluster_cols``."
+        score_check_args = time_check_args + [(t_col_set, "time variable", "``t_col``")]
+        for set1, name, argument in score_check_args:
+            self._check_disjoint(
+                set1=set1, name1=name, arg1=argument, set2=s_col_set, name2="score or selection variable", arg2="``s_col``"
             )
-        # TODO: Is the following combination allowed, or not?
-        if not x_cols_set.isdisjoint(cluster_cols_set):
-            raise ValueError(
-                "At least one variable/column is set as covariate (``x_cols``) and cluster variable in ``cluster_cols``."
-            )
-        if self.z_cols is not None:
-            z_cols_set = set(self.z_cols)
-            if not z_cols_set.isdisjoint(cluster_cols_set):
-                raise ValueError(
-                    "At least one variable/column is set as instrumental variable (``z_cols``) and "
-                    "cluster variable in ``cluster_cols``."
-                )
-        if self.t_col is not None:
-            if not t_col_set.isdisjoint(cluster_cols_set):
-                raise ValueError(
-                    f"{str(self.t_col)} cannot be set as time variable ``t_col`` and cluster variable in ``cluster_cols``."
-                )
-        if self.s_col is not None:
-            if not s_col_set.isdisjoint(cluster_cols_set):
-                raise ValueError(
-                    f"{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and "
-                    "cluster variable in ``cluster_cols``."
-                )
-
-    def _set_cluster_vars(self):
-        assert_all_finite(self.data.loc[:, self.cluster_cols])
-        self._cluster_vars = self.data.loc[:, self.cluster_cols]
diff --git a/doubleml/data/cluster_data.py b/doubleml/data/cluster_data.py
new file mode 100644
index 00000000..658ab0cc
--- /dev/null
+++ b/doubleml/data/cluster_data.py
@@ -0,0 +1,289 @@
+import io
+
+import numpy as np
+import pandas as pd
+from sklearn.utils import assert_all_finite
+from sklearn.utils.validation import check_array
+
+from doubleml.data.base_data import DoubleMLBaseData, DoubleMLData
+from doubleml.utils._estimation import _assure_2d_array
+
+
+class DoubleMLClusterData(DoubleMLData):
+    """Double machine learning data-backend for data with cluster variables.
+
+    :class:`DoubleMLClusterData` objects can be initialized from
+    :class:`pandas.DataFrame`'s as well as :class:`numpy.ndarray`'s.
+
+    Parameters
+    ----------
+    data : :class:`pandas.DataFrame`
+        The data.
+
+    y_col : str
+        The outcome variable.
+
+    d_cols : str or list
+        The treatment variable(s).
+
+    cluster_cols : str or list
+        The cluster variable(s).
+
+    x_cols : None, str or list
+        The covariates.
+        If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor
+        treatment variables ``d_cols``, nor instrumental variables ``z_cols`` are used as covariates.
+        Default is ``None``.
+
+    z_cols : None, str or list
+        The instrumental variable(s).
+        Default is ``None``.
+
+    t_col : None or str
+        The time variable (only relevant/used for DiD Estimators).
+        Default is ``None``.
+
+    s_col : None or str
+        The score or selection variable (only relevant/used for RDD and SSM Estimatiors).
+        Default is ``None``.
+
+    use_other_treat_as_covariate : bool
+        Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates.
+        Default is ``True``.
+
+    force_all_x_finite : bool or str
+        Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``.
+        Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are
+        allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed).
+        Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used
+        for the nuisance functions are capable to provide valid predictions with missings and / or infinite values
+        in the covariates ``x``.
+        Default is ``True``.
+
+    Examples
+    --------
+    >>> from doubleml import DoubleMLClusterData
+    >>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021
+    >>> # initialization from pandas.DataFrame
+    >>> df = make_pliv_multiway_cluster_CKMS2021(return_type='DataFrame')
+    >>> obj_dml_data_from_df = DoubleMLClusterData(df, 'Y', 'D', ['cluster_var_i', 'cluster_var_j'], z_cols='Z')
+    >>> # initialization from np.ndarray
+    >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array')
+    >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z)
+    """
+
+    def __init__(
+        self,
+        data,
+        y_col,
+        d_cols,
+        cluster_cols,
+        x_cols=None,
+        z_cols=None,
+        t_col=None,
+        s_col=None,
+        use_other_treat_as_covariate=True,
+        force_all_x_finite=True,
+    ):
+        DoubleMLBaseData.__init__(self, data)
+
+        # we need to set cluster_cols (needs _data) before call to the super __init__ because of the x_cols setter
+        self.cluster_cols = cluster_cols
+        self._set_cluster_vars()
+        DoubleMLData.__init__(
+            self, data, y_col, d_cols, x_cols, z_cols, t_col, s_col, use_other_treat_as_covariate, force_all_x_finite
+        )
+        self._check_disjoint_sets_cluster_cols()
+
+    def __str__(self):
+        data_summary = self._data_summary_str()
+        buf = io.StringIO()
+        self.data.info(verbose=False, buf=buf)
+        df_info = buf.getvalue()
+        res = (
+            "================== DoubleMLClusterData Object ==================\n"
+            + "\n------------------ Data summary      ------------------\n"
+            + data_summary
+            + "\n------------------ DataFrame info    ------------------\n"
+            + df_info
+        )
+        return res
+
+    def _data_summary_str(self):
+        data_summary = (
+            f"Outcome variable: {self.y_col}\n"
+            f"Treatment variable(s): {self.d_cols}\n"
+            f"Cluster variable(s): {self.cluster_cols}\n"
+            f"Covariates: {self.x_cols}\n"
+            f"Instrument variable(s): {self.z_cols}\n"
+        )
+        if self.t_col is not None:
+            data_summary += f"Time variable: {self.t_col}\n"
+        if self.s_col is not None:
+            data_summary += f"Score/Selection variable: {self.s_col}\n"
+
+        data_summary += f"No. Observations: {self.n_obs}\n"
+        return data_summary
+
+    @classmethod
+    def from_arrays(
+        cls, x, y, d, cluster_vars, z=None, t=None, s=None, use_other_treat_as_covariate=True, force_all_x_finite=True
+    ):
+        """
+        Initialize :class:`DoubleMLClusterData` from :class:`numpy.ndarray`'s.
+
+        Parameters
+        ----------
+        x : :class:`numpy.ndarray`
+            Array of covariates.
+
+        y : :class:`numpy.ndarray`
+            Array of the outcome variable.
+
+        d : :class:`numpy.ndarray`
+            Array of treatment variables.
+
+        cluster_vars : :class:`numpy.ndarray`
+            Array of cluster variables.
+
+        z : None or :class:`numpy.ndarray`
+            Array of instrumental variables.
+            Default is ``None``.
+
+        t : :class:`numpy.ndarray`
+            Array of the time variable (only relevant/used for DiD models).
+            Default is ``None``.
+
+        s : :class:`numpy.ndarray`
+            Array of the score or selection variable (only relevant/used for RDD or SSM models).
+            Default is ``None``.
+
+        use_other_treat_as_covariate : bool
+            Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates.
+            Default is ``True``.
+
+        force_all_x_finite : bool or str
+            Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``.
+            Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are
+            allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed).
+            Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used
+            for the nuisance functions are capable to provide valid predictions with missings and / or infinite values
+            in the covariates ``x``.
+            Default is ``True``.
+
+        Examples
+        --------
+        >>> from doubleml import DoubleMLClusterData
+        >>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021
+        >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array')
+        >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z)
+        """
+        dml_data = DoubleMLData.from_arrays(x, y, d, z, t, s, use_other_treat_as_covariate, force_all_x_finite)
+        cluster_vars = check_array(cluster_vars, ensure_2d=False, allow_nd=False)
+        cluster_vars = _assure_2d_array(cluster_vars)
+        if cluster_vars.shape[1] == 1:
+            cluster_cols = ["cluster_var"]
+        else:
+            cluster_cols = [f"cluster_var{i + 1}" for i in np.arange(cluster_vars.shape[1])]
+
+        data = pd.concat((pd.DataFrame(cluster_vars, columns=cluster_cols), dml_data.data), axis=1)
+
+        return cls(
+            data,
+            dml_data.y_col,
+            dml_data.d_cols,
+            cluster_cols,
+            dml_data.x_cols,
+            dml_data.z_cols,
+            dml_data.t_col,
+            dml_data.s_col,
+            dml_data.use_other_treat_as_covariate,
+            dml_data.force_all_x_finite,
+        )
+
+    @property
+    def cluster_cols(self):
+        """
+        The cluster variable(s).
+        """
+        return self._cluster_cols
+
+    @cluster_cols.setter
+    def cluster_cols(self, value):
+        reset_value = hasattr(self, "_cluster_cols")
+        if isinstance(value, str):
+            value = [value]
+        if not isinstance(value, list):
+            raise TypeError(
+                "The cluster variable(s) cluster_cols must be of str or list type. "
+                f"{str(value)} of type {str(type(value))} was passed."
+            )
+        if not len(set(value)) == len(value):
+            raise ValueError("Invalid cluster variable(s) cluster_cols: Contains duplicate values.")
+        if not set(value).issubset(set(self.all_variables)):
+            raise ValueError("Invalid cluster variable(s) cluster_cols. At least one cluster variable is no data column.")
+        self._cluster_cols = value
+        if reset_value:
+            self._check_disjoint_sets()
+            self._set_cluster_vars()
+
+    @property
+    def n_cluster_vars(self):
+        """
+        The number of cluster variables.
+        """
+        return len(self.cluster_cols)
+
+    @property
+    def cluster_vars(self):
+        """
+        Array of cluster variable(s).
+        """
+        return self._cluster_vars.values
+
+    def _get_optional_col_sets(self):
+        base_optional_col_sets = super()._get_optional_col_sets()
+        cluster_cols_set = set(self.cluster_cols)
+        return [cluster_cols_set] + base_optional_col_sets
+
+    def _check_disjoint_sets(self):
+        # apply the standard checks from the DoubleMLData class
+        super(DoubleMLClusterData, self)._check_disjoint_sets()
+        self._check_disjoint_sets_cluster_cols()
+
+    def _check_disjoint_sets_cluster_cols(self):
+        # apply the standard checks from the DoubleMLData class
+        super(DoubleMLClusterData, self)._check_disjoint_sets()
+
+        # special checks for the additional cluster variables
+        cluster_cols_set = set(self.cluster_cols)
+        y_col_set = {self.y_col}
+        x_cols_set = set(self.x_cols)
+        d_cols_set = set(self.d_cols)
+
+        z_cols_set = set(self.z_cols or [])
+        t_col_set = {self.t_col} if self.t_col else set()
+        s_col_set = {self.s_col} if self.s_col else set()
+
+        # TODO: X can not be used as cluster variable
+        cluster_checks_args = [
+            (y_col_set, "outcome variable", "``y_col``"),
+            (d_cols_set, "treatment variable", "``d_cols``"),
+            (x_cols_set, "covariate", "``x_cols``"),
+            (z_cols_set, "instrumental variable", "``z_cols``"),
+            (t_col_set, "time variable", "``t_col``"),
+            (s_col_set, "score or selection variable", "``s_col``"),
+        ]
+        for set1, name, argument in cluster_checks_args:
+            self._check_disjoint(
+                set1=set1,
+                name1=name,
+                arg1=argument,
+                set2=cluster_cols_set,
+                name2="cluster variable(s)",
+                arg2="``cluster_cols``",
+            )
+
+    def _set_cluster_vars(self):
+        assert_all_finite(self.data.loc[:, self.cluster_cols])
+        self._cluster_vars = self.data.loc[:, self.cluster_cols]
diff --git a/doubleml/data/panel_data.py b/doubleml/data/panel_data.py
new file mode 100644
index 00000000..f548ae6a
--- /dev/null
+++ b/doubleml/data/panel_data.py
@@ -0,0 +1,315 @@
+import io
+
+import numpy as np
+import pandas as pd
+from sklearn.utils import assert_all_finite
+
+from doubleml.data.base_data import DoubleMLBaseData, DoubleMLData
+from doubleml.data.utils.panel_data_utils import _is_valid_datetime_unit
+
+
+class DoubleMLPanelData(DoubleMLData):
+    """Double machine learning data-backend for panel data in long format.
+
+    :class:`DoubleMLPanelData` objects can be initialized from
+    :class:`pandas.DataFrame` as well as :class:`numpy.ndarray` objects.
+
+    Parameters
+    ----------
+    data : :class:`pandas.DataFrame`
+        The data.
+
+    y_col : str
+        The outcome variable.
+
+    d_cols : str or list
+        The treatment variable(s) indicating the treatment groups in terms of first time of treatment exposure.
+
+    t_col : str
+        The time variable indicating the time.
+
+    id_col : str
+        Unique unit identifier.
+
+    x_cols : None, str or list
+        The covariates.
+        If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor
+        treatment variables ``d_cols``, nor instrumental variables ``z_cols`` are used as covariates.
+        Default is ``None``.
+
+    z_cols : None, str or list
+        The instrumental variable(s).
+        Default is ``None``.
+
+    use_other_treat_as_covariate : bool
+        Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates.
+        Default is ``True``.
+
+    force_all_x_finite : bool or str
+        Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``.
+        Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are
+        allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed).
+        Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used
+        for the nuisance functions are capable to provide valid predictions with missings and / or infinite values
+        in the covariates ``x``.
+        Default is ``True``.
+
+    datetime_unit : str
+        The unit of the time and treatment variable (if datetime type).
+
+    Examples
+    --------
+    >>> from doubleml.did.datasets import make_did_CS2021
+    >>> from doubleml import DoubleMLPanelData
+    >>> df = make_did_CS2021(n_obs=500)
+    >>> dml_data = DoubleMLPanelData(
+    ...     df,
+    ...     y_col="y",
+    ...     d_cols="d",
+    ...     id_col="id",
+    ...     t_col="t",
+    ...     x_cols=["Z1", "Z2", "Z3", "Z4"],
+    ...     datetime_unit="M"
+    ... )
+    """
+
+    def __init__(
+        self,
+        data,
+        y_col,
+        d_cols,
+        t_col,
+        id_col,
+        x_cols=None,
+        z_cols=None,
+        use_other_treat_as_covariate=True,
+        force_all_x_finite=True,
+        datetime_unit="M",
+    ):
+        DoubleMLBaseData.__init__(self, data)
+
+        # we need to set id_col (needs _data) before call to the super __init__ because of the x_cols setter
+        self.id_col = id_col
+        self._datetime_unit = _is_valid_datetime_unit(datetime_unit)
+        self._set_id_var()
+
+        DoubleMLData.__init__(
+            self,
+            data=data,
+            y_col=y_col,
+            d_cols=d_cols,
+            x_cols=x_cols,
+            z_cols=z_cols,
+            t_col=t_col,
+            s_col=None,
+            use_other_treat_as_covariate=use_other_treat_as_covariate,
+            force_all_x_finite=force_all_x_finite,
+            force_all_d_finite=False,
+        )
+        if self.n_treat != 1:
+            raise ValueError("Only one treatment column is allowed for panel data.")
+
+        self._check_disjoint_sets_id_col()
+
+        # intialize the unique values of g and t
+        self._g_values = np.sort(np.unique(self.d))  # unique values of g
+        self._t_values = np.sort(np.unique(self.t))  # unique values of t
+
+    def __str__(self):
+        data_summary = self._data_summary_str()
+        buf = io.StringIO()
+        self.data.info(verbose=False, buf=buf)
+        df_info = buf.getvalue()
+        res = (
+            "================== DoubleMLPanelData Object ==================\n"
+            + "\n------------------ Data summary      ------------------\n"
+            + data_summary
+            + "\n------------------ DataFrame info    ------------------\n"
+            + df_info
+        )
+        return res
+
+    def _data_summary_str(self):
+        data_summary = (
+            f"Outcome variable: {self.y_col}\n"
+            f"Treatment variable(s): {self.d_cols}\n"
+            f"Covariates: {self.x_cols}\n"
+            f"Instrument variable(s): {self.z_cols}\n"
+            f"Time variable: {self.t_col}\n"
+            f"Id variable: {self.id_col}\n"
+        )
+
+        data_summary += f"No. Observations: {self.n_obs}\n"
+        return data_summary
+
+    @classmethod
+    def from_arrays(cls, x, y, d, t, identifier, z=None, s=None, use_other_treat_as_covariate=True, force_all_x_finite=True):
+        # TODO: Implement initialization from arrays
+        raise NotImplementedError("from_arrays is not implemented for DoubleMLPanelData")
+
+    @property
+    def datetime_unit(self):
+        """
+        The unit of the time variable.
+        """
+        return self._datetime_unit
+
+    @property
+    def d(self):
+        """
+        Array of treatment variable;
+        Dynamic! Depends on the currently set treatment variable;
+        To get an array of all treatment variables (independent of the currently set treatment variable)
+        call ``obj.data[obj.d_cols].values``.
+        """
+        if pd.api.types.is_datetime64_any_dtype(self._d):
+            return self._d.values.astype(f"datetime64[{self.datetime_unit}]")
+        else:
+            return self._d.values
+
+    @property
+    def t(self):
+        """
+        Array of time variable.
+        """
+        if pd.api.types.is_datetime64_any_dtype(self._d):
+            return self._t.values.astype(f"datetime64[{self.datetime_unit}]")
+        else:
+            return self._t.values
+
+    @property
+    def id_col(self):
+        """
+        The id variable.
+        """
+        return self._id_col
+
+    @id_col.setter
+    def id_col(self, value):
+        reset_value = hasattr(self, "_id_col")
+        if not isinstance(value, str):
+            raise TypeError(
+                "The id variable id_col must be of str type. " f"{str(value)} of type {str(type(value))} was passed."
+            )
+        if value not in self.all_variables:
+            raise ValueError("Invalid id variable id_col. " f"{value} is no data column.")
+        self._id_col = value
+        if reset_value:
+            self._check_disjoint_sets()
+            self._set_id_var()
+
+    @property
+    def id_var(self):
+        """
+        Array of id variable.
+        """
+        return self._id_var.values
+
+    @property
+    def id_var_unique(self):
+        """
+        Unique values of id variable.
+        """
+        return self._id_var_unique
+
+    @property
+    def n_obs(self):
+        """
+        The number of observations. For panel data, the number of unique values for id_col.
+        """
+        return len(self._id_var_unique)
+
+    @property
+    def g_col(self):
+        """
+        The treatment variable indicating the time of treatment exposure.
+        """
+        return self._d_cols[0]
+
+    @DoubleMLData.d_cols.setter
+    def d_cols(self, value):
+        super(self.__class__, self.__class__).d_cols.__set__(self, value)
+        if hasattr(self, "_g_values"):
+            self._g_values = np.sort(np.unique(self.d))  # update unique values of g
+
+    @property
+    def g_values(self):
+        """
+        The unique values of the treatment variable (groups) ``d``.
+        """
+        return self._g_values
+
+    @property
+    def n_groups(self):
+        """
+        The number of groups.
+        """
+        return len(self.g_values)
+
+    @DoubleMLData.t_col.setter
+    def t_col(self, value):
+        if value is None:
+            raise TypeError("Invalid time variable t_col. Time variable required for panel data.")
+        super(self.__class__, self.__class__).t_col.__set__(self, value)
+        if hasattr(self, "_t_values"):
+            self._t_values = np.sort(np.unique(self.t))  # update unique values of t
+
+    @property
+    def t_values(self):
+        """
+        The unique values of the time variable ``t``.
+        """
+        return self._t_values
+
+    @property
+    def n_t_periods(self):
+        """
+        The number of time periods.
+        """
+        return len(self.t_values)
+
+    def _get_optional_col_sets(self):
+        base_optional_col_sets = super()._get_optional_col_sets()
+        id_col_set = {self.id_col}
+        return [id_col_set] + base_optional_col_sets
+
+    def _check_disjoint_sets(self):
+        # apply the standard checks from the DoubleMLData class
+        super(DoubleMLPanelData, self)._check_disjoint_sets()
+        self._check_disjoint_sets_id_col()
+
+    def _check_disjoint_sets_id_col(self):
+        # apply the standard checks from the DoubleMLData class
+        super(DoubleMLPanelData, self)._check_disjoint_sets()
+
+        # special checks for the additional id variable (and the time variable)
+        id_col_set = {self.id_col}
+        y_col_set = {self.y_col}
+        x_cols_set = set(self.x_cols)
+        d_cols_set = set(self.d_cols)
+
+        z_cols_set = set(self.z_cols or [])
+        t_col_set = {self.t_col}  # t_col is not None for panel data
+        # s_col not tested as not relevant for panel data
+
+        id_col_check_args = [
+            (y_col_set, "outcome variable", "``y_col``"),
+            (d_cols_set, "treatment variable", "``d_cols``"),
+            (x_cols_set, "covariate", "``x_cols``"),
+            (z_cols_set, "instrumental variable", "``z_cols``"),
+            (t_col_set, "time variable", "``t_col``"),
+        ]
+        for set1, name, argument in id_col_check_args:
+            self._check_disjoint(
+                set1=set1,
+                name1=name,
+                arg1=argument,
+                set2=id_col_set,
+                name2="identifier variable",
+                arg2="``id_col``",
+            )
+
+    def _set_id_var(self):
+        assert_all_finite(self.data.loc[:, self.id_col])
+        self._id_var = self.data.loc[:, self.id_col]
+        self._id_var_unique = np.unique(self._id_var.values)
diff --git a/doubleml/data/tests/__init__.py b/doubleml/data/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/doubleml/data/tests/conftest.py b/doubleml/data/tests/conftest.py
new file mode 100644
index 00000000..6960b58a
--- /dev/null
+++ b/doubleml/data/tests/conftest.py
@@ -0,0 +1,40 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from doubleml.datasets import make_irm_data, make_plr_turrell2018
+
+
+@pytest.fixture(scope="session", params=[(500, 10), (1000, 20), (1000, 100)])
+def generate_data1(request):
+    n_p = request.param
+    np.random.seed(1111)
+    # setting parameters
+    n = n_p[0]
+    p = n_p[1]
+    theta = 0.5
+
+    # generating data
+    data = make_plr_turrell2018(n, p, theta, return_type=pd.DataFrame)
+
+    return data
+
+
+@pytest.fixture(scope="session", params=[(500, 10), (1000, 20)])
+def generate_data_irm_w_missings(request):
+    n_p = request.param
+    np.random.seed(1111)
+    # setting parameters
+    n = n_p[0]
+    p = n_p[1]
+    theta = 0.5
+
+    # generating data
+    (x, y, d) = make_irm_data(n, p, theta, return_type="array")
+
+    # randomly set some entries to np.nan
+    ind = np.random.choice(np.arange(x.size), replace=False, size=int(x.size * 0.05))
+    x[np.unravel_index(ind, x.shape)] = np.nan
+    data = (x, y, d)
+
+    return data
diff --git a/doubleml/data/tests/test_cluster_data.py b/doubleml/data/tests/test_cluster_data.py
new file mode 100644
index 00000000..e95dfa03
--- /dev/null
+++ b/doubleml/data/tests/test_cluster_data.py
@@ -0,0 +1,230 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from doubleml import DoubleMLClusterData
+from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021, make_plr_CCDDHNR2018
+
+
+@pytest.mark.ci
+def test_obj_vs_from_arrays():
+    np.random.seed(3141)
+    dml_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10)
+    dml_data_from_array = DoubleMLClusterData.from_arrays(
+        dml_data.data[dml_data.x_cols],
+        dml_data.data[dml_data.y_col],
+        dml_data.data[dml_data.d_cols],
+        dml_data.data[dml_data.cluster_cols],
+        dml_data.data[dml_data.z_cols],
+    )
+    df = dml_data.data.copy()
+    df.rename(
+        columns={"cluster_var_i": "cluster_var1", "cluster_var_j": "cluster_var2", "Y": "y", "D": "d", "Z": "z"}, inplace=True
+    )
+    assert dml_data_from_array.data.equals(df)
+
+    # with a single cluster variable
+    dml_data_from_array = DoubleMLClusterData.from_arrays(
+        dml_data.data[dml_data.x_cols],
+        dml_data.data[dml_data.y_col],
+        dml_data.data[dml_data.d_cols],
+        dml_data.data[dml_data.cluster_cols[1]],
+        dml_data.data[dml_data.z_cols],
+    )
+    df = dml_data.data.copy().drop(columns="cluster_var_i")
+    df.rename(columns={"cluster_var_j": "cluster_var", "Y": "y", "D": "d", "Z": "z"}, inplace=True)
+    assert dml_data_from_array.data.equals(df)
+
+
+@pytest.mark.ci
+def test_x_cols_setter_defaults_w_cluster():
+    df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "xx3", "cluster1"])
+    dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1")
+    assert dml_data.x_cols == ["xx1", "xx2", "xx3"]
+    dml_data.x_cols = ["xx1", "xx3"]
+    assert dml_data.x_cols == ["xx1", "xx3"]
+    dml_data.x_cols = None
+    assert dml_data.x_cols == ["xx1", "xx2", "xx3"]
+
+    # with instrument
+    df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "z", "cluster1"])
+    dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="z")
+    assert dml_data.x_cols == ["xx1", "xx2"]
+
+    # without instrument and with time
+    df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "tt", "cluster1"])
+    dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", t_col="tt")
+    assert dml_data.x_cols == ["xx1", "xx2"]
+
+    # with instrument and with time
+    df = pd.DataFrame(np.tile(np.arange(7), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "tt", "cluster1"])
+    dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", t_col="tt")
+    assert dml_data.x_cols == ["xx1", "xx2"]
+
+    # without instrument and with selection
+    df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "ss", "cluster1"])
+    dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", s_col="ss")
+    assert dml_data.x_cols == ["xx1", "xx2"]
+
+    # with instrument and with selection
+    df = pd.DataFrame(np.tile(np.arange(7), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "ss", "cluster1"])
+    dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", s_col="ss")
+    assert dml_data.x_cols == ["xx1", "xx2"]
+
+    # without instrument with time with selection
+    df = pd.DataFrame(np.tile(np.arange(7), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "tt", "ss", "cluster1"])
+    dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", t_col="tt", s_col="ss")
+    assert dml_data.x_cols == ["xx1", "xx2"]
+
+    # with instrument with time with selection
+    df = pd.DataFrame(np.tile(np.arange(8), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "tt", "ss", "cluster1"])
+    dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", t_col="tt", s_col="ss")
+    assert dml_data.x_cols == ["xx1", "xx2"]
+
+
+@pytest.mark.ci
+def test_cluster_cols_setter():
+    np.random.seed(3141)
+    dml_data = make_plr_CCDDHNR2018(n_obs=100)
+    df = dml_data.data.copy().iloc[:, :10]
+    df.columns = [f"X{i + 1}" for i in np.arange(7)] + ["y", "d1", "d2"]
+    dml_data = DoubleMLClusterData(
+        df, "y", ["d1", "d2"], cluster_cols=[f"X{i + 1}" for i in [5, 6]], x_cols=[f"X{i + 1}" for i in np.arange(5)]
+    )
+
+    cluster_vars = df[["X6", "X7"]].values
+    assert np.array_equal(dml_data.cluster_vars, cluster_vars)
+    assert dml_data.n_cluster_vars == 2
+
+    # check that after changing cluster_cols, the cluster_vars array gets updated
+    cluster_vars = df[["X7", "X6"]].values
+    dml_data.cluster_cols = ["X7", "X6"]
+    assert np.array_equal(dml_data.cluster_vars, cluster_vars)
+
+    msg = r"Invalid cluster variable\(s\) cluster_cols. At least one cluster variable is no data column."
+    with pytest.raises(ValueError, match=msg):
+        dml_data.cluster_cols = ["X6", "X13"]
+    with pytest.raises(ValueError, match=msg):
+        dml_data.cluster_cols = "X13"
+
+    msg = r"The cluster variable\(s\) cluster_cols must be of str or list type. " "5 of type <class 'int'> was passed."
+    with pytest.raises(TypeError, match=msg):
+        dml_data.cluster_cols = 5
+
+    # check single cluster variable
+    cluster_vars = df[["X7"]].values
+    dml_data.cluster_cols = "X7"
+    assert np.array_equal(dml_data.cluster_vars, cluster_vars)
+    assert dml_data.n_cluster_vars == 1
+
+
+@pytest.mark.ci
+def test_disjoint_sets():
+    np.random.seed(3141)
+    df = pd.DataFrame(np.tile(np.arange(6), (4, 1)), columns=["yy", "dd1", "xx1", "xx2", "zz", "tt"])
+
+    # cluster data
+    msg = (
+        r"At least one variable/column is set as outcome variable \(``y_col``\) "
+        r"and cluster variable\(s\) \(``cluster_cols``\)."
+    )
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="yy")
+    msg = (
+        r"At least one variable/column is set as treatment variable \(``d_cols``\) "
+        r"and cluster variable\(s\) \(``cluster_cols``\)."
+    )
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="dd1")
+    msg = (
+        r"At least one variable/column is set as covariate \(``x_cols``\) " r"and cluster variable\(s\) \(``cluster_cols``\)."
+    )
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="xx2")
+
+    msg = (
+        r"At least one variable/column is set as instrumental variable \(``z_cols``\) "
+        r"and cluster variable\(s\) \(``cluster_cols``\)."
+    )
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], z_cols=["xx2"], cluster_cols="xx2")
+
+    msg = (
+        r"At least one variable/column is set as time variable \(``t_col``\) "
+        r"and cluster variable\(s\) \(``cluster_cols``\)."
+    )
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], t_col="xx2", cluster_cols="xx2")
+
+    msg = (
+        r"At least one variable/column is set as score or selection variable \(``s_col``\) "
+        r"and cluster variable\(s\) \(``cluster_cols``\)."
+    )
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], s_col="xx2", cluster_cols="xx2")
+
+
+@pytest.mark.ci
+def test_duplicates():
+    np.random.seed(3141)
+    dml_cluster_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10)
+
+    msg = r"Invalid cluster variable\(s\) cluster_cols: Contains duplicate values."
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLClusterData(dml_cluster_data.data, y_col="y", d_cols=["d"], cluster_cols=["X3", "X2", "X3"])
+    with pytest.raises(ValueError, match=msg):
+        dml_cluster_data.cluster_cols = ["X3", "X2", "X3"]
+
+    msg = "Invalid pd.DataFrame: Contains duplicate column names."
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLClusterData(
+            pd.DataFrame(np.zeros((100, 5)), columns=["y", "d", "X3", "X2", "y"]), y_col="y", d_cols=["d"], cluster_cols=["X2"]
+        )
+
+
+@pytest.mark.ci
+def test_dml_datatype():
+    data_array = np.zeros((100, 10))
+    with pytest.raises(TypeError):
+        _ = DoubleMLClusterData(data_array, y_col="y", d_cols=["d"], cluster_cols=["X3", "X2"])
+
+
+@pytest.mark.ci
+def test_cluster_data_str():
+    np.random.seed(3141)
+    dml_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10)
+
+    # Convert the object to string
+    dml_str = str(dml_data)
+
+    # Check that all important sections are present in the string
+    assert "================== DoubleMLClusterData Object ==================" in dml_str
+    assert "------------------ Data summary      ------------------" in dml_str
+    assert "------------------ DataFrame info    ------------------" in dml_str
+
+    # Check that specific data attributes are correctly included
+    assert "Outcome variable: Y" in dml_str
+    assert "Treatment variable(s): ['D']" in dml_str
+    assert "Cluster variable(s): ['cluster_var_i', 'cluster_var_j']" in dml_str
+    assert "Covariates: " in dml_str
+    assert "Instrument variable(s): ['Z']" in dml_str
+    assert "No. Observations:" in dml_str
+
+    # Test with additional optional attributes
+    df = dml_data.data.copy()
+    df["time_var"] = 1
+    df["score_var"] = 0.5
+
+    dml_data_with_optional = DoubleMLClusterData(
+        data=df,
+        y_col="Y",
+        d_cols="D",
+        cluster_cols=["cluster_var_i", "cluster_var_j"],
+        z_cols="Z",
+        t_col="time_var",
+        s_col="score_var",
+    )
+
+    dml_str_optional = str(dml_data_with_optional)
+    assert "Time variable: time_var" in dml_str_optional
+    assert "Score/Selection variable: score_var" in dml_str_optional
diff --git a/doubleml/tests/test_dml_data.py b/doubleml/data/tests/test_dml_data.py
similarity index 73%
rename from doubleml/tests/test_dml_data.py
rename to doubleml/data/tests/test_dml_data.py
index d89e802a..7cf394b5 100644
--- a/doubleml/tests/test_dml_data.py
+++ b/doubleml/data/tests/test_dml_data.py
@@ -3,16 +3,15 @@
 import pytest
 from sklearn.linear_model import Lasso, LogisticRegression
 
-from doubleml import DoubleMLClusterData, DoubleMLData, DoubleMLDIDCS, DoubleMLPLR, DoubleMLSSM
+from doubleml import DoubleMLData, DoubleMLDIDCS, DoubleMLPLR, DoubleMLSSM
+from doubleml.data.base_data import DoubleMLBaseData
 from doubleml.datasets import (
     _make_pliv_data,
-    make_did_SZ2020,
     make_pliv_CHS2015,
-    make_pliv_multiway_cluster_CKMS2021,
     make_plr_CCDDHNR2018,
     make_ssm_data,
 )
-from doubleml.double_ml_data import DoubleMLBaseData
+from doubleml.did.datasets import make_did_SZ2020
 
 
 class DummyDataClass(DoubleMLBaseData):
@@ -123,32 +122,6 @@ def test_obj_vs_from_arrays():
     )
     assert np.array_equal(dml_data_from_array.data, dml_data.data)
 
-    dml_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10)
-    dml_data_from_array = DoubleMLClusterData.from_arrays(
-        dml_data.data[dml_data.x_cols],
-        dml_data.data[dml_data.y_col],
-        dml_data.data[dml_data.d_cols],
-        dml_data.data[dml_data.cluster_cols],
-        dml_data.data[dml_data.z_cols],
-    )
-    df = dml_data.data.copy()
-    df.rename(
-        columns={"cluster_var_i": "cluster_var1", "cluster_var_j": "cluster_var2", "Y": "y", "D": "d", "Z": "z"}, inplace=True
-    )
-    assert dml_data_from_array.data.equals(df)
-
-    # with a single cluster variable
-    dml_data_from_array = DoubleMLClusterData.from_arrays(
-        dml_data.data[dml_data.x_cols],
-        dml_data.data[dml_data.y_col],
-        dml_data.data[dml_data.d_cols],
-        dml_data.data[dml_data.cluster_cols[1]],
-        dml_data.data[dml_data.z_cols],
-    )
-    df = dml_data.data.copy().drop(columns="cluster_var_i")
-    df.rename(columns={"cluster_var_j": "cluster_var", "Y": "y", "D": "d", "Z": "z"}, inplace=True)
-    assert dml_data_from_array.data.equals(df)
-
 
 @pytest.mark.ci
 def test_add_vars_in_df():
@@ -249,52 +222,6 @@ def test_x_cols_setter_defaults():
     assert dml_data.x_cols == ["xx1", "xx2"]
 
 
-@pytest.mark.ci
-def test_x_cols_setter_defaults_w_cluster():
-    df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "xx3", "cluster1"])
-    dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1")
-    assert dml_data.x_cols == ["xx1", "xx2", "xx3"]
-    dml_data.x_cols = ["xx1", "xx3"]
-    assert dml_data.x_cols == ["xx1", "xx3"]
-    dml_data.x_cols = None
-    assert dml_data.x_cols == ["xx1", "xx2", "xx3"]
-
-    # with instrument
-    df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "z", "cluster1"])
-    dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="z")
-    assert dml_data.x_cols == ["xx1", "xx2"]
-
-    # without instrument and with time
-    df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "tt", "cluster1"])
-    dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", t_col="tt")
-    assert dml_data.x_cols == ["xx1", "xx2"]
-
-    # with instrument and with time
-    df = pd.DataFrame(np.tile(np.arange(7), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "tt", "cluster1"])
-    dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", t_col="tt")
-    assert dml_data.x_cols == ["xx1", "xx2"]
-
-    # without instrument and with selection
-    df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "ss", "cluster1"])
-    dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", s_col="ss")
-    assert dml_data.x_cols == ["xx1", "xx2"]
-
-    # with instrument and with selection
-    df = pd.DataFrame(np.tile(np.arange(7), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "ss", "cluster1"])
-    dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", s_col="ss")
-    assert dml_data.x_cols == ["xx1", "xx2"]
-
-    # without instrument with time with selection
-    df = pd.DataFrame(np.tile(np.arange(7), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "tt", "ss", "cluster1"])
-    dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", t_col="tt", s_col="ss")
-    assert dml_data.x_cols == ["xx1", "xx2"]
-
-    # with instrument with time with selection
-    df = pd.DataFrame(np.tile(np.arange(8), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "tt", "ss", "cluster1"])
-    dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", t_col="tt", s_col="ss")
-    assert dml_data.x_cols == ["xx1", "xx2"]
-
-
 @pytest.mark.ci
 def test_x_cols_setter():
     np.random.seed(3141)
@@ -442,42 +369,6 @@ def test_s_col_setter():
     assert dml_data.s is None
 
 
-@pytest.mark.ci
-def test_cluster_cols_setter():
-    np.random.seed(3141)
-    dml_data = make_plr_CCDDHNR2018(n_obs=100)
-    df = dml_data.data.copy().iloc[:, :10]
-    df.columns = [f"X{i + 1}" for i in np.arange(7)] + ["y", "d1", "d2"]
-    dml_data = DoubleMLClusterData(
-        df, "y", ["d1", "d2"], cluster_cols=[f"X{i + 1}" for i in [5, 6]], x_cols=[f"X{i + 1}" for i in np.arange(5)]
-    )
-
-    cluster_vars = df[["X6", "X7"]].values
-    assert np.array_equal(dml_data.cluster_vars, cluster_vars)
-    assert dml_data.n_cluster_vars == 2
-
-    # check that after changing cluster_cols, the cluster_vars array gets updated
-    cluster_vars = df[["X7", "X6"]].values
-    dml_data.cluster_cols = ["X7", "X6"]
-    assert np.array_equal(dml_data.cluster_vars, cluster_vars)
-
-    msg = r"Invalid cluster variable\(s\) cluster_cols. At least one cluster variable is no data column."
-    with pytest.raises(ValueError, match=msg):
-        dml_data.cluster_cols = ["X6", "X13"]
-    with pytest.raises(ValueError, match=msg):
-        dml_data.cluster_cols = "X13"
-
-    msg = r"The cluster variable\(s\) cluster_cols must be of str or list type. " "5 of type <class 'int'> was passed."
-    with pytest.raises(TypeError, match=msg):
-        dml_data.cluster_cols = 5
-
-    # check single cluster variable
-    cluster_vars = df[["X7"]].values
-    dml_data.cluster_cols = "X7"
-    assert np.array_equal(dml_data.cluster_vars, cluster_vars)
-    assert dml_data.n_cluster_vars == 1
-
-
 @pytest.mark.ci
 def test_y_col_setter():
     np.random.seed(3141)
@@ -556,79 +447,62 @@ def test_disjoint_sets():
     msg = "yy cannot be set as outcome variable ``y_col`` and covariate in ``x_cols``"
     with pytest.raises(ValueError, match=msg):
         _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "yy", "xx2"])
-    msg = "yy cannot be set as outcome variable ``y_col`` and instrumental variable in ``z_cols``"
+
+    # instrumental variable
+    msg = r"At least one variable/column is set as outcome variable \(``y_col``\) and instrumental variable \(``z_cols``\)."
     with pytest.raises(ValueError, match=msg):
         _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], z_cols="yy")
-    msg = (
-        r"At least one variable/column is set as treatment variable \(``d_cols``\) and instrumental variable in " "``z_cols``."
-    )
+    msg = r"At least one variable/column is set as treatment variable \(``d_cols``\) and instrumental variable \(``z_cols``\)."
     with pytest.raises(ValueError, match=msg):
         _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], z_cols=["dd1"])
-    msg = r"At least one variable/column is set as covariate \(``x_cols``\) and instrumental variable in " "``z_cols``."
+    msg = r"At least one variable/column is set as covariate \(``x_cols``\) and instrumental variable \(``z_cols``\)."
     with pytest.raises(ValueError, match=msg):
         _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], z_cols="xx2")
 
-    msg = "xx2 cannot be set as time variable ``t_col`` and covariate in ``x_cols``."
+    # time variable
+    msg = r"At least one variable/column is set as outcome variable \(``y_col``\) and time variable \(``t_col``\)."
     with pytest.raises(ValueError, match=msg):
-        _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="xx2")
-    msg = "dd1 cannot be set as time variable ``t_col`` and treatment variable in ``d_cols``."
+        _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="yy")
+    msg = r"At least one variable/column is set as treatment variable \(``d_cols``\) and time variable \(``t_col``\)."
     with pytest.raises(ValueError, match=msg):
         _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="dd1")
-    msg = "yy cannot be set as time variable ``t_col`` and outcome variable ``y_col``."
+    msg = r"At least one variable/column is set as covariate \(``x_cols``\) and time variable \(``t_col``\)."
     with pytest.raises(ValueError, match=msg):
-        _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="yy")
-    msg = "zz cannot be set as time variable ``t_col`` and instrumental variable in ``z_cols``."
+        _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="xx2")
+    msg = r"At least one variable/column is set as instrumental variable \(``z_cols``\) and time variable \(``t_col``\)."
     with pytest.raises(ValueError, match=msg):
         _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], z_cols="zz", t_col="zz")
 
-    msg = "xx2 cannot be set as score or selection variable ``s_col`` and covariate in ``x_cols``."
-    with pytest.raises(ValueError, match=msg):
-        _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], s_col="xx2")
-    msg = "dd1 cannot be set as score or selection variable ``s_col`` and treatment variable in ``d_cols``."
-    with pytest.raises(ValueError, match=msg):
-        _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], s_col="dd1")
-    msg = "yy cannot be set as score or selection variable ``s_col`` and outcome variable ``y_col``."
+    # score or selection variable
+    msg = (
+        r"At least one variable/column is set as outcome variable \(``y_col``\) and score or selection variable \(``s_col``\)."
+    )
     with pytest.raises(ValueError, match=msg):
         _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], s_col="yy")
-    msg = "zz cannot be set as score or selection variable ``s_col`` and instrumental variable in ``z_cols``."
-    with pytest.raises(ValueError, match=msg):
-        _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], z_cols="zz", s_col="zz")
-    msg = "tt cannot be set as score or selection variable ``s_col`` and time variable ``t_col``."
-    with pytest.raises(ValueError, match=msg):
-        _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="tt", s_col="tt")
-
-    # cluster data
-    msg = "yy cannot be set as outcome variable ``y_col`` and cluster variable in ``cluster_cols``"
-    with pytest.raises(ValueError, match=msg):
-        _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="yy")
     msg = (
-        r"At least one variable/column is set as treatment variable \(``d_cols``\) and cluster variable in "
-        "``cluster_cols``."
+        r"At least one variable/column is set as treatment variable \(``d_cols``\) "
+        r"and score or selection variable \(``s_col``\)."
     )
     with pytest.raises(ValueError, match=msg):
-        _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="dd1")
-    msg = r"At least one variable/column is set as covariate \(``x_cols``\) and cluster variable in " "``cluster_cols``."
+        _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], s_col="dd1")
+    msg = r"At least one variable/column is set as covariate \(``x_cols``\) and score or selection variable \(``s_col``\)."
     with pytest.raises(ValueError, match=msg):
-        _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="xx2")
+        _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], s_col="xx2")
     msg = (
-        r"At least one variable/column is set as instrumental variable \(``z_cols``\) and cluster variable in "
-        "``cluster_cols``."
+        r"At least one variable/column is set as instrumental variable \(``z_cols``\) "
+        r"and score or selection variable \(``s_col``\)."
     )
     with pytest.raises(ValueError, match=msg):
-        _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], z_cols=["xx2"], cluster_cols="xx2")
-    msg = "xx2 cannot be set as time variable ``t_col`` and cluster variable in ``cluster_cols``."
-    with pytest.raises(ValueError, match=msg):
-        _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], t_col="xx2", cluster_cols="xx2")
-    msg = "xx2 cannot be set as score or selection variable ``s_col`` and cluster variable in ``cluster_cols``."
+        _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], z_cols="zz", s_col="zz")
+    msg = r"At least one variable/column is set as time variable \(``t_col``\) and score or selection variable \(``s_col``\)."
     with pytest.raises(ValueError, match=msg):
-        _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], s_col="xx2", cluster_cols="xx2")
+        _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="tt", s_col="tt")
 
 
 @pytest.mark.ci
 def test_duplicates():
     np.random.seed(3141)
     dml_data = make_plr_CCDDHNR2018(n_obs=100)
-    dml_cluster_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10)
 
     msg = r"Invalid treatment variable\(s\) d_cols: Contains duplicate values."
     with pytest.raises(ValueError, match=msg):
@@ -648,21 +522,11 @@ def test_duplicates():
     with pytest.raises(ValueError, match=msg):
         dml_data.z_cols = ["X15", "X12", "X12", "X15"]
 
-    msg = r"Invalid cluster variable\(s\) cluster_cols: Contains duplicate values."
-    with pytest.raises(ValueError, match=msg):
-        _ = DoubleMLClusterData(dml_cluster_data.data, y_col="y", d_cols=["d"], cluster_cols=["X3", "X2", "X3"])
-    with pytest.raises(ValueError, match=msg):
-        dml_cluster_data.cluster_cols = ["X3", "X2", "X3"]
-
     msg = "Invalid pd.DataFrame: Contains duplicate column names."
     with pytest.raises(ValueError, match=msg):
         _ = DoubleMLData(
             pd.DataFrame(np.zeros((100, 5)), columns=["y", "d", "X3", "X2", "y"]), y_col="y", d_cols=["d"], x_cols=["X3", "X2"]
         )
-    with pytest.raises(ValueError, match=msg):
-        _ = DoubleMLClusterData(
-            pd.DataFrame(np.zeros((100, 5)), columns=["y", "d", "X3", "X2", "y"]), y_col="y", d_cols=["d"], cluster_cols=["X2"]
-        )
 
 
 @pytest.mark.ci
@@ -672,8 +536,6 @@ def test_dml_datatype():
     #        f'{str(data_array)} of type {str(type(data_array))} was passed.')
     with pytest.raises(TypeError):
         _ = DoubleMLData(data_array, y_col="y", d_cols=["d"], x_cols=["X3", "X2"])
-    with pytest.raises(TypeError):
-        _ = DoubleMLClusterData(data_array, y_col="y", d_cols=["d"], cluster_cols=["X3", "X2"])
 
 
 @pytest.mark.ci
@@ -724,3 +586,57 @@ def test_dml_data_w_missings(generate_data_irm_w_missings):
     assert dml_data.force_all_x_finite is False
     dml_data.force_all_x_finite = "allow-nan"
     assert dml_data.force_all_x_finite == "allow-nan"
+
+
+def test_dml_data_w_missing_d(generate_data1):
+    data = generate_data1
+    np.random.seed(3141)
+    x_cols = data.columns[data.columns.str.startswith("X")].tolist()
+
+    pd_args = {
+        "data": data,
+        "y_col": "y",
+        "d_cols": ["d"],
+        "x_cols": x_cols,
+    }
+    dml_data = DoubleMLData(force_all_d_finite=True, **pd_args)
+
+    data["d"] = np.nan
+    np_args = {
+        "x": data.loc[:, x_cols].values,
+        "y": data["y"].values,
+        "d": data["d"].values,
+    }
+    msg = r"Input contains NaN."
+    with pytest.raises(ValueError, match=msg):
+        dml_data2 = DoubleMLData(force_all_d_finite=False, **pd_args)
+        dml_data2.force_all_d_finite = True
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLData.from_arrays(force_all_d_finite=True, **np_args)
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLData(force_all_d_finite=True, **pd_args)
+
+    data["d"] = np.inf
+    np_args = {
+        "x": data.loc[:, x_cols].values,
+        "y": data["y"].values,
+        "d": data["d"].values,
+    }
+    msg = r"Input contains infinity or a value too large for dtype\('float64'\)."
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLData.from_arrays(force_all_d_finite=True, **np_args)
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLData(force_all_d_finite=True, **pd_args)
+
+    msg = "Invalid force_all_d_finite. force_all_d_finite must be True, False or 'allow-nan'."
+    with pytest.raises(TypeError, match=msg):
+        _ = DoubleMLData(force_all_d_finite=1, **pd_args)
+    with pytest.raises(TypeError, match=msg):
+        _ = DoubleMLData.from_arrays(force_all_d_finite=1, **np_args)
+
+    data["d"] = 1.0
+    assert dml_data.force_all_d_finite is True
+    dml_data.force_all_d_finite = False
+    assert dml_data.force_all_d_finite is False
+    dml_data.force_all_d_finite = "allow-nan"
+    assert dml_data.force_all_d_finite == "allow-nan"
diff --git a/doubleml/data/tests/test_panel_data.py b/doubleml/data/tests/test_panel_data.py
new file mode 100644
index 00000000..2f2250ba
--- /dev/null
+++ b/doubleml/data/tests/test_panel_data.py
@@ -0,0 +1,177 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from doubleml.data import DoubleMLPanelData
+from doubleml.did.datasets import make_did_SZ2020
+
+
+@pytest.mark.ci
+def test_dml_datatype():
+    data_array = np.zeros((100, 10))
+    with pytest.raises(TypeError):
+        _ = DoubleMLPanelData(data_array, y_col="y", d_cols=["d"], t_col="t", id_col="id")
+
+
+@pytest.mark.ci
+def test_t_col_setter():
+    np.random.seed(3141)
+    df = make_did_SZ2020(n_obs=100, return_type="DoubleMLPanelData")._data
+    df["t_new"] = 1.0
+    dml_data = DoubleMLPanelData(
+        data=df, y_col="y", d_cols="d", t_col="t", id_col="id", x_cols=[f"Z{i + 1}" for i in np.arange(4)]
+    )
+
+    # check that after changing t_col, the t array gets updated
+    t_comp = dml_data.data["t_new"].values
+    dml_data.t_col = "t_new"
+    assert np.array_equal(dml_data.t, t_comp)
+    assert dml_data._t_values == np.unique(t_comp)
+    assert dml_data.n_t_periods == 1
+
+    msg = "Invalid time variable t_col. a13 is no data column."
+    with pytest.raises(ValueError, match=msg):
+        dml_data.t_col = "a13"
+
+    msg = r"The time variable t_col must be of str type \(or None\). " "5 of type <class 'int'> was passed."
+    with pytest.raises(TypeError, match=msg):
+        dml_data.t_col = 5
+
+    msg = "Invalid time variable t_col. Time variable required for panel data."
+    with pytest.raises(TypeError, match=msg):
+        dml_data.t_col = None
+
+
+@pytest.mark.ci
+def test_id_col_setter():
+    np.random.seed(3141)
+    df = make_did_SZ2020(n_obs=100, return_type="DoubleMLPanelData")._data
+    df["id_new"] = 1.0
+    dml_data = DoubleMLPanelData(
+        data=df, y_col="y", d_cols="d", t_col="t", id_col="id", x_cols=[f"Z{i + 1}" for i in np.arange(4)]
+    )
+
+    # check that after changing id_col, the id array etc. gets updated
+    id_comp = dml_data.data["id_new"].values
+    dml_data.id_col = "id_new"
+    assert np.array_equal(dml_data.id_var, id_comp)
+    assert dml_data._id_var_unique == np.unique(id_comp)
+    assert dml_data.n_obs == 1
+
+    msg = "Invalid id variable id_col. a13 is no data column."
+    with pytest.raises(ValueError, match=msg):
+        dml_data.id_col = "a13"
+
+    msg = "The id variable id_col must be of str type. " "5 of type <class 'int'> was passed."
+    with pytest.raises(TypeError, match=msg):
+        dml_data.id_col = 5
+
+    msg = "The id variable id_col must be of str type. None of type <class 'NoneType'> was passed."
+    with pytest.raises(TypeError, match=msg):
+        dml_data.id_col = None
+
+
+@pytest.mark.ci
+def test_d_col_setter():
+    np.random.seed(3141)
+    df = make_did_SZ2020(n_obs=100, return_type="DoubleMLPanelData")._data
+    df["d_new"] = 1.0
+    dml_data = DoubleMLPanelData(
+        data=df, y_col="y", d_cols="d", t_col="t", id_col="id", x_cols=[f"Z{i + 1}" for i in np.arange(4)]
+    )
+
+    # check that after changing d_col, the id array etc. gets updated
+    d_comp = dml_data.data["d_new"].values
+    dml_data.d_cols = "d_new"
+    assert dml_data.d_cols == ["d_new"]
+    assert np.array_equal(dml_data.d, d_comp)
+    assert dml_data.g_col == "d_new"
+    assert dml_data._g_values == np.unique(d_comp)
+    assert dml_data.n_groups == 1
+
+    msg = r"Invalid treatment variable\(s\) d_cols. At least one treatment variable is no data column."
+    with pytest.raises(ValueError, match=msg):
+        dml_data.d_cols = "a13"
+
+    msg = r"The treatment variable\(s\) d_cols must be of str or list type. 5 of type <class 'int'> was passed."
+    with pytest.raises(TypeError, match=msg):
+        dml_data.d_cols = 5
+
+    msg = r"The treatment variable\(s\) d_cols must be of str or list type. None of type <class 'NoneType'> was passed."
+    with pytest.raises(TypeError, match=msg):
+        dml_data.d_cols = None
+
+
+@pytest.mark.ci
+def test_disjoint_sets():
+    np.random.seed(3141)
+    df = pd.DataFrame(np.tile(np.arange(7), (4, 1)), columns=["yy", "dd1", "xx1", "xx2", "zz", "tt", "id"])
+
+    msg = r"At least one variable/column is set as outcome variable \(``y_col``\) " r"and identifier variable \(``id_col``\)."
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLPanelData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="tt", id_col="yy")
+
+    msg = (
+        r"At least one variable/column is set as treatment variable \(``d_cols``\) " r"and identifier variable \(``id_col``\)."
+    )
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLPanelData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="tt", id_col="dd1")
+
+    msg = r"At least one variable/column is set as covariate \(``x_cols``\) " r"and identifier variable \(``id_col``\)."
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLPanelData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="tt", id_col="xx1")
+
+    msg = r"At least one variable/column is set as time variable \(``t_col``\) " r"and identifier variable \(``id_col``\)."
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLPanelData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="tt", id_col="tt")
+
+    msg = (
+        r"At least one variable/column is set as instrumental variable \(``z_cols``\) "
+        r"and identifier variable \(``id_col``\)."
+    )
+    with pytest.raises(ValueError, match=msg):
+        _ = DoubleMLPanelData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="tt", z_cols=["zz"], id_col="zz")
+
+
+@pytest.mark.ci
+def test_panel_data_str():
+    np.random.seed(3141)
+    df = make_did_SZ2020(n_obs=100, return_type="DoubleMLPanelData")._data
+    dml_data = DoubleMLPanelData(
+        data=df, y_col="y", d_cols="d", t_col="t", id_col="id", x_cols=[f"Z{i + 1}" for i in np.arange(4)]
+    )
+
+    # Convert the object to string
+    dml_str = str(dml_data)
+
+    # Check that all important sections are present in the string
+    assert "================== DoubleMLPanelData Object ==================" in dml_str
+    assert "------------------ Data summary      ------------------" in dml_str
+    assert "------------------ DataFrame info    ------------------" in dml_str
+
+    # Check that specific data attributes are correctly included
+    assert "Outcome variable: y" in dml_str
+    assert "Treatment variable(s): ['d']" in dml_str
+    assert "Covariates: ['Z1', 'Z2', 'Z3', 'Z4']" in dml_str
+    assert "Instrument variable(s): None" in dml_str
+    assert "Time variable: t" in dml_str
+    assert "Id variable: id" in dml_str
+    assert "No. Observations:" in dml_str
+
+
+@pytest.mark.ci
+def test_panel_data_properties():
+    np.random.seed(3141)
+    df = make_did_SZ2020(n_obs=100, return_type="DoubleMLPanelData")._data
+    dml_data = DoubleMLPanelData(
+        data=df, y_col="y", d_cols="d", t_col="t", id_col="id", x_cols=[f"Z{i + 1}" for i in np.arange(4)]
+    )
+
+    assert np.array_equal(dml_data.id_var, df["id"].values)
+    assert np.array_equal(dml_data.id_var_unique, np.unique(df["id"].values))
+    assert dml_data.n_obs == len(np.unique(df["id"].values))
+    assert dml_data.g_col == "d"
+    assert np.array_equal(dml_data.g_values, np.sort(np.unique(df["d"].values)))
+    assert dml_data.n_groups == len(np.unique(df["d"].values))
+    assert np.array_equal(dml_data.t_values, np.sort(np.unique(df["t"].values)))
+    assert dml_data.n_t_periods == len(np.unique(df["t"].values))
diff --git a/doubleml/data/tests/test_panel_data_exceptions.py b/doubleml/data/tests/test_panel_data_exceptions.py
new file mode 100644
index 00000000..fab648fe
--- /dev/null
+++ b/doubleml/data/tests/test_panel_data_exceptions.py
@@ -0,0 +1,113 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from doubleml.data import DoubleMLPanelData
+
+
+@pytest.fixture
+def sample_data():
+    n_ids = 3
+    n_periods = 4
+
+    data = []
+    for id_val in range(n_ids):
+        for t in range(n_periods):
+            data.append(
+                {
+                    "id": f"ID_{id_val}",
+                    "time": t,
+                    "y": np.random.normal(),
+                    "treatment": int(t >= 2),
+                    "x1": np.random.normal(),
+                    "x2": np.random.normal(),
+                    "z": np.random.normal(),
+                }
+            )
+
+    return pd.DataFrame(data)
+
+
+@pytest.mark.ci
+def test_multiple_treatments_exception(sample_data):
+    # Test exception when more than one treatment column is provided
+    with pytest.raises(ValueError, match="Only one treatment column is allowed for panel data."):
+        # Create copy of data with an additional treatment column
+        data_multi = sample_data.copy()
+        data_multi["treatment2"] = np.random.binomial(1, 0.5, size=len(data_multi))
+        DoubleMLPanelData(data=data_multi, y_col="y", d_cols=["treatment", "treatment2"], t_col="time", id_col="id")
+
+
+@pytest.mark.ci
+def test_id_col_type_exception(sample_data):
+    # Test exception when id_col is not a string
+    with pytest.raises(TypeError, match="The id variable id_col must be of str type."):
+        DoubleMLPanelData(data=sample_data, y_col="y", d_cols="treatment", t_col="time", id_col=123)  # Should be a string
+
+
+@pytest.mark.ci
+def test_id_col_not_in_data(sample_data):
+    # Test exception when id_col doesn't exist in data
+    with pytest.raises(ValueError, match="Invalid id variable id_col. non_existent_id is no data column."):
+        DoubleMLPanelData(data=sample_data, y_col="y", d_cols="treatment", t_col="time", id_col="non_existent_id")
+
+
+@pytest.mark.ci
+def test_time_col_none_exception(sample_data):
+    # Test exception when t_col is None
+    with pytest.raises(TypeError, match="Invalid time variable t_col. Time variable required for panel data."):
+        DoubleMLPanelData(
+            data=sample_data, y_col="y", d_cols="treatment", t_col=None, id_col="id"  # Should not be None for panel data
+        )
+
+
+@pytest.mark.ci
+def test_overlapping_variables_exception(sample_data):
+    # Test exception when id_col overlaps with another variable
+    msg = r"At least one variable/column is set as outcome variable \(``y_col``\) and identifier variable \(``id_col``\)."
+    with pytest.raises(ValueError, match=msg):
+        DoubleMLPanelData(
+            data=sample_data,
+            y_col="id",  # Using id as outcome variable
+            d_cols="treatment",
+            t_col="time",
+            id_col="id",  # Same as y_col
+        )
+
+    # Test treatment variable overlapping
+    msg = r"At least one variable/column is set as treatment variable \(``d_cols``\) and identifier variable \(``id_col``\)."
+    with pytest.raises(ValueError, match=msg):
+        DoubleMLPanelData(data=sample_data, y_col="y", d_cols="id", t_col="time", id_col="id")  # Using id as treatment
+
+    # Test time variable overlapping
+    msg = r"At least one variable/column is set as time variable \(``t_col``\) and identifier variable \(``id_col``\)."
+    with pytest.raises(ValueError, match=msg):
+        DoubleMLPanelData(data=sample_data, y_col="y", d_cols="treatment", t_col="id", id_col="id")  # Using id as time
+
+
+@pytest.mark.ci
+def test_from_arrays_not_implemented():
+    # Test that from_arrays raises NotImplementedError
+    with pytest.raises(NotImplementedError, match="from_arrays is not implemented for DoubleMLPanelData"):
+        DoubleMLPanelData.from_arrays(
+            x=np.random.normal(size=(10, 2)),
+            y=np.random.normal(size=10),
+            d=np.random.binomial(1, 0.5, size=10),
+            t=np.arange(10),
+            identifier=np.arange(10),
+        )
+
+
+@pytest.mark.ci
+def test_invalid_datetime_unit(sample_data):
+    with pytest.raises(ValueError, match="Invalid datetime unit."):
+        DoubleMLPanelData(
+            data=sample_data, y_col="y", d_cols="treatment", t_col="time", id_col="id", datetime_unit="invalid_unit"
+        )
+
+
+# test if no exception is raised
+@pytest.mark.ci
+def test_no_exception(sample_data):
+    DoubleMLPanelData(data=sample_data, y_col="y", d_cols="treatment", t_col="time", id_col="id")
+    assert True
diff --git a/doubleml/data/utils/panel_data_utils.py b/doubleml/data/utils/panel_data_utils.py
new file mode 100644
index 00000000..abd365eb
--- /dev/null
+++ b/doubleml/data/utils/panel_data_utils.py
@@ -0,0 +1,8 @@
+valid_datetime_units = {"Y", "M", "D", "h", "m", "s", "ms", "us", "ns"}
+
+
+def _is_valid_datetime_unit(unit):
+    if unit not in valid_datetime_units:
+        raise ValueError("Invalid datetime unit.")
+    else:
+        return unit
diff --git a/doubleml/data/utils/tests/test_panel_data_utils.py b/doubleml/data/utils/tests/test_panel_data_utils.py
new file mode 100644
index 00000000..e5201384
--- /dev/null
+++ b/doubleml/data/utils/tests/test_panel_data_utils.py
@@ -0,0 +1,32 @@
+import pytest
+
+from doubleml.data.utils.panel_data_utils import _is_valid_datetime_unit
+
+
+@pytest.mark.ci
+def test_is_valid_datetime_unit():
+    # Test all valid units
+    for unit in ["Y", "M", "D", "h", "m", "s", "ms", "us", "ns"]:
+        assert _is_valid_datetime_unit(unit) == unit, f"Unit {unit} should be valid and return itself"
+
+    # Test invalid units
+    invalid_units = ["", "minutes", "d", "H", "S", "MS", "y", "seconds", "days"]
+    for unit in invalid_units:
+        with pytest.raises(ValueError, match="Invalid datetime unit."):
+            _is_valid_datetime_unit(unit)
+
+    # Test case sensitivity
+    assert _is_valid_datetime_unit("m") == "m"  # minute is valid
+    assert _is_valid_datetime_unit("M") == "M"  # month is valid
+
+    with pytest.raises(ValueError, match="Invalid datetime unit."):
+        _is_valid_datetime_unit("d")  # lowercase day is invalid
+
+    assert _is_valid_datetime_unit("D") == "D"  # uppercase day is valid
+
+    # Test edge cases
+    with pytest.raises(ValueError, match="Invalid datetime unit."):
+        _is_valid_datetime_unit(None)
+
+    with pytest.raises(ValueError, match="Invalid datetime unit."):
+        _is_valid_datetime_unit(123)
diff --git a/doubleml/datasets.py b/doubleml/datasets.py
index a73d8216..0dcd33c7 100644
--- a/doubleml/datasets.py
+++ b/doubleml/datasets.py
@@ -7,12 +7,13 @@
 from sklearn.datasets import make_spd_matrix
 from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
 
-from .double_ml_data import DoubleMLClusterData, DoubleMLData
+from doubleml.data import DoubleMLClusterData, DoubleMLData
+from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_cluster_data_alias, _get_dml_data_alias
 
-_array_alias = ["array", "np.ndarray", "np.array", np.ndarray]
-_data_frame_alias = ["DataFrame", "pd.DataFrame", pd.DataFrame]
-_dml_data_alias = ["DoubleMLData", DoubleMLData]
-_dml_cluster_data_alias = ["DoubleMLClusterData", DoubleMLClusterData]
+_array_alias = _get_array_alias()
+_data_frame_alias = _get_data_frame_alias()
+_dml_data_alias = _get_dml_data_alias()
+_dml_cluster_data_alias = _get_dml_cluster_data_alias()
 
 
 def fetch_401K(return_type="DoubleMLData", polynomial_features=False):
@@ -856,197 +857,6 @@ def make_pliv_multiway_cluster_CKMS2021(N=25, M=25, dim_X=100, theta=1.0, return
         raise ValueError("Invalid return_type.")
 
 
-def make_did_SZ2020(n_obs=500, dgp_type=1, cross_sectional_data=False, return_type="DoubleMLData", **kwargs):
-    """
-    Generates data from a difference-in-differences model used in Sant'Anna and Zhao (2020).
-    The data generating process is defined as follows. For a generic :math:`W=(W_1, W_2, W_3, W_4)^T`, let
-
-    .. math::
-
-        f_{reg}(W) &= 210 + 27.4 \\cdot W_1 +13.7 \\cdot (W_2 + W_3 + W_4),
-
-        f_{ps}(W) &= 0.75 \\cdot (-W_1 + 0.5 \\cdot W_2 -0.25 \\cdot W_3 - 0.1 \\cdot W_4).
-
-
-    Let :math:`X= (X_1, X_2, X_3, X_4)^T \\sim \\mathcal{N}(0, \\Sigma)`, where  :math:`\\Sigma` is a matrix with entries
-    :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is  :math:`c = 0`, corresponding to the identity matrix.
-    Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`,
-    where :math:`\\tilde{Z}_1 = \\exp(0.5 \\cdot X_1)`, :math:`\\tilde{Z}_2 = 10 + X_2/(1 + \\exp(X_1))`,
-    :math:`\\tilde{Z}_3 = (0.6 + X_1 \\cdot X_3 / 25)^3` and :math:`\\tilde{Z}_4 = (20 + X_2 + X_4)^2`.
-    At first define
-
-    .. math::
-
-        Y_0(0) &= f_{reg}(W_{reg}) + \\nu(W_{reg}, D) + \\varepsilon_0,
-
-        Y_1(d) &= 2 \\cdot f_{reg}(W_{reg}) + \\nu(W_{reg}, D) + \\varepsilon_1(d),
-
-        p(W_{ps}) &= \\frac{\\exp(f_{ps}(W_{ps}))}{1 + \\exp(f_{ps}(W_{ps}))},
-
-        D &= 1\\{p(W_{ps}) \\ge U\\},
-
-    where :math:`\\varepsilon_0, \\varepsilon_1(d), d=0, 1` are independent standard normal random variables,
-    :math:`U \\sim \\mathcal{U}[0, 1]` is a independent standard uniform
-    and :math:`\\nu(W_{reg}, D)\\sim \\mathcal{N}(D \\cdot f_{reg}(W_{reg}),1)`.
-    The different data generating processes are defined via
-
-    .. math::
-
-        DGP1:\\quad W_{reg} &= Z \\quad W_{ps} = Z
-
-        DGP2:\\quad W_{reg} &= Z \\quad W_{ps} = X
-
-        DGP3:\\quad W_{reg} &= X \\quad W_{ps} = Z
-
-        DGP4:\\quad W_{reg} &= X \\quad W_{ps} = X
-
-        DGP5:\\quad W_{reg} &= Z \\quad W_{ps} = 0
-
-        DGP6:\\quad W_{reg} &= X \\quad W_{ps} = 0,
-
-    such that the last two settings correspond to an experimental setting with treatment probability
-    of :math:`P(D=1) = \\frac{1}{2}.`
-    For the panel data the outcome is already defined as the difference :math:`Y = Y_1(D) - Y_0(0)`.
-    For cross-sectional data the flag ``cross_sectional_data`` has to be set to ``True``.
-    Then the outcome will be defined to be
-
-    .. math::
-
-        Y = T \\cdot Y_1(D) + (1-T) \\cdot Y_0(0),
-
-    where :math:`T = 1\\{U_T\\le \\lambda_T \\}` with :math:`U_T\\sim \\mathcal{U}[0, 1]` and :math:`\\lambda_T=0.5`.
-    The true average treatment effect on the treated is zero for all data generating processes.
-
-    Parameters
-    ----------
-    n_obs :
-        The number of observations to simulate.
-    dgp_type :
-        The DGP to be used. Default value is ``1`` (integer).
-    cross_sectional_data :
-        Indicates whether the setting is uses cross-sectional or panel data. Default value is ``False``.
-    return_type :
-        If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object.
-
-        If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``.
-
-        If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``
-        or ``(x, y, d, t)``.
-    **kwargs
-        Additional keyword arguments to set non-default values for the parameter
-        :math:`xi=0.75`, :math:`c=0.0` and :math:`\\lambda_T=0.5`.
-
-    References
-    ----------
-    Sant’Anna, P. H. and Zhao, J. (2020),
-    Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122.
-    doi:`10.1016/j.jeconom.2020.06.003 <https://doi.org/10.1016/j.jeconom.2020.06.003>`_.
-    """
-    xi = kwargs.get("xi", 0.75)
-    c = kwargs.get("c", 0.0)
-    lambda_t = kwargs.get("lambda_t", 0.5)
-
-    def f_reg(w):
-        res = 210 + 27.4 * w[:, 0] + 13.7 * (w[:, 1] + w[:, 2] + w[:, 3])
-        return res
-
-    def f_ps(w, xi):
-        res = xi * (-w[:, 0] + 0.5 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3])
-        return res
-
-    dim_x = 4
-    cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)])
-    x = np.random.multivariate_normal(
-        np.zeros(dim_x),
-        cov_mat,
-        size=[
-            n_obs,
-        ],
-    )
-
-    z_tilde_1 = np.exp(0.5 * x[:, 0])
-    z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0]))
-    z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3
-    z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2
-
-    z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4))
-    z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0)
-
-    # error terms
-    epsilon_0 = np.random.normal(loc=0, scale=1, size=n_obs)
-    epsilon_1 = np.random.normal(loc=0, scale=1, size=[n_obs, 2])
-
-    if dgp_type == 1:
-        features_ps = z
-        features_reg = z
-    elif dgp_type == 2:
-        features_ps = x
-        features_reg = z
-    elif dgp_type == 3:
-        features_ps = z
-        features_reg = x
-    elif dgp_type == 4:
-        features_ps = x
-        features_reg = x
-    elif dgp_type == 5:
-        features_ps = None
-        features_reg = z
-    elif dgp_type == 6:
-        features_ps = None
-        features_reg = x
-    else:
-        raise ValueError("The dgp_type is not valid.")
-
-    # treatment and propensities
-    is_experimental = (dgp_type == 5) or (dgp_type == 6)
-    if is_experimental:
-        # Set D to be experimental
-        p = 0.5 * np.ones(n_obs)
-    else:
-        p = np.exp(f_ps(features_ps, xi)) / (1 + np.exp(f_ps(features_ps, xi)))
-    u = np.random.uniform(low=0, high=1, size=n_obs)
-    d = 1.0 * (p >= u)
-
-    # potential outcomes
-    nu = np.random.normal(loc=d * f_reg(features_reg), scale=1, size=n_obs)
-    y0 = f_reg(features_reg) + nu + epsilon_0
-    y1_d0 = 2 * f_reg(features_reg) + nu + epsilon_1[:, 0]
-    y1_d1 = 2 * f_reg(features_reg) + nu + epsilon_1[:, 1]
-    y1 = d * y1_d1 + (1 - d) * y1_d0
-
-    if not cross_sectional_data:
-        y = y1 - y0
-
-        if return_type in _array_alias:
-            return z, y, d, None
-        elif return_type in _data_frame_alias + _dml_data_alias:
-            z_cols = [f"Z{i + 1}" for i in np.arange(dim_x)]
-            data = pd.DataFrame(np.column_stack((z, y, d)), columns=z_cols + ["y", "d"])
-            if return_type in _data_frame_alias:
-                return data
-            else:
-                return DoubleMLData(data, "y", "d", z_cols)
-        else:
-            raise ValueError("Invalid return_type.")
-
-    else:
-        u_t = np.random.uniform(low=0, high=1, size=n_obs)
-        t = 1.0 * (u_t <= lambda_t)
-        y = t * y1 + (1 - t) * y0
-
-        if return_type in _array_alias:
-            return z, y, d, t
-        elif return_type in _data_frame_alias + _dml_data_alias:
-            z_cols = [f"Z{i + 1}" for i in np.arange(dim_x)]
-            data = pd.DataFrame(np.column_stack((z, y, d, t)), columns=z_cols + ["y", "d", "t"])
-            if return_type in _data_frame_alias:
-                return data
-            else:
-                return DoubleMLData(data, "y", "d", z_cols, t_col="t")
-        else:
-            raise ValueError("Invalid return_type.")
-
-
 def make_confounded_irm_data(n_obs=500, theta=0.0, gamma_a=0.127, beta_a=0.58, linear=False, **kwargs):
     """
     Generates counfounded data from an interactive regression model.
diff --git a/doubleml/did/__init__.py b/doubleml/did/__init__.py
index 594fa680..354ffaa5 100644
--- a/doubleml/did/__init__.py
+++ b/doubleml/did/__init__.py
@@ -3,9 +3,15 @@
 """
 
 from .did import DoubleMLDID
+from .did_aggregation import DoubleMLDIDAggregation
+from .did_binary import DoubleMLDIDBinary
 from .did_cs import DoubleMLDIDCS
+from .did_multi import DoubleMLDIDMulti
 
 __all__ = [
+    "DoubleMLDIDAggregation",
     "DoubleMLDID",
     "DoubleMLDIDCS",
+    "DoubleMLDIDBinary",
+    "DoubleMLDIDMulti",
 ]
diff --git a/doubleml/did/datasets/__init__.py b/doubleml/did/datasets/__init__.py
new file mode 100644
index 00000000..aaa5fc0a
--- /dev/null
+++ b/doubleml/did/datasets/__init__.py
@@ -0,0 +1,11 @@
+"""
+The :mod:`doubleml.did.datasets` module implements data generating processes for difference-in-differences.
+"""
+
+from .dgp_did_CS2021 import make_did_CS2021
+from .dgp_did_SZ2020 import make_did_SZ2020
+
+__all__ = [
+    "make_did_SZ2020",
+    "make_did_CS2021",
+]
diff --git a/doubleml/did/datasets/dgp_did_CS2021.py b/doubleml/did/datasets/dgp_did_CS2021.py
new file mode 100644
index 00000000..50336cdb
--- /dev/null
+++ b/doubleml/did/datasets/dgp_did_CS2021.py
@@ -0,0 +1,301 @@
+import numpy as np
+import pandas as pd
+
+from .dgp_did_SZ2020 import _generate_features, _select_features
+
+# Based on https://doi.org/10.1016/j.jeconom.2020.12.001 (see Appendix SC)
+# and https://d2cml-ai.github.io/csdid/examples/csdid_basic.html#Examples-with-simulated-data
+
+
+def _f_ps_groups(w, xi, n_groups):
+    # Create coefficient matrix: 4 features x n_groups
+    coef_vec = np.array([-1.0, 0.5, -0.25, -0.2])
+
+    # use i_group/n_groups as coeffect for columns
+    coef_matrix = np.array([coef_vec * (1.0 - (i_group / n_groups)) for i_group in range(n_groups)]).T
+
+    res = xi * (w @ coef_matrix)
+    return res
+
+
+def _f_reg_time(w, n_time_periods):
+    coef_vec = np.array([27.4, 13.7, 13.7, 13.7])
+
+    # use time period as coeffect for columns
+    coef_matrix = np.array([coef_vec * (i_time / n_time_periods) for i_time in range(1, n_time_periods + 1)]).T
+
+    res = 210 + w @ coef_matrix
+    return res
+
+
+def make_did_CS2021(n_obs=1000, dgp_type=1, include_never_treated=True, time_type="datetime", **kwargs):
+    """
+    Generate synthetic panel data for difference-in-differences analysis based on Callaway and Sant'Anna (2021).
+
+    This function creates panel data with heterogeneous treatment effects across time periods and groups.
+    The data includes pre-treatment periods, multiple treatment groups that receive treatment at different times,
+    and optionally a never-treated group that serves as a control. The true average treatment effect on the
+    treated (ATT) has a heterogeneous structure dependent on covariates and exposure time.
+
+    The data generating process offers six variations (``dgp_type`` 1-6) that differ in how the regression features
+    and propensity score features are derived:
+
+    - DGP 1: Outcome and propensity score are linear (in Z)
+    - DGP 2: Outcome is linear, propensity score is nonlinear
+    - DGP 3: Outcome is nonlinear, propensity score is linear
+    - DGP 4: Outcome and propensity score are nonlinear
+    - DGP 5: Outcome is linear, propensity score is constant (experimental setting)
+    - DGP 6: Outcome is nonlinear, propensity score is constant (experimental setting)
+
+    Let :math:`X= (X_1, X_2, X_3, X_4)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries
+    :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix.
+
+    Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`,
+    where :math:`\\tilde{Z}_1 = \\exp(0.5 \\cdot X_1)`, :math:`\\tilde{Z}_2 = 10 + X_2/(1 + \\exp(X_1))`,
+    :math:`\\tilde{Z}_3 = (0.6 + X_1 \\cdot X_3 / 25)^3` and :math:`\\tilde{Z}_4 = (20 + X_2 + X_4)^2`.
+
+    For a feature vector :math:`W=(W_1, W_2, W_3, W_4)^T` (either X or Z based on ``dgp_type``), the core functions are:
+
+    1. Time-varying outcome regression function for each time period :math:`t`:
+
+       .. math::
+
+           f_{reg,t}(W) = 210 + \\frac{t}{T} \\cdot (27.4 \\cdot W_1 + 13.7 \\cdot W_2 + 13.7 \\cdot W_3 + 13.7 \\cdot W_4)
+
+    2. Group-specific propensity function for each treatment group :math:`g`:
+
+       .. math::
+
+           f_{ps,g}(W) = \\xi \\cdot \\left(1-\\frac{g}{G}\\right) \\cdot
+           (-W_1 + 0.5 \\cdot W_2 - 0.25 \\cdot W_3 - 0.2\\cdot W_4)
+
+    where :math:`T` is the number of time periods, :math:`G` is the number of treatment groups, and :math:`\\xi` is a
+    scale parameter (default: 0.9).
+
+    The panel data model is defined with the following components:
+
+    1. Time effects: :math:`\\delta_t = t` for time period :math:`t`
+
+    2. Individual effects: :math:`\\eta_i \\sim \\mathcal{N}(g_i, 1)` where :math:`g_i` is unit :math:`i`'s treatment group
+
+    3. Treatment effects: For a unit in treatment group :math:`g`, the effect in period :math:`t` is:
+
+       .. math::
+
+           \\theta_{i,t,g} = \\max(t - t_g + 1, 0) + 0.1 \\cdot X_{i,1} \\cdot \\max(t - t_g + 1, 0)
+
+       where :math:`t_g` is the first treatment period for group :math:`g`, :math:`X_{i,1}` is the first covariate for unit
+       :math:`i`, and :math:`\\max(t - t_g + 1, 0)` represents the exposure time (0 for pre-treatment periods).
+
+    4. Potential outcomes for unit :math:`i` in period :math:`t`:
+
+       .. math::
+
+           Y_{i,t}(0) &= f_{reg,t}(W_{reg}) + \\delta_t + \\eta_i + \\varepsilon_{i,0,t}
+
+           Y_{i,t}(1) &= Y_{i,t}(0) + \\theta_{i,t,g} + (\\varepsilon_{i,1,t} - \\varepsilon_{i,0,t})
+
+       where :math:`\\varepsilon_{i,0,t}, \\varepsilon_{i,1,t} \\sim \\mathcal{N}(0, 1)`.
+
+    5. Observed outcomes:
+
+       .. math::
+
+           Y_{i,t} = Y_{i,t}(1) \\cdot 1\\{t \\geq t_g\\} + Y_{i,t}(0) \\cdot 1\\{t < t_g\\}
+
+    6. Treatment assignment:
+
+       For non-experimental settings (DGP 1-4), the probability of being in treatment group :math:`g` is:
+
+       .. math::
+
+           P(G_i = g) = \\frac{\\exp(f_{ps,g}(W_{ps}))}{\\sum_{g'} \\exp(f_{ps,g'}(W_{ps}))}
+
+       For experimental settings (DGP 5-6), each treatment group (including never-treated) has equal probability:
+
+       .. math::
+
+           P(G_i = g) = \\frac{1}{G} \\text{ for all } g
+
+    The variables :math:`W_{reg}` and :math:`W_{ps}` are selected based on the DGP type:
+
+    .. math::
+
+        DGP1:\\quad W_{reg} &= Z \\quad W_{ps} = Z
+
+        DGP2:\\quad W_{reg} &= Z \\quad W_{ps} = X
+
+        DGP3:\\quad W_{reg} &= X \\quad W_{ps} = Z
+
+        DGP4:\\quad W_{reg} &= X \\quad W_{ps} = X
+
+        DGP5:\\quad W_{reg} &= Z \\quad W_{ps} = 0
+
+        DGP6:\\quad W_{reg} &= X \\quad W_{ps} = 0
+
+    where settings 5-6 correspond to experimental designs with equal probability across treatment groups.
+
+
+    Parameters
+    ----------
+    n_obs : int, default=1000
+        The number of observations to simulate.
+
+    dgp_type : int, default=1
+        The data generating process to be used (1-6).
+
+    include_never_treated : bool, default=True
+        Whether to include units that are never treated.
+
+    time_type : str, default="datetime"
+        Type of time variable. Either "datetime" or "float".
+
+    **kwargs
+        Additional keyword arguments. Accepts the following parameters:
+
+        `c` (float, default=0.0):
+            Parameter for correlation structure in X.
+
+        `dim_x` (int, default=4):
+            Dimension of feature vectors.
+
+        `xi` (float, default=0.9):
+            Scale parameter for the propensity score function.
+
+        `n_periods` (int, default=5):
+            Number of time periods.
+
+        `anticipation_periods` (int, default=0):
+            Number of periods before treatment where anticipation effects occur.
+
+        `n_pre_treat_periods` (int, default=2):
+            Number of pre-treatment periods.
+
+        `start_date` (str, default="2025-01"):
+            Start date for datetime time variables.
+
+    Returns
+    -------
+    pandas.DataFrame
+        DataFrame containing the simulated panel data.
+
+    References
+    ----------
+    Callaway, B. and Sant’Anna, P. H. (2021),
+    Difference-in-Differences with multiple time periods. Journal of Econometrics, 225(2), 200-230.
+    doi:`10.1016/j.jeconom.2020.12.001 <https://doi.org/10.1016/j.jeconom.2020.12.001>`_.
+    """
+
+    c = kwargs.get("c", 0.0)
+    dim_x = kwargs.get("dim_x", 4)
+    xi = kwargs.get("xi", 0.9)
+    n_periods = kwargs.get("n_periods", 5)
+    anticipation_periods = kwargs.get("anticipation_periods", 0)
+    n_pre_treat_periods = kwargs.get("n_pre_treat_periods", 2)
+    start_date = kwargs.get("start_date", "2025-01")
+
+    if anticipation_periods > 0:
+        n_periods += anticipation_periods  # increase number of periods
+
+    expected_time_types = ("datetime", "float")
+    if time_type not in expected_time_types:
+        raise ValueError(f"time_type must be one of {expected_time_types}. Got {time_type}.")
+
+    x, z = _generate_features(n_obs, c, dim_x=dim_x)
+    features_ps, features_reg = _select_features(dgp_type, x, z)
+
+    # generate possible time periods
+    if time_type == "datetime":
+        time_periods = np.array([np.datetime64(start_date) + np.timedelta64(i, "M") for i in range(n_periods)])
+        never_treated_value = np.datetime64("NaT")
+    else:
+        assert time_type == "float"
+        time_periods = np.arange(n_periods)
+        never_treated_value = np.inf
+    n_time_periods = len(time_periods)
+
+    # set treatment values for time periods greater than n_pre_treat_periods
+    treatment_values = time_periods[time_periods >= time_periods[n_pre_treat_periods]]
+    max_exposure = len(treatment_values)  # exclude never treated
+    if include_never_treated:
+        treatment_values = np.append(treatment_values, never_treated_value)
+    n_treatment_groups = len(treatment_values)
+
+    # treatment assignment and propensities (shape (n_obs,))
+    is_experimental = (dgp_type == 5) or (dgp_type == 6)
+    if is_experimental:
+        # Set D to be experimental
+        p = np.ones(n_treatment_groups) / n_treatment_groups
+        d_index = np.random.choice(n_treatment_groups, size=n_obs, p=p)
+    else:
+        unnormalized_p = np.exp(_f_ps_groups(features_ps, xi, n_groups=n_treatment_groups))
+        p = unnormalized_p / unnormalized_p.sum(1, keepdims=True)
+        d_index = np.array([np.random.choice(n_treatment_groups, p=p_row) for p_row in p])
+
+    # fixed effects (shape (n_obs, n_time_periods))
+    time_effects = np.arange(n_time_periods)
+    delta_t = np.tile(time_effects, (n_obs, 1))
+    indiviual_effects = np.random.normal(loc=d_index, scale=1, size=(n_obs,))
+    eta_i = np.tile(indiviual_effects, (n_time_periods, 1)).T
+
+    # error terms (shape (n_obs, n_time_periods))
+    epsilon_0 = np.random.normal(loc=0, scale=1, size=(n_obs, n_time_periods))
+    epsilon_1 = np.random.normal(loc=0, scale=1, size=(n_obs, n_time_periods))
+
+    # regression function (shape (n_obs, n_time_periods))
+    f_reg = _f_reg_time(features_reg, n_time_periods)
+
+    # treatment effecs (shape (n_obs, n_time_periods))
+    exposure_pre_period = np.zeros((n_obs, n_pre_treat_periods))
+    exposure_post_first_treatment = np.clip(np.arange(max_exposure) - d_index.reshape(-1, 1) + 1, a_min=0, a_max=None)
+    exposure_time = np.column_stack((exposure_pre_period, exposure_post_first_treatment))
+    delta_e = exposure_time
+
+    # add heterogeneity in treatment effects
+    heterogeneity_x = 0.1 * x[:, 0]
+    heterogeneity = heterogeneity_x.reshape(-1, 1) * exposure_time
+    delta_e += heterogeneity
+
+    # potential outcomes (shape (n_obs, n_time_periods))
+    y0 = f_reg + delta_t + eta_i + epsilon_0
+    y1 = y0 + delta_e + (epsilon_1 - epsilon_0)
+
+    # observed outcomes (shape (n_obs, n_time_periods))
+    is_exposed = exposure_time > 0
+    y = y1 * is_exposed + y0 * ~is_exposed
+
+    # map treatment index to values
+    d = np.array([treatment_values[i] for i in d_index])
+    d_matrix = np.tile(d, (n_time_periods, 1)).T
+
+    # create matrices to flatten the data
+    id_matrix = np.tile(np.arange(n_obs), (n_time_periods, 1)).T
+    time_matrix = np.tile(time_periods, (n_obs, 1))
+
+    df = pd.DataFrame(
+        {
+            "id": id_matrix.flatten(),
+            "y": y.flatten(),
+            "y0": y0.flatten(),
+            "y1": y1.flatten(),
+            "d": d_matrix.flatten(),
+            "t": time_matrix.flatten(),
+            **{f"Z{i + 1}": z[:, i].repeat(n_time_periods) for i in range(dim_x)},
+        }
+    )
+    if anticipation_periods > 0:
+        # filter time periods
+        df = df[df["t"] >= time_periods[anticipation_periods]]
+        # filter treatment after anticipation periods
+        df = df[(df["d"] <= time_periods[-(anticipation_periods + 1)]) | pd.isna(df["d"])]
+
+        # update time periods by subtracting time delta
+        if time_type == "datetime":
+            df = df[(df["d"] <= time_periods[-(anticipation_periods + 1)]) | pd.isna(df["d"])]
+            df["t"] = df["t"].apply(lambda x: x - pd.DateOffset(months=anticipation_periods))
+        else:
+            assert time_type == "float"
+            df = df[(df["d"] <= time_periods[-(anticipation_periods + 1)]) | np.isinf(df["d"])]
+            df["t"] = df["t"] - anticipation_periods
+
+    return df
diff --git a/doubleml/did/datasets/dgp_did_SZ2020.py b/doubleml/did/datasets/dgp_did_SZ2020.py
new file mode 100644
index 00000000..ccfd4a80
--- /dev/null
+++ b/doubleml/did/datasets/dgp_did_SZ2020.py
@@ -0,0 +1,238 @@
+import numpy as np
+import pandas as pd
+from scipy.linalg import toeplitz
+
+from ...data.base_data import DoubleMLData
+from ...data.panel_data import DoubleMLPanelData
+from ...utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias
+
+_array_alias = _get_array_alias()
+_data_frame_alias = _get_data_frame_alias()
+_dml_data_alias = _get_dml_data_alias()
+
+
+def _generate_features(n_obs, c, dim_x=4):
+    cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)])
+    x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=n_obs)
+
+    z_tilde_1 = np.exp(0.5 * x[:, 0])
+    z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0]))
+    z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3
+    z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2
+
+    z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4))
+    z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0)
+
+    return x, z
+
+
+def _select_features(dgp_type, x, z):
+    if dgp_type == 1:
+        features_ps = z
+        features_reg = z
+    elif dgp_type == 2:
+        features_ps = x
+        features_reg = z
+    elif dgp_type == 3:
+        features_ps = z
+        features_reg = x
+    elif dgp_type == 4:
+        features_ps = x
+        features_reg = x
+    elif dgp_type == 5:
+        features_ps = None
+        features_reg = z
+    elif dgp_type == 6:
+        features_ps = None
+        features_reg = x
+    else:
+        raise ValueError("The dgp_type is not valid.")
+    return features_ps, features_reg
+
+
+def _f_reg(w):
+    res = 210 + 27.4 * w[:, 0] + 13.7 * (w[:, 1] + w[:, 2] + w[:, 3])
+    return res
+
+
+def _f_ps(w, xi):
+    res = xi * (-w[:, 0] + 0.5 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3])
+    return res
+
+
+def make_did_SZ2020(n_obs=500, dgp_type=1, cross_sectional_data=False, return_type="DoubleMLData", **kwargs):
+    """
+    Generates data from a difference-in-differences model used in Sant'Anna and Zhao (2020).
+    The data generating process is defined as follows. For a generic :math:`W=(W_1, W_2, W_3, W_4)^T`, let
+
+    .. math::
+
+        f_{reg}(W) &= 210 + 27.4 \\cdot W_1 +13.7 \\cdot (W_2 + W_3 + W_4),
+
+        f_{ps}(W) &= 0.75 \\cdot (-W_1 + 0.5 \\cdot W_2 -0.25 \\cdot W_3 - 0.1 \\cdot W_4).
+
+
+    Let :math:`X= (X_1, X_2, X_3, X_4)^T \\sim \\mathcal{N}(0, \\Sigma)`, where  :math:`\\Sigma` is a matrix with entries
+    :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is  :math:`c = 0`, corresponding to the identity matrix.
+    Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`,
+    where :math:`\\tilde{Z}_1 = \\exp(0.5 \\cdot X_1)`, :math:`\\tilde{Z}_2 = 10 + X_2/(1 + \\exp(X_1))`,
+    :math:`\\tilde{Z}_3 = (0.6 + X_1 \\cdot X_3 / 25)^3` and :math:`\\tilde{Z}_4 = (20 + X_2 + X_4)^2`.
+    At first define
+
+    .. math::
+
+        Y_0(0) &= f_{reg}(W_{reg}) + \\nu(W_{reg}, D) + \\varepsilon_0,
+
+        Y_1(d) &= 2 \\cdot f_{reg}(W_{reg}) + \\nu(W_{reg}, D) + \\varepsilon_1(d),
+
+        p(W_{ps}) &= \\frac{\\exp(f_{ps}(W_{ps}))}{1 + \\exp(f_{ps}(W_{ps}))},
+
+        D &= 1\\{p(W_{ps}) \\ge U\\},
+
+    where :math:`\\varepsilon_0, \\varepsilon_1(d), d=0, 1` are independent standard normal random variables,
+    :math:`U \\sim \\mathcal{U}[0, 1]` is a independent standard uniform
+    and :math:`\\nu(W_{reg}, D)\\sim \\mathcal{N}(D \\cdot f_{reg}(W_{reg}),1)`.
+    The different data generating processes are defined via
+
+    .. math::
+
+        DGP1:\\quad W_{reg} &= Z \\quad W_{ps} = Z
+
+        DGP2:\\quad W_{reg} &= Z \\quad W_{ps} = X
+
+        DGP3:\\quad W_{reg} &= X \\quad W_{ps} = Z
+
+        DGP4:\\quad W_{reg} &= X \\quad W_{ps} = X
+
+        DGP5:\\quad W_{reg} &= Z \\quad W_{ps} = 0
+
+        DGP6:\\quad W_{reg} &= X \\quad W_{ps} = 0,
+
+    such that the last two settings correspond to an experimental setting with treatment probability
+    of :math:`P(D=1) = \\frac{1}{2}.`
+    For the panel data the outcome is already defined as the difference :math:`Y = Y_1(D) - Y_0(0)`.
+    For cross-sectional data the flag ``cross_sectional_data`` has to be set to ``True``.
+    Then the outcome will be defined to be
+
+    .. math::
+
+        Y = T \\cdot Y_1(D) + (1-T) \\cdot Y_0(0),
+
+    where :math:`T = 1\\{U_T\\le \\lambda_T \\}` with :math:`U_T\\sim \\mathcal{U}[0, 1]` and :math:`\\lambda_T=0.5`.
+    The true average treatment effect on the treated is zero for all data generating processes.
+
+    Parameters
+    ----------
+    n_obs :
+        The number of observations to simulate.
+    dgp_type :
+        The DGP to be used. Default value is ``1`` (integer).
+    cross_sectional_data :
+        Indicates whether the setting is uses cross-sectional or panel data. Default value is ``False``.
+    return_type :
+        If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object.
+
+        If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``.
+
+        If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``
+        or ``(x, y, d, t)``.
+    **kwargs
+        Additional keyword arguments to set non-default values for the parameter
+        :math:`xi=0.75`, :math:`c=0.0` and :math:`\\lambda_T=0.5`.
+
+    References
+    ----------
+    Sant’Anna, P. H. and Zhao, J. (2020),
+    Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122.
+    doi:`10.1016/j.jeconom.2020.06.003 <https://doi.org/10.1016/j.jeconom.2020.06.003>`_.
+    """
+    xi = kwargs.get("xi", 0.75)
+    c = kwargs.get("c", 0.0)
+    lambda_t = kwargs.get("lambda_t", 0.5)
+
+    dim_x = 4
+    x, z = _generate_features(n_obs, c, dim_x=dim_x)
+
+    # error terms
+    epsilon_0 = np.random.normal(loc=0, scale=1, size=n_obs)
+    epsilon_1 = np.random.normal(loc=0, scale=1, size=[n_obs, 2])
+
+    features_ps, features_reg = _select_features(dgp_type, x, z)
+
+    # treatment and propensities
+    is_experimental = (dgp_type == 5) or (dgp_type == 6)
+    if is_experimental:
+        # Set D to be experimental
+        p = 0.5 * np.ones(n_obs)
+    else:
+        p = np.exp(_f_ps(features_ps, xi)) / (1 + np.exp(_f_ps(features_ps, xi)))
+    u = np.random.uniform(low=0, high=1, size=n_obs)
+    d = 1.0 * (p >= u)
+
+    # potential outcomes
+    nu = np.random.normal(loc=d * _f_reg(features_reg), scale=1, size=n_obs)
+    y0 = _f_reg(features_reg) + nu + epsilon_0
+    y1_d0 = 2 * _f_reg(features_reg) + nu + epsilon_1[:, 0]
+    y1_d1 = 2 * _f_reg(features_reg) + nu + epsilon_1[:, 1]
+    y1 = d * y1_d1 + (1 - d) * y1_d0
+
+    if not cross_sectional_data:
+        y = y1 - y0
+
+        if return_type in _array_alias:
+            return z, y, d, None
+        elif return_type in _data_frame_alias + _dml_data_alias:
+            z_cols = [f"Z{i + 1}" for i in np.arange(dim_x)]
+            data = pd.DataFrame(np.column_stack((z, y, d)), columns=z_cols + ["y", "d"])
+            if return_type in _data_frame_alias:
+                return data
+            else:
+                return DoubleMLData(data, "y", "d", z_cols)
+        elif return_type == "DoubleMLPanelData":
+            z_cols = [f"Z{i + 1}" for i in np.arange(dim_x)]
+            df0 = (
+                pd.DataFrame(
+                    {
+                        "y": y0,
+                        "d": d.astype(np.int32),
+                        "t": np.zeros_like(y0, dtype=np.int32),
+                        **{col: z[:, i] for i, col in enumerate(z_cols)},
+                    }
+                )
+                .reset_index()
+                .rename(columns={"index": "id"})
+            )
+            df1 = (
+                pd.DataFrame(
+                    {
+                        "y": y1,
+                        "d": d.astype(np.int32),
+                        "t": np.ones_like(y0, dtype=np.int32),
+                        **{col: z[:, i] for i, col in enumerate(z_cols)},
+                    }
+                )
+                .reset_index()
+                .rename(columns={"index": "id"})
+            )
+            df = pd.concat([df0, df1], axis=0)
+
+            return DoubleMLPanelData(df, "y", "d", t_col="t", id_col="id", x_cols=z_cols)
+        else:
+            raise ValueError("Invalid return_type.")
+
+    else:
+        u_t = np.random.uniform(low=0, high=1, size=n_obs)
+        t = 1.0 * (u_t <= lambda_t)
+        y = t * y1 + (1 - t) * y0
+
+        if return_type in _array_alias:
+            return z, y, d, t
+        elif return_type in _data_frame_alias + _dml_data_alias:
+            z_cols = [f"Z{i + 1}" for i in np.arange(dim_x)]
+            data = pd.DataFrame(np.column_stack((z, y, d, t)), columns=z_cols + ["y", "d", "t"])
+            if return_type in _data_frame_alias:
+                return data
+            else:
+                return DoubleMLData(data, "y", "d", z_cols, t_col="t")
+        else:
+            raise ValueError("Invalid return_type.")
diff --git a/doubleml/did/did.py b/doubleml/did/did.py
index e71068f2..7a671993 100644
--- a/doubleml/did/did.py
+++ b/doubleml/did/did.py
@@ -4,8 +4,8 @@
 from sklearn.utils import check_X_y
 from sklearn.utils.multiclass import type_of_target
 
+from doubleml.data.base_data import DoubleMLData
 from doubleml.double_ml import DoubleML
-from doubleml.double_ml_data import DoubleMLData
 from doubleml.double_ml_score_mixins import LinearScoreMixin
 from doubleml.utils._checks import _check_finite_predictions, _check_is_propensity, _check_score, _check_trimming
 from doubleml.utils._estimation import _dml_cv_predict, _dml_tune, _get_cond_smpls
@@ -209,7 +209,9 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
 
         # nuisance g for d==0
         if external_predictions["ml_g0"] is not None:
-            g_hat0 = {"preds": external_predictions["ml_g0"], "targets": None, "models": None}
+            ml_g0_targets = np.full_like(y, np.nan, dtype="float64")
+            ml_g0_targets[d == 0] = y[d == 0]
+            g_hat0 = {"preds": external_predictions["ml_g0"], "targets": ml_g0_targets, "models": None}
         else:
             g_hat0 = _dml_cv_predict(
                 self._learner["ml_g"],
@@ -229,7 +231,9 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
 
         # nuisance g for d==1
         if external_predictions["ml_g1"] is not None:
-            g_hat1 = {"preds": external_predictions["ml_g1"], "targets": None, "models": None}
+            ml_g1_targets = np.full_like(y, np.nan, dtype="float64")
+            ml_g1_targets[d == 1] = y[d == 1]
+            g_hat1 = {"preds": external_predictions["ml_g1"], "targets": ml_g1_targets, "models": None}
         else:
             g_hat1 = _dml_cv_predict(
                 self._learner["ml_g"],
@@ -252,7 +256,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
         if self.score == "observational":
             # nuisance m
             if external_predictions["ml_m"] is not None:
-                m_hat = {"preds": external_predictions["ml_m"], "targets": None, "models": None}
+                m_hat = {"preds": external_predictions["ml_m"], "targets": d, "models": None}
             else:
                 m_hat = _dml_cv_predict(
                     self._learner["ml_m"],
@@ -269,10 +273,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa
             m_hat["preds"] = _trimm(m_hat["preds"], self.trimming_rule, self.trimming_threshold)
 
         # nuisance estimates of the uncond. treatment prob.
-        p_hat = np.full_like(d, np.nan, dtype="float64")
-        for train_index, test_index in smpls:
-            p_hat[test_index] = np.mean(d[train_index])
-
+        p_hat = np.full_like(d, d.mean(), dtype="float64")
         psi_a, psi_b = self._score_elements(y, d, g_hat0["preds"], g_hat1["preds"], m_hat["preds"], p_hat)
 
         psi_elements = {"psi_a": psi_a, "psi_b": psi_b}
@@ -432,3 +433,31 @@ def _nuisance_tuning(
         res = {"params": params, "tune_res": tune_res}
 
         return res
+
+    def sensitivity_benchmark(self, benchmarking_set, fit_args=None):
+        """
+        Computes a benchmark for a given set of features.
+        Returns a DataFrame containing the corresponding values for cf_y, cf_d, rho and the change in estimates.
+
+        Parameters
+        ----------
+        benchmarking_set : list
+            List of features to be used for benchmarking.
+
+        fit_args : dict, optional
+            Additional arguments for the fit method.
+            Default is None.
+
+        Returns
+        -------
+        benchmark_results : pandas.DataFrame
+            Benchmark results.
+        """
+        if self.score == "experimental":
+            warnings.warn(
+                "Sensitivity benchmarking for experimental score may not be meaningful. "
+                "Consider using score='observational' for conditional treatment assignment.",
+                UserWarning,
+            )
+
+        return super().sensitivity_benchmark(benchmarking_set, fit_args)
diff --git a/doubleml/did/did_aggregation.py b/doubleml/did/did_aggregation.py
new file mode 100644
index 00000000..0e34aa37
--- /dev/null
+++ b/doubleml/did/did_aggregation.py
@@ -0,0 +1,391 @@
+import warnings
+from functools import reduce
+from operator import add
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+
+from doubleml.double_ml_framework import DoubleMLFramework, concat
+
+
+class DoubleMLDIDAggregation:
+    """
+    Class for aggregating multiple difference-in-differences (DID) frameworks.
+
+    This class enables weighted aggregation of multiple DoubleMLFramework objects, allowing for
+    both multiple separate aggregations and an overall aggregation across them. It provides
+    methods for summarizing and visualizing aggregated treatment effects.
+
+    Parameters
+    ----------
+    frameworks : list
+        List of DoubleMLFramework objects to aggregate. Each framework must be one-dimensional
+        (n_thetas = 1).
+
+    aggregation_weights : numpy.ndarray
+        2D array of weights for aggregating frameworks. Shape should be (n_aggregations, n_frameworks),
+        where each row corresponds to a separate aggregation of the frameworks.
+
+    overall_aggregation_weights : numpy.ndarray, optional
+        1D array of weights for aggregating across the aggregated frameworks. Length should
+        equal the number of rows in aggregation_weights. If None, equal weights are used.
+        Default is None.
+
+    aggregation_names : list of str, optional
+        Names for each aggregation. Length should equal the number of rows in aggregation_weights.
+        If None, default names like "Aggregation_0", "Aggregation_1", etc. are used.
+        Default is None.
+
+    aggregation_method_name : str, optional
+        Name describing the aggregation method used.
+        Default is "Custom".
+
+    additional_information : dict, optional
+        Dictionary containing additional information to display in the string representation.
+        Default is None.
+
+    additional_parameters : dict, optional
+        Dictionary containing additional parameters used by the class methods.
+        For example, can contain 'aggregation_color_idx' for plot_effects().
+        Default is None.
+    """
+
+    def __init__(
+        self,
+        frameworks,
+        aggregation_weights,
+        overall_aggregation_weights=None,
+        aggregation_names=None,
+        aggregation_method_name="Custom",
+        additional_information=None,
+        additional_parameters=None,
+    ):
+        self._base_frameworks = self._check_frameworks(frameworks)
+
+        self._aggregation_weights, self._overall_aggregation_weights = self._check_weights(
+            aggregation_weights, overall_aggregation_weights
+        )
+        self._n_aggregations = self.aggregation_weights.shape[0]
+
+        self._aggregation_names, self._aggregation_method_name = self._check_names(aggregation_names, aggregation_method_name)
+
+        if additional_information is not None and not isinstance(additional_information, dict):
+            raise TypeError("'additional_information' must be a dictionary (or None)")
+        self._additional_information = additional_information
+        if additional_parameters is not None and not isinstance(additional_parameters, dict):
+            raise TypeError("'additional_parameters' must be a dictionary (or None)")
+        self._additional_parameters = additional_parameters
+
+        agg_frameworks = [None] * self._n_aggregations
+        for idx_agg in range(self._n_aggregations):
+            weights = self.aggregation_weights[idx_agg, :]
+            weighted_frameworks = [w * f for w, f in zip(weights, self.base_frameworks)]
+            agg_frameworks[idx_agg] = reduce(add, weighted_frameworks)
+
+        self._aggregated_frameworks = concat(agg_frameworks)
+        self._aggregated_frameworks.treatment_names = self._aggregation_names
+
+        # overall framework
+        overall_weighted_frameworks = [w * f for w, f in zip(self.overall_aggregation_weights, agg_frameworks)]
+        self._overall_aggregated_framework = reduce(add, overall_weighted_frameworks)
+
+    def __str__(self):
+        class_name = self.__class__.__name__
+        header = (
+            f"================== {class_name} Object ==================\n" + f" {self.aggregation_method_name} Aggregation \n"
+        )
+        overall_summary = self.overall_summary.to_string(index=False)
+        aggregated_effects_summary = self.aggregated_summary.to_string(index=True)
+
+        res = (
+            header
+            + "\n------------------ Overall Aggregated Effects ------------------\n"
+            + overall_summary
+            + "\n------------------ Aggregated Effects         ------------------\n"
+            + aggregated_effects_summary
+        )
+        if self.additional_information is not None:
+            res += "\n------------------ Additional Information     ------------------\n"
+            res += self.additional_information
+
+        return res
+
+    @property
+    def base_frameworks(self):
+        """Underlying frameworks"""
+        return self._base_frameworks
+
+    @property
+    def aggregated_frameworks(self):
+        """Aggregated frameworks"""
+        return self._aggregated_frameworks
+
+    @property
+    def overall_aggregated_framework(self):
+        """Overall aggregated framework"""
+        return self._overall_aggregated_framework
+
+    @property
+    def aggregation_weights(self):
+        """Aggregation weights"""
+        return self._aggregation_weights
+
+    @property
+    def overall_aggregation_weights(self):
+        """Overall aggregation weights"""
+        return self._overall_aggregation_weights
+
+    @property
+    def n_aggregations(self):
+        """Number of aggregations"""
+        return self._n_aggregations
+
+    @property
+    def aggregation_names(self):
+        """Aggregation names"""
+        return self._aggregation_names
+
+    @property
+    def aggregation_method_name(self):
+        """Aggregation method name"""
+        return self._aggregation_method_name
+
+    @property
+    def aggregated_summary(self):
+        """
+        A summary for the aggregated effects.
+        """
+        return self.aggregated_frameworks.summary
+
+    @property
+    def overall_summary(self):
+        """
+        A summary for the overall aggregated effect.
+        """
+        return self.overall_aggregated_framework.summary
+
+    @property
+    def additional_information(self):
+        """Additional information"""
+        if self._additional_information is None:
+            add_info = None
+        else:
+            add_info = str()
+            for key, value in self._additional_information.items():
+                add_info += f"{key}: {value}\n"
+        return add_info
+
+    @property
+    def additional_parameters(self):
+        """Additional parameters"""
+        return self._additional_parameters
+
+    def plot_effects(
+        self,
+        level=0.95,
+        joint=True,
+        figsize=(12, 6),
+        sort_by=None,
+        color_palette="colorblind",
+        title="Aggregated Treatment Effects",
+        y_label="Effect",
+    ):
+        """
+        Plot aggregated treatment effect estimates with confidence intervals.
+
+        Parameters
+        ----------
+        level : float
+            Confidence level for the intervals.
+            Default is ``0.95``.
+        joint : bool
+            Indicates whether joint confidence intervals are computed.
+            Default is ``True``.
+        figsize : tuple
+            Figure size as (width, height).
+            Default is ``(12, 6)``.
+        sort_by : str or None
+            How to sort the results - 'estimate', 'name', or None.
+            Default is ``None``.
+        color_palette : str or list
+            Seaborn color palette name or list of colors.
+            Default is ``"colorblind"``.
+        title : str
+            Title for the plot.
+            Default is ``"Aggregated Treatment Effects"``.
+        y_label : str
+            Label for y-axis.
+            Default is ``"Effect"``.
+
+        Returns
+        -------
+        fig : matplotlib.figure.Figure
+            The created figure object.
+        ax : matplotlib.axes.Axes
+            The axes object for further customization.
+
+        Notes
+        -----
+        If ``joint=True`` and bootstrapping hasn't been performed, this method will automatically
+        perform bootstrapping with default parameters and issue a warning.
+        """
+        df = self._create_ci_dataframe(level=level, joint=joint)
+
+        # Validate sorting column
+        valid_sort_options = {"estimate", "name", None}
+        if sort_by not in valid_sort_options:
+            raise ValueError(f"Invalid sort_by value. Choose from {valid_sort_options}.")
+
+        # Sort data if requested
+        if sort_by == "estimate":
+            df = df.sort_values(by="Estimate", ascending=False)
+        elif sort_by == "name":
+            df = df.sort_values(by="Aggregation_Names", ascending=True)
+
+        # Handle color palette
+        colors = sns.color_palette(color_palette) if isinstance(color_palette, str) else color_palette
+        selected_colors = [colors[idx] for idx in df["color_idx"]]
+
+        # Create figure
+        fig, ax = plt.subplots(figsize=figsize)
+
+        # Plot zero reference line
+        ax.axhline(y=0, color="black", linestyle="--", alpha=0.5, label="Zero effect")
+
+        # Calculate asymmetric error bars
+        x_positions = np.arange(len(df))
+        yerr = np.array([df["Estimate"] - df["CI_Lower"], df["CI_Upper"] - df["Estimate"]])  # lower error  # upper error
+
+        for i, (x, y, color) in enumerate(zip(x_positions, df["Estimate"], selected_colors)):
+            ax.errorbar(
+                x,
+                y,
+                yerr=[[yerr[0, i]], [yerr[1, i]]],
+                fmt="o",
+                capsize=4,
+                color=color,
+                ecolor=color,
+                markersize=8,
+                markeredgewidth=1.5,
+                linewidth=1.5,
+            )
+
+        # Set labels and title
+        ax.set_xticks(x_positions)
+        ax.set_xticklabels(df["Aggregation_Names"])
+        ax.set_ylabel(y_label)
+        ax.set_title(title)
+
+        ax.grid(axis="y", alpha=0.3)
+        plt.tight_layout()
+
+        return fig, ax
+
+    def _check_frameworks(self, frameworks):
+        msg = "The 'frameworks' must be a list of DoubleMLFramework objects"
+        is_list = isinstance(frameworks, list)
+        all_frameworks = all(isinstance(framework, DoubleMLFramework) for framework in frameworks)
+        if not is_list or not all_frameworks:
+            raise TypeError(msg)
+
+        if not all(framework.n_thetas == 1 for framework in frameworks):
+            raise ValueError("All frameworks must be one-dimensional")
+
+        return frameworks
+
+    def _check_weights(self, aggregation_weights, overall_aggregation_weights):
+
+        # aggregation weights
+        if not isinstance(aggregation_weights, np.ndarray):
+            raise TypeError("'aggregation_weights' must be a numpy array")
+
+        if not aggregation_weights.ndim == 2:
+            raise ValueError("'aggregation_weights' must be a 2-dimensional array")
+
+        if not aggregation_weights.shape[1] == len(self.base_frameworks):
+            raise ValueError("The number of rows in 'aggregation_weights' must be equal to the number of frameworks")
+
+        n_aggregations = aggregation_weights.shape[0]
+        # overall aggregation weights
+        if overall_aggregation_weights is None:
+            overall_aggregation_weights = np.ones(n_aggregations) / n_aggregations
+
+        if not isinstance(overall_aggregation_weights, np.ndarray):
+            raise TypeError("'overall_aggregation_weights' must be a numpy array")
+        if not overall_aggregation_weights.ndim == 1:
+            raise ValueError("'overall_aggregation_weights' must be a 1-dimensional array")
+        if not len(overall_aggregation_weights) == n_aggregations:
+            raise ValueError(
+                "'overall_aggregation_weights' must have the same length as the number of aggregated frameworks "
+                "(number of rows in 'aggregation_weights')."
+            )
+
+        return aggregation_weights, overall_aggregation_weights
+
+    def _check_names(self, aggregation_names, aggregation_method_name):
+        if aggregation_names is None:
+            aggregation_names = [f"Aggregation_{i}" for i in range(self.n_aggregations)]
+
+        if not isinstance(aggregation_names, list):
+            raise TypeError("'aggregation_names' must be a list of strings")
+
+        if not all(isinstance(name, str) for name in aggregation_names):
+            raise TypeError("'aggregation_names' must be a list of strings")
+
+        if not len(aggregation_names) == self.n_aggregations:
+            raise ValueError("'aggregation_names' must have the same length as the number of aggregations")
+
+        if not isinstance(aggregation_method_name, str):
+            raise TypeError("'aggregation_method_name' must be a string")
+
+        return aggregation_names, aggregation_method_name
+
+    def _create_ci_dataframe(self, level=0.95, joint=True):
+        """
+        Create a DataFrame with coefficient estimates and confidence intervals.
+
+        Parameters
+        ----------
+        level : float, default=0.95
+            Confidence level for intervals.
+        joint : bool, default=True
+            Whether to use joint confidence intervals.
+
+        Returns
+        -------
+        pandas.DataFrame
+            DataFrame containing:
+            - Aggregation names
+            - Coefficient estimates
+            - Lower and upper confidence interval bounds
+            - Color indices for plotting
+        """
+
+        if joint and self.aggregated_frameworks.boot_t_stat is None:
+            self.aggregated_frameworks.bootstrap()
+            warnings.warn(
+                "Joint confidence intervals require bootstrapping which hasn't been performed yet. "
+                "Automatically applying '.aggregated_frameworks.bootstrap(method=\"normal\", n_rep_boot=500)' "
+                "with default values. For different bootstrap settings, call bootstrap() explicitly before plotting.",
+                UserWarning,
+            )
+        ci = self.aggregated_frameworks.confint(level=level, joint=joint)
+
+        default_color_idx = [0] * self._n_aggregations
+        if self.additional_parameters is None:
+            color_idx = default_color_idx
+        else:
+            color_idx = self.additional_parameters.get("aggregation_color_idx", default_color_idx)
+
+        df = pd.DataFrame(
+            {
+                "Aggregation_Names": self.aggregation_names,
+                "Estimate": self.aggregated_frameworks.thetas,
+                "CI_Lower": ci.iloc[:, 0],
+                "CI_Upper": ci.iloc[:, 1],
+                "color_idx": color_idx,
+            }
+        )
+        return df
diff --git a/doubleml/did/did_binary.py b/doubleml/did/did_binary.py
new file mode 100644
index 00000000..e4d309db
--- /dev/null
+++ b/doubleml/did/did_binary.py
@@ -0,0 +1,752 @@
+import warnings
+
+import numpy as np
+from sklearn.utils import check_X_y
+
+from doubleml.data.panel_data import DoubleMLPanelData
+from doubleml.did.utils._did_utils import (
+    _check_anticipation_periods,
+    _check_control_group,
+    _check_gt_combination,
+    _check_gt_values,
+    _get_id_positions,
+    _get_never_treated_value,
+    _is_never_treated,
+    _set_id_positions,
+)
+from doubleml.double_ml import DoubleML
+from doubleml.double_ml_score_mixins import LinearScoreMixin
+from doubleml.utils._checks import (
+    _check_bool,
+    _check_finite_predictions,
+    _check_is_propensity,
+    _check_score,
+    _check_trimming,
+)
+from doubleml.utils._estimation import _dml_cv_predict, _dml_tune, _get_cond_smpls
+from doubleml.utils._propensity_score import _trimm
+
+
+class DoubleMLDIDBinary(LinearScoreMixin, DoubleML):
+    """Double machine learning for difference-in-differences models with panel data (binary setting in terms of group and time
+     combinations).
+
+    Parameters
+    ----------
+    obj_dml_data : :class:`DoubleMLPanelData` object
+        The :class:`DoubleMLPanelData` object providing the data and specifying the variables for the causal model.
+
+    g_value : int
+        The value indicating the treatment group (first period with treatment).
+        Default is ``None``. This implements the case for the smallest, non-zero value of G.
+
+    t_value_pre : int
+        The value indicating the baseline pre-treatment period.
+
+    t_value_eval : int
+        The value indicating the period for evaluation.
+
+    ml_g : estimator implementing ``fit()`` and ``predict()``
+        A machine learner implementing ``fit()`` and ``predict()`` methods (e.g.
+        :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function :math:`g_0(d,X) = E[Y_1-Y_0|D=d, X]`.
+        For a binary outcome variable :math:`Y` (with values 0 and 1), a classifier implementing ``fit()`` and
+        ``predict_proba()`` can also be specified. If :py:func:`sklearn.base.is_classifier` returns ``True``,
+        ``predict_proba()`` is used otherwise ``predict()``.
+
+    ml_m : classifier implementing ``fit()`` and ``predict_proba()``
+        A machine learner implementing ``fit()`` and ``predict_proba()`` methods (e.g.
+        :py:class:`sklearn.ensemble.RandomForestClassifier`) for the nuisance function :math:`m_0(X) = E[D=1|X]`.
+        Only relevant for ``score='observational'``.
+
+    control_group : str
+        Specifies the control group. Either ``'never_treated'`` or ``'not_yet_treated'``.
+        Default is ``'never_treated'``.
+
+    anticipation_periods : int
+        Number of anticipation periods. Default is ``0``.
+
+    n_folds : int
+        Number of folds.
+        Default is ``5``.
+
+    n_rep : int
+        Number of repetitons for the sample splitting.
+        Default is ``1``.
+
+    score : str
+        A str (``'observational'`` or ``'experimental'``) specifying the score function.
+        The ``'experimental'`` scores refers to an A/B setting, where the treatment is independent
+        from the pretreatment covariates.
+        Default is ``'observational'``.
+
+    in_sample_normalization : bool
+        Indicates whether to use a sligthly different normalization from Sant'Anna and Zhao (2020).
+        Default is ``True``.
+
+    trimming_rule : str
+        A str (``'truncate'`` is the only choice) specifying the trimming approach.
+        Default is ``'truncate'``.
+
+    trimming_threshold : float
+        The threshold used for trimming.
+        Default is ``1e-2``.
+
+    draw_sample_splitting : bool
+        Indicates whether the sample splitting should be drawn during initialization of the object.
+        Default is ``True``.
+
+    print_periods : bool
+        Indicates whether to print information about the evaluated periods.
+        Default is ``False``.
+
+    """
+
+    def __init__(
+        self,
+        obj_dml_data,
+        g_value,
+        t_value_pre,
+        t_value_eval,
+        ml_g,
+        ml_m=None,
+        control_group="never_treated",
+        anticipation_periods=0,
+        n_folds=5,
+        n_rep=1,
+        score="observational",
+        in_sample_normalization=True,
+        trimming_rule="truncate",
+        trimming_threshold=1e-2,
+        draw_sample_splitting=True,
+        print_periods=False,
+    ):
+
+        super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting=False)
+
+        self._check_data(self._dml_data)
+        g_values = self._dml_data.g_values
+        t_values = self._dml_data.t_values
+
+        _check_bool(print_periods, "print_periods")
+        self._print_periods = print_periods
+
+        self._control_group = _check_control_group(control_group)
+        self._never_treated_value = _get_never_treated_value(g_values)
+        self._anticipation_periods = _check_anticipation_periods(anticipation_periods)
+
+        _check_gt_combination(
+            (g_value, t_value_pre, t_value_eval), g_values, t_values, self.never_treated_value, self.anticipation_periods
+        )
+        self._g_value = g_value
+        self._t_value_pre = t_value_pre
+        self._t_value_eval = t_value_eval
+
+        # check if post_treatment evaluation
+        if g_value <= t_value_eval:
+            post_treatment = True
+        else:
+            post_treatment = False
+
+        self._post_treatment = post_treatment
+
+        if self._print_periods:
+            print(
+                f"Evaluation of ATT({g_value}, {t_value_eval}), with pre-treatment period {t_value_pre},\n"
+                + f"post-treatment: {post_treatment}. Control group: {control_group}.\n"
+            )
+
+        # Preprocess data
+        # Y1, Y0 might be needed if we want to support custom estimators and scores; currently only output y_diff
+        self._panel_data_wide = self._preprocess_data(self._g_value, self._t_value_pre, self._t_value_eval)
+
+        # Handling id values to match pairwise evaluation & simultaneous inference
+        id_panel_data = self._panel_data_wide[self._dml_data.id_col].values
+        id_original = self._dml_data.id_var_unique
+        if not np.all(np.isin(id_panel_data, id_original)):
+            raise ValueError("The id values in the panel data are not a subset of the original id values.")
+
+        # Find position of id_panel_data in original data
+        # These entries should be replaced by nuisance predictions, all others should be set to 0.
+        self._id_positions = np.searchsorted(id_original, id_panel_data)
+
+        # Numeric values for positions of the entries in id_panel_data inside id_original
+        # np.nonzero(np.isin(id_original, id_panel_data))
+        self._n_subset = self._panel_data_wide.shape[0]
+        self._n_obs = self._n_subset  # Effective sample size used for resampling
+        self._n_treated_subset = self._panel_data_wide["G_indicator"].sum()
+
+        # Save x and y for later ML estimation
+        self._x_panel = self._panel_data_wide.loc[:, self._dml_data.x_cols].values
+        self._y_panel = self._panel_data_wide.loc[:, "y_diff"].values
+        self._g_panel = self._panel_data_wide.loc[:, "G_indicator"].values
+
+        valid_scores = ["observational", "experimental"]
+        _check_score(self.score, valid_scores, allow_callable=False)
+
+        self._in_sample_normalization = in_sample_normalization
+        if not isinstance(self.in_sample_normalization, bool):
+            raise TypeError(
+                "in_sample_normalization indicator has to be boolean. "
+                + f"Object of type {str(type(self.in_sample_normalization))} passed."
+            )
+
+        # set stratication for resampling
+        self._strata = self._panel_data_wide["G_indicator"]
+        if draw_sample_splitting:
+            self.draw_sample_splitting()
+
+        # check learners
+        ml_g_is_classifier = self._check_learner(ml_g, "ml_g", regressor=True, classifier=True)
+        if self.score == "observational":
+            _ = self._check_learner(ml_m, "ml_m", regressor=False, classifier=True)
+            self._learner = {"ml_g": ml_g, "ml_m": ml_m}
+        else:
+            assert self.score == "experimental"
+            if ml_m is not None:
+                warnings.warn(
+                    (
+                        'A learner ml_m has been provided for score = "experimental" but will be ignored. '
+                        "A learner ml_m is not required for estimation."
+                    )
+                )
+            self._learner = {"ml_g": ml_g}
+
+        if ml_g_is_classifier:
+            if obj_dml_data.binary_outcome:
+                self._predict_method = {"ml_g": "predict_proba"}
+            else:
+                raise ValueError(
+                    f"The ml_g learner {str(ml_g)} was identified as classifier "
+                    "but the outcome variable is not binary with values 0 and 1."
+                )
+        else:
+            self._predict_method = {"ml_g": "predict"}
+
+        if "ml_m" in self._learner:
+            self._predict_method["ml_m"] = "predict_proba"
+        self._initialize_ml_nuisance_params()
+
+        self._trimming_rule = trimming_rule
+        self._trimming_threshold = trimming_threshold
+        _check_trimming(self._trimming_rule, self._trimming_threshold)
+
+        self._sensitivity_implemented = True
+        self._external_predictions_implemented = True
+
+    def __str__(self):
+        class_name = self.__class__.__name__
+        header = f"================== {class_name} Object ==================\n"
+        data_summary = self._dml_data._data_summary_str()
+        score_info = (
+            f"Score function: {str(self.score)}\n"
+            f"Treatment group: {str(self.g_value)}\n"
+            f"Pre-treatment period: {str(self.t_value_pre)}\n"
+            f"Evaluation period: {str(self.t_value_eval)}\n"
+            f"Control group: {str(self.control_group)}\n"
+            f"Anticipation periods: {str(self.anticipation_periods)}\n"
+            f"Effective sample size: {str(self.n_obs)}\n"
+        )
+        learner_info = ""
+        for key, value in self.learner.items():
+            learner_info += f"Learner {key}: {str(value)}\n"
+        if self.nuisance_loss is not None:
+            learner_info += "Out-of-sample Performance:\n"
+            is_classifier = [value for value in self._is_classifier.values()]
+            is_regressor = [not value for value in is_classifier]
+            if any(is_regressor):
+                learner_info += "Regression:\n"
+                for learner in [key for key, value in self._is_classifier.items() if value is False]:
+                    learner_info += f"Learner {learner} RMSE: {self.nuisance_loss[learner]}\n"
+            if any(is_classifier):
+                learner_info += "Classification:\n"
+                for learner in [key for key, value in self._is_classifier.items() if value is True]:
+                    learner_info += f"Learner {learner} Log Loss: {self.nuisance_loss[learner]}\n"
+
+        if self._is_cluster_data:
+            resampling_info = (
+                f"No. folds per cluster: {self._n_folds_per_cluster}\n"
+                f"No. folds: {self.n_folds}\n"
+                f"No. repeated sample splits: {self.n_rep}\n"
+            )
+        else:
+            resampling_info = f"No. folds: {self.n_folds}\nNo. repeated sample splits: {self.n_rep}\n"
+        fit_summary = str(self.summary)
+        res = (
+            header
+            + "\n------------------ Data summary      ------------------\n"
+            + data_summary
+            + "\n------------------ Score & algorithm ------------------\n"
+            + score_info
+            + "\n------------------ Machine learner   ------------------\n"
+            + learner_info
+            + "\n------------------ Resampling        ------------------\n"
+            + resampling_info
+            + "\n------------------ Fit summary       ------------------\n"
+            + fit_summary
+        )
+        return res
+
+    @property
+    def g_value(self):
+        """
+        The value indicating the treatment group (first period with treatment).
+        """
+        return self._g_value
+
+    @property
+    def t_value_eval(self):
+        """
+        The value indicating the evaluation period.
+        """
+        return self._t_value_eval
+
+    @property
+    def t_value_pre(self):
+        """
+        The value indicating the pre-treatment period.
+        """
+        return self._t_value_pre
+
+    @property
+    def never_treated_value(self):
+        """
+        The value indicating that a unit was never treated.
+        """
+        return self._never_treated_value
+
+    @property
+    def post_treatment(self):
+        """
+        Indicates whether the evaluation period is after the treatment period.
+        """
+        return self._post_treatment
+
+    @property
+    def control_group(self):
+        """
+        The control group.
+        """
+        return self._control_group
+
+    @property
+    def anticipation_periods(self):
+        """
+        The number of anticipation periods.
+        """
+        return self._anticipation_periods
+
+    @property
+    def panel_data_wide(self):
+        """
+        The preprocessed panel data in wide format.
+        """
+        return self._panel_data_wide
+
+    @property
+    def id_positions(self):
+        """
+        The positions of the id values in the original data.
+        """
+        return self._id_positions
+
+    @property
+    def in_sample_normalization(self):
+        """
+        Indicates whether the in sample normalization of weights are used.
+        """
+        return self._in_sample_normalization
+
+    @property
+    def trimming_rule(self):
+        """
+        Specifies the used trimming rule.
+        """
+        return self._trimming_rule
+
+    @property
+    def trimming_threshold(self):
+        """
+        Specifies the used trimming threshold.
+        """
+        return self._trimming_threshold
+
+    @property
+    def n_obs(self):
+        """
+        The number of observations used for estimation.
+        """
+        return self._n_subset
+
+    def _initialize_ml_nuisance_params(self):
+        if self.score == "observational":
+            valid_learner = ["ml_g0", "ml_g1", "ml_m"]
+        else:
+            assert self.score == "experimental"
+            valid_learner = ["ml_g0", "ml_g1"]
+        self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in valid_learner}
+
+    def _check_data(self, obj_dml_data):
+        if not isinstance(obj_dml_data, DoubleMLPanelData):
+            raise TypeError(
+                "For repeated outcomes the data must be of DoubleMLPanelData type. "
+                f"{str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed."
+            )
+        if obj_dml_data.z_cols is not None:
+            raise NotImplementedError(
+                "Incompatible data. " + " and ".join(obj_dml_data.z_cols) + " have been set as instrumental variable(s). "
+                "At the moment there are not DiD models with instruments implemented."
+            )
+
+        one_treat = obj_dml_data.n_treat == 1
+        if not (one_treat):
+            raise ValueError(
+                "Incompatible data. "
+                "To fit an DID model with DML "
+                "exactly one variable needs to be specified as treatment variable."
+            )
+        _check_gt_values(obj_dml_data.g_values, obj_dml_data.t_values)
+        return
+
+    def _preprocess_data(self, g_value, pre_t, eval_t):
+        data = self._dml_data.data
+
+        y_col = self._dml_data.y_col
+        t_col = self._dml_data.t_col
+        id_col = self._dml_data.id_col
+        g_col = self._dml_data.g_col
+
+        # relevent data subset
+        data_subset_indicator = data[t_col].isin([pre_t, eval_t])
+        data_subset = data[data_subset_indicator].sort_values(by=[id_col, t_col])
+
+        # Construct G (treatment group) indicating treatment period in g
+        G_indicator = (data_subset[g_col] == g_value).astype(int)
+
+        # Construct C (control group) indicating never treated or not yet treated
+        never_treated = _is_never_treated(data_subset[g_col], self.never_treated_value).reshape(-1)
+        if self.control_group == "never_treated":
+            C_indicator = never_treated.astype(int)
+
+        elif self.control_group == "not_yet_treated":
+            # adjust max_g_value for anticipation periods
+            t_values = self._dml_data.t_values
+            max_g_value = t_values[min(np.where(t_values == eval_t)[0][0] + self.anticipation_periods, len(t_values) - 1)]
+            # not in G just as a additional check
+            later_treated = (data_subset[g_col] > max_g_value) & (G_indicator == 0)
+            not_yet_treated = never_treated | later_treated
+            C_indicator = not_yet_treated.astype(int)
+
+        if np.sum(C_indicator) == 0:
+            raise ValueError("No observations in the control group.")
+
+        data_subset = data_subset.assign(C_indicator=C_indicator, G_indicator=G_indicator)
+        # reduce to relevant subset
+        data_subset = data_subset[(data_subset["G_indicator"] == 1) | (data_subset["C_indicator"] == 1)]
+        # check if G and C are disjoint
+        assert sum(G_indicator & C_indicator) == 0
+
+        # Alternatively, use .shift() (check if time ordering is correct)
+        # y_diff = this_data.groupby(id_col)[y_col].shift(-1)
+        y_diff = (
+            data_subset[data_subset[t_col] == eval_t][y_col].values - data_subset[data_subset[t_col] == pre_t][y_col].values
+        )
+
+        # keep covariates only observations from the first period
+        # Data processing from long to wide format
+        select_cols = [id_col, "G_indicator", "C_indicator"] + self._dml_data.x_cols
+        first_period = data_subset[t_col].min()
+        wide_data = data_subset[select_cols][data_subset[t_col] == first_period]
+        wide_data = wide_data.assign(y_diff=y_diff)
+
+        return wide_data
+
+    def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False):
+
+        # Here: d is a binary treatment indicator
+        x, y = check_X_y(self._x_panel, self._y_panel, force_all_finite=False)
+        x, d = check_X_y(x, self._g_panel, force_all_finite=False)
+        # nuisance g
+        # get train indices for d == 0
+        smpls_d0, smpls_d1 = _get_cond_smpls(smpls, d)
+
+        # nuisance g for d==0
+        if external_predictions["ml_g0"] is not None:
+            ml_g0_targets = np.full_like(y, np.nan, dtype="float64")
+            ml_g0_targets[d == 0] = y[d == 0]
+            ml_g0_pred = _get_id_positions(external_predictions["ml_g0"], self.id_positions)
+            g_hat0 = {"preds": ml_g0_pred, "targets": ml_g0_targets, "models": None}
+        else:
+            g_hat0 = _dml_cv_predict(
+                self._learner["ml_g"],
+                x,
+                y,
+                smpls=smpls_d0,
+                n_jobs=n_jobs_cv,
+                est_params=self._get_params("ml_g0"),
+                method=self._predict_method["ml_g"],
+                return_models=return_models,
+            )
+
+            _check_finite_predictions(g_hat0["preds"], self._learner["ml_g"], "ml_g", smpls)
+            # adjust target values to consider only compatible subsamples
+            g_hat0["targets"] = g_hat0["targets"].astype(float)
+            g_hat0["targets"][d == 1] = np.nan
+
+        # nuisance g for d==1
+        if external_predictions["ml_g1"] is not None:
+            ml_g1_targets = np.full_like(y, np.nan, dtype="float64")
+            ml_g1_targets[d == 1] = y[d == 1]
+            ml_g1_pred = _get_id_positions(external_predictions["ml_g1"], self.id_positions)
+            g_hat1 = {"preds": ml_g1_pred, "targets": ml_g1_targets, "models": None}
+        else:
+            g_hat1 = _dml_cv_predict(
+                self._learner["ml_g"],
+                x,
+                y,
+                smpls=smpls_d1,
+                n_jobs=n_jobs_cv,
+                est_params=self._get_params("ml_g1"),
+                method=self._predict_method["ml_g"],
+                return_models=return_models,
+            )
+
+            _check_finite_predictions(g_hat1["preds"], self._learner["ml_g"], "ml_g", smpls)
+            # adjust target values to consider only compatible subsamples
+            g_hat1["targets"] = g_hat1["targets"].astype(float)
+            g_hat1["targets"][d == 0] = np.nan
+
+        # only relevant for observational setting
+        m_hat = {"preds": None, "targets": None, "models": None}
+        if self.score == "observational":
+            # nuisance m
+            if external_predictions["ml_m"] is not None:
+                ml_m_pred = _get_id_positions(external_predictions["ml_m"], self.id_positions)
+                m_hat = {"preds": ml_m_pred, "targets": d, "models": None}
+            else:
+                m_hat = _dml_cv_predict(
+                    self._learner["ml_m"],
+                    x,
+                    d,
+                    smpls=smpls,
+                    n_jobs=n_jobs_cv,
+                    est_params=self._get_params("ml_m"),
+                    method=self._predict_method["ml_m"],
+                    return_models=return_models,
+                )
+            _check_finite_predictions(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls)
+            _check_is_propensity(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls, eps=1e-12)
+            m_hat["preds"] = _trimm(m_hat["preds"], self.trimming_rule, self.trimming_threshold)
+
+        # nuisance estimates of the uncond. treatment prob.
+        p_hat = np.full_like(d, d.mean(), dtype="float64")
+        psi_a, psi_b = self._score_elements(y, d, g_hat0["preds"], g_hat1["preds"], m_hat["preds"], p_hat)
+
+        extend_kwargs = {
+            "n_obs": self._dml_data.n_obs,
+            "id_positions": self.id_positions,
+        }
+        psi_elements = {
+            "psi_a": _set_id_positions(psi_a, fill_value=0.0, **extend_kwargs),
+            "psi_b": _set_id_positions(psi_b, fill_value=0.0, **extend_kwargs),
+        }
+        preds = {
+            "predictions": {
+                "ml_g0": _set_id_positions(g_hat0["preds"], fill_value=np.nan, **extend_kwargs),
+                "ml_g1": _set_id_positions(g_hat1["preds"], fill_value=np.nan, **extend_kwargs),
+                "ml_m": _set_id_positions(m_hat["preds"], fill_value=np.nan, **extend_kwargs),
+            },
+            "targets": {
+                "ml_g0": _set_id_positions(g_hat0["targets"], fill_value=np.nan, **extend_kwargs),
+                "ml_g1": _set_id_positions(g_hat1["targets"], fill_value=np.nan, **extend_kwargs),
+                "ml_m": _set_id_positions(m_hat["targets"], fill_value=np.nan, **extend_kwargs),
+            },
+            "models": {"ml_g0": g_hat0["models"], "ml_g1": g_hat1["models"], "ml_m": m_hat["models"]},
+        }
+
+        return psi_elements, preds
+
+    def _score_elements(self, y, d, g_hat0, g_hat1, m_hat, p_hat):
+        # calc residuals
+        resid_d0 = y - g_hat0
+
+        if self.score == "observational":
+            if self.in_sample_normalization:
+                weight_psi_a = np.divide(d, np.mean(d))
+                propensity_weight = np.multiply(1.0 - d, np.divide(m_hat, 1.0 - m_hat))
+                weight_resid_d0 = np.divide(d, np.mean(d)) - np.divide(propensity_weight, np.mean(propensity_weight))
+            else:
+                weight_psi_a = np.divide(d, p_hat)
+                weight_resid_d0 = np.divide(d - m_hat, np.multiply(p_hat, 1.0 - m_hat))
+
+            psi_b_1 = np.zeros_like(y)
+
+        else:
+            assert self.score == "experimental"
+            if self.in_sample_normalization:
+                weight_psi_a = np.ones_like(y)
+                weight_g0 = np.divide(d, np.mean(d)) - 1.0
+                weight_g1 = 1.0 - np.divide(d, np.mean(d))
+                weight_resid_d0 = np.divide(d, np.mean(d)) - np.divide(1.0 - d, np.mean(1.0 - d))
+            else:
+                weight_psi_a = np.ones_like(y)
+                weight_g0 = np.divide(d, p_hat) - 1.0
+                weight_g1 = 1.0 - np.divide(d, p_hat)
+                weight_resid_d0 = np.divide(d - p_hat, np.multiply(p_hat, 1.0 - p_hat))
+
+            psi_b_1 = np.multiply(weight_g0, g_hat0) + np.multiply(weight_g1, g_hat1)
+
+        # set score elements
+        psi_a = -1.0 * weight_psi_a
+        psi_b = psi_b_1 + np.multiply(weight_resid_d0, resid_d0)
+
+        return psi_a, psi_b
+
+    def _nuisance_tuning(
+        self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search
+    ):
+        x, y = check_X_y(self._x_panel, self._y_panel, force_all_finite=False)
+        x, d = check_X_y(x, self._g_panel, force_all_finite=False)
+
+        # get train indices for d == 0 and d == 1
+        smpls_d0, smpls_d1 = _get_cond_smpls(smpls, d)
+
+        if scoring_methods is None:
+            scoring_methods = {"ml_g": None, "ml_m": None}
+
+        train_inds = [train_index for (train_index, _) in smpls]
+        train_inds_d0 = [train_index for (train_index, _) in smpls_d0]
+        train_inds_d1 = [train_index for (train_index, _) in smpls_d1]
+        g0_tune_res = _dml_tune(
+            y,
+            x,
+            train_inds_d0,
+            self._learner["ml_g"],
+            param_grids["ml_g"],
+            scoring_methods["ml_g"],
+            n_folds_tune,
+            n_jobs_cv,
+            search_mode,
+            n_iter_randomized_search,
+        )
+        g1_tune_res = _dml_tune(
+            y,
+            x,
+            train_inds_d1,
+            self._learner["ml_g"],
+            param_grids["ml_g"],
+            scoring_methods["ml_g"],
+            n_folds_tune,
+            n_jobs_cv,
+            search_mode,
+            n_iter_randomized_search,
+        )
+
+        g0_best_params = [xx.best_params_ for xx in g0_tune_res]
+        g1_best_params = [xx.best_params_ for xx in g1_tune_res]
+
+        if self.score == "observational":
+            m_tune_res = _dml_tune(
+                d,
+                x,
+                train_inds,
+                self._learner["ml_m"],
+                param_grids["ml_m"],
+                scoring_methods["ml_m"],
+                n_folds_tune,
+                n_jobs_cv,
+                search_mode,
+                n_iter_randomized_search,
+            )
+            m_best_params = [xx.best_params_ for xx in m_tune_res]
+            params = {"ml_g0": g0_best_params, "ml_g1": g1_best_params, "ml_m": m_best_params}
+            tune_res = {"g0_tune": g0_tune_res, "g1_tune": g1_tune_res, "m_tune": m_tune_res}
+        else:
+            params = {"ml_g0": g0_best_params, "ml_g1": g1_best_params}
+            tune_res = {"g0_tune": g0_tune_res, "g1_tune": g1_tune_res}
+
+        res = {"params": params, "tune_res": tune_res}
+
+        return res
+
+    def _sensitivity_element_est(self, preds):
+        y = self._y_panel
+        d = self._g_panel
+
+        m_hat = _get_id_positions(preds["predictions"]["ml_m"], self.id_positions)
+        g_hat0 = _get_id_positions(preds["predictions"]["ml_g0"], self.id_positions)
+        g_hat1 = _get_id_positions(preds["predictions"]["ml_g1"], self.id_positions)
+
+        g_hat = np.multiply(d, g_hat1) + np.multiply(1.0 - d, g_hat0)
+        sigma2_score_element = np.square(y - g_hat)
+        sigma2 = np.mean(sigma2_score_element)
+        psi_sigma2 = sigma2_score_element - sigma2
+
+        # calc m(W,alpha) and Riesz representer
+        p_hat = np.mean(d)
+        if self.score == "observational":
+            propensity_weight_d0 = np.divide(m_hat, 1.0 - m_hat)
+            if self.in_sample_normalization:
+                weight_d0 = np.multiply(1.0 - d, propensity_weight_d0)
+                mean_weight_d0 = np.mean(weight_d0)
+
+                m_alpha = np.multiply(
+                    np.divide(d, p_hat), np.divide(1.0, p_hat) + np.divide(propensity_weight_d0, mean_weight_d0)
+                )
+                rr = np.divide(d, p_hat) - np.divide(weight_d0, mean_weight_d0)
+            else:
+                m_alpha = np.multiply(np.divide(d, np.square(p_hat)), (1.0 + propensity_weight_d0))
+                rr = np.divide(d, p_hat) - np.multiply(np.divide(1.0 - d, p_hat), propensity_weight_d0)
+        else:
+            assert self.score == "experimental"
+            # the same with or without self-normalization
+            m_alpha = np.divide(1.0, p_hat) + np.divide(1.0, 1.0 - p_hat)
+            rr = np.divide(d, p_hat) - np.divide(1.0 - d, 1.0 - p_hat)
+
+        nu2_score_element = np.multiply(2.0, m_alpha) - np.square(rr)
+        nu2 = np.mean(nu2_score_element)
+        psi_nu2 = nu2_score_element - nu2
+
+        extend_kwargs = {
+            "n_obs": self._dml_data.n_obs,
+            "id_positions": self.id_positions,
+            "fill_value": 0.0,
+        }
+
+        # add scaling to make variance estimation consistent (sample size difference)
+        scaling = self._dml_data.n_obs / self._n_subset
+        element_dict = {
+            "sigma2": sigma2,
+            "nu2": nu2,
+            "psi_sigma2": scaling * _set_id_positions(psi_sigma2, **extend_kwargs),
+            "psi_nu2": scaling * _set_id_positions(psi_nu2, **extend_kwargs),
+            "riesz_rep": scaling * _set_id_positions(rr, **extend_kwargs),
+        }
+        return element_dict
+
+    def sensitivity_benchmark(self, benchmarking_set, fit_args=None):
+        """
+        Computes a benchmark for a given set of features.
+        Returns a DataFrame containing the corresponding values for cf_y, cf_d, rho and the change in estimates.
+
+        Parameters
+        ----------
+        benchmarking_set : list
+            List of features to be used for benchmarking.
+
+        fit_args : dict, optional
+            Additional arguments for the fit method.
+            Default is None.
+
+        Returns
+        -------
+        benchmark_results : pandas.DataFrame
+            Benchmark results.
+        """
+        if self.score == "experimental":
+            warnings.warn(
+                "Sensitivity benchmarking for experimental score may not be meaningful. "
+                "Consider using score='observational' for conditional treatment assignment.",
+                UserWarning,
+            )
+
+        return super().sensitivity_benchmark(benchmarking_set, fit_args)
diff --git a/doubleml/did/did_cs.py b/doubleml/did/did_cs.py
index a198bcea..ab2af5b9 100644
--- a/doubleml/did/did_cs.py
+++ b/doubleml/did/did_cs.py
@@ -4,8 +4,8 @@
 from sklearn.utils import check_X_y
 from sklearn.utils.multiclass import type_of_target
 
+from doubleml.data.base_data import DoubleMLData
 from doubleml.double_ml import DoubleML
-from doubleml.double_ml_data import DoubleMLData
 from doubleml.double_ml_score_mixins import LinearScoreMixin
 from doubleml.utils._checks import _check_finite_predictions, _check_is_propensity, _check_score, _check_trimming
 from doubleml.utils._estimation import _dml_cv_predict, _dml_tune, _get_cond_smpls_2d
@@ -664,3 +664,31 @@ def _nuisance_tuning(
         res = {"params": params, "tune_res": tune_res}
 
         return res
+
+    def sensitivity_benchmark(self, benchmarking_set, fit_args=None):
+        """
+        Computes a benchmark for a given set of features.
+        Returns a DataFrame containing the corresponding values for cf_y, cf_d, rho and the change in estimates.
+
+        Parameters
+        ----------
+        benchmarking_set : list
+            List of features to be used for benchmarking.
+
+        fit_args : dict, optional
+            Additional arguments for the fit method.
+            Default is None.
+
+        Returns
+        -------
+        benchmark_results : pandas.DataFrame
+            Benchmark results.
+        """
+        if self.score == "experimental":
+            warnings.warn(
+                "Sensitivity benchmarking for experimental score may not be meaningful. "
+                "Consider using score='observational' for conditional treatment assignment.",
+                UserWarning,
+            )
+
+        return super().sensitivity_benchmark(benchmarking_set, fit_args)
diff --git a/doubleml/did/did_multi.py b/doubleml/did/did_multi.py
new file mode 100644
index 00000000..0243cca5
--- /dev/null
+++ b/doubleml/did/did_multi.py
@@ -0,0 +1,1367 @@
+import copy
+import warnings
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import seaborn as sns
+from joblib import Parallel, delayed
+from matplotlib.lines import Line2D
+from sklearn.base import clone
+
+from doubleml.data import DoubleMLPanelData
+from doubleml.did.did_aggregation import DoubleMLDIDAggregation
+from doubleml.did.did_binary import DoubleMLDIDBinary
+from doubleml.did.utils._aggregation import (
+    _check_did_aggregation_dict,
+    _compute_did_eventstudy_aggregation_weights,
+    _compute_did_group_aggregation_weights,
+    _compute_did_time_aggregation_weights,
+)
+from doubleml.did.utils._did_utils import (
+    _check_anticipation_periods,
+    _check_control_group,
+    _check_gt_combination,
+    _check_gt_values,
+    _construct_gt_combinations,
+    _construct_gt_index,
+    _construct_post_treatment_mask,
+    _get_never_treated_value,
+)
+from doubleml.did.utils._plot import add_jitter
+from doubleml.double_ml import DoubleML
+from doubleml.double_ml_framework import concat
+from doubleml.utils._checks import _check_score, _check_trimming
+from doubleml.utils._descriptive import generate_summary
+from doubleml.utils.gain_statistics import gain_statistics
+
+
+class DoubleMLDIDMulti:
+    """Double machine learning for multi-period difference-in-differences models.
+
+    Parameters
+    ----------
+    obj_dml_data : :class:`DoubleMLPanelData` object
+        The :class:`DoubleMLPanelData` object providing the data and specifying the variables for the causal model.
+
+    ml_g : estimator implementing ``fit()`` and ``predict()``
+        A machine learner implementing ``fit()`` and ``predict()`` methods (e.g.
+        :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function
+        :math:`g_0(0,X) = E[Y_{t_\text{eval}}-Y_{t_\text{pre}}|X, C__{t_\text{eval} + \\delta} = 1]`.
+        For a binary outcome variable :math:`Y` (with values 0 and 1), a classifier implementing ``fit()`` and
+        ``predict_proba()`` can also be specified.
+
+    ml_m : classifier implementing ``fit()`` and ``predict_proba()``
+        A machine learner implementing ``fit()`` and ``predict_proba()`` methods (e.g.
+        :py:class:`sklearn.ensemble.RandomForestClassifier`) for the nuisance function :math:`m_0(X) = E[D=1|X]`.
+        Only relevant for ``score='observational'``. Default is ``None``.
+
+    gt_combinations : array-like
+        A list of tuples with the group-time combinations to be evaluated.
+
+    control_group : str
+        Specifies the control group. Either ``'never_treated'`` or ``'not_yet_treated'``.
+        Default is ``'never_treated'``.
+
+    anticipation_periods : int
+        Number of anticipation periods. Default is ``0``.
+
+    n_folds : int
+        Number of folds for cross-fitting.
+        Default is ``5``.
+
+    n_rep : int
+        Number of repetitions for the sample splitting.
+        Default is ``1``.
+
+    score : str
+        A str (``'observational'`` or ``'experimental'``) specifying the score function.
+        The ``'experimental'`` scores refers to an A/B setting, where the treatment is independent
+        from the pretreatment covariates.
+        Default is ``'observational'``.
+
+    in_sample_normalization : bool
+        Indicates whether to use in-sample normalization of weights.
+        Default is ``True``.
+
+    trimming_rule : str
+        A str (``'truncate'`` is the only choice) specifying the trimming approach.
+        Default is ``'truncate'``.
+
+    trimming_threshold : float
+        The threshold used for trimming.
+        Default is ``1e-2``.
+
+    draw_sample_splitting : bool
+        Indicates whether the sample splitting should be drawn during initialization.
+        Default is ``True``.
+
+    print_periods : bool
+        Indicates whether to print information about the evaluated periods.
+        Default is ``False``.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import doubleml as dml
+    >>> from doubleml.did.datasets import make_did_CS2021
+    >>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
+    >>> np.random.seed(42)
+    >>> df = make_did_CS2021(n_obs=500)
+    >>> dml_data = dml.data.DoubleMLPanelData(
+    ...     df,
+    ...     y_col="y",
+    ...     d_cols="d",
+    ...     id_col="id",
+    ...     t_col="t",
+    ...     x_cols=["Z1", "Z2", "Z3", "Z4"],
+    ...     datetime_unit="M"
+    ... )
+    >>> ml_g = RandomForestRegressor(n_estimators=100, max_depth=5)
+    >>> ml_m = RandomForestClassifier(n_estimators=100, max_depth=5)
+    >>> dml_did_obj = dml.did.DoubleMLDIDMulti(
+    ...     obj_dml_data=dml_data,
+    ...     ml_g=ml_g,
+    ...     ml_m=ml_m,
+    ...     gt_combinations="standard",
+    ...     control_group="never_treated",
+    ... )
+    >>> print(dml_did_obj.fit())
+    """
+
+    def __init__(
+        self,
+        obj_dml_data,
+        ml_g,
+        ml_m=None,
+        gt_combinations="standard",
+        control_group="never_treated",
+        anticipation_periods=0,
+        n_folds=5,
+        n_rep=1,
+        score="observational",
+        in_sample_normalization=True,
+        trimming_rule="truncate",
+        trimming_threshold=1e-2,
+        draw_sample_splitting=True,
+        print_periods=False,
+    ):
+
+        self._dml_data = obj_dml_data
+        self._is_cluster_data = False
+        self._is_panel_data = isinstance(obj_dml_data, DoubleMLPanelData)
+        self._check_data(self._dml_data)
+        self._g_values = self._dml_data.g_values
+        self._t_values = self._dml_data.t_values
+        self._print_periods = print_periods
+
+        self._control_group = _check_control_group(control_group)
+        self._never_treated_value = _get_never_treated_value(self.g_values)
+        self._anticipation_periods = _check_anticipation_periods(anticipation_periods)
+
+        self._gt_combinations = self._validate_gt_combinations(gt_combinations)
+        self._gt_index = _construct_gt_index(self.gt_combinations, self.g_values, self.t_values)
+        self._post_treatment_mask = _construct_post_treatment_mask(self.g_values, self.t_values)
+        self._gt_labels = [f"ATT({g},{t_pre},{t_eval})" for g, t_pre, t_eval in self.gt_combinations]
+
+        self._in_sample_normalization = in_sample_normalization
+        if not isinstance(self.in_sample_normalization, bool):
+            raise TypeError(
+                "in_sample_normalization indicator has to be boolean. "
+                + f"Object of type {str(type(self.in_sample_normalization))} passed."
+            )
+
+        self._n_folds = n_folds
+        self._n_rep = n_rep
+
+        # check score
+        self._score = score
+        valid_scores = ["observational", "experimental"]
+        _check_score(self.score, valid_scores, allow_callable=False)
+
+        # initialize framework which is constructed after the fit method is called
+        self._framework = None
+
+        # initialize and check trimming
+        self._trimming_rule = trimming_rule
+        self._trimming_threshold = trimming_threshold
+        _check_trimming(self._trimming_rule, self._trimming_threshold)
+
+        ml_g_is_classifier = DoubleML._check_learner(ml_g, "ml_g", regressor=True, classifier=True)
+        if self.score == "observational":
+            _ = DoubleML._check_learner(ml_m, "ml_m", regressor=False, classifier=True)
+            self._learner = {"ml_g": clone(ml_g), "ml_m": clone(ml_m)}
+        else:
+            assert self.score == "experimental"
+            if ml_m is not None:
+                warnings.warn(
+                    (
+                        'A learner ml_m has been provided for score = "experimental" but will be ignored. '
+                        "A learner ml_m is not required for estimation."
+                    )
+                )
+            self._learner = {"ml_g": ml_g, "ml_m": None}
+
+        if ml_g_is_classifier:
+            if obj_dml_data.binary_outcome:
+                self._predict_method = {"ml_g": "predict_proba", "ml_m": "predict_proba"}
+            else:
+                raise ValueError(
+                    f"The ml_g learner {str(ml_g)} was identified as classifier "
+                    "but the outcome variable is not binary with values 0 and 1."
+                )
+        else:
+            self._predict_method = {"ml_g": "predict", "ml_m": "predict_proba"}
+
+        # perform sample splitting
+        self._smpls = None
+        self._draw_sample_splitting = draw_sample_splitting
+
+        # initialize all models if splits are known
+        self._modellist = self._initialize_models()
+        self._nuisance_loss = None
+
+    def __str__(self):
+        class_name = self.__class__.__name__
+        header = f"================== {class_name} Object ==================\n"
+        data_summary = self._dml_data._data_summary_str()
+        score_info = (
+            f"Score function: {str(self.score)}\n"
+            f"Control group: {str(self.control_group)}\n"
+            f"Anticipation periods: {str(self.anticipation_periods)}\n"
+        )
+        resampling_info = f"No. folds: {self.n_folds}\nNo. repeated sample splits: {self.n_rep}\n"
+        learner_info = ""
+        for key, value in self._learner.items():
+            learner_info += f"Learner {key}: {str(value)}\n"
+        if self.nuisance_loss is not None:
+            learner_info += "Out-of-sample Performance:\n"
+            is_classifier = [value for value in self.modellist[0]._is_classifier.values()]
+            is_regressor = [not value for value in is_classifier]
+            if any(is_regressor):
+                learner_info += "Regression:\n"
+                for learner in [key for key, value in self.modellist[0]._is_classifier.items() if value is False]:
+                    learner_info += f"Learner {learner} RMSE: {self.nuisance_loss[learner]}\n"
+            if any(is_classifier):
+                learner_info += "Classification:\n"
+                for learner in [key for key, value in self.modellist[0]._is_classifier.items() if value is True]:
+                    learner_info += f"Learner {learner} Log Loss: {self.nuisance_loss[learner]}\n"
+        fit_summary = str(self.summary)
+        res = (
+            header
+            + "\n------------------ Data summary      ------------------\n"
+            + data_summary
+            + "\n------------------ Score & algorithm ------------------\n"
+            + score_info
+            + "\n------------------ Machine learner   ------------------\n"
+            + learner_info
+            + "\n------------------ Resampling        ------------------\n"
+            + resampling_info
+            + "\n------------------ Fit summary       ------------------\n"
+            + fit_summary
+        )
+        return res
+
+    @property
+    def score(self):
+        """
+        The score function.
+        """
+        return self._score
+
+    @property
+    def control_group(self):
+        """
+        The control group.
+        """
+        return self._control_group
+
+    @property
+    def anticipation_periods(self):
+        """
+        The number of anticipation periods.
+        """
+        return self._anticipation_periods
+
+    @property
+    def gt_combinations(self):
+        """
+        The combinations of g and t values.
+        """
+        return self._gt_combinations
+
+    @property
+    def gt_index(self):
+        """
+        The index of the combinations of g and t values.
+        """
+        return self._gt_index
+
+    @property
+    def n_gt_atts(self):
+        """
+        The number of evaluated combinations of the treatment variable and the period.
+        """
+        return len(self.gt_combinations)
+
+    @property
+    def gt_labels(self):
+        """
+        The evaluated labels of the treatment effects 'ATT(g, t_pre, t_eval)' and the period.
+        """
+        return self._gt_labels
+
+    @property
+    def g_values(self):
+        """
+        The values of the treatment variable.
+        """
+        return self._g_values
+
+    @property
+    def t_values(self):
+        """
+        The values of the time periods.
+        """
+        return self._t_values
+
+    @property
+    def never_treated_value(self):
+        """
+        The value indicating that a unit was never treated.
+        """
+        return self._never_treated_value
+
+    @property
+    def in_sample_normalization(self):
+        """
+        Indicates whether the in sample normalization of weights are used.
+        """
+        return self._in_sample_normalization
+
+    @property
+    def trimming_rule(self):
+        """
+        Specifies the used trimming rule.
+        """
+        return self._trimming_rule
+
+    @property
+    def trimming_threshold(self):
+        """
+        Specifies the used trimming threshold.
+        """
+        return self._trimming_threshold
+
+    @property
+    def n_folds(self):
+        """
+        Number of folds.
+        """
+        return self._n_folds
+
+    @property
+    def n_rep(self):
+        """
+        Number of repetitions for the sample splitting.
+        """
+        return self._n_rep
+
+    @property
+    def n_rep_boot(self):
+        """
+        The number of bootstrap replications.
+        """
+        if self._framework is None:
+            n_rep_boot = None
+        else:
+            n_rep_boot = self._framework.n_rep_boot
+        return n_rep_boot
+
+    @property
+    def boot_method(self):
+        """
+        The method to construct the bootstrap replications.
+        """
+        if self._framework is None:
+            method = None
+        else:
+            method = self._framework.boot_method
+        return method
+
+    @property
+    def coef(self):
+        """
+        Estimates for the causal parameter(s) after calling :meth:`fit` (shape (``n_gt_atts``,)).
+        """
+        if self._framework is None:
+            coef = None
+        else:
+            coef = self.framework.thetas
+        return coef
+
+    @property
+    def all_coef(self):
+        """
+        Estimates of the causal parameter(s) for the ``n_rep`` different sample splits after calling :meth:`fit`
+         (shape (``n_gt_atts``, ``n_rep``)).
+        """
+        if self._framework is None:
+            all_coef = None
+        else:
+            all_coef = self.framework.all_thetas
+        return all_coef
+
+    @property
+    def se(self):
+        """
+        Standard errors for the causal parameter(s) after calling :meth:`fit` (shape (``n_gt_atts``,)).
+        """
+        if self._framework is None:
+            se = None
+        else:
+            se = self.framework.ses
+        return se
+
+    @property
+    def all_se(self):
+        """
+        Standard errors of the causal parameter(s) for the ``n_rep`` different sample splits after calling :meth:`fit`
+         (shape (``n_gt_atts``, ``n_rep``)).
+        """
+        if self._framework is None:
+            all_se = None
+        else:
+            all_se = self.framework.all_ses
+        return all_se
+
+    @property
+    def t_stat(self):
+        """
+        t-statistics for the causal parameter(s) after calling :meth:`fit` (shape (``n_gt_atts``,)).
+        """
+        if self._framework is None:
+            t_stats = None
+        else:
+            t_stats = self.framework.t_stats
+        return t_stats
+
+    @property
+    def pval(self):
+        """
+        p-values for the causal parameter(s) (shape (``n_gt_atts``,)).
+        """
+        if self._framework is None:
+            pvals = None
+        else:
+            pvals = self.framework.pvals
+        return pvals
+
+    @property
+    def boot_t_stat(self):
+        """
+        Bootstrapped t-statistics for the causal parameter(s) after calling :meth:`fit` and :meth:`bootstrap`
+         (shape (``n_rep_boot``, ``n_gt_atts``, ``n_rep``)).
+        """
+        if self._framework is None:
+            boot_t_stat = None
+        else:
+            boot_t_stat = self._framework.boot_t_stat
+        return boot_t_stat
+
+    @property
+    def nuisance_loss(self):
+        """
+        The losses of the nuisance models (root-mean-squared-errors or logloss).
+        """
+        return self._nuisance_loss
+
+    @property
+    def framework(self):
+        """
+        The corresponding :class:`doubleml.DoubleMLFramework` object.
+        """
+        return self._framework
+
+    @property
+    def modellist(self):
+        """
+        The list of DoubleMLDIDBinary models.
+        """
+        return self._modellist
+
+    @property
+    def sensitivity_elements(self):
+        """
+        Values of the sensitivity components after calling :meth:`fit`;
+        If available (e.g., PLR, IRM) a dictionary with entries ``sigma2``, ``nu2``, ``psi_sigma2``, ``psi_nu2``
+        and ``riesz_rep``.
+        """
+        if self._framework is None:
+            sensitivity_elements = None
+        else:
+            sensitivity_elements = self._framework.sensitivity_elements
+        return sensitivity_elements
+
+    @property
+    def sensitivity_params(self):
+        """
+        Values of the sensitivity parameters after calling :meth:`sesitivity_analysis`;
+        If available (e.g., PLR, IRM) a dictionary with entries ``theta``, ``se``, ``ci``, ``rv``
+        and ``rva``.
+        """
+        if self._framework is None:
+            sensitivity_params = None
+        else:
+            sensitivity_params = self._framework.sensitivity_params
+        return sensitivity_params
+
+    @property
+    def summary(self):
+        """
+        A summary for the estimated causal effect after calling :meth:`fit`.
+        """
+        if self.framework is None:
+            col_names = ["coef", "std err", "t", "P>|t|"]
+            df_summary = pd.DataFrame(columns=col_names)
+        else:
+            ci = self.confint()
+            df_summary = generate_summary(self.coef, self.se, self.t_stat, self.pval, ci, self.gt_labels)
+        return df_summary
+
+    @property
+    def sensitivity_summary(self):
+        """
+        Returns a summary for the sensitivity analysis after calling :meth:`sensitivity_analysis`.
+        Returns
+        -------
+        res : str
+            Summary for the sensitivity analysis.
+        """
+        if self._framework is None:
+            raise ValueError("Apply sensitivity_analysis() before sensitivity_summary.")
+        else:
+            sensitivity_summary = self._framework.sensitivity_summary
+        return sensitivity_summary
+
+    def fit(self, n_jobs_models=None, n_jobs_cv=None, store_predictions=True, store_models=False, external_predictions=None):
+        """
+        Estimate DoubleMLDIDMulti models.
+
+        Parameters
+        ----------
+        n_jobs_models : None or int
+            The number of CPUs to use to fit the group-time ATTs. ``None`` means ``1``.
+            Default is ``None``.
+
+        n_jobs_cv : None or int
+            The number of CPUs to use to fit the learners. ``None`` means ``1``.
+            Does not speed up computation for quantile models.
+            Default is ``None``.
+
+        store_predictions : bool
+            Indicates whether the predictions for the nuisance functions should be stored in ``predictions``.
+            Default is ``True``.
+
+        store_models : bool
+            Indicates whether the fitted models for the nuisance functions should be stored in ``models``. This allows
+            to analyze the fitted models or extract information like variable importance.
+            Default is ``False``.
+
+        external_predictions : dict or None
+            A nested dictionary where the keys correspond the the treatment levels and can contain predictions according to
+            each treatment level. The values have to be dictionaries which can contain keys ``'ml_g0'``, ``'ml_g1'``
+            and ``'ml_m'``.
+            Default is `None`.
+
+        Returns
+        -------
+        self : object
+        """
+
+        if external_predictions is not None:
+            self._check_external_predictions(external_predictions)
+            ext_pred_dict = self._rename_external_predictions(external_predictions)
+        else:
+            ext_pred_dict = None
+
+        # parallel estimation of the models
+        parallel = Parallel(n_jobs=n_jobs_models, verbose=0, pre_dispatch="2*n_jobs")
+        fitted_models = parallel(
+            delayed(self._fit_model)(i_gt, n_jobs_cv, store_predictions, store_models, ext_pred_dict)
+            for i_gt in range(self.n_gt_atts)
+        )
+
+        # combine the estimates and scores
+        framework_list = [None] * self.n_gt_atts
+
+        for i_gt in range(self.n_gt_atts):
+            self._modellist[i_gt] = fitted_models[i_gt]
+            framework_list[i_gt] = self._modellist[i_gt].framework
+
+        # aggregate all frameworks
+        self._framework = concat(framework_list)
+        self._framework.treatment_names = self._gt_labels
+
+        # store the nuisance losses
+        self._nuisance_loss = self._calc_nuisance_loss()
+
+        return self
+
+    def confint(self, joint=False, level=0.95):
+        """
+        Confidence intervals for DoubleML models.
+
+        Parameters
+        ----------
+        joint : bool
+            Indicates whether joint confidence intervals are computed.
+            Default is ``False``
+        level : float
+            The confidence level.
+            Default is ``0.95``.
+
+        Returns
+        -------
+        df_ci : pd.DataFrame
+            A data frame with the confidence interval(s).
+        """
+
+        if self.framework is None:
+            raise ValueError("Apply fit() before confint().")
+
+        df_ci = self.framework.confint(joint=joint, level=level)
+        df_ci.set_index(pd.Index(self.gt_labels), inplace=True)
+
+        return df_ci
+
+    def p_adjust(self, method="romano-wolf"):
+        """
+        Multiple testing adjustment for DoubleML models.
+
+        Parameters
+        ----------
+        method : str
+            A str (``'romano-wolf''``, ``'bonferroni'``, ``'holm'``, etc) specifying the adjustment method.
+            In addition to ``'romano-wolf''``, all methods implemented in
+            :py:func:`statsmodels.stats.multitest.multipletests` can be applied.
+            Default is ``'romano-wolf'``.
+
+        Returns
+        -------
+        p_val : pd.DataFrame
+            A data frame with adjusted p-values.
+        """
+
+        if self.framework is None:
+            raise ValueError("Apply fit() before p_adjust().")
+
+        p_val, _ = self.framework.p_adjust(method=method)
+        p_val.set_index(pd.Index(self.gt_labels), inplace=True)
+
+        return p_val
+
+    def bootstrap(self, method="normal", n_rep_boot=500):
+        """
+        Multiplier bootstrap for DoubleML models.
+
+        Parameters
+        ----------
+        method : str
+            A str (``'Bayes'``, ``'normal'`` or ``'wild'``) specifying the multiplier bootstrap method.
+            Default is ``'normal'``
+        n_rep_boot : int
+            The number of bootstrap replications.
+
+        Returns
+        -------
+        self : object
+        """
+        if self._framework is None:
+            raise ValueError("Apply fit() before bootstrap().")
+        self._framework.bootstrap(method=method, n_rep_boot=n_rep_boot)
+
+        return self
+
+    def sensitivity_analysis(self, cf_y=0.03, cf_d=0.03, rho=1.0, level=0.95, null_hypothesis=0.0):
+        """
+        Performs a sensitivity analysis to account for unobserved confounders.
+        The evaluated scenario is stored as a dictionary in the property ``sensitivity_params``.
+
+        Parameters
+        ----------
+        cf_y : float
+            Percentage of the residual variation of the outcome explained by latent/confounding variables.
+            Default is ``0.03``.
+        cf_d : float
+            Percentage gains in the variation of the Riesz representer generated by latent/confounding variables.
+            Default is ``0.03``.
+        rho : float
+            The correlation between the differences in short and long representations in the main regression and
+            Riesz representer. Has to be in [-1,1]. The absolute value determines the adversarial strength of the
+            confounding (maximizes at 1.0).
+            Default is ``1.0``.
+        level : float
+            The confidence level.
+            Default is ``0.95``.
+        null_hypothesis : float or numpy.ndarray
+            Null hypothesis for the effect. Determines the robustness values.
+            If it is a single float uses the same null hypothesis for all estimated parameters.
+            Else the array has to be of shape (n_coefs,).
+            Default is ``0.0``.
+
+        Returns
+        -------
+        self : object
+        """
+
+        if self._framework is None:
+            raise ValueError("Apply fit() before sensitivity_analysis().")
+        self._framework.sensitivity_analysis(cf_y=cf_y, cf_d=cf_d, rho=rho, level=level, null_hypothesis=null_hypothesis)
+
+        return self
+
+    def sensitivity_plot(
+        self,
+        idx_treatment=0,
+        value="theta",
+        rho=1.0,
+        level=0.95,
+        null_hypothesis=0.0,
+        include_scenario=True,
+        benchmarks=None,
+        fill=True,
+        grid_bounds=(0.15, 0.15),
+        grid_size=100,
+    ):
+        """
+        Contour plot of the sensivity with respect to latent/confounding variables.
+
+        Parameters
+        ----------
+        idx_gt_atte : int
+            Index of the treatment to perform the sensitivity analysis.
+            Default is ``0``.
+        value : str
+            Determines which contours to plot. Valid values are ``'theta'`` (refers to the bounds)
+            and ``'ci'`` (refers to the bounds including statistical uncertainty).
+            Default is ``'theta'``.
+        rho: float
+            The correlation between the differences in short and long representations in the main regression and
+            Riesz representer. Has to be in [-1,1]. The absolute value determines the adversarial strength of the
+            confounding (maximizes at 1.0).
+            Default is ``1.0``.
+        level : float
+            The confidence level.
+            Default is ``0.95``.
+        null_hypothesis : float
+            Null hypothesis for the effect. Determines the direction of the contour lines.
+        include_scenario : bool
+            Indicates whether to highlight the scenario from the call of :meth:`sensitivity_analysis`.
+            Default is ``True``.
+        benchmarks : dict or None
+            Dictionary of benchmarks to be included in the plot. The keys are ``cf_y``, ``cf_d`` and ``name``.
+            Default is ``None``.
+        fill : bool
+            Indicates whether to use a heatmap style or only contour lines.
+            Default is ``True``.
+        grid_bounds : tuple
+            Determines the evaluation bounds of the grid for ``cf_d`` and ``cf_y``. Has to contain two floats in [0, 1).
+            Default is ``(0.15, 0.15)``.
+        grid_size : int
+            Determines the number of evaluation points of the grid.
+            Default is ``100``.
+
+        Returns
+        -------
+        fig : object
+            Plotly figure of the sensitivity contours.
+        """
+        if self._framework is None:
+            raise ValueError("Apply fit() before sensitivity_plot().")
+        fig = self._framework.sensitivity_plot(
+            idx_treatment=idx_treatment,
+            value=value,
+            rho=rho,
+            level=level,
+            null_hypothesis=null_hypothesis,
+            include_scenario=include_scenario,
+            benchmarks=benchmarks,
+            fill=fill,
+            grid_bounds=grid_bounds,
+            grid_size=grid_size,
+        )
+
+        return fig
+
+    def sensitivity_benchmark(self, benchmarking_set, fit_args=None):
+        """
+        Computes a benchmark for a given set of features.
+        Returns a DataFrame containing the corresponding values for cf_y, cf_d, rho and the change in estimates.
+
+        Returns
+        -------
+        benchmark_results : pandas.DataFrame
+            Benchmark results.
+        """
+        x_list_long = self._dml_data.x_cols
+
+        # input checks
+        if self.sensitivity_elements is None:
+            raise NotImplementedError(f"Sensitivity analysis not yet implemented for {self.__class__.__name__}.")
+        if not isinstance(benchmarking_set, list):
+            raise TypeError(
+                "benchmarking_set must be a list. " f"{str(benchmarking_set)} of type {type(benchmarking_set)} was passed."
+            )
+        if len(benchmarking_set) == 0:
+            raise ValueError("benchmarking_set must not be empty.")
+        if not set(benchmarking_set) <= set(x_list_long):
+            raise ValueError(
+                f"benchmarking_set must be a subset of features {str(self._dml_data.x_cols)}. "
+                f"{str(benchmarking_set)} was passed."
+            )
+        if fit_args is not None and not isinstance(fit_args, dict):
+            raise TypeError("fit_args must be a dict. " f"{str(fit_args)} of type {type(fit_args)} was passed.")
+
+        # refit short form of the model
+        x_list_short = [x for x in x_list_long if x not in benchmarking_set]
+        dml_short = copy.deepcopy(self)
+        dml_short._dml_data.x_cols = x_list_short
+        if fit_args is not None:
+            dml_short.fit(**fit_args)
+        else:
+            dml_short.fit()
+
+        benchmark_dict = gain_statistics(dml_long=self, dml_short=dml_short)
+        df_benchmark = pd.DataFrame(benchmark_dict, index=self.gt_labels)
+        return df_benchmark
+
+    def aggregate(self, aggregation="group"):
+        """
+        Aggregates treatment effects.
+
+        Parameters
+        ----------
+        aggregation : str or dict
+            Method to aggregate treatment effects or dictionary with aggregation weights (masked numpy array).
+            Has to one of ``'group'``, ``'time'``, ``'eventstudy'`` or a masked numpy array.
+            Default is ``'group'``.
+
+        Returns
+        -------
+        DoubleMLFramework
+            Aggregated treatment effects framework
+
+        """
+        if self.framework is None:
+            raise ValueError("Apply fit() before aggregate().")
+
+        # select all non-masked values
+        selected_gt_mask = ~self.gt_index.mask
+
+        # get aggregation weights
+        aggregation_dict = self._get_agg_weights(selected_gt_mask, aggregation)
+        aggregation_dict = _check_did_aggregation_dict(aggregation_dict, self.gt_index)
+        # set elements for readability
+        weight_masks = aggregation_dict["weight_masks"]
+
+        # ordered frameworks
+        all_frameworks = [self.modellist[idx].framework for idx in self.gt_index.compressed()]
+        # ordered weights
+        n_aggregations = weight_masks.shape[-1]
+        weight_list = [weight_masks[..., idx_agg].compressed() for idx_agg in range(n_aggregations)]
+        all_agg_weights = np.stack(weight_list, axis=0)
+
+        additional_info = {
+            "Score function": self.score,
+            "Control group": self.control_group,
+            "Anticipation periods": self.anticipation_periods,
+        }
+
+        additional_params = {
+            "gt_combinations": self.gt_combinations,
+            "gt_index": self.gt_index,
+            "weight_masks": weight_masks,
+        }
+
+        # set plotting colors for eventstudy
+        if aggregation_dict["method"] == "Event Study":
+            additional_params["aggregation_color_idx"] = [0 if "-" in name else 1 for name in aggregation_dict["agg_names"]]
+        else:
+            additional_params["aggregation_color_idx"] = [1] * n_aggregations
+
+        aggregation_args = {
+            "frameworks": all_frameworks,
+            "aggregation_weights": all_agg_weights,
+            "overall_aggregation_weights": aggregation_dict.get("agg_weights", None),
+            "aggregation_names": aggregation_dict.get("agg_names", None),
+            "aggregation_method_name": aggregation_dict["method"],
+            "additional_information": additional_info,
+            "additional_parameters": additional_params,
+        }
+
+        agg_obj = DoubleMLDIDAggregation(**aggregation_args)
+        return agg_obj
+
+    def plot_effects(
+        self,
+        level=0.95,
+        joint=True,
+        figsize=(12, 8),
+        color_palette="colorblind",
+        date_format=None,
+        y_label="Effect",
+        title="Estimated ATTs by Group",
+        jitter_value=None,
+        default_jitter=0.1,
+    ):
+        """
+        Plots coefficient estimates with confidence intervals over time, grouped by first treated period.
+
+        Parameters
+        ----------
+        level : float
+            The confidence level for the intervals.
+            Default is ``0.95``.
+        joint : bool
+            Indicates whether joint confidence intervals are computed.
+            Default is ``True``.
+        figsize : tuple
+            Figure size as (width, height).
+            Default is ``(12, 8)``.
+        color_palette : str
+            Name of seaborn color palette to use for distinguishing pre and post treatment effects.
+            Default is ``"colorblind"``.
+        date_format : str
+            Format string for date ticks if x-axis contains datetime values.
+            Default is ``None``.
+        y_label : str
+            Label for y-axis.
+            Default is ``"Effect"``.
+        title : str
+            Title for the entire plot.
+            Default is ``"Estimated ATTs by Group"``.
+        jitter_value : float
+            Amount of jitter to apply to points.
+            Default is ``None``.
+        default_jitter : float
+            Default amount of jitter to apply to points.
+            Default is ``0.1``.
+
+        Returns
+        -------
+        fig : matplotlib.figure.Figure
+            The created figure object
+        axes : list
+            List of matplotlib axis objects for further customization
+
+        Notes
+        -----
+        If joint=True and bootstrapping hasn't been performed, this method will automatically
+        perform bootstrapping with default parameters and issue a warning.
+        """
+        if self.framework is None:
+            raise ValueError("Apply fit() before plot_effects().")
+        df = self._create_ci_dataframe(level=level, joint=joint)
+
+        # Sort time periods and treatment groups
+        first_treated_periods = sorted(df["First Treated"].unique())
+        n_periods = len(first_treated_periods)
+
+        # Set up colors
+        colors = dict(zip(["pre", "post"], sns.color_palette(color_palette)[:2]))
+
+        # Check if x-axis is datetime or convert to float
+        is_datetime = pd.api.types.is_datetime64_any_dtype(df["Evaluation Period"])
+        if pd.api.types.is_integer_dtype(df["Evaluation Period"]):
+            df["Evaluation Period"] = df["Evaluation Period"].astype(float)
+
+        # Create figure and subplots
+        fig = plt.figure(figsize=figsize)
+        gs = fig.add_gridspec(n_periods + 1, 1, height_ratios=[3] * n_periods + [0.5])
+        axes = [fig.add_subplot(gs[i]) for i in range(n_periods)]
+
+        # Auto-calculate jitter if not specified
+        if jitter_value is None:
+            all_values = self.t_values
+            if is_datetime:
+                jitter_value = (all_values[1] - all_values[0]).astype("timedelta64[s]").astype(int) * default_jitter
+            else:
+                jitter_value = (all_values[1] - all_values[0]) * default_jitter
+
+        # Plot each treatment group
+        for idx, period in enumerate(first_treated_periods):
+            period_df = df[df["First Treated"] == period]
+            ax = axes[idx]
+
+            self._plot_single_group(ax, period_df, period, colors, is_datetime, jitter_value)
+
+            # Set axis labels
+            if idx == n_periods - 1:  # Only bottom plot gets x label
+                ax.set_xlabel("Evaluation Period")
+            ax.set_ylabel(y_label)
+
+            # Format date ticks if needed
+            if is_datetime and date_format:
+                ax.xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter(date_format))
+                plt.setp(ax.xaxis.get_majorticklabels())
+
+        # Add legend
+        legend_ax = fig.add_subplot(gs[-1])
+        legend_ax.axis("off")
+        legend_elements = [
+            Line2D([0], [0], color="red", linestyle=":", alpha=0.7, label="Treatment start"),
+            Line2D([0], [0], color="black", linestyle="--", alpha=0.5, label="Zero effect"),
+            Line2D([0], [0], marker="o", color=colors["pre"], linestyle="None", label="Pre-treatment", markersize=5),
+            Line2D([0], [0], marker="o", color=colors["post"], linestyle="None", label="Post-treatment", markersize=5),
+        ]
+        legend_ax.legend(handles=legend_elements, loc="center", ncol=4, mode="expand", borderaxespad=0.0)
+
+        # Set title and layout
+        plt.suptitle(title, y=1.02)
+        plt.tight_layout()
+
+        return fig, axes
+
+    def _plot_single_group(self, ax, period_df, period, colors, is_datetime, jitter_value):
+        """
+        Plot estimates for a single treatment group on the given axis.
+
+        Parameters
+        ----------
+        ax : matplotlib.axes.Axes
+            Matplotlib axis to plot on.
+        period_df : pandas.DataFrame
+            DataFrame containing estimates for a specific time period.
+        period : int or datetime
+            Treatment period for this group.
+        colors : dict
+            Dictionary with 'pre' and 'post' color values.
+        is_datetime : bool
+            Whether the x-axis represents datetime values.
+        jitter_value : float
+            Amount of jitter to apply to points.
+            Default is ``None``.
+
+        Returns
+        -------
+        matplotlib.axes.Axes
+            The updated axis object.
+        """
+
+        # Plot reference lines
+        ax.axvline(x=period, color="red", linestyle=":", alpha=0.7)
+        ax.axhline(y=0, color="black", linestyle="--", alpha=0.5)
+
+        # Split and jitter data
+        pre_treatment = add_jitter(
+            period_df[period_df["Pre-Treatment"]],
+            "Evaluation Period",
+            is_datetime=is_datetime,
+            jitter_value=jitter_value,
+        )
+        post_treatment = add_jitter(
+            period_df[~period_df["Pre-Treatment"]],
+            "Evaluation Period",
+            is_datetime=is_datetime,
+            jitter_value=jitter_value,
+        )
+
+        # Plot pre-treatment points
+        if not pre_treatment.empty:
+            ax.scatter(pre_treatment["jittered_x"], pre_treatment["Estimate"], color=colors["pre"], alpha=0.8, s=30)
+            ax.errorbar(
+                pre_treatment["jittered_x"],
+                pre_treatment["Estimate"],
+                yerr=[
+                    pre_treatment["Estimate"] - pre_treatment["CI Lower"],
+                    pre_treatment["CI Upper"] - pre_treatment["Estimate"],
+                ],
+                fmt="o",
+                capsize=3,
+                color=colors["pre"],
+                markersize=4,
+                markeredgewidth=1,
+                linewidth=1,
+            )
+
+        # Plot post-treatment points
+        if not post_treatment.empty:
+            ax.scatter(post_treatment["jittered_x"], post_treatment["Estimate"], color=colors["post"], alpha=0.8, s=30)
+            ax.errorbar(
+                post_treatment["jittered_x"],
+                post_treatment["Estimate"],
+                yerr=[
+                    post_treatment["Estimate"] - post_treatment["CI Lower"],
+                    post_treatment["CI Upper"] - post_treatment["Estimate"],
+                ],
+                fmt="o",
+                capsize=3,
+                color=colors["post"],
+                markersize=4,
+                markeredgewidth=1,
+                linewidth=1,
+            )
+
+        # Format axes
+        if is_datetime:
+            period_str = np.datetime64(period, self._dml_data.datetime_unit)
+        else:
+            period_str = period
+        ax.set_title(f"First Treated: {period_str}")
+        ax.grid(True, alpha=0.3)
+
+        return ax
+
+    def _get_agg_weights(self, selected_gt_mask, aggregation):
+        """
+        Calculate weights for aggregating treatment effects.
+
+        Parameters
+        ----------
+        selected_gt_mask : numpy.ndarray
+            Boolean mask indicating which group-time combinations to include
+        aggregation : str or dict
+            Method to aggregate treatment effects
+
+        Returns
+        -------
+        tuple
+            (weight_masks, agg_names, agg_weights)
+        """
+
+        if isinstance(aggregation, dict):
+            aggregation_dict = aggregation
+
+        elif isinstance(aggregation, str):
+            valid_aggregations = ["group", "time", "eventstudy"]
+            if aggregation not in valid_aggregations:
+                raise ValueError(f"aggregation must be one of {valid_aggregations}. " f"{str(aggregation)} was passed.")
+
+            if aggregation == "group":
+                # exclude pre-treatment combinations
+                selected_gt_mask = selected_gt_mask & self._post_treatment_mask
+                aggregation_dict = _compute_did_group_aggregation_weights(
+                    gt_index=self.gt_index,
+                    g_values=self.g_values,
+                    d_values=self._dml_data.d,
+                    selected_gt_mask=selected_gt_mask,
+                )
+                aggregation_dict["method"] = "Group"
+            elif aggregation == "time":
+                # exclude pre-treatment combinations
+                selected_gt_mask = selected_gt_mask & self._post_treatment_mask
+                aggregation_dict = _compute_did_time_aggregation_weights(
+                    gt_index=self.gt_index,
+                    g_values=self.g_values,
+                    t_values=self.t_values,
+                    d_values=self._dml_data.d,
+                    selected_gt_mask=selected_gt_mask,
+                )
+                aggregation_dict["method"] = "Time"
+            elif aggregation == "eventstudy":
+                aggregation_dict = _compute_did_eventstudy_aggregation_weights(
+                    gt_index=self.gt_index,
+                    g_values=self.g_values,
+                    t_values=self.t_values,
+                    d_values=self._dml_data.d,
+                    time_values=self._dml_data.t,
+                    selected_gt_mask=selected_gt_mask,
+                )
+                aggregation_dict["method"] = "Event Study"
+        else:
+            raise TypeError(
+                "aggregation must be a string or dictionary. " f"{str(aggregation)} of type {type(aggregation)} was passed."
+            )
+
+        return aggregation_dict
+
+    def _fit_model(self, i_gt, n_jobs_cv=None, store_predictions=True, store_models=False, external_predictions_dict=None):
+
+        model = self.modellist[i_gt]
+        if external_predictions_dict is not None:
+            external_predictions = external_predictions_dict[self.gt_labels[i_gt]]
+        else:
+            external_predictions = None
+        model.fit(
+            n_jobs_cv=n_jobs_cv,
+            store_predictions=store_predictions,
+            store_models=store_models,
+            external_predictions=external_predictions,
+        )
+        return model
+
+    def _check_data(self, obj_dml_data):
+        if not isinstance(obj_dml_data, DoubleMLPanelData):
+            raise TypeError(
+                "The data has to be a DoubleMLPanelData object. "
+                f"{str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed."
+            )
+        if obj_dml_data.z_cols is not None:
+            raise NotImplementedError(
+                "Incompatible data. " + " and ".join(obj_dml_data.z_cols) + " have been set as instrumental variable(s). "
+                "At the moment there are not DiD models with instruments implemented."
+            )
+        _check_gt_values(obj_dml_data.g_values, obj_dml_data.t_values)
+        return
+
+    def _validate_gt_combinations(self, gt_combinations):
+        """Validate all treatment-time combinations."""
+
+        if isinstance(gt_combinations, str):
+            gt_combinations = _construct_gt_combinations(
+                gt_combinations, self.g_values, self.t_values, self.never_treated_value, self.anticipation_periods
+            )
+
+        if not isinstance(gt_combinations, list):
+            raise TypeError(
+                "gt_combinations must be a list. " + f"{str(gt_combinations)} of type {type(gt_combinations)} was passed."
+            )
+
+        if len(gt_combinations) == 0:
+            raise ValueError("gt_combinations must not be empty.")
+
+        if not all(isinstance(gt_combination, tuple) for gt_combination in gt_combinations):
+            raise TypeError("gt_combinations must be a list of tuples. At least one element is not a tuple.")
+
+        if not all(len(gt_combination) == 3 for gt_combination in gt_combinations):
+            raise ValueError(
+                "gt_combinations must be a list of tuples with 3 elements. At least one tuple has not 3 elements."
+            )
+
+        for gt_combination in gt_combinations:
+            _check_gt_combination(
+                gt_combination, self.g_values, self.t_values, self.never_treated_value, self.anticipation_periods
+            )
+
+        return gt_combinations
+
+    def _check_external_predictions(self, external_predictions):
+        expected_keys = self.gt_labels
+        if not isinstance(external_predictions, dict):
+            raise TypeError(
+                "external_predictions must be a dictionary. " + f"Object of type {type(external_predictions)} passed."
+            )
+
+        if not set(external_predictions.keys()).issubset(set(expected_keys)):
+            raise ValueError(
+                "external_predictions must be a subset of all gt_combinations. "
+                + f"Expected keys: {set(expected_keys)}. "
+                + f"Passed keys: {set(external_predictions.keys())}."
+            )
+
+        expected_learner_keys = ["ml_g0", "ml_g1", "ml_m"]
+        for key, value in external_predictions.items():
+            if not isinstance(value, dict):
+                raise TypeError(
+                    f"external_predictions[{key}] must be a dictionary. " + f"Object of type {type(value)} passed."
+                )
+            if not set(value.keys()).issubset(set(expected_learner_keys)):
+                raise ValueError(
+                    f"external_predictions[{key}] must be a subset of {set(expected_learner_keys)}. "
+                    + f"Passed keys: {set(value.keys())}."
+                )
+
+        return
+
+    def _rename_external_predictions(self, external_predictions):
+        d_col = self._dml_data.d_cols[0]
+        ext_pred_dict = {gt_combination: {d_col: {}} for gt_combination in self.gt_labels}
+        for gt_combination in self.gt_labels:
+            if "ml_g0" in external_predictions[gt_combination]:
+                ext_pred_dict[gt_combination][d_col]["ml_g0"] = external_predictions[gt_combination]["ml_g0"]
+            if "ml_g1" in external_predictions[gt_combination]:
+                ext_pred_dict[gt_combination][d_col]["ml_g1"] = external_predictions[gt_combination]["ml_g1"]
+            if "ml_m" in external_predictions[gt_combination]:
+                ext_pred_dict[gt_combination][d_col]["ml_m"] = external_predictions[gt_combination]["ml_m"]
+
+        return ext_pred_dict
+
+    def _calc_nuisance_loss(self):
+        nuisance_loss = {learner: np.full((self.n_rep, self.n_gt_atts), np.nan) for learner in self.modellist[0].params_names}
+        for i_model, model in enumerate(self.modellist):
+            for learner in self.modellist[0].params_names:
+                for i_rep in range(self.n_rep):
+                    nuisance_loss[learner][i_rep, i_model] = model.nuisance_loss[learner][i_rep].item()
+                    nuisance_loss[learner][i_rep, i_model] = model.nuisance_loss[learner][i_rep].item()
+
+        return nuisance_loss
+
+    def _initialize_models(self):
+        modellist = [None] * self.n_gt_atts
+        kwargs = {
+            "obj_dml_data": self._dml_data,
+            "ml_g": self._learner["ml_g"],
+            "ml_m": self._learner["ml_m"],
+            "control_group": self.control_group,
+            "anticipation_periods": self.anticipation_periods,
+            "score": self.score,
+            "n_folds": self.n_folds,
+            "n_rep": self.n_rep,
+            "trimming_rule": self.trimming_rule,
+            "trimming_threshold": self.trimming_threshold,
+            "in_sample_normalization": self.in_sample_normalization,
+            "draw_sample_splitting": True,
+            "print_periods": self._print_periods,
+        }
+        for i_model, (g_value, t_value_pre, t_value_eval) in enumerate(self.gt_combinations):
+            # initialize models for all levels
+            model = DoubleMLDIDBinary(g_value=g_value, t_value_pre=t_value_pre, t_value_eval=t_value_eval, **kwargs)
+
+            modellist[i_model] = model
+
+        return modellist
+
+    def _create_ci_dataframe(self, level=0.95, joint=True):
+        """
+        Create a DataFrame with coefficient estimates and confidence intervals for treatment effects.
+
+        Parameters
+        ----------
+        level : float, default=0.95
+            Confidence level for intervals (between 0 and 1).
+        joint : bool, default=True
+            Whether to use joint confidence intervals. If True and bootstrapping hasn't been
+            performed yet, will automatically call bootstrap() with default parameters.
+
+        Returns
+        -------
+        pandas.DataFrame
+            DataFrame containing:
+            - 'First Treated': First treatment time for each group
+            - 'Pre-treatment Period': Pre-treatment time period
+            - 'Evaluation Period': Evaluation time period
+            - 'Estimate': Treatment effect estimates
+            - 'CI Lower': Lower bound of confidence intervals
+            - 'CI Upper': Upper bound of confidence intervals
+            - 'Pre-Treatment': Boolean indicating if evaluation period is before treatment
+
+        Notes
+        -----
+        If joint=True and bootstrapping hasn't been performed, this method will automatically
+        perform bootstrapping with default parameters and issue a warning.
+        """
+
+        if joint and self.framework.boot_t_stat is None:
+            self.bootstrap()
+            warnings.warn(
+                "Joint confidence intervals require bootstrapping which hasn't been performed yet. "
+                "Automatically applying '.bootstrap(method=\"normal\", n_rep_boot=500)' with default values. "
+                "For different bootstrap settings, call bootstrap() explicitly before plotting.",
+                UserWarning,
+            )
+
+        ci = self.confint(level=level, joint=joint)
+        df = pd.DataFrame(
+            {
+                "First Treated": [gt_combination[0] for gt_combination in self.gt_combinations],
+                "Pre-treatment Period": [gt_combination[1] for gt_combination in self.gt_combinations],
+                "Evaluation Period": [gt_combination[2] for gt_combination in self.gt_combinations],
+                "Estimate": self.framework.thetas,
+                "CI Lower": ci.iloc[:, 0],
+                "CI Upper": ci.iloc[:, 1],
+                "Pre-Treatment": [gt_combination[2] < gt_combination[0] for gt_combination in self.gt_combinations],
+            }
+        )
+
+        return df
diff --git a/doubleml/did/tests/_utils_did_manual.py b/doubleml/did/tests/_utils_did_manual.py
index e48c9042..e314c301 100644
--- a/doubleml/did/tests/_utils_did_manual.py
+++ b/doubleml/did/tests/_utils_did_manual.py
@@ -105,7 +105,7 @@ def fit_nuisance_did(
 
     p_hat_list = []
     for train_index, _ in smpls:
-        p_hat_list.append(np.mean(d[train_index]))
+        p_hat_list.append(np.mean(d))
 
     return g_hat0_list, g_hat1_list, m_hat_list, p_hat_list
 
diff --git a/doubleml/did/tests/conftest.py b/doubleml/did/tests/conftest.py
index 90e8394c..de528156 100644
--- a/doubleml/did/tests/conftest.py
+++ b/doubleml/did/tests/conftest.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pytest
 
-from doubleml.datasets import make_did_SZ2020
+from doubleml.did.datasets import make_did_SZ2020
 
 
 @pytest.fixture(scope="session", params=[(500, 1), (1000, 1), (1000, 2)])
@@ -30,3 +30,17 @@ def generate_data_did_cs(request):
     data = make_did_SZ2020(n, dgp_type=dpg, cross_sectional_data=True, return_type="array")
 
     return data
+
+
+@pytest.fixture(scope="session", params=[(500, 1), (1000, 1), (1000, 2)])
+def generate_data_did_binary(request):
+    params = request.param
+    np.random.seed(1111)
+    # setting parameters
+    n = params[0]
+    dpg = params[1]
+
+    # generating data
+    data = make_did_SZ2020(n, dgp_type=dpg, return_type="DoubleMLPanelData")
+
+    return data
diff --git a/doubleml/did/tests/test_datasets.py b/doubleml/did/tests/test_datasets.py
new file mode 100644
index 00000000..0e323ec9
--- /dev/null
+++ b/doubleml/did/tests/test_datasets.py
@@ -0,0 +1,79 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from doubleml import DoubleMLData
+from doubleml.did.datasets import make_did_CS2021, make_did_SZ2020
+
+msg_inv_return_type = "Invalid return_type."
+
+
+@pytest.fixture(scope="function", params=[False, True])
+def cross_sectional(request):
+    return request.param
+
+
+@pytest.fixture(scope="function", params=[1, 2, 3, 4, 5, 6])
+def dgp_type(request):
+    return request.param
+
+
+@pytest.mark.ci
+def test_make_did_SZ2020_return_types(cross_sectional, dgp_type):
+    np.random.seed(3141)
+    res = make_did_SZ2020(n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type=DoubleMLData)
+    assert isinstance(res, DoubleMLData)
+    res = make_did_SZ2020(n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type=pd.DataFrame)
+    assert isinstance(res, pd.DataFrame)
+    if cross_sectional:
+        x, y, d, t = make_did_SZ2020(
+            n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type=np.ndarray
+        )
+        assert isinstance(t, np.ndarray)
+    else:
+        x, y, d, _ = make_did_SZ2020(
+            n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type=np.ndarray
+        )
+    assert isinstance(x, np.ndarray)
+    assert isinstance(y, np.ndarray)
+    assert isinstance(d, np.ndarray)
+    with pytest.raises(ValueError, match=msg_inv_return_type):
+        _ = make_did_SZ2020(n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type="matrix")
+    msg = "The dgp_type is not valid."
+    with pytest.raises(ValueError, match=msg):
+        _ = make_did_SZ2020(n_obs=100, dgp_type="5", cross_sectional_data=cross_sectional, return_type="matrix")
+
+
+@pytest.fixture(scope="function", params=[True, False])
+def include_never_treated(request):
+    return request.param
+
+
+@pytest.fixture(scope="function", params=["datetime", "float"])
+def time_type(request):
+    return request.param
+
+
+@pytest.fixture(scope="function", params=[0, 2])
+def anticipation_periods(request):
+    return request.param
+
+
+@pytest.mark.ci
+def test_make_did_CS2021_return_types(dgp_type, include_never_treated, time_type, anticipation_periods):
+    np.random.seed(3141)
+    df = make_did_CS2021(
+        n_obs=100,
+        dgp_type=dgp_type,
+        include_never_treated=include_never_treated,
+        time_type=time_type,
+        anticipation_periods=anticipation_periods,
+    )
+    assert isinstance(df, pd.DataFrame)
+
+
+@pytest.mark.ci
+def test_make_did_CS2021_exceptions():
+    msg = r"time_type must be one of \('datetime', 'float'\). Got 2."
+    with pytest.raises(ValueError, match=msg):
+        _ = make_did_CS2021(n_obs=100, time_type=2)
diff --git a/doubleml/did/tests/test_did_aggregation.py b/doubleml/did/tests/test_did_aggregation.py
new file mode 100644
index 00000000..cc3c4304
--- /dev/null
+++ b/doubleml/did/tests/test_did_aggregation.py
@@ -0,0 +1,98 @@
+import numpy as np
+import pytest
+
+from doubleml.did.did_aggregation import DoubleMLDIDAggregation
+from doubleml.double_ml_framework import DoubleMLFramework
+from doubleml.tests._utils import generate_dml_dict
+
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[2, 5])
+def n_base_fameworks(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def base_framework(n_rep):
+    # Create a consistent framework for all tests
+    n_obs = 10
+    n_thetas = 1
+
+    # Generate consistent scores with known effect
+    np.random.seed(42)
+    psi_a = np.ones(shape=(n_obs, n_thetas, n_rep))
+    psi_b = np.random.normal(size=(n_obs, n_thetas, n_rep))
+
+    doubleml_dict = generate_dml_dict(psi_a, psi_b)
+    return DoubleMLFramework(doubleml_dict)
+
+
+@pytest.fixture(scope="module", params=["ones", "random", "zeros", "mixed"])
+def weight_type(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[1, 4, 5])
+def n_aggregations(request):
+    return request.param
+
+
+@pytest.fixture
+def weights(n_aggregations, n_base_fameworks, weight_type):
+    np.random.seed(42)
+
+    if weight_type == "ones":
+        aggregation_weights = np.ones(shape=(n_aggregations, n_base_fameworks))
+        overall_aggregation_weights = np.ones(shape=n_aggregations)
+    elif weight_type == "random":
+        aggregation_weights = np.random.rand(n_aggregations, n_base_fameworks)
+        overall_aggregation_weights = np.random.rand(n_aggregations)
+    elif weight_type == "zeros":
+        aggregation_weights = np.zeros(shape=(n_aggregations, n_base_fameworks))
+        overall_aggregation_weights = np.zeros(shape=n_aggregations)
+    else:  # mixed
+        aggregation_weights = np.ones(shape=(n_aggregations, n_base_fameworks))
+        aggregation_weights[::2] = 0.5  # Set every other row to 0.5
+        overall_aggregation_weights = np.ones(shape=n_aggregations)
+        overall_aggregation_weights[::2] = 0.5
+
+    return aggregation_weights, overall_aggregation_weights
+
+
+@pytest.mark.ci
+def test_multiple_equal_frameworks(base_framework, weights):
+    """Test that aggregating the same framework with different weights works correctly"""
+    agg_weights, overall_agg_weights = weights
+
+    n_aggregations = agg_weights.shape[0]
+    n_frameworks = agg_weights.shape[1]
+    # Create list of identical frameworks
+    frameworks = [base_framework] * n_frameworks
+
+    # Create aggregation
+    aggregation = DoubleMLDIDAggregation(
+        frameworks=frameworks, aggregation_weights=agg_weights, overall_aggregation_weights=overall_agg_weights
+    )
+
+    # Expected results
+    scaled_frameworks = [None] * n_aggregations
+    for i_agg in range(n_aggregations):
+        scaled_frameworks[i_agg] = sum(agg_weights[i_agg]) * base_framework
+
+        # Check individual aggregation results
+        np.testing.assert_allclose(aggregation.aggregated_frameworks.all_thetas[i_agg], scaled_frameworks[i_agg].all_thetas[0])
+        np.testing.assert_allclose(
+            aggregation.aggregated_frameworks.scaled_psi[:, i_agg, :], scaled_frameworks[i_agg].scaled_psi[:, 0, :]
+        )
+        # ses might differ due to 1/n and 1/n-1 scaling
+
+    # Check overall aggregation results
+    overall_weights = sum([overall_agg_weights[i] * sum(agg_weights[i]) for i in range(n_aggregations)])
+    overall_scaled_framework = overall_weights * base_framework
+
+    np.testing.assert_allclose(aggregation.overall_aggregated_framework.all_thetas, overall_scaled_framework.all_thetas)
+    np.testing.assert_allclose(aggregation.overall_aggregated_framework.scaled_psi, overall_scaled_framework.scaled_psi)
diff --git a/doubleml/did/tests/test_did_aggregation_exceptions.py b/doubleml/did/tests/test_did_aggregation_exceptions.py
new file mode 100644
index 00000000..0f895b5b
--- /dev/null
+++ b/doubleml/did/tests/test_did_aggregation_exceptions.py
@@ -0,0 +1,190 @@
+import numpy as np
+import pytest
+
+from doubleml.did.did_aggregation import DoubleMLDIDAggregation
+from doubleml.double_ml_framework import DoubleMLFramework
+from doubleml.tests._utils import generate_dml_dict
+
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[1])
+def n_thetas(request):
+    return request.param
+
+
+@pytest.fixture
+def mock_framework(n_rep, n_thetas):
+    # Create a minimal mock of DoubleMLFramework
+    n_obs = 10
+    # generate score samples
+    psi_a = np.ones(shape=(n_obs, n_thetas, n_rep))
+    psi_b = np.random.normal(size=(n_obs, n_thetas, n_rep))
+    doubleml_dict = generate_dml_dict(psi_a, psi_b)
+    return DoubleMLFramework(doubleml_dict)
+
+
+@pytest.fixture
+def frameworks(mock_framework):
+    # Create a list of 3 frameworks
+    return [mock_framework] * 3
+
+
+@pytest.fixture
+def aggregation_weights():
+    # Create sample weights for 2 aggregations over 3 frameworks
+    return np.array([[0.5, 0.3, 0.2], [0.2, 0.5, 0.3]])
+
+
+@pytest.mark.ci
+def test_valid_initialization(frameworks, aggregation_weights):
+    # Test initialization with valid parameters
+    aggregation = DoubleMLDIDAggregation(
+        frameworks=frameworks,
+        aggregation_weights=aggregation_weights,
+        overall_aggregation_weights=np.array([0.6, 0.4]),
+        aggregation_names=["agg1", "agg2"],
+        aggregation_method_name="custom",
+        additional_information={"key": "value"},
+    )
+    assert isinstance(aggregation.base_frameworks, list)
+    assert isinstance(aggregation.aggregation_weights, np.ndarray)
+    assert isinstance(aggregation.additional_information, str)
+
+
+@pytest.mark.ci
+def test_invalid_frameworks(aggregation_weights):
+    # Test with invalid frameworks type
+    with pytest.raises(TypeError, match="The 'frameworks' must be a list of DoubleMLFramework objects"):
+        DoubleMLDIDAggregation(frameworks="invalid_frameworks", aggregation_weights=aggregation_weights)
+
+
+@pytest.mark.ci
+def test_invalid_framework_dim():
+    psi_a = np.ones(shape=(10, 2, 1))
+    psi_b = np.random.normal(size=(10, 2, 1))
+    doubleml_dict = generate_dml_dict(psi_a, psi_b)
+    framework = DoubleMLFramework(doubleml_dict)
+
+    # Test with invalid framework dimension
+    with pytest.raises(ValueError, match="All frameworks must be one-dimensional"):
+        DoubleMLDIDAggregation(frameworks=[framework, framework], aggregation_weights=np.array([[0.5, 0.5], [0.3, 0.7]]))
+
+
+@pytest.mark.ci
+def test_invalid_aggregation_weights(frameworks):
+    # Test with invalid aggregation_weights type
+    with pytest.raises(TypeError, match="'aggregation_weights' must be a numpy array"):
+        DoubleMLDIDAggregation(frameworks=frameworks, aggregation_weights=[1, 2, 3])  # list instead of numpy array
+
+
+@pytest.mark.ci
+def test_invalid_aggregation_weights_ndim(frameworks):
+    # Test with 1D array instead of 2D
+    with pytest.raises(ValueError, match="'aggregation_weights' must be a 2-dimensional array"):
+        DoubleMLDIDAggregation(frameworks=frameworks, aggregation_weights=np.array([0.5, 0.3, 0.2]))
+
+
+@pytest.mark.ci
+def test_invalid_aggregation_weights_shape(frameworks):
+    # Test with wrong number of columns
+    with pytest.raises(
+        ValueError, match="The number of rows in 'aggregation_weights' must be equal to the number of frameworks"
+    ):
+        DoubleMLDIDAggregation(
+            frameworks=frameworks, aggregation_weights=np.array([[0.5, 0.5], [0.3, 0.7]])  # Only 2 columns for 3 frameworks
+        )
+
+
+@pytest.mark.ci
+def test_invalid_overall_aggregation_weights(frameworks, aggregation_weights):
+    # Test with invalid overall_aggregation_weights type
+    with pytest.raises(TypeError, match="'overall_aggregation_weights' must be a numpy array"):
+        DoubleMLDIDAggregation(
+            frameworks=frameworks,
+            aggregation_weights=aggregation_weights,
+            overall_aggregation_weights=[0.5, 0.5],  # list instead of numpy array
+        )
+
+
+@pytest.mark.ci
+def test_invalid_overall_weights_ndim(frameworks, aggregation_weights):
+    # Test with 2D array instead of 1D
+    with pytest.raises(ValueError, match="'overall_aggregation_weights' must be a 1-dimensional array"):
+        DoubleMLDIDAggregation(
+            frameworks=frameworks,
+            aggregation_weights=aggregation_weights,
+            overall_aggregation_weights=np.array([[0.5], [0.5]]),
+        )
+
+
+@pytest.mark.ci
+def test_invalid_overall_weights_length(frameworks, aggregation_weights):
+    # Test with wrong length
+    with pytest.raises(
+        ValueError, match="'overall_aggregation_weights' must have the same length as the number of aggregated frameworks"
+    ):
+        DoubleMLDIDAggregation(
+            frameworks=frameworks,
+            aggregation_weights=aggregation_weights,
+            overall_aggregation_weights=np.array([0.5, 0.3, 0.2]),  # 3 weights for 2 aggregations
+        )
+
+
+@pytest.mark.ci
+def test_invalid_aggregation_names_type(frameworks, aggregation_weights):
+    # Test with non-list type
+    with pytest.raises(TypeError, match="'aggregation_names' must be a list of strings"):
+        DoubleMLDIDAggregation(
+            frameworks=frameworks, aggregation_weights=aggregation_weights, aggregation_names="invalid_names"
+        )
+
+
+@pytest.mark.ci
+def test_invalid_aggregation_names_content(frameworks, aggregation_weights):
+    # Test with non-string elements
+    with pytest.raises(TypeError, match="'aggregation_names' must be a list of strings"):
+        DoubleMLDIDAggregation(frameworks=frameworks, aggregation_weights=aggregation_weights, aggregation_names=[1, 2])
+
+
+@pytest.mark.ci
+def test_invalid_aggregation_names_length(frameworks, aggregation_weights):
+    # Test with wrong length
+    with pytest.raises(ValueError, match="'aggregation_names' must have the same length as the number of aggregations"):
+        DoubleMLDIDAggregation(
+            frameworks=frameworks,
+            aggregation_weights=aggregation_weights,
+            aggregation_names=["agg1"],  # Only 1 name for 2 aggregations
+        )
+
+
+@pytest.mark.ci
+def test_invalid_method_name_type(frameworks, aggregation_weights):
+    # Test with non-string type
+    with pytest.raises(TypeError, match="'aggregation_method_name' must be a string"):
+        DoubleMLDIDAggregation(frameworks=frameworks, aggregation_weights=aggregation_weights, aggregation_method_name=123)
+
+
+@pytest.mark.ci
+def test_invalid_additional_information(frameworks, aggregation_weights):
+    # Test with invalid additional_information type
+    with pytest.raises(TypeError, match="'additional_information' must be a dictionary"):
+        DoubleMLDIDAggregation(
+            frameworks=frameworks,
+            aggregation_weights=aggregation_weights,
+            additional_information=[1, 2, 3],  # list instead of dict
+        )
+
+
+@pytest.mark.ci
+def test_additional_parameters(frameworks, aggregation_weights):
+    # Test with invalid additional_parameters type
+    with pytest.raises(TypeError, match="'additional_parameters' must be a dictionary"):
+        DoubleMLDIDAggregation(
+            frameworks=frameworks,
+            aggregation_weights=aggregation_weights,
+            additional_parameters=[1, 2, 3],  # list instead of dict
+        )
diff --git a/doubleml/did/tests/test_did_aggregation_plot.py b/doubleml/did/tests/test_did_aggregation_plot.py
new file mode 100644
index 00000000..1079b144
--- /dev/null
+++ b/doubleml/did/tests/test_did_aggregation_plot.py
@@ -0,0 +1,192 @@
+import warnings
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pytest
+from matplotlib.axes import Axes
+from matplotlib.figure import Figure
+
+from doubleml.did.did_aggregation import DoubleMLDIDAggregation
+from doubleml.double_ml_framework import DoubleMLFramework
+from doubleml.tests._utils import generate_dml_dict
+
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    return request.param
+
+
+@pytest.fixture
+def mock_framework(n_rep):
+    # Create a minimal mock of DoubleMLFramework
+    n_obs = 10
+    n_thetas = 1
+    # generate score samples
+    psi_a = np.ones(shape=(n_obs, n_thetas, n_rep))
+    psi_b = np.random.normal(size=(n_obs, n_thetas, n_rep))
+    doubleml_dict = generate_dml_dict(psi_a, psi_b)
+    return DoubleMLFramework(doubleml_dict)
+
+
+@pytest.fixture
+def simple_aggregation(mock_framework):
+    """Create a simple DoubleMLDIDAggregation object for testing."""
+    # Get two framework instances
+    fw1 = mock_framework
+    fw2 = mock_framework
+
+    # Set treatment names (important for the test)
+    fw1.treatment_names = ["Treatment 1"]
+    fw2.treatment_names = ["Treatment 2"]
+
+    # Weights for aggregation
+    agg_weights = np.array([[1.0, 0.0], [0.0, 1.0]])
+    overall_weights = np.array([0.7, 0.3])
+
+    agg_obj = DoubleMLDIDAggregation(
+        frameworks=[fw1, fw2],
+        aggregation_weights=agg_weights,
+        overall_aggregation_weights=overall_weights,
+        aggregation_names=["Group A", "Group B"],
+        aggregation_method_name="Test Method",
+        additional_information={"Test Info": "Value"},
+        additional_parameters={"aggregation_color_idx": [0, 1]},
+    )
+
+    agg_obj.aggregated_frameworks.bootstrap(n_rep_boot=10)
+    return agg_obj
+
+
+@pytest.mark.ci
+def test_plot_effects_returns_fig_ax(simple_aggregation):
+    """Test that plot_effects returns figure and axes objects."""
+    fig, ax = simple_aggregation.plot_effects()
+
+    assert isinstance(fig, Figure)
+    assert isinstance(ax, Axes)
+    plt.close("all")
+
+
+@pytest.mark.ci
+def test_plot_effects_invalid_sort_by(simple_aggregation):
+    """Test that invalid sort_by values raise ValueError."""
+    with pytest.raises(ValueError, match="Invalid sort_by value"):
+        simple_aggregation.plot_effects(sort_by="invalid")
+
+    # These should not raise
+    for valid_value in ["name", "estimate", None]:
+        _ = simple_aggregation.plot_effects(sort_by=valid_value)
+
+    plt.close("all")
+
+
+@pytest.mark.ci
+def test_plot_effects_sorting(simple_aggregation):
+    """Test that sorting works correctly."""
+    # Get the dataframe that would be created inside the method
+    df = simple_aggregation._create_ci_dataframe()
+
+    # Test name sorting
+    _, ax = simple_aggregation.plot_effects(sort_by="name")
+    labels = [text.get_text() for text in ax.get_xticklabels()]
+    expected = sorted(df["Aggregation_Names"])
+    assert labels == expected
+
+    # Test estimate sorting
+    _, ax = simple_aggregation.plot_effects(sort_by="estimate")
+    labels = [text.get_text() for text in ax.get_xticklabels()]
+    expected = df.sort_values("Estimate", ascending=False)["Aggregation_Names"].tolist()
+    assert labels == expected
+
+    plt.close("all")
+
+
+@pytest.mark.ci
+def test_plot_effects_elements(simple_aggregation):
+    """Test that the plot contains expected elements."""
+    _, ax = simple_aggregation.plot_effects(title="Test Title", y_label="Test Label")
+
+    # Check title and y-label
+    assert ax.get_title() == "Test Title"
+    assert ax.get_ylabel() == "Test Label"
+
+    # Check that we have the zero line
+    zero_lines = [line for line in ax.get_lines() if line.get_linestyle() == "--"]
+    assert len(zero_lines) == 1
+
+    # Check we have scatter points for estimates
+    assert len(ax.collections) > 0
+
+    # Check we have the correct number of x-ticks
+    assert len(ax.get_xticks()) == 2  # We have 2 groups in our fixture
+
+    plt.close("all")
+
+
+@pytest.mark.ci
+def test_plot_effects_custom_figsize(simple_aggregation):
+    """Test that figsize parameter works."""
+    custom_figsize = (8, 4)
+    fig, _ = simple_aggregation.plot_effects(figsize=custom_figsize)
+
+    # Convert to inches for comparison (matplotlib uses inches)
+    width, height = fig.get_size_inches()
+    assert (width, height) == custom_figsize
+
+    plt.close("all")
+
+
+@pytest.mark.ci
+def test_plot_effects_custom_colors(simple_aggregation):
+    """Test that color_palette parameter works."""
+    # Custom color list
+    custom_colors = [(1, 0, 0), (0, 1, 0)]  # Red and green
+    _, _ = simple_aggregation.plot_effects(color_palette=custom_colors)
+    plt.close("all")
+
+    # Named palette
+    _, _ = simple_aggregation.plot_effects(color_palette="Set1")
+    plt.close("all")
+
+
+@pytest.mark.ci
+def test_joint_ci_bootstrap_warning(mock_framework):
+    """Test that requesting joint confidence intervals without bootstrapping issues a warning."""
+    # Create a new aggregation object without bootstrapping
+    fw1 = mock_framework
+    fw2 = mock_framework
+
+    # Set treatment names
+    fw1.treatment_names = ["Treatment 1"]
+    fw2.treatment_names = ["Treatment 2"]
+
+    # Weights for aggregation
+    agg_weights = np.array([[1.0, 0.0], [0.0, 1.0]])
+    overall_weights = np.array([0.7, 0.3])
+
+    # Create aggregation without bootstrapping
+    aggregation = DoubleMLDIDAggregation(
+        frameworks=[fw1, fw2],
+        aggregation_weights=agg_weights,
+        overall_aggregation_weights=overall_weights,
+        aggregation_names=["Group A", "Group B"],
+        additional_parameters={"aggregation_color_idx": [0, 1]},
+    )
+
+    # Ensure no bootstrapping exists
+    aggregation.aggregated_frameworks._boot_t_stat = None
+
+    # Check that a warning is raised with the expected message
+    with pytest.warns(UserWarning, match="Joint confidence intervals require bootstrapping"):
+        _ = aggregation.plot_effects(joint=True)
+
+    # Verify that bootstrap was performed
+    assert aggregation.aggregated_frameworks.boot_t_stat is not None
+
+    # No warning should be raised when plotting again
+    with warnings.catch_warnings(record=True) as recorded_warnings:
+        warnings.simplefilter("always")  # Ensure all warnings are recorded
+        _ = aggregation.plot_effects(joint=True)
+
+    assert len(recorded_warnings) == 0
+    plt.close("all")
diff --git a/doubleml/did/tests/test_did_aggregation_return_types.py b/doubleml/did/tests/test_did_aggregation_return_types.py
new file mode 100644
index 00000000..e63eda70
--- /dev/null
+++ b/doubleml/did/tests/test_did_aggregation_return_types.py
@@ -0,0 +1,189 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import pytest
+from matplotlib.axes import Axes
+from matplotlib.figure import Figure
+
+from doubleml.did.did_aggregation import DoubleMLDIDAggregation
+from doubleml.double_ml_framework import DoubleMLFramework
+from doubleml.tests._utils import generate_dml_dict
+
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    return request.param
+
+
+@pytest.fixture
+def mock_framework(n_rep):
+    # Create a minimal mock of DoubleMLFramework
+    n_obs = 10
+    n_thetas = 1
+    # generate score samples
+    psi_a = np.ones(shape=(n_obs, n_thetas, n_rep))
+    psi_b = np.random.normal(size=(n_obs, n_thetas, n_rep))
+    doubleml_dict = generate_dml_dict(psi_a, psi_b)
+    return DoubleMLFramework(doubleml_dict)
+
+
+@pytest.fixture
+def frameworks(mock_framework):
+    # Create a list of 3 frameworks
+    return [mock_framework] * 3
+
+
+@pytest.fixture
+def aggregation_weights():
+    # Create sample weights for 2 aggregations over 3 frameworks
+    return np.array([[0.5, 0.3, 0.2], [0.2, 0.5, 0.3]])
+
+
+@pytest.mark.ci
+@pytest.mark.parametrize(
+    "property_name,expected_value",
+    [
+        ("overall_aggregation_weights", lambda w: np.array([0.5, 0.5])),  # Equal weights for 2 aggregations
+        ("aggregation_names", lambda w: ["Aggregation_0", "Aggregation_1"]),
+        ("aggregation_method_name", lambda w: "Custom"),
+        ("additional_information", lambda w: None),
+        ("additional_parameters", lambda w: None),
+    ],
+)
+def test_default_values(frameworks, aggregation_weights, property_name, expected_value):
+    # Test that default values are correctly set when not explicitly provided
+    aggregation = DoubleMLDIDAggregation(frameworks=frameworks, aggregation_weights=aggregation_weights)
+
+    expected = expected_value(aggregation_weights)
+    actual = getattr(aggregation, property_name)
+
+    if property_name == "overall_aggregation_weights":
+        np.testing.assert_array_equal(actual, expected)
+    else:
+        assert actual == expected
+
+
+@pytest.mark.ci
+def test_custom_aggregation_values(frameworks, aggregation_weights):
+    # Test all custom values are properly set when provided
+    custom_names = ["Custom1", "Custom2"]
+    custom_method = "MyMethod"
+    custom_overall_weights = np.array([0.7, 0.3])
+    custom_info = {"info": "test"}
+    custom_params = {"param": 123}
+
+    aggregation = DoubleMLDIDAggregation(
+        frameworks=frameworks,
+        aggregation_weights=aggregation_weights,
+        overall_aggregation_weights=custom_overall_weights,
+        aggregation_names=custom_names,
+        aggregation_method_name=custom_method,
+        additional_information=custom_info,
+        additional_parameters=custom_params,
+    )
+
+    assert aggregation.aggregation_names == custom_names
+    assert aggregation.aggregation_method_name == custom_method
+    np.testing.assert_array_equal(aggregation.overall_aggregation_weights, custom_overall_weights)
+    assert "info: test" in aggregation.additional_information
+    assert aggregation.additional_parameters == custom_params
+
+
+@pytest.mark.ci
+@pytest.mark.parametrize(
+    "property_name,expected_type",
+    [
+        ("base_frameworks", list),
+        ("aggregated_frameworks", DoubleMLFramework),
+        ("overall_aggregated_framework", DoubleMLFramework),
+        ("aggregation_weights", np.ndarray),
+        ("overall_aggregation_weights", np.ndarray),
+        ("n_aggregations", int),
+        ("aggregation_names", list),
+        ("aggregation_method_name", str),
+        ("aggregated_summary", pd.DataFrame),
+        ("overall_summary", pd.DataFrame),
+    ],
+)
+def test_return_types(frameworks, aggregation_weights, property_name, expected_type):
+    # Test that properties return the expected types
+    aggregation = DoubleMLDIDAggregation(frameworks=frameworks, aggregation_weights=aggregation_weights)
+
+    value = getattr(aggregation, property_name)
+    assert isinstance(value, expected_type)
+
+
+@pytest.mark.ci
+def test_additional_info_return_types(frameworks, aggregation_weights):
+    # Test additional_information and additional_parameters return types
+
+    # Test when None
+    aggregation1 = DoubleMLDIDAggregation(frameworks=frameworks, aggregation_weights=aggregation_weights)
+    assert aggregation1.additional_information is None
+    assert aggregation1.additional_parameters is None
+
+    # Test when provided
+    aggregation2 = DoubleMLDIDAggregation(
+        frameworks=frameworks,
+        aggregation_weights=aggregation_weights,
+        additional_information={"info": "value"},
+        additional_parameters={"param": "value"},
+    )
+    assert isinstance(aggregation2.additional_information, str)
+    assert isinstance(aggregation2.additional_parameters, dict)
+
+
+@pytest.mark.ci
+def test_str_representation(frameworks, aggregation_weights):
+    # Test string representation without additional information
+    aggregation1 = DoubleMLDIDAggregation(
+        frameworks=frameworks, aggregation_weights=aggregation_weights, aggregation_method_name="TestMethod"
+    )
+    str_output = str(aggregation1)
+
+    # Check presence of all required sections
+    assert "================== DoubleMLDIDAggregation Object ==================" in str_output
+    assert "TestMethod Aggregation" in str_output
+    assert "------------------ Overall Aggregated Effects ------------------" in str_output
+    assert "------------------ Aggregated Effects         ------------------" in str_output
+    assert "------------------ Additional Information     ------------------" not in str_output
+
+    # Test string representation with additional information
+    aggregation2 = DoubleMLDIDAggregation(
+        frameworks=frameworks,
+        aggregation_weights=aggregation_weights,
+        aggregation_method_name="TestMethod",
+        additional_information={"key": "value"},
+    )
+    str_output_with_info = str(aggregation2)
+
+    # Check additional information section
+    assert "------------------ Additional Information     ------------------" in str_output_with_info
+    assert "key: value" in str_output_with_info
+
+
+@pytest.mark.ci
+def test_plot_effects_return_type(frameworks, aggregation_weights):
+    """Test that plot_effects method returns matplotlib Figure and Axes objects."""
+    aggregation = DoubleMLDIDAggregation(frameworks=frameworks, aggregation_weights=aggregation_weights)
+    aggregation.aggregated_frameworks.bootstrap(n_rep_boot=10)
+
+    # Test basic call without parameters
+    fig, ax = aggregation.plot_effects()
+    assert isinstance(fig, Figure)
+    assert isinstance(ax, Axes)
+    plt.close(fig)
+
+    # Test with parameters
+    fig, ax = aggregation.plot_effects(
+        level=0.9,
+        joint=False,
+        figsize=(10, 5),
+        sort_by="estimate",
+        color_palette="Set2",
+        title="Custom Title",
+        y_label="Custom Y-Label",
+    )
+    assert isinstance(fig, Figure)
+    assert isinstance(ax, Axes)
+    plt.close(fig)
diff --git a/doubleml/did/tests/test_did_binary_control_groups.py b/doubleml/did/tests/test_did_binary_control_groups.py
new file mode 100644
index 00000000..b8406b15
--- /dev/null
+++ b/doubleml/did/tests/test_did_binary_control_groups.py
@@ -0,0 +1,31 @@
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+import doubleml as dml
+
+df = dml.did.datasets.make_did_CS2021(n_obs=500, dgp_type=1, n_pre_treat_periods=2, n_periods=4, time_type="float")
+dml_data = dml.data.DoubleMLPanelData(df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"])
+
+args = {
+    "obj_dml_data": dml_data,
+    "ml_g": LinearRegression(),
+    "ml_m": LogisticRegression(),
+    "g_value": 2,
+    "t_value_pre": 0,
+    "t_value_eval": 1,
+    "score": "observational",
+    "n_rep": 1,
+}
+
+
+def test_control_groups_different():
+    dml_did_never_treated = dml.did.DoubleMLDIDBinary(control_group="never_treated", **args)
+    dml_did_not_yet_treated = dml.did.DoubleMLDIDBinary(control_group="not_yet_treated", **args)
+
+    assert dml_did_never_treated._n_subset != dml_did_not_yet_treated._n_subset
+    # same treatment group
+    assert dml_did_never_treated._n_treated_subset == dml_did_not_yet_treated._n_treated_subset
+
+    dml_did_never_treated.fit()
+    dml_did_not_yet_treated.fit()
+
+    assert dml_did_never_treated.coef != dml_did_not_yet_treated.coef
diff --git a/doubleml/did/tests/test_did_binary_exceptions.py b/doubleml/did/tests/test_did_binary_exceptions.py
new file mode 100644
index 00000000..c7aa2395
--- /dev/null
+++ b/doubleml/did/tests/test_did_binary_exceptions.py
@@ -0,0 +1,152 @@
+from unittest.mock import patch
+
+import numpy as np
+import pandas as pd
+import pytest
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+import doubleml as dml
+
+dml_data = dml.did.datasets.make_did_SZ2020(n_obs=500, dgp_type=1, return_type="DoubleMLPanelData")
+
+valid_arguments = {
+    "obj_dml_data": dml_data,
+    "ml_g": LinearRegression(),
+    "ml_m": LogisticRegression(),
+    "g_value": 1,
+    "t_value_pre": 0,
+    "t_value_eval": 1,
+    "score": "observational",
+    "n_rep": 1,
+    "draw_sample_splitting": True,
+}
+
+
+@pytest.mark.ci
+def test_input():
+    # control group
+    msg = r"The control group has to be one of \['never_treated', 'not_yet_treated'\]. 0 was passed."
+    with pytest.raises(ValueError, match=msg):
+        invalid_arguments = {"control_group": 0}
+        _ = dml.did.DoubleMLDIDBinary(**(valid_arguments | invalid_arguments))
+
+    # g value
+    msg = r"The value test is not in the set of treatment group values \[0 1\]."
+    with pytest.raises(ValueError, match=msg):
+        invalid_arguments = {"g_value": "test"}
+        _ = dml.did.DoubleMLDIDBinary(**(valid_arguments | invalid_arguments))
+
+    msg = r"The never treated group is not allowed as treatment group \(g_value=0\)."
+    with pytest.raises(ValueError, match=msg):
+        invalid_arguments = {"g_value": 0}
+        _ = dml.did.DoubleMLDIDBinary(**(valid_arguments | invalid_arguments))
+
+    msg = r"The never treated group is not allowed as treatment group \(g_value=0\)."
+    with pytest.raises(ValueError, match=msg):
+        invalid_arguments = {"g_value": 0.0}
+        _ = dml.did.DoubleMLDIDBinary(**(valid_arguments | invalid_arguments))
+
+    # t values
+    msg = r"The value test is not in the set of evaluation period values \[0 1\]."
+    with pytest.raises(ValueError, match=msg):
+        invalid_arguments = {"t_value_pre": "test"}
+        _ = dml.did.DoubleMLDIDBinary(**(valid_arguments | invalid_arguments))
+    with pytest.raises(ValueError, match=msg):
+        invalid_arguments = {"t_value_eval": "test"}
+        _ = dml.did.DoubleMLDIDBinary(**(valid_arguments | invalid_arguments))
+
+    # in-sample normalization
+    msg = "in_sample_normalization indicator has to be boolean. Object of type <class 'str'> passed."
+    with pytest.raises(TypeError, match=msg):
+        invalid_arguments = {"in_sample_normalization": "test"}
+        _ = dml.did.DoubleMLDIDBinary(**(valid_arguments | invalid_arguments))
+
+    # ml_g classifier
+    msg = r"The ml_g learner LogisticRegression\(\) was identified as"
+    with pytest.raises(ValueError, match=msg):
+        invalid_arguments = {"ml_g": LogisticRegression()}
+        _ = dml.did.DoubleMLDIDBinary(**(valid_arguments | invalid_arguments))
+
+
+@pytest.mark.ci
+def test_no_control_group_exception():
+    msg = "No observations in the control group."
+    with pytest.raises(ValueError, match=msg):
+        invalid_data = dml.did.datasets.make_did_SZ2020(n_obs=500, dgp_type=1, return_type="DoubleMLPanelData")
+        invalid_data.data["d"] = 1.0
+        invalid_arguments = {"obj_dml_data": invalid_data, "control_group": "not_yet_treated"}
+        _ = dml.did.DoubleMLDIDBinary(**(valid_arguments | invalid_arguments))
+
+
+@pytest.mark.ci
+def test_check_data_exceptions():
+    """Test exception handling for _check_data method in DoubleMLDIDBinary"""
+    df = pd.DataFrame(np.random.normal(size=(10, 5)), columns=[f"Col_{i}" for i in range(5)])
+
+    # Test 1: Data has to be DoubleMLPanelData
+    invalid_data_types = [
+        dml.data.DoubleMLData(df, y_col="Col_0", d_cols="Col_1"),
+    ]
+
+    for invalid_data in invalid_data_types:
+        msg = r"For repeated outcomes the data must be of DoubleMLPanelData type\."
+        with pytest.raises(TypeError, match=msg):
+            _ = dml.did.DoubleMLDIDBinary(
+                obj_dml_data=invalid_data,
+                ml_g=LinearRegression(),
+                ml_m=LogisticRegression(),
+                g_value=1,
+                t_value_pre=0,
+                t_value_eval=1,
+            )
+
+    # Test 2: Data cannot have instrumental variables
+    df_with_z = dml_data.data.copy()
+    dml_data_with_z = dml.data.DoubleMLPanelData(
+        df_with_z, y_col="y", d_cols="d", id_col="id", t_col="t", z_cols=["Z1"], x_cols=["Z2", "Z3", "Z4"]
+    )
+
+    msg = r"Incompatible data. Z1 have been set as instrumental variable\(s\)."
+    with pytest.raises(NotImplementedError, match=msg):
+        _ = dml.did.DoubleMLDIDBinary(
+            obj_dml_data=dml_data_with_z,
+            ml_g=LinearRegression(),
+            ml_m=LogisticRegression(),
+            g_value=1,
+            t_value_pre=0,
+            t_value_eval=1,
+        )
+
+    # Test 3: Data must have exactly one treatment variable (using mock)
+    with patch.object(dml_data.__class__, "n_treat", property(lambda self: 2)):
+        msg = (
+            "Incompatible data. To fit an DID model with DML exactly one variable needs to be specified as treatment variable."
+        )
+        with pytest.raises(ValueError, match=msg):
+            _ = dml.did.DoubleMLDIDBinary(
+                obj_dml_data=dml_data,
+                ml_g=LinearRegression(),
+                ml_m=LogisticRegression(),
+                g_value=1,
+                t_value_pre=0,
+                t_value_eval=1,
+            )
+
+
+@pytest.mark.ci
+def test_benchmark_warning():
+    """Test warning when sensitivity_benchmark is called with experimental score"""
+    args = {
+        "obj_dml_data": dml_data,
+        "ml_g": LinearRegression(),
+        "ml_m": LogisticRegression(),
+        "g_value": 1,
+        "t_value_pre": 0,
+        "t_value_eval": 1,
+        "n_rep": 1,
+    }
+    # Create a DID model with experimental score
+    did_model = dml.did.DoubleMLDIDBinary(**args, score="experimental")
+    did_model.fit()
+    with pytest.warns(UserWarning, match="Sensitivity benchmarking for experimental score may not be meaningful"):
+        did_model.sensitivity_benchmark(["Z1", "Z2"])
diff --git a/doubleml/did/tests/test_did_binary_external_predictions.py b/doubleml/did/tests/test_did_binary_external_predictions.py
new file mode 100644
index 00000000..ccc136d0
--- /dev/null
+++ b/doubleml/did/tests/test_did_binary_external_predictions.py
@@ -0,0 +1,163 @@
+import math
+
+import numpy as np
+import pytest
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+from doubleml.data import DoubleMLPanelData
+from doubleml.did import DoubleMLDIDBinary
+from doubleml.did.datasets import make_did_CS2021, make_did_SZ2020
+from doubleml.tests._utils import draw_smpls
+from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor
+
+
+@pytest.fixture(scope="module", params=["observational", "experimental"])
+def did_score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def doubleml_did_fixture(did_score, n_rep):
+    n_obs = 500
+    n_folds = 5
+
+    ext_predictions = {"d": {}}
+    dml_data = make_did_SZ2020(n_obs=n_obs, return_type="DoubleMLPanelData")
+
+    kwargs = {
+        "obj_dml_data": dml_data,
+        "g_value": 1,
+        "t_value_pre": 0,
+        "t_value_eval": 1,
+        "score": did_score,
+        "n_rep": n_rep,
+        "draw_sample_splitting": False,
+    }
+
+    dml_did = DoubleMLDIDBinary(ml_g=LinearRegression(), ml_m=LogisticRegression(), **kwargs)
+    all_smpls = draw_smpls(n_obs, n_folds, n_rep=n_rep, groups=dml_did._g_panel)
+    dml_did.set_sample_splitting(all_smpls)
+
+    np.random.seed(3141)
+    dml_did.fit(store_predictions=True)
+
+    ext_predictions["d"]["ml_g0"] = dml_did.predictions["ml_g0"][:, :, 0]
+    ext_predictions["d"]["ml_g1"] = dml_did.predictions["ml_g1"][:, :, 0]
+    if did_score == "observational":
+        ext_predictions["d"]["ml_m"] = dml_did.predictions["ml_m"][:, :, 0]
+
+    dml_did_ext = DoubleMLDIDBinary(ml_g=DMLDummyRegressor(), ml_m=DMLDummyClassifier(), **kwargs)
+    dml_did_ext.set_sample_splitting(all_smpls)
+    np.random.seed(3141)
+    dml_did_ext.fit(external_predictions=ext_predictions)
+
+    res_dict = {
+        "coef": dml_did.coef[0],
+        "coef_ext": dml_did_ext.coef[0],
+        "se": dml_did.se[0],
+        "se_ext": dml_did_ext.se[0],
+        "score": dml_did.psi,
+        "score_ext": dml_did_ext.psi,
+        "dml_did_nuisance_loss": dml_did.nuisance_loss,
+        "dml_did_ext_nuisance_loss": dml_did_ext.nuisance_loss,
+    }
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_coef(doubleml_did_fixture):
+    assert math.isclose(doubleml_did_fixture["coef"], doubleml_did_fixture["coef_ext"], rel_tol=1e-9, abs_tol=1e-3)
+
+
+@pytest.mark.ci
+def test_se(doubleml_did_fixture):
+    assert math.isclose(doubleml_did_fixture["se"], doubleml_did_fixture["se_ext"], rel_tol=1e-9, abs_tol=1e-3)
+
+
+@pytest.mark.ci
+def test_score(doubleml_did_fixture):
+    assert np.allclose(doubleml_did_fixture["score"], doubleml_did_fixture["score_ext"], rtol=1e-9, atol=1e-3)
+
+
+@pytest.mark.ci
+def test_nuisance_loss(doubleml_did_fixture):
+    for key, value in doubleml_did_fixture["dml_did_nuisance_loss"].items():
+        assert np.allclose(value, doubleml_did_fixture["dml_did_ext_nuisance_loss"][key], rtol=1e-9, atol=1e-3)
+
+
+@pytest.fixture(scope="module")
+def doubleml_did_panel_fixture(did_score, n_rep):
+    n_obs = 500
+    n_folds = 5
+    dgp = 1
+
+    ext_predictions = {"d": {}}
+    df = make_did_CS2021(n_obs=n_obs, dgp_type=dgp, time_type="float")
+    dml_panel_data = DoubleMLPanelData(df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"])
+
+    kwargs = {
+        "obj_dml_data": dml_panel_data,
+        "g_value": 2,
+        "t_value_pre": 0,
+        "t_value_eval": 1,
+        "score": did_score,
+        "n_rep": n_rep,
+        "draw_sample_splitting": False,
+    }
+
+    dml_did = DoubleMLDIDBinary(ml_g=LinearRegression(), ml_m=LogisticRegression(), **kwargs)
+    all_smpls = draw_smpls(n_obs=dml_did._n_subset, n_folds=n_folds, n_rep=n_rep, groups=dml_did._g_panel)
+    dml_did.set_sample_splitting(all_smpls)
+
+    np.random.seed(3141)
+    dml_did.fit(store_predictions=True)
+
+    pred = dml_did.predictions
+    ext_predictions["d"]["ml_g0"] = pred["ml_g0"][:, :, 0]
+    ext_predictions["d"]["ml_g1"] = pred["ml_g1"][:, :, 0]
+    if did_score == "observational":
+        ext_predictions["d"]["ml_m"] = pred["ml_m"][:, :, 0]
+    dml_did_ext = DoubleMLDIDBinary(ml_g=DMLDummyRegressor(), ml_m=DMLDummyClassifier(), **kwargs)
+    dml_did_ext.set_sample_splitting(all_smpls)
+    np.random.seed(3141)
+    dml_did_ext.fit(external_predictions=ext_predictions)
+
+    res_dict = {
+        "coef": dml_did.coef[0],
+        "coef_ext": dml_did_ext.coef[0],
+        "se": dml_did.se[0],
+        "se_ext": dml_did_ext.se[0],
+        "score": dml_did.psi,
+        "score_ext": dml_did_ext.psi,
+        "dml_did_nuisance_loss": dml_did.nuisance_loss,
+        "dml_did_ext_nuisance_loss": dml_did_ext.nuisance_loss,
+    }
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_panel_coef(doubleml_did_panel_fixture):
+    assert math.isclose(doubleml_did_panel_fixture["coef"], doubleml_did_panel_fixture["coef_ext"], rel_tol=1e-9, abs_tol=1e-3)
+
+
+@pytest.mark.ci
+def test_panel_se(doubleml_did_panel_fixture):
+    assert math.isclose(doubleml_did_panel_fixture["se"], doubleml_did_panel_fixture["se_ext"], rel_tol=1e-9, abs_tol=1e-3)
+
+
+@pytest.mark.ci
+def test_panel_score(doubleml_did_panel_fixture):
+    assert np.allclose(doubleml_did_panel_fixture["score"], doubleml_did_panel_fixture["score_ext"], rtol=1e-9, atol=1e-3)
+
+
+@pytest.mark.ci
+def test_panel_nuisance_loss(doubleml_did_panel_fixture):
+    for key, value in doubleml_did_panel_fixture["dml_did_nuisance_loss"].items():
+        assert np.allclose(value, doubleml_did_panel_fixture["dml_did_ext_nuisance_loss"][key], rtol=1e-9, atol=1e-3)
diff --git a/doubleml/did/tests/test_did_binary_placebo.py b/doubleml/did/tests/test_did_binary_placebo.py
new file mode 100644
index 00000000..ab90030e
--- /dev/null
+++ b/doubleml/did/tests/test_did_binary_placebo.py
@@ -0,0 +1,58 @@
+import numpy as np
+import pytest
+from lightgbm import LGBMClassifier, LGBMRegressor
+
+from doubleml.data import DoubleMLPanelData
+from doubleml.did import DoubleMLDIDBinary
+from doubleml.did.datasets import make_did_CS2021
+
+
+@pytest.fixture(scope="module", params=["observational", "experimental"])
+def did_score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def doubleml_did_fixture(did_score, n_rep):
+    n_obs = 500
+    dgp = 5  # has to be experimental (for experimental score to be valid)
+    df = make_did_CS2021(n_obs=n_obs, dgp=dgp, n_pre_treat_periods=3)
+    dml_data = DoubleMLPanelData(df, y_col="y", d_cols="d", t_col="t", id_col="id", x_cols=["Z1", "Z2", "Z3", "Z4"])
+
+    kwargs = {
+        "obj_dml_data": dml_data,
+        "g_value": dml_data.g_values[0],
+        "t_value_pre": dml_data.t_values[0],
+        "t_value_eval": dml_data.t_values[1],
+        "ml_g": LGBMRegressor(verbose=-1),
+        "ml_m": LGBMClassifier(verbose=-1),
+        "score": did_score,
+        "n_rep": n_rep,
+        "n_folds": 5,
+        "draw_sample_splitting": True,
+    }
+
+    dml_did = DoubleMLDIDBinary(**kwargs)
+
+    np.random.seed(3141)
+    dml_did.fit()
+    ci = dml_did.confint(level=0.99)
+
+    res_dict = {
+        "coef": dml_did.coef[0],
+        "ci_lower": ci.iloc[0, 0],
+        "ci_upper": ci.iloc[0, 1],
+    }
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_zero(doubleml_did_fixture):
+    assert doubleml_did_fixture["ci_lower"] <= 0.0
+    assert doubleml_did_fixture["ci_upper"] >= 0.0
diff --git a/doubleml/did/tests/test_did_binary_stdout.py b/doubleml/did/tests/test_did_binary_stdout.py
new file mode 100644
index 00000000..04687fb9
--- /dev/null
+++ b/doubleml/did/tests/test_did_binary_stdout.py
@@ -0,0 +1,49 @@
+import io
+from contextlib import redirect_stdout
+
+import pytest
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+import doubleml as dml
+
+dml_data = dml.did.datasets.make_did_SZ2020(n_obs=500, dgp_type=1, return_type="DoubleMLPanelData")
+
+
+@pytest.mark.ci
+def test_print_periods():
+    """Test that print_periods parameter correctly controls output printing."""
+
+    # Create test data
+    dml_data = dml.did.datasets.make_did_SZ2020(n_obs=100, return_type="DoubleMLPanelData")
+
+    # Test 1: Default case (print_periods=False) - should not print anything
+    f = io.StringIO()
+    with redirect_stdout(f):
+        _ = dml.did.DoubleMLDIDBinary(
+            obj_dml_data=dml_data,
+            ml_g=LinearRegression(),
+            ml_m=LogisticRegression(),
+            g_value=1,
+            t_value_pre=0,
+            t_value_eval=1,
+            print_periods=False,  # Default
+        )
+    output_default = f.getvalue()
+    assert output_default.strip() == "", "Expected no output with print_periods=False"
+
+    # Test 2: With print_periods=True - should print information
+    f = io.StringIO()
+    with redirect_stdout(f):
+        _ = dml.did.DoubleMLDIDBinary(
+            obj_dml_data=dml_data,
+            ml_g=LinearRegression(),
+            ml_m=LogisticRegression(),
+            g_value=1,
+            t_value_pre=0,
+            t_value_eval=1,
+            print_periods=True,
+        )
+    output_print = f.getvalue()
+    assert "Evaluation of ATT(1, 1), with pre-treatment period 0" in output_print
+    assert "post-treatment: True" in output_print
+    assert "Control group: never_treated" in output_print
diff --git a/doubleml/did/tests/test_did_binary_tune.py b/doubleml/did/tests/test_did_binary_tune.py
new file mode 100644
index 00000000..a817223f
--- /dev/null
+++ b/doubleml/did/tests/test_did_binary_tune.py
@@ -0,0 +1,213 @@
+import math
+
+import numpy as np
+import pytest
+from sklearn.base import clone
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.linear_model import LogisticRegression
+
+import doubleml as dml
+
+from ...tests._utils import draw_smpls
+from ._utils_did_manual import boot_did, fit_did, tune_nuisance_did
+
+
+@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)])
+def learner_g(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[LogisticRegression()])
+def learner_m(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["observational", "experimental"])
+def score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def in_sample_normalization(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def tune_on_folds(request):
+    return request.param
+
+
+def get_par_grid(learner):
+    if learner.__class__ in [RandomForestRegressor]:
+        par_grid = {"n_estimators": [5, 10, 20]}
+    else:
+        assert learner.__class__ in [LogisticRegression]
+        par_grid = {"C": np.logspace(-4, 2, 10)}
+    return par_grid
+
+
+@pytest.fixture(scope="module")
+def dml_did_fixture(generate_data_did_binary, learner_g, learner_m, score, in_sample_normalization, tune_on_folds):
+    par_grid = {"ml_g": get_par_grid(learner_g), "ml_m": get_par_grid(learner_m)}
+    n_folds_tune = 4
+
+    boot_methods = ["normal"]
+    n_folds = 2
+    n_rep_boot = 499
+
+    # collect data
+    dml_panel_data = generate_data_did_binary
+    df = dml_panel_data._data.sort_values(by=["id", "t"])
+    df_panel = df.groupby("id").agg(
+        {"y": lambda x: x.iloc[1] - x.iloc[0], "d": "first", "Z1": "first", "Z2": "first", "Z3": "first", "Z4": "first"}
+    )
+
+    n_obs = df_panel.shape[0]
+    all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=df_panel["d"])
+    obj_dml_data = dml.DoubleMLData(df_panel, y_col="y", d_cols="d", x_cols=["Z1", "Z2", "Z3", "Z4"])
+
+    # Set machine learning methods for m & g
+    ml_g = clone(learner_g)
+    ml_m = clone(learner_m)
+
+    dml_args = {
+        "ml_g": ml_g,
+        "ml_m": ml_m,
+        "n_folds": n_folds,
+        "score": score,
+        "in_sample_normalization": in_sample_normalization,
+        "draw_sample_splitting": False,
+    }
+
+    dml_did_binary_obj = dml.did.DoubleMLDIDBinary(
+        dml_panel_data,
+        g_value=1,
+        t_value_pre=0,
+        t_value_eval=1,
+        **dml_args,
+    )
+
+    dml_did_obj = dml.DoubleMLDID(
+        obj_dml_data,
+        **dml_args,
+    )
+
+    # synchronize the sample splitting
+    dml_did_obj.set_sample_splitting(all_smpls=all_smpls)
+    dml_did_binary_obj.set_sample_splitting(all_smpls=all_smpls)
+
+    # tune hyperparameters
+    np.random.seed(3141)
+    tune_res = dml_did_obj.tune(par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune, return_tune_res=False)
+    assert isinstance(tune_res, dml.DoubleMLDID)
+    np.random.seed(3141)
+    tune_res_binary = dml_did_binary_obj.tune(
+        par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune, return_tune_res=False
+    )
+    assert isinstance(tune_res_binary, dml.did.DoubleMLDIDBinary)
+
+    dml_did_obj.fit()
+    dml_did_binary_obj.fit()
+
+    # manual fit
+    y = df_panel["y"].values
+    d = df_panel["d"].values
+    x = df_panel[["Z1", "Z2", "Z3", "Z4"]].values
+    np.random.seed(3141)
+    smpls = all_smpls[0]
+
+    if tune_on_folds:
+        g0_params, g1_params, m_params = tune_nuisance_did(
+            y, x, d, clone(learner_g), clone(learner_m), smpls, score, n_folds_tune, par_grid["ml_g"], par_grid["ml_m"]
+        )
+    else:
+        xx = [(np.arange(len(y)), np.array([]))]
+        g0_params, g1_params, m_params = tune_nuisance_did(
+            y, x, d, clone(learner_g), clone(learner_m), xx, score, n_folds_tune, par_grid["ml_g"], par_grid["ml_m"]
+        )
+        g0_params = g0_params * n_folds
+        if score == "experimental":
+            g1_params = g1_params * n_folds
+            m_params = None
+        else:
+            assert score == "observational"
+            g1_params = None
+            m_params = m_params * n_folds
+
+    res_manual = fit_did(
+        y,
+        x,
+        d,
+        clone(learner_g),
+        clone(learner_m),
+        all_smpls,
+        score,
+        in_sample_normalization,
+        g0_params=g0_params,
+        g1_params=g1_params,
+        m_params=m_params,
+    )
+
+    res_dict = {
+        "coef": dml_did_obj.coef,
+        "coef_binary": dml_did_binary_obj.coef,
+        "coef_manual": res_manual["theta"],
+        "se": dml_did_obj.se,
+        "se_binary": dml_did_binary_obj.se,
+        "se_manual": res_manual["se"],
+        "boot_methods": boot_methods,
+    }
+
+    for bootstrap in boot_methods:
+        np.random.seed(3141)
+        boot_t_stat = boot_did(
+            y,
+            res_manual["thetas"],
+            res_manual["ses"],
+            res_manual["all_psi_a"],
+            res_manual["all_psi_b"],
+            all_smpls,
+            bootstrap,
+            n_rep_boot,
+        )
+
+        np.random.seed(3141)
+        dml_did_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
+        np.random.seed(3141)
+        dml_did_binary_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
+
+        res_dict["boot_t_stat" + bootstrap] = dml_did_obj.boot_t_stat
+        res_dict["boot_t_stat" + bootstrap + "_binary"] = dml_did_binary_obj.boot_t_stat
+        res_dict["boot_t_stat" + bootstrap + "_manual"] = boot_t_stat.reshape(-1, 1, 1)
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_dml_did_coef(dml_did_fixture):
+    assert math.isclose(dml_did_fixture["coef"][0], dml_did_fixture["coef_manual"], rel_tol=1e-9, abs_tol=1e-4)
+    assert math.isclose(dml_did_fixture["coef_binary"][0], dml_did_fixture["coef"][0], rel_tol=1e-9, abs_tol=1e-4)
+
+
+@pytest.mark.ci
+def test_dml_did_se(dml_did_fixture):
+    assert math.isclose(dml_did_fixture["se"][0], dml_did_fixture["se_manual"], rel_tol=1e-9, abs_tol=1e-4)
+    assert math.isclose(dml_did_fixture["se_binary"][0], dml_did_fixture["se"][0], rel_tol=1e-9, abs_tol=1e-4)
+
+
+@pytest.mark.ci
+def test_boot(dml_did_fixture):
+    for bootstrap in dml_did_fixture["boot_methods"]:
+        assert np.allclose(
+            dml_did_fixture["boot_t_stat" + bootstrap],
+            dml_did_fixture["boot_t_stat" + bootstrap + "_manual"],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+
+        assert np.allclose(
+            dml_did_fixture["boot_t_stat" + bootstrap],
+            dml_did_fixture["boot_t_stat" + bootstrap + "_binary"],
+            rtol=1e-9,
+            atol=1e-4,
+        )
diff --git a/doubleml/did/tests/test_did_binary_vs_did_panel.py b/doubleml/did/tests/test_did_binary_vs_did_panel.py
new file mode 100644
index 00000000..1eacdf6a
--- /dev/null
+++ b/doubleml/did/tests/test_did_binary_vs_did_panel.py
@@ -0,0 +1,215 @@
+import math
+
+import numpy as np
+import pytest
+from sklearn.base import clone
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+import doubleml as dml
+from doubleml.did.datasets import make_did_CS2021
+from doubleml.did.utils._did_utils import _get_id_positions
+
+
+@pytest.fixture(
+    scope="module",
+    params=[
+        [LinearRegression(), LogisticRegression(solver="lbfgs", max_iter=250)],
+        [
+            RandomForestRegressor(max_depth=5, n_estimators=10, random_state=42),
+            RandomForestClassifier(max_depth=5, n_estimators=10, random_state=42),
+        ],
+    ],
+)
+def learner(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["observational", "experimental"])
+def score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def in_sample_normalization(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[0.1])
+def trimming_threshold(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["datetime", "float"])
+def time_type(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def dml_did_binary_vs_did_fixture(time_type, learner, score, in_sample_normalization, trimming_threshold):
+    n_obs = 500
+    dpg = 1
+
+    boot_methods = ["normal"]
+    n_rep_boot = 50000
+
+    # collect data
+    df = make_did_CS2021(n_obs=n_obs, dgp_type=dpg, time_type=time_type)
+    dml_panel_data = dml.data.DoubleMLPanelData(
+        df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]
+    )
+
+    dml_args = {
+        "ml_g": clone(learner[0]),
+        "ml_m": clone(learner[1]),
+        "n_folds": 3,
+        "score": score,
+        "in_sample_normalization": in_sample_normalization,
+        "trimming_threshold": trimming_threshold,
+        "draw_sample_splitting": True,
+    }
+
+    dml_did_binary_obj = dml.did.DoubleMLDIDBinary(
+        dml_panel_data,
+        g_value=dml_panel_data.g_values[0],
+        t_value_pre=dml_panel_data.t_values[0],
+        t_value_eval=dml_panel_data.t_values[1],
+        **dml_args,
+    )
+    dml_did_binary_obj.fit()
+
+    df_wide = dml_did_binary_obj._panel_data_wide.copy()
+    dml_data = dml.data.DoubleMLData(df_wide, y_col="y_diff", d_cols="G_indicator", x_cols=["Z1", "Z2", "Z3", "Z4"])
+    dml_did_obj = dml.DoubleMLDID(
+        dml_data,
+        **dml_args,
+    )
+
+    # use external predictions (sample splitting is hard to synchronize)
+    ext_predictions = {"G_indicator": {}}
+    ext_predictions["G_indicator"]["ml_g0"] = _get_id_positions(
+        dml_did_binary_obj.predictions["ml_g0"][:, :, 0], dml_did_binary_obj._id_positions
+    )
+    ext_predictions["G_indicator"]["ml_g1"] = _get_id_positions(
+        dml_did_binary_obj.predictions["ml_g1"][:, :, 0], dml_did_binary_obj._id_positions
+    )
+    if score == "observational":
+        ext_predictions["G_indicator"]["ml_m"] = _get_id_positions(
+            dml_did_binary_obj.predictions["ml_m"][:, :, 0], dml_did_binary_obj._id_positions
+        )
+    dml_did_obj.fit(external_predictions=ext_predictions)
+
+    res_dict = {
+        "coef": dml_did_obj.coef,
+        "coef_binary": dml_did_binary_obj.coef,
+        "se": dml_did_obj.se,
+        "se_binary": dml_did_binary_obj.se,
+        "nuisance_loss": dml_did_obj.nuisance_loss,
+        "nuisance_loss_binary": dml_did_binary_obj.nuisance_loss,
+        "boot_methods": boot_methods,
+        "dml_did_binary_obj": dml_did_binary_obj,
+    }
+
+    for bootstrap in boot_methods:
+        np.random.seed(3141)
+        dml_did_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
+        np.random.seed(3141)
+        dml_did_binary_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
+
+        # approximately same ci (bootstrap not identical due to size of score)
+        res_dict["boot_ci" + bootstrap] = dml_did_obj.confint(joint=True)
+        res_dict["boot_ci" + bootstrap + "_binary"] = dml_did_binary_obj.confint(joint=True)
+
+    # sensitivity tests
+    res_dict["sensitivity_elements"] = dml_did_obj.sensitivity_elements
+    res_dict["sensitivity_elements_binary"] = dml_did_binary_obj.sensitivity_elements
+
+    dml_did_obj.sensitivity_analysis()
+    dml_did_binary_obj.sensitivity_analysis()
+
+    res_dict["sensitivity_params"] = dml_did_obj.sensitivity_params
+    res_dict["sensitivity_params_binary"] = dml_did_binary_obj.sensitivity_params
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_coefs(dml_did_binary_vs_did_fixture):
+    assert math.isclose(
+        dml_did_binary_vs_did_fixture["coef_binary"][0], dml_did_binary_vs_did_fixture["coef"][0], rel_tol=1e-9, abs_tol=1e-4
+    )
+
+
+@pytest.mark.ci
+def test_ses(dml_did_binary_vs_did_fixture):
+    assert math.isclose(
+        dml_did_binary_vs_did_fixture["se_binary"][0], dml_did_binary_vs_did_fixture["se"][0], rel_tol=1e-9, abs_tol=1e-4
+    )
+
+
+@pytest.mark.ci
+def test_boot(dml_did_binary_vs_did_fixture):
+    for bootstrap in dml_did_binary_vs_did_fixture["boot_methods"]:
+        assert np.allclose(
+            dml_did_binary_vs_did_fixture["boot_ci" + bootstrap].values,
+            dml_did_binary_vs_did_fixture["boot_ci" + bootstrap + "_binary"].values,
+            atol=1e-2,
+        )
+
+
+@pytest.mark.ci
+def test_nuisance_loss(dml_did_binary_vs_did_fixture):
+    assert (
+        dml_did_binary_vs_did_fixture["nuisance_loss"].keys() == dml_did_binary_vs_did_fixture["nuisance_loss_binary"].keys()
+    )
+    for key, value in dml_did_binary_vs_did_fixture["nuisance_loss"].items():
+        assert np.allclose(value, dml_did_binary_vs_did_fixture["nuisance_loss_binary"][key], rtol=1e-9, atol=1e-3)
+
+
+@pytest.mark.ci
+def test_sensitivity_elements(dml_did_binary_vs_did_fixture):
+    sensitivity_element_names = ["sigma2", "nu2"]
+    for sensitivity_element in sensitivity_element_names:
+        assert np.allclose(
+            dml_did_binary_vs_did_fixture["sensitivity_elements"][sensitivity_element],
+            dml_did_binary_vs_did_fixture["sensitivity_elements_binary"][sensitivity_element],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+    for sensitivity_element in ["psi_sigma2", "psi_nu2", "riesz_rep"]:
+        dml_binary_obj = dml_did_binary_vs_did_fixture["dml_did_binary_obj"]
+        scaling = dml_binary_obj._n_subset / dml_binary_obj._dml_data.n_obs
+        binary_sensitivity_element = scaling * _get_id_positions(
+            dml_did_binary_vs_did_fixture["sensitivity_elements_binary"][sensitivity_element], dml_binary_obj._id_positions
+        )
+        assert np.allclose(
+            dml_did_binary_vs_did_fixture["sensitivity_elements"][sensitivity_element],
+            binary_sensitivity_element,
+            rtol=1e-9,
+            atol=1e-4,
+        )
+
+
+@pytest.mark.ci
+def test_sensitivity_params(dml_did_binary_vs_did_fixture):
+    for key in ["theta", "se", "ci"]:
+        assert np.allclose(
+            dml_did_binary_vs_did_fixture["sensitivity_params"][key]["lower"],
+            dml_did_binary_vs_did_fixture["sensitivity_params_binary"][key]["lower"],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+        assert np.allclose(
+            dml_did_binary_vs_did_fixture["sensitivity_params"][key]["upper"],
+            dml_did_binary_vs_did_fixture["sensitivity_params_binary"][key]["upper"],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+
+    for key in ["rv", "rva"]:
+        assert np.allclose(
+            dml_did_binary_vs_did_fixture["sensitivity_params"][key],
+            dml_did_binary_vs_did_fixture["sensitivity_params_binary"][key],
+            rtol=1e-9,
+            atol=1e-4,
+        )
diff --git a/doubleml/did/tests/test_did_binary_vs_did_two_period.py b/doubleml/did/tests/test_did_binary_vs_did_two_period.py
new file mode 100644
index 00000000..0db2a752
--- /dev/null
+++ b/doubleml/did/tests/test_did_binary_vs_did_two_period.py
@@ -0,0 +1,264 @@
+import math
+
+import numpy as np
+import pytest
+from sklearn.base import clone
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+import doubleml as dml
+
+from ...tests._utils import draw_smpls
+from ._utils_did_manual import boot_did, fit_did, fit_sensitivity_elements_did
+
+
+@pytest.fixture(
+    scope="module",
+    params=[
+        [LinearRegression(), LogisticRegression(solver="lbfgs", max_iter=250)],
+        [
+            RandomForestRegressor(max_depth=5, n_estimators=10, random_state=42),
+            RandomForestClassifier(max_depth=5, n_estimators=10, random_state=42),
+        ],
+    ],
+)
+def learner(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["observational", "experimental"])
+def score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def in_sample_normalization(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[0.1])
+def trimming_threshold(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def dml_did_binary_vs_did_fixture(generate_data_did_binary, learner, score, in_sample_normalization, trimming_threshold):
+    boot_methods = ["normal"]
+    n_folds = 2
+    n_rep_boot = 499
+
+    # collect data
+    dml_panel_data = generate_data_did_binary
+    df = dml_panel_data._data.sort_values(by=["id", "t"])
+    df_panel = df.groupby("id").agg(
+        {"y": lambda x: x.iloc[1] - x.iloc[0], "d": "first", "Z1": "first", "Z2": "first", "Z3": "first", "Z4": "first"}
+    )
+
+    n_obs = df_panel.shape[0]
+    all_smpls = draw_smpls(n_obs, n_folds)
+    obj_dml_data = dml.DoubleMLData(df_panel, y_col="y", d_cols="d", x_cols=["Z1", "Z2", "Z3", "Z4"])
+
+    # Set machine learning methods for m & g
+    ml_g = clone(learner[0])
+    ml_m = clone(learner[1])
+
+    dml_args = {
+        "ml_g": ml_g,
+        "ml_m": ml_m,
+        "n_folds": n_folds,
+        "score": score,
+        "in_sample_normalization": in_sample_normalization,
+        "trimming_threshold": trimming_threshold,
+        "draw_sample_splitting": False,
+    }
+
+    dml_did_binary_obj = dml.did.DoubleMLDIDBinary(
+        dml_panel_data,
+        g_value=1,
+        t_value_pre=0,
+        t_value_eval=1,
+        **dml_args,
+    )
+
+    dml_did_obj = dml.DoubleMLDID(
+        obj_dml_data,
+        **dml_args,
+    )
+
+    # synchronize the sample splitting
+    dml_did_obj.set_sample_splitting(all_smpls=all_smpls)
+    dml_did_binary_obj.set_sample_splitting(all_smpls=all_smpls)
+
+    dml_did_obj.fit()
+    dml_did_binary_obj.fit()
+
+    # manual fit
+    y = df_panel["y"].values
+    d = df_panel["d"].values
+    x = df_panel[["Z1", "Z2", "Z3", "Z4"]].values
+
+    np.random.seed(3141)
+    res_manual = fit_did(
+        y,
+        x,
+        d,
+        clone(learner[0]),
+        clone(learner[1]),
+        all_smpls,
+        score,
+        in_sample_normalization,
+        trimming_threshold=trimming_threshold,
+    )
+
+    res_dict = {
+        "coef": dml_did_obj.coef,
+        "coef_binary": dml_did_binary_obj.coef,
+        "coef_manual": res_manual["theta"],
+        "se": dml_did_obj.se,
+        "se_binary": dml_did_binary_obj.se,
+        "se_manual": res_manual["se"],
+        "nuisance_loss": dml_did_obj.nuisance_loss,
+        "nuisance_loss_binary": dml_did_binary_obj.nuisance_loss,
+        "boot_methods": boot_methods,
+    }
+
+    for bootstrap in boot_methods:
+        np.random.seed(3141)
+        boot_t_stat = boot_did(
+            y,
+            res_manual["thetas"],
+            res_manual["ses"],
+            res_manual["all_psi_a"],
+            res_manual["all_psi_b"],
+            all_smpls,
+            bootstrap,
+            n_rep_boot,
+        )
+
+        np.random.seed(3141)
+        dml_did_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
+        np.random.seed(3141)
+        dml_did_binary_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
+
+        res_dict["boot_t_stat" + bootstrap] = dml_did_obj.boot_t_stat
+        res_dict["boot_t_stat" + bootstrap + "_binary"] = dml_did_binary_obj.boot_t_stat
+        res_dict["boot_t_stat" + bootstrap + "_manual"] = boot_t_stat.reshape(-1, 1, 1)
+
+    # sensitivity tests
+    res_dict["sensitivity_elements"] = dml_did_obj.sensitivity_elements
+    res_dict["sensitivity_elements_binary"] = dml_did_binary_obj.sensitivity_elements
+    res_dict["sensitivity_elements_manual"] = fit_sensitivity_elements_did(
+        y,
+        d,
+        all_coef=dml_did_obj.all_coef,
+        predictions=dml_did_obj.predictions,
+        score=score,
+        in_sample_normalization=in_sample_normalization,
+        n_rep=1,
+    )
+
+    # sensitivity tests
+    res_dict["sensitivity_elements"] = dml_did_obj.sensitivity_elements
+    res_dict["sensitivity_elements_binary"] = dml_did_binary_obj.sensitivity_elements
+
+    dml_did_obj.sensitivity_analysis()
+    dml_did_binary_obj.sensitivity_analysis()
+
+    res_dict["sensitivity_params"] = dml_did_obj.sensitivity_params
+    res_dict["sensitivity_params_binary"] = dml_did_binary_obj.sensitivity_params
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_coefs(dml_did_binary_vs_did_fixture):
+    assert math.isclose(
+        dml_did_binary_vs_did_fixture["coef"][0], dml_did_binary_vs_did_fixture["coef_manual"], rel_tol=1e-9, abs_tol=1e-4
+    )
+    assert math.isclose(
+        dml_did_binary_vs_did_fixture["coef_binary"][0], dml_did_binary_vs_did_fixture["coef"][0], rel_tol=1e-9, abs_tol=1e-4
+    )
+
+
+@pytest.mark.ci
+def test_ses(dml_did_binary_vs_did_fixture):
+    assert math.isclose(
+        dml_did_binary_vs_did_fixture["se"][0], dml_did_binary_vs_did_fixture["se_manual"], rel_tol=1e-9, abs_tol=1e-4
+    )
+    assert math.isclose(
+        dml_did_binary_vs_did_fixture["se_binary"][0], dml_did_binary_vs_did_fixture["se"][0], rel_tol=1e-9, abs_tol=1e-4
+    )
+
+
+@pytest.mark.ci
+def test_boot(dml_did_binary_vs_did_fixture):
+    for bootstrap in dml_did_binary_vs_did_fixture["boot_methods"]:
+        assert np.allclose(
+            dml_did_binary_vs_did_fixture["boot_t_stat" + bootstrap],
+            dml_did_binary_vs_did_fixture["boot_t_stat" + bootstrap + "_manual"],
+            atol=1e-4,
+        )
+        assert np.allclose(
+            dml_did_binary_vs_did_fixture["boot_t_stat" + bootstrap],
+            dml_did_binary_vs_did_fixture["boot_t_stat" + bootstrap + "_binary"],
+            atol=1e-4,
+        )
+
+
+@pytest.mark.ci
+def test_nuisance_loss(dml_did_binary_vs_did_fixture):
+    assert (
+        dml_did_binary_vs_did_fixture["nuisance_loss"].keys() == dml_did_binary_vs_did_fixture["nuisance_loss_binary"].keys()
+    )
+    for key, value in dml_did_binary_vs_did_fixture["nuisance_loss"].items():
+        assert np.allclose(value, dml_did_binary_vs_did_fixture["nuisance_loss_binary"][key], rtol=1e-9, atol=1e-3)
+
+
+@pytest.mark.ci
+def test_sensitivity_elements(dml_did_binary_vs_did_fixture):
+    sensitivity_element_names = ["sigma2", "nu2", "psi_sigma2", "psi_nu2"]
+    for sensitivity_element in sensitivity_element_names:
+        assert np.allclose(
+            dml_did_binary_vs_did_fixture["sensitivity_elements"][sensitivity_element],
+            dml_did_binary_vs_did_fixture["sensitivity_elements_manual"][sensitivity_element],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+        assert np.allclose(
+            dml_did_binary_vs_did_fixture["sensitivity_elements"][sensitivity_element],
+            dml_did_binary_vs_did_fixture["sensitivity_elements_binary"][sensitivity_element],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+    for sensitivity_element in ["riesz_rep"]:
+        assert np.allclose(
+            dml_did_binary_vs_did_fixture["sensitivity_elements"][sensitivity_element],
+            dml_did_binary_vs_did_fixture["sensitivity_elements_binary"][sensitivity_element],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+
+
+@pytest.mark.ci
+def test_sensitivity_params(dml_did_binary_vs_did_fixture):
+    for key in ["theta", "se", "ci"]:
+        assert np.allclose(
+            dml_did_binary_vs_did_fixture["sensitivity_params"][key]["lower"],
+            dml_did_binary_vs_did_fixture["sensitivity_params_binary"][key]["lower"],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+        assert np.allclose(
+            dml_did_binary_vs_did_fixture["sensitivity_params"][key]["upper"],
+            dml_did_binary_vs_did_fixture["sensitivity_params_binary"][key]["upper"],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+
+    for key in ["rv", "rva"]:
+        assert np.allclose(
+            dml_did_binary_vs_did_fixture["sensitivity_params"][key],
+            dml_did_binary_vs_did_fixture["sensitivity_params_binary"][key],
+            rtol=1e-9,
+            atol=1e-4,
+        )
diff --git a/doubleml/did/tests/test_did_cs_external_predictions.py b/doubleml/did/tests/test_did_cs_external_predictions.py
index f4a47997..2b28ac8a 100644
--- a/doubleml/did/tests/test_did_cs_external_predictions.py
+++ b/doubleml/did/tests/test_did_cs_external_predictions.py
@@ -5,7 +5,7 @@
 from sklearn.linear_model import LinearRegression, LogisticRegression
 
 from doubleml import DoubleMLDIDCS
-from doubleml.datasets import make_did_SZ2020
+from doubleml.did.datasets import make_did_SZ2020
 from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor
 
 from ...tests._utils import draw_smpls
diff --git a/doubleml/did/tests/test_did_external_predictions.py b/doubleml/did/tests/test_did_external_predictions.py
index 9027e7dc..7234be8e 100644
--- a/doubleml/did/tests/test_did_external_predictions.py
+++ b/doubleml/did/tests/test_did_external_predictions.py
@@ -5,7 +5,7 @@
 from sklearn.linear_model import LinearRegression, LogisticRegression
 
 from doubleml import DoubleMLDID
-from doubleml.datasets import make_did_SZ2020
+from doubleml.did.datasets import make_did_SZ2020
 from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor
 
 from ...tests._utils import draw_smpls
@@ -42,11 +42,36 @@ def doubleml_did_fixture(did_score, n_rep):
     np.random.seed(3141)
     dml_did_ext.fit(external_predictions=ext_predictions)
 
-    res_dict = {"coef_normal": dml_did.coef[0], "coef_ext": dml_did_ext.coef[0]}
+    res_dict = {
+        "coef": dml_did.coef[0],
+        "coef_ext": dml_did_ext.coef[0],
+        "se": dml_did.se[0],
+        "se_ext": dml_did_ext.se[0],
+        "score": dml_did.psi,
+        "score_ext": dml_did_ext.psi,
+        "dml_did_nuisance_loss": dml_did.nuisance_loss,
+        "dml_did_ext_nuisance_loss": dml_did_ext.nuisance_loss,
+    }
 
     return res_dict
 
 
 @pytest.mark.ci
-def test_doubleml_did_coef(doubleml_did_fixture):
-    assert math.isclose(doubleml_did_fixture["coef_normal"], doubleml_did_fixture["coef_ext"], rel_tol=1e-9, abs_tol=1e-3)
+def test_coef(doubleml_did_fixture):
+    assert math.isclose(doubleml_did_fixture["coef"], doubleml_did_fixture["coef_ext"], rel_tol=1e-9, abs_tol=1e-3)
+
+
+@pytest.mark.ci
+def test_se(doubleml_did_fixture):
+    assert math.isclose(doubleml_did_fixture["se"], doubleml_did_fixture["se_ext"], rel_tol=1e-9, abs_tol=1e-3)
+
+
+@pytest.mark.ci
+def test_score(doubleml_did_fixture):
+    assert np.allclose(doubleml_did_fixture["score"], doubleml_did_fixture["score_ext"], rtol=1e-9, atol=1e-3)
+
+
+@pytest.mark.ci
+def test_nuisance_loss(doubleml_did_fixture):
+    for key, value in doubleml_did_fixture["dml_did_nuisance_loss"].items():
+        assert np.allclose(value, doubleml_did_fixture["dml_did_ext_nuisance_loss"][key], rtol=1e-9, atol=1e-3)
diff --git a/doubleml/did/tests/test_did_multi_aggregation_manual_weights.py b/doubleml/did/tests/test_did_multi_aggregation_manual_weights.py
new file mode 100644
index 00000000..35512d8f
--- /dev/null
+++ b/doubleml/did/tests/test_did_multi_aggregation_manual_weights.py
@@ -0,0 +1 @@
+# TODO: For each aggregation method check if the manual weights equal the string aggregation method.
diff --git a/doubleml/did/tests/test_did_multi_aggregation_single_gt.py b/doubleml/did/tests/test_did_multi_aggregation_single_gt.py
new file mode 100644
index 00000000..0f71d91b
--- /dev/null
+++ b/doubleml/did/tests/test_did_multi_aggregation_single_gt.py
@@ -0,0 +1,112 @@
+import math
+
+import pytest
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+import doubleml as dml
+from doubleml.did.datasets import make_did_CS2021
+
+
+@pytest.fixture(scope="module", params=["group", "time", "eventstudy"])
+def aggregation(request):
+    return request.param
+
+
+@pytest.fixture(
+    scope="module",
+    params=[
+        [LinearRegression(), LogisticRegression(solver="lbfgs", max_iter=250)],
+    ],
+)
+def learner(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["observational", "experimental"])
+def score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def in_sample_normalization(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[0.1])
+def trimming_threshold(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["datetime", "float"])
+def time_type(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def dml_single_gt_aggregation(aggregation, time_type, learner, score, in_sample_normalization, trimming_threshold):
+    n_obs = 500
+    dpg = 1
+
+    # collect data
+    df = make_did_CS2021(n_obs=n_obs, dgp_type=dpg, time_type=time_type)
+    dml_panel_data = dml.data.DoubleMLPanelData(
+        df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]
+    )
+
+    dml_args = {
+        "n_folds": 3,
+        "score": score,
+        "in_sample_normalization": in_sample_normalization,
+        "trimming_threshold": trimming_threshold,
+        "draw_sample_splitting": True,
+    }
+    gt_combination = [(dml_panel_data.g_values[0], dml_panel_data.t_values[0], dml_panel_data.t_values[3])]
+    dml_obj = dml.did.DoubleMLDIDMulti(
+        dml_panel_data,
+        ml_g=learner[0],
+        ml_m=learner[1],
+        gt_combinations=gt_combination,
+        **dml_args,
+    )
+    dml_obj.fit()
+
+    dml_obj_agg = dml_obj.aggregate(aggregation=aggregation)
+
+    res_dict = {
+        "dml_obj": dml_obj,
+        "dml_obj_agg": dml_obj_agg,
+    }
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_dml_single_gt_thetas(dml_single_gt_aggregation):
+    assert math.isclose(
+        dml_single_gt_aggregation["dml_obj"].coef[0],
+        dml_single_gt_aggregation["dml_obj_agg"].aggregated_frameworks.thetas[0],
+        rel_tol=1e-9,
+        abs_tol=1e-4,
+    )
+    assert math.isclose(
+        dml_single_gt_aggregation["dml_obj_agg"].aggregated_frameworks.thetas[0],
+        dml_single_gt_aggregation["dml_obj_agg"].overall_aggregated_framework.thetas[0],
+        rel_tol=1e-9,
+        abs_tol=1e-4,
+    )
+
+
+@pytest.mark.ci
+def test_dml_single_gt_ses(dml_single_gt_aggregation):
+    assert math.isclose(
+        dml_single_gt_aggregation["dml_obj"].se[0],
+        dml_single_gt_aggregation["dml_obj_agg"].aggregated_frameworks.ses[0],
+        rel_tol=1e-9,
+        abs_tol=1e-4,
+    )
+    assert math.isclose(
+        dml_single_gt_aggregation["dml_obj_agg"].aggregated_frameworks.ses[0],
+        dml_single_gt_aggregation["dml_obj_agg"].overall_aggregated_framework.ses[0],
+        rel_tol=1e-9,
+        abs_tol=1e-4,
+    )
diff --git a/doubleml/did/tests/test_did_multi_aggregation_weight_index.py b/doubleml/did/tests/test_did_multi_aggregation_weight_index.py
new file mode 100644
index 00000000..d001a4a8
--- /dev/null
+++ b/doubleml/did/tests/test_did_multi_aggregation_weight_index.py
@@ -0,0 +1 @@
+# TODO: For each aggregation method check if the aggregated weights correspond to certain gt_combinations (group, time etc.)
diff --git a/doubleml/did/tests/test_did_multi_exceptions.py b/doubleml/did/tests/test_did_multi_exceptions.py
new file mode 100644
index 00000000..aead8e48
--- /dev/null
+++ b/doubleml/did/tests/test_did_multi_exceptions.py
@@ -0,0 +1,239 @@
+from unittest.mock import patch
+
+import pytest
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+import doubleml as dml
+
+df = dml.did.datasets.make_did_CS2021(n_obs=500, dgp_type=1, n_pre_treat_periods=0, n_periods=3, time_type="float")
+dml_data = dml.data.DoubleMLPanelData(df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"])
+# df_binary_outcome = df.copy()
+# df_binary_outcome["y"] = (df_binary_outcome["y"] > df_binary_outcome["y"].median()).astype(int)
+# dml_data_binary_outcome = dml.data.DoubleMLPanelData(
+#     df_binary_outcome, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]
+# )
+
+valid_arguments = {
+    "obj_dml_data": dml_data,
+    "ml_g": LinearRegression(),
+    "ml_m": LogisticRegression(),
+    "gt_combinations": [(1, 0, 1)],
+}
+
+
+@pytest.mark.ci
+def test_input():
+    # data
+    msg = r"The data has to be a DoubleMLPanelData object. 0 of type <class 'int'> was passed."
+    with pytest.raises(TypeError, match=msg):
+        invalid_arguments = {"obj_dml_data": 0}
+        _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments))
+
+    invalid_data = dml.data.DoubleMLPanelData(
+        df, y_col="y", d_cols="d", id_col="id", t_col="t", z_cols=["Z4"], x_cols=["Z1", "Z2", "Z3"]
+    )
+    msg = r"Incompatible data. Z4 have been set as instrumental variable\(s\)."
+    with pytest.raises(NotImplementedError, match=msg):
+        invalid_arguments = {"obj_dml_data": invalid_data}
+        _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments))
+
+    # control group
+    msg = r"The control group has to be one of \['never_treated', 'not_yet_treated'\]. 0 was passed."
+    with pytest.raises(ValueError, match=msg):
+        invalid_arguments = {"control_group": 0}
+        _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments))
+
+    # propensity score adjustments
+    msg = "in_sample_normalization indicator has to be boolean. Object of type <class 'str'> passed."
+    with pytest.raises(TypeError, match=msg):
+        invalid_arguments = {"in_sample_normalization": "test"}
+        _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments))
+
+    # score
+    msg = "Invalid score test. Valid score observational or experimental."
+    with pytest.raises(ValueError, match=msg):
+        invalid_arguments = {"score": "test"}
+        _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments))
+
+    # trimming
+    msg = "Invalid trimming_rule discard. Valid trimming_rule truncate."
+    with pytest.raises(ValueError, match=msg):
+        invalid_arguments = {"trimming_rule": "discard"}
+        _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments))
+
+    msg = "trimming_threshold has to be a float. Object of type <class 'str'> passed."
+    with pytest.raises(TypeError, match=msg):
+        invalid_arguments = {"trimming_threshold": "test"}
+        _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments))
+
+    msg = "Invalid trimming_threshold 0.6. trimming_threshold has to be between 0 and 0.5."
+    with pytest.raises(ValueError, match=msg):
+        invalid_arguments = {"trimming_threshold": 0.6}
+        _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments))
+
+
+@pytest.mark.ci
+def test_exception_learners():
+    msg = (
+        r"The ml_g learner LogisticRegression\(\) was identified as classifier but "
+        + "the outcome variable is not binary with values 0 and 1."
+    )
+    with pytest.raises(ValueError, match=msg):
+        invalid_arguments = {"ml_g": LogisticRegression()}
+        _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments))
+
+    msg = (
+        'A learner ml_m has been provided for score = "experimental" but will be ignored. '
+        "A learner ml_m is not required for estimation."
+    )
+    with pytest.warns(UserWarning, match=msg):
+        invalid_arguments = {"score": "experimental"}
+        _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments))
+
+
+@pytest.mark.ci
+def test_exception_gt_combinations():
+    msg = r"gt_combinations must be one of \['standard', 'all'\]. test was passed."
+    with pytest.raises(ValueError, match=msg):
+        invalid_arguments = {"gt_combinations": "test"}
+        _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments))
+
+    msg = "gt_combinations must be a list. 1 of type <class 'int'> was passed."
+    with pytest.raises(TypeError, match=msg):
+        invalid_arguments = {"gt_combinations": 1}
+        _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments))
+
+    msg = "gt_combinations must not be empty."
+    with pytest.raises(ValueError, match=msg):
+        invalid_arguments = {"gt_combinations": []}
+        _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments))
+
+    msg = "gt_combinations must be a list of tuples. At least one element is not a tuple."
+    with pytest.raises(TypeError, match=msg):
+        invalid_arguments = {"gt_combinations": [1]}
+        _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments))
+
+    msg = "gt_combinations must be a list of tuples with 3 elements. At least one tuple has not 3 elements."
+    with pytest.raises(ValueError, match=msg):
+        invalid_arguments = {"gt_combinations": [(1, 0)]}
+        _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments))
+
+
+@pytest.mark.ci
+def test_exceptions_aggregate():
+    dml_obj = dml.did.DoubleMLDIDMulti(**valid_arguments)
+    # test without fit()
+    msg = r"Apply fit\(\) before aggregate\(\)."
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.aggregate()
+
+    dml_obj.fit()
+
+    # Test non-string input
+    msg = "aggregation must be a string or dictionary. 123 of type <class 'int'> was passed."
+    with pytest.raises(TypeError, match=msg):
+        dml_obj.aggregate(aggregation=123)
+
+    # Test invalid string value
+    msg = "aggregation must be one of \\['group', 'time', 'eventstudy'\\]. invalid was passed."
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.aggregate(aggregation="invalid")
+
+
+@pytest.mark.ci
+def test_check_external_predictions():
+    # Create DID instance
+    model = dml.did.DoubleMLDIDMulti(**valid_arguments)
+
+    # Test 1: Invalid type (not a dictionary)
+    invalid_pred = ["not a dict"]
+    with pytest.raises(TypeError, match="external_predictions must be a dictionary"):
+        model.fit(external_predictions=invalid_pred)
+
+    # Test 2: Invalid keys in top-level dictionary
+    invalid_keys = {"invalid_key": {}}
+    with pytest.raises(ValueError, match="external_predictions must be a subset of all gt_combinations"):
+        model.fit(external_predictions=invalid_keys)
+
+    # Test 3: Invalid type for nested prediction dictionary
+    invalid_nested = {model.gt_labels[0]: "not a dict"}
+    msg = r"external_predictions\[ATT\(1,0,1\)\] must be a dictionary\. Object of type <class 'str'> passed\."
+    with pytest.raises(TypeError, match=msg):
+        model.fit(external_predictions=invalid_nested)
+
+    # Test 4: Invalid keys in nested prediction dictionary
+    invalid_learner = {model.gt_labels[0]: {"invalid_learner": None}}
+    with pytest.raises(ValueError, match="must be a subset of "):
+        model.fit(external_predictions=invalid_learner)
+
+    # Test 5: Valid external predictions should not raise
+    valid_pred = {model.gt_labels[0]: {"ml_g0": None, "ml_g1": None, "ml_m": None}}
+    model._check_external_predictions(valid_pred)
+
+
+@pytest.mark.ci
+def test_exceptions_before_fit():
+    """Test exception handling for confint() and p_adjust() methods when fit() hasn't been called."""
+    dml_obj = dml.did.DoubleMLDIDMulti(**valid_arguments)
+
+    msg = r"Apply fit\(\) before {}."
+    with pytest.raises(ValueError, match=msg.format("confint")):
+        dml_obj.confint()
+
+    with pytest.raises(ValueError, match=msg.format("p_adjust")):
+        dml_obj.p_adjust()
+
+    with pytest.raises(ValueError, match=msg.format("bootstrap")):
+        dml_obj.bootstrap()
+
+    with pytest.raises(ValueError, match=msg.format("sensitivity_analysis")):
+        dml_obj.sensitivity_analysis()
+
+    with pytest.raises(ValueError, match=msg.format("sensitivity_plot")):
+        dml_obj.sensitivity_plot()
+
+    with pytest.raises(ValueError, match=msg.format("aggregate")):
+        dml_obj.aggregate()
+
+    msg = r"Apply sensitivity_analysis\(\) before sensitivity_summary."
+    with pytest.raises(ValueError, match=msg):
+        _ = dml_obj.sensitivity_summary
+
+
+@pytest.mark.ci
+def test_exceptions_sensitivity_benchmark():
+    """Test exception handling for sensitivity_benchmark() method."""
+    dml_obj = dml.did.DoubleMLDIDMulti(**valid_arguments)
+    dml_obj.fit()
+
+    # Test 1: sensitivity_elements is None
+    with patch.object(dml_obj.__class__, "sensitivity_elements", property(lambda self: None)):
+        msg = "Sensitivity analysis not yet implemented for"
+        with pytest.raises(NotImplementedError, match=msg):
+            dml_obj.sensitivity_benchmark(benchmarking_set=["Z1"])
+
+    # Test 2: benchmarking_set is not a list
+    invalid_types = [123, "string", {"dict": "value"}, (1, 2, 3)]
+    for invalid_type in invalid_types:
+        msg = "benchmarking_set must be a list."
+        with pytest.raises(TypeError, match=msg):
+            dml_obj.sensitivity_benchmark(benchmarking_set=invalid_type)
+
+    # Test 3: benchmarking_set is an empty list
+    msg = "benchmarking_set must not be empty."
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.sensitivity_benchmark(benchmarking_set=[])
+
+    # Test 4: benchmarking_set is not a subset of features
+    msg = (
+        r"benchmarking_set must be a subset of features \['Z1', 'Z2', 'Z3', 'Z4'\]. \['Z5', 'NonExistentFeature'\] was passed."
+    )
+    with pytest.raises(ValueError, match=msg):
+        dml_obj.sensitivity_benchmark(benchmarking_set=["Z5", "NonExistentFeature"])
+
+    # Test 5: fit_args is not None and not a dictionary
+    invalid_types = [123, "string", ["list"], (1, 2, 3)]
+    for invalid_type in invalid_types:
+        msg = "fit_args must be a dict."
+        with pytest.raises(TypeError, match=msg):
+            dml_obj.sensitivity_benchmark(benchmarking_set=["Z1"], fit_args=invalid_type)
diff --git a/doubleml/did/tests/test_did_multi_external_predictions.py b/doubleml/did/tests/test_did_multi_external_predictions.py
new file mode 100644
index 00000000..2e7003f9
--- /dev/null
+++ b/doubleml/did/tests/test_did_multi_external_predictions.py
@@ -0,0 +1,102 @@
+import math
+
+import numpy as np
+import pytest
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+import doubleml as dml
+from doubleml.did.datasets import make_did_CS2021
+from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor
+
+
+@pytest.fixture(scope="module", params=["observational", "experimental"])
+def did_score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def set_ml_m_ext(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def set_ml_g_ext(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def doubleml_did_multi_ext_fixture(did_score, n_rep, set_ml_m_ext, set_ml_g_ext):
+    n_obs = 500
+    n_folds = 5
+    dgp = 1
+    ml_g = LinearRegression()
+    ml_m = LogisticRegression(random_state=42)
+
+    # collect data
+    df = make_did_CS2021(n_obs=n_obs, dgp_type=dgp, time_type="float")
+    dml_panel_data = dml.data.DoubleMLPanelData(
+        df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]
+    )
+
+    dml_args = {
+        "obj_dml_data": dml_panel_data,
+        "gt_combinations": [(2, 0, 1)],
+        "score": did_score,
+        "n_rep": n_rep,
+        "n_folds": n_folds,
+    }
+
+    np.random.seed(3141)
+    dml_obj = dml.did.DoubleMLDIDMulti(
+        ml_g=ml_g,
+        ml_m=ml_m,
+        **dml_args,
+    )
+    np.random.seed(3141)
+    dml_obj.fit()
+
+    ext_pred_dict = {gt_combination: {} for gt_combination in dml_obj.gt_labels}
+    if set_ml_m_ext and did_score == "observational":
+        for i_gt_combination, gt_label in enumerate(dml_obj.gt_labels):
+            ext_pred_dict[gt_label]["ml_m"] = dml_obj.modellist[i_gt_combination].predictions["ml_m"][:, :, 0]
+        ml_m_ext = DMLDummyClassifier()
+    else:
+        ml_m_ext = ml_m
+
+    if set_ml_g_ext:
+        for i_gt_combination, gt_label in enumerate(dml_obj.gt_labels):
+            ext_pred_dict[gt_label]["ml_g0"] = dml_obj.modellist[i_gt_combination].predictions["ml_g0"][:, :, 0]
+            ext_pred_dict[gt_label]["ml_g1"] = dml_obj.modellist[i_gt_combination].predictions["ml_g1"][:, :, 0]
+        ml_g_ext = DMLDummyRegressor()
+    else:
+        ml_g_ext = ml_g
+
+    np.random.seed(3141)
+    dml_obj_ext = dml.did.DoubleMLDIDMulti(
+        ml_g=ml_g_ext,
+        ml_m=ml_m_ext,
+        **dml_args,
+    )
+    np.random.seed(3141)
+    dml_obj_ext.fit(external_predictions=ext_pred_dict)
+
+    res_dict = {
+        "coef": dml_obj.coef[0],
+        "coef_ext": dml_obj_ext.coef[0],
+        "se": dml_obj.se[0],
+        "se_ext": dml_obj_ext.se[0],
+    }
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_coef(doubleml_did_multi_ext_fixture):
+    assert math.isclose(
+        doubleml_did_multi_ext_fixture["coef"], doubleml_did_multi_ext_fixture["coef_ext"], rel_tol=1e-9, abs_tol=1e-3
+    )
diff --git a/doubleml/did/tests/test_did_multi_placebo.py b/doubleml/did/tests/test_did_multi_placebo.py
new file mode 100644
index 00000000..8f01d426
--- /dev/null
+++ b/doubleml/did/tests/test_did_multi_placebo.py
@@ -0,0 +1,62 @@
+import numpy as np
+import pytest
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+from doubleml.data import DoubleMLPanelData
+from doubleml.did import DoubleMLDIDMulti
+from doubleml.did.datasets import make_did_CS2021
+
+
+@pytest.fixture(scope="module", params=["observational", "experimental"])
+def did_score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def doubleml_did_fixture(did_score, n_rep):
+    n_obs = 1000
+    dgp = 5  # has to be experimental (for experimental score to be valid)
+    np.random.seed(42)
+    df = make_did_CS2021(n_obs=n_obs, dgp=dgp, n_pre_treat_periods=3, n_periods=5, time_type="float")
+    dml_data = DoubleMLPanelData(df, y_col="y", d_cols="d", t_col="t", id_col="id", x_cols=["Z1", "Z2", "Z3", "Z4"])
+
+    # all placebo combinations
+    gt_combinations_group3 = [(3, 0, 1), (3, 0, 2), (3, 1, 2)]
+    gt_combinations_group4 = [(4, 0, 1), (4, 0, 2), (4, 0, 3), (4, 1, 2), (4, 1, 3), (4, 2, 3)]
+    gt_combinations = gt_combinations_group3 + gt_combinations_group4
+
+    kwargs = {
+        "obj_dml_data": dml_data,
+        "ml_g": LinearRegression(),
+        "ml_m": LogisticRegression(),
+        "gt_combinations": gt_combinations,
+        "score": did_score,
+        "n_rep": n_rep,
+        "n_folds": 5,
+        "draw_sample_splitting": True,
+    }
+
+    dml_did = DoubleMLDIDMulti(**kwargs)
+
+    np.random.seed(3141)
+    dml_did.fit()
+    ci = dml_did.confint(level=0.95)
+
+    res_dict = {
+        "coef": dml_did.coef[:],
+        "ci_lower": ci.iloc[:, 0],
+        "ci_upper": ci.iloc[:, 1],
+    }
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_zero(doubleml_did_fixture):
+    assert all(doubleml_did_fixture["ci_lower"] <= 0.0)
+    assert all(doubleml_did_fixture["ci_upper"] >= 0.0)
diff --git a/doubleml/did/tests/test_did_multi_plot.py b/doubleml/did/tests/test_did_multi_plot.py
new file mode 100644
index 00000000..2eb15dcc
--- /dev/null
+++ b/doubleml/did/tests/test_did_multi_plot.py
@@ -0,0 +1,175 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import pytest
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+from doubleml.data import DoubleMLPanelData
+from doubleml.did import DoubleMLDIDMulti
+from doubleml.did.datasets import make_did_CS2021
+
+
+@pytest.fixture(scope="module", params=["observational", "experimental"])
+def did_score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[1, 3])
+def n_rep(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def doubleml_did_fixture(did_score, n_rep):
+    n_obs = 1000
+    dgp = 5  # has to be experimental (for experimental score to be valid)
+    np.random.seed(42)
+    df = make_did_CS2021(n_obs=n_obs, dgp=dgp, n_pre_treat_periods=3, n_periods=5, time_type="float")
+    dml_data = DoubleMLPanelData(df, y_col="y", d_cols="d", t_col="t", id_col="id", x_cols=["Z1", "Z2", "Z3", "Z4"])
+
+    kwargs = {
+        "obj_dml_data": dml_data,
+        "ml_g": LinearRegression(),
+        "ml_m": LogisticRegression(),
+        "gt_combinations": "all",
+        "score": did_score,
+        "n_rep": n_rep,
+        "n_folds": 2,
+        "draw_sample_splitting": True,
+    }
+
+    dml_did = DoubleMLDIDMulti(**kwargs)
+
+    np.random.seed(3141)
+    dml_did.fit()
+
+    res_dict = {
+        "model": dml_did,
+    }
+    return res_dict
+
+
+@pytest.mark.ci
+def test_plot_bootstrap_warnings(doubleml_did_fixture):
+    msg = "Joint confidence intervals require bootstrapping"
+    with pytest.warns(UserWarning, match=msg):
+        _ = doubleml_did_fixture["model"].plot_effects()
+
+
+@pytest.mark.ci
+def test_plot_effects_default(doubleml_did_fixture):
+    dml_obj = doubleml_did_fixture["model"]
+    fig, axes = dml_obj.plot_effects()
+
+    assert isinstance(fig, plt.Figure)
+    assert isinstance(axes, list)
+    assert all(isinstance(ax, plt.Axes) for ax in axes)
+    plt.close("all")
+
+
+@pytest.mark.ci
+def test_plot_effects_confidence_level(doubleml_did_fixture):
+    """Test plot_effects with different confidence levels."""
+    dml_obj = doubleml_did_fixture["model"]
+
+    # Test with 90% confidence level
+    fig, _ = dml_obj.plot_effects(level=0.9)
+    assert isinstance(fig, plt.Figure)
+
+    # assert figure is not equal to default value
+    fig_default, _ = dml_obj.plot_effects()
+    assert fig_default != fig
+
+    plt.close("all")
+
+
+@pytest.mark.ci
+def test_plot_effects_joint_ci(doubleml_did_fixture):
+    """Test plot_effects with different joint confidence interval settings."""
+    dml_obj = doubleml_did_fixture["model"]
+
+    # Test with joint=False
+    fig, _ = dml_obj.plot_effects(joint=False)
+    assert isinstance(fig, plt.Figure)
+
+    # assert figure is not equal to default value
+    fig_default, _ = dml_obj.plot_effects()
+    assert fig_default != fig
+
+    plt.close("all")
+
+
+@pytest.mark.ci
+def test_plot_effects_figure_size(doubleml_did_fixture):
+    """Test plot_effects with custom figure size."""
+    dml_obj = doubleml_did_fixture["model"]
+
+    custom_figsize = (10, 5)
+    fig, _ = dml_obj.plot_effects(figsize=custom_figsize)
+    assert isinstance(fig, plt.Figure)
+
+    # Check if figure size matches the specified size
+    width, height = fig.get_size_inches()
+    assert (width, height) == custom_figsize
+
+    plt.close("all")
+
+
+@pytest.mark.ci
+def test_plot_effects_color_palette(doubleml_did_fixture):
+    """Test plot_effects with different color palettes."""
+    dml_obj = doubleml_did_fixture["model"]
+
+    # Test with a different seaborn palette
+    fig, _ = dml_obj.plot_effects(color_palette="Set1")
+    assert isinstance(fig, plt.Figure)
+
+    # Test with a custom color list
+    custom_colors = [(1, 0, 0), (0, 1, 0)]  # Red and green
+    fig, _ = dml_obj.plot_effects(color_palette=custom_colors)
+    assert isinstance(fig, plt.Figure)
+
+    plt.close("all")
+
+
+@pytest.mark.ci
+def test_plot_effects_labels_and_title(doubleml_did_fixture):
+    """Test plot_effects with custom labels and title."""
+    dml_obj = doubleml_did_fixture["model"]
+
+    custom_title = "Custom Title for Test"
+    custom_ylabel = "Custom Y Label"
+
+    fig, axes = dml_obj.plot_effects(title=custom_title, y_label=custom_ylabel)
+    assert isinstance(fig, plt.Figure)
+
+    # Check if title is set correctly (title is on the figure level)
+    assert fig._suptitle.get_text() == custom_title
+
+    # Check if y_label is set correctly (at least on the first axis)
+    assert axes[0].get_ylabel() == custom_ylabel
+
+    plt.close("all")
+
+
+@pytest.mark.ci
+def test_plot_effects_jitter(doubleml_did_fixture):
+    """Test plot_effects with custom jitter settings."""
+    dml_obj = doubleml_did_fixture["model"]
+
+    # Test with custom jitter value
+    fig, _ = dml_obj.plot_effects(jitter_value=0.2)
+    assert isinstance(fig, plt.Figure)
+
+    # assert figure is not equal to default value
+    fig_default, _ = dml_obj.plot_effects()
+    assert fig_default != fig
+
+    # Test with custom default_jitter
+    fig, _ = dml_obj.plot_effects(default_jitter=0.05)
+    assert isinstance(fig, plt.Figure)
+
+    # assert figure is not equal to default value
+    fig_default, _ = dml_obj.plot_effects()
+    assert fig_default != fig
+
+    plt.close("all")
diff --git a/doubleml/did/tests/test_did_multi_return_types.py b/doubleml/did/tests/test_did_multi_return_types.py
new file mode 100644
index 00000000..2e12ce10
--- /dev/null
+++ b/doubleml/did/tests/test_did_multi_return_types.py
@@ -0,0 +1,193 @@
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import plotly
+import pytest
+from matplotlib.axes import Axes
+from matplotlib.figure import Figure
+from sklearn.linear_model import Lasso, LogisticRegression
+
+from doubleml.data import DoubleMLPanelData
+from doubleml.did import DoubleMLDIDAggregation, DoubleMLDIDMulti
+from doubleml.did.datasets import make_did_CS2021
+from doubleml.double_ml_framework import DoubleMLFramework
+
+# Test constants
+N_OBS = 200
+N_REP = 1
+N_FOLDS = 3
+N_REP_BOOT = 314
+
+dml_args = {
+    "n_rep": N_REP,
+    "n_folds": N_FOLDS,
+    "gt_combinations": "standard",
+}
+
+
+# create all datasets
+np.random.seed(3141)
+datasets = {}
+
+# panel data
+df_panel = make_did_CS2021(n_obs=N_OBS, dgp_type=1, n_pre_treat_periods=2, n_periods=5, time_type="float")
+df_panel["y_binary"] = np.random.binomial(n=1, p=0.5, size=df_panel.shape[0])
+datasets["did_panel"] = DoubleMLPanelData(
+    df_panel, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]
+)
+datasets["did_panel_binary_outcome"] = DoubleMLPanelData(
+    df_panel, y_col="y_binary", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]
+)
+
+
+dml_objs = [
+    (DoubleMLDIDMulti(datasets["did_panel"], ml_g=Lasso(), ml_m=LogisticRegression(), **dml_args), DoubleMLDIDMulti),
+    (
+        DoubleMLDIDMulti(
+            datasets["did_panel_binary_outcome"], ml_g=LogisticRegression(), ml_m=LogisticRegression(), **dml_args
+        ),
+        DoubleMLDIDMulti,
+    ),
+]
+
+
+@pytest.mark.ci
+@pytest.mark.parametrize("dml_obj, cls", dml_objs)
+def test_panel_return_types(dml_obj, cls):
+    assert isinstance(dml_obj.__str__(), str)
+    assert isinstance(dml_obj.summary, pd.DataFrame)
+    # assert isinstance(dml_obj.draw_sample_splitting(), cls)  # not implemented
+    assert isinstance(dml_obj.fit(), cls)
+    assert isinstance(dml_obj.__str__(), str)  # called again after fit, now with numbers
+    assert isinstance(dml_obj.summary, pd.DataFrame)  # called again after fit, now with numbers
+    assert isinstance(dml_obj.bootstrap(), cls)
+
+    assert isinstance(dml_obj.confint(), pd.DataFrame)
+    assert isinstance(dml_obj.p_adjust(), pd.DataFrame)
+
+    assert isinstance(dml_obj._dml_data.__str__(), str)
+
+    # further return type tests
+
+
+@pytest.fixture(params=dml_objs)
+def fitted_dml_obj(request):
+    dml_obj, _ = request.param
+    dml_obj.fit()
+    dml_obj.bootstrap(n_rep_boot=N_REP_BOOT)
+    return dml_obj
+
+
+@pytest.mark.ci
+def test_panel_property_types_and_shapes(fitted_dml_obj):
+    n_treat = len(fitted_dml_obj.gt_combinations)
+    dml_obj = fitted_dml_obj
+
+    # check_basic_property_types_and_shapes
+    # check that the setting is still in line with the hard-coded values
+    assert dml_obj._dml_data.n_treat == 1
+    assert dml_obj.n_gt_atts == n_treat
+    assert dml_obj.n_rep == N_REP
+    assert dml_obj.n_folds == N_FOLDS
+    assert dml_obj._dml_data.n_obs == N_OBS
+    assert dml_obj.n_rep_boot == N_REP_BOOT
+
+    assert isinstance(dml_obj.all_coef, np.ndarray)
+    assert dml_obj.all_coef.shape == (n_treat, N_REP)
+
+    assert isinstance(dml_obj.all_se, np.ndarray)
+    assert dml_obj.all_se.shape == (n_treat, N_REP)
+
+    assert isinstance(dml_obj.boot_t_stat, np.ndarray)
+    assert dml_obj.boot_t_stat.shape == (N_REP_BOOT, n_treat, N_REP)
+
+    assert isinstance(dml_obj.coef, np.ndarray)
+    assert dml_obj.coef.shape == (n_treat,)
+
+    assert isinstance(dml_obj.se, np.ndarray)
+    assert dml_obj.se.shape == (n_treat,)
+
+    assert isinstance(dml_obj.t_stat, np.ndarray)
+    assert dml_obj.t_stat.shape == (n_treat,)
+
+    assert isinstance(dml_obj.framework.scaled_psi, np.ndarray)
+    assert dml_obj.framework.scaled_psi.shape == (
+        N_OBS,
+        n_treat,
+        N_REP,
+    )
+
+    assert isinstance(dml_obj.framework, DoubleMLFramework)
+    assert isinstance(dml_obj.pval, np.ndarray)
+    assert dml_obj.pval.shape == (n_treat,)
+
+    assert isinstance(dml_obj._dml_data.binary_treats, pd.Series)
+    assert len(dml_obj._dml_data.binary_treats) == 1
+
+    # check_basic_predictions_and_targets
+    expected_keys = ["ml_g0", "ml_g1", "ml_m"]
+    for key in expected_keys:
+        assert isinstance(dml_obj.nuisance_loss[key], np.ndarray)
+        assert dml_obj.nuisance_loss[key].shape == (N_REP, n_treat)
+
+
+@pytest.mark.ci
+def test_panel_sensitivity_return_types(fitted_dml_obj):
+    n_treat = len(fitted_dml_obj.gt_combinations)
+    benchmarking_set = [fitted_dml_obj._dml_data.x_cols[0]]
+    dml_obj = fitted_dml_obj
+
+    assert isinstance(dml_obj.sensitivity_elements, dict)
+    for key in ["sigma2", "nu2", "max_bias"]:
+        assert isinstance(dml_obj.sensitivity_elements[key], np.ndarray)
+        assert dml_obj.sensitivity_elements[key].shape == (1, n_treat, N_REP)
+    for key in ["psi_max_bias"]:
+        assert isinstance(dml_obj.sensitivity_elements[key], np.ndarray)
+        assert dml_obj.sensitivity_elements[key].shape == (N_OBS, n_treat, N_REP)
+
+    assert isinstance(dml_obj.sensitivity_summary, str)
+    dml_obj.sensitivity_analysis()
+    assert isinstance(dml_obj.sensitivity_summary, str)
+    assert isinstance(dml_obj.sensitivity_plot(), plotly.graph_objs._figure.Figure)
+    benchmarks = {"cf_y": [0.1, 0.2], "cf_d": [0.15, 0.2], "name": ["test1", "test2"]}
+    assert isinstance(dml_obj.sensitivity_plot(value="ci", benchmarks=benchmarks), plotly.graph_objs._figure.Figure)
+
+    assert isinstance(dml_obj.framework._calc_sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0, level=0.95), dict)
+    assert isinstance(
+        dml_obj.framework._calc_robustness_value(null_hypothesis=0.0, level=0.95, rho=1.0, idx_treatment=0), tuple
+    )
+    benchmark = dml_obj.sensitivity_benchmark(benchmarking_set=benchmarking_set)
+    assert isinstance(benchmark, pd.DataFrame)
+
+
+@pytest.mark.ci
+def test_panel_plot_effects(fitted_dml_obj):
+    fig, axes = fitted_dml_obj.plot_effects()
+    assert isinstance(fig, Figure)
+
+    # list of axes objects
+    assert isinstance(axes, list)
+    for ax in axes:
+        assert isinstance(ax, Axes)
+
+    plt.close(fig)
+
+
+@pytest.fixture(scope="module", params=["eventstudy", "group", "time"])
+def aggregation(request):
+    return request.param
+
+
+@pytest.mark.ci
+def test_panel_agg_return_types(fitted_dml_obj, aggregation):
+    agg_obj = fitted_dml_obj.aggregate(aggregation=aggregation)
+    agg_obj.aggregated_frameworks.bootstrap(n_rep_boot=10)
+
+    assert isinstance(agg_obj, DoubleMLDIDAggregation)
+    assert isinstance(agg_obj.__str__(), str)
+
+    # test plotting
+    fig, ax = agg_obj.plot_effects()
+    assert isinstance(fig, Figure)
+    assert isinstance(ax, Axes)
+    plt.close(fig)
diff --git a/doubleml/did/tests/test_did_multi_vs_binary.py b/doubleml/did/tests/test_did_multi_vs_binary.py
new file mode 100644
index 00000000..40b877b2
--- /dev/null
+++ b/doubleml/did/tests/test_did_multi_vs_binary.py
@@ -0,0 +1,206 @@
+import math
+
+import numpy as np
+import pytest
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+import doubleml as dml
+from doubleml.did.datasets import make_did_CS2021
+from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor
+
+
+@pytest.fixture(
+    scope="module",
+    params=[
+        [LinearRegression(), LogisticRegression(solver="lbfgs", max_iter=250)],
+        [
+            RandomForestRegressor(max_depth=5, n_estimators=10, random_state=42),
+            RandomForestClassifier(max_depth=5, n_estimators=10, random_state=42),
+        ],
+    ],
+)
+def learner(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["observational", "experimental"])
+def score(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[True, False])
+def in_sample_normalization(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=[0.1])
+def trimming_threshold(request):
+    return request.param
+
+
+@pytest.fixture(scope="module", params=["datetime", "float"])
+def time_type(request):
+    return request.param
+
+
+@pytest.fixture(scope="module")
+def dml_did_binary_vs_did_multi_fixture(time_type, learner, score, in_sample_normalization, trimming_threshold):
+    n_obs = 500
+    dpg = 1
+    boot_methods = ["normal"]
+    n_rep_boot = 50000
+
+    # collect data
+    df = make_did_CS2021(n_obs=n_obs, dgp_type=dpg, time_type=time_type)
+    dml_panel_data = dml.data.DoubleMLPanelData(
+        df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]
+    )
+
+    dml_args = {
+        "n_folds": 3,
+        "score": score,
+        "in_sample_normalization": in_sample_normalization,
+        "trimming_threshold": trimming_threshold,
+        "draw_sample_splitting": True,
+    }
+    gt_combination = [(dml_panel_data.g_values[0], dml_panel_data.t_values[0], dml_panel_data.t_values[1])]
+    dml_did_multi_obj = dml.did.DoubleMLDIDMulti(
+        dml_panel_data,
+        ml_g=learner[0],
+        ml_m=learner[1],
+        gt_combinations=gt_combination,
+        **dml_args,
+    )
+    dml_did_multi_obj.fit()
+
+    treatment_col = dml_panel_data.d_cols[0]
+    ext_pred_dict = {treatment_col: {}}
+    ext_pred_dict[treatment_col]["ml_g0"] = dml_did_multi_obj.modellist[0].predictions["ml_g0"][:, :, 0]
+    ext_pred_dict[treatment_col]["ml_g1"] = dml_did_multi_obj.modellist[0].predictions["ml_g1"][:, :, 0]
+    if score == "observational":
+        ext_pred_dict[treatment_col]["ml_m"] = dml_did_multi_obj.modellist[0].predictions["ml_m"][:, :, 0]
+
+    dml_did_binary_obj = dml.did.DoubleMLDIDBinary(
+        dml_panel_data,
+        g_value=gt_combination[0][0],
+        t_value_pre=gt_combination[0][1],
+        t_value_eval=gt_combination[0][2],
+        ml_g=DMLDummyRegressor(),
+        ml_m=DMLDummyClassifier(),
+        **dml_args,
+    )
+    dml_did_binary_obj.fit(external_predictions=ext_pred_dict)
+
+    res_dict = {
+        "coef_multi": dml_did_multi_obj.coef,
+        "coef_binary": dml_did_binary_obj.coef,
+        "se_multi": dml_did_multi_obj.se,
+        "se_binary": dml_did_binary_obj.se,
+        "boot_methods": boot_methods,
+        "nuisance_loss_multi": dml_did_multi_obj.nuisance_loss,
+        "nuisance_loss_binary": dml_did_binary_obj.nuisance_loss,
+    }
+
+    for bootstrap in boot_methods:
+        np.random.seed(3141)
+        dml_did_multi_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
+        np.random.seed(3141)
+        dml_did_binary_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot)
+
+        # approximately same ci (bootstrap not identical due to size of score)
+        res_dict["boot_ci" + bootstrap + "_multi"] = dml_did_multi_obj.confint(joint=True)
+        res_dict["boot_ci" + bootstrap + "_binary"] = dml_did_binary_obj.confint(joint=True)
+
+    # sensitivity tests
+    res_dict["sensitivity_elements_multi"] = dml_did_multi_obj.sensitivity_elements
+    res_dict["sensitivity_elements_binary"] = dml_did_binary_obj.framework.sensitivity_elements
+
+    dml_did_multi_obj.sensitivity_analysis()
+    dml_did_binary_obj.sensitivity_analysis()
+
+    res_dict["sensitivity_params_multi"] = dml_did_multi_obj.sensitivity_params
+    res_dict["sensitivity_params_binary"] = dml_did_binary_obj.sensitivity_params
+
+    return res_dict
+
+
+@pytest.mark.ci
+def test_coefs(dml_did_binary_vs_did_multi_fixture):
+    assert math.isclose(
+        dml_did_binary_vs_did_multi_fixture["coef_binary"][0],
+        dml_did_binary_vs_did_multi_fixture["coef_multi"][0],
+        rel_tol=1e-9,
+        abs_tol=1e-4,
+    )
+
+
+@pytest.mark.ci
+def test_se(dml_did_binary_vs_did_multi_fixture):
+    assert math.isclose(
+        dml_did_binary_vs_did_multi_fixture["se_binary"][0],
+        dml_did_binary_vs_did_multi_fixture["se_multi"][0],
+        rel_tol=1e-9,
+        abs_tol=1e-4,
+    )
+
+
+@pytest.mark.ci
+def test_boot(dml_did_binary_vs_did_multi_fixture):
+    for bootstrap in dml_did_binary_vs_did_multi_fixture["boot_methods"]:
+        assert np.allclose(
+            dml_did_binary_vs_did_multi_fixture["boot_ci" + bootstrap + "_multi"].values,
+            dml_did_binary_vs_did_multi_fixture["boot_ci" + bootstrap + "_binary"].values,
+            atol=1e-2,
+        )
+
+
+@pytest.mark.ci
+def test_nuisance_loss(dml_did_binary_vs_did_multi_fixture):
+    assert (
+        dml_did_binary_vs_did_multi_fixture["nuisance_loss_multi"].keys()
+        == dml_did_binary_vs_did_multi_fixture["nuisance_loss_binary"].keys()
+    )
+    for key, value in dml_did_binary_vs_did_multi_fixture["nuisance_loss_multi"].items():
+        assert np.allclose(value, dml_did_binary_vs_did_multi_fixture["nuisance_loss_binary"][key], rtol=1e-9, atol=1e-3)
+
+
+@pytest.mark.ci
+def test_sensitivity_elements(dml_did_binary_vs_did_multi_fixture):
+    elements_multi = dml_did_binary_vs_did_multi_fixture["sensitivity_elements_multi"]
+    elements_binary = dml_did_binary_vs_did_multi_fixture["sensitivity_elements_binary"]
+    sensitivity_element_names = ["max_bias", "psi_max_bias", "sigma2", "nu2"]
+    for sensitivity_element in sensitivity_element_names:
+        assert np.allclose(
+            elements_multi[sensitivity_element],
+            elements_binary[sensitivity_element],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+
+
+@pytest.mark.ci
+def test_sensitivity_params(dml_did_binary_vs_did_multi_fixture):
+    multi_params = dml_did_binary_vs_did_multi_fixture["sensitivity_params_multi"]
+    binary_params = dml_did_binary_vs_did_multi_fixture["sensitivity_params_binary"]
+    for key in ["theta", "se", "ci"]:
+        assert np.allclose(
+            multi_params[key]["lower"],
+            binary_params[key]["lower"],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+        assert np.allclose(
+            multi_params[key]["upper"],
+            binary_params[key]["upper"],
+            rtol=1e-9,
+            atol=1e-4,
+        )
+
+    for key in ["rv", "rva"]:
+        assert np.allclose(
+            multi_params[key],
+            binary_params[key],
+            rtol=1e-9,
+            atol=1e-4,
+        )
diff --git a/doubleml/did/tests/test_model_defaults.py b/doubleml/did/tests/test_model_defaults.py
new file mode 100644
index 00000000..f8c59e70
--- /dev/null
+++ b/doubleml/did/tests/test_model_defaults.py
@@ -0,0 +1,81 @@
+import pytest
+from sklearn.linear_model import LinearRegression, LogisticRegression
+
+import doubleml as dml
+from doubleml.did import DoubleMLDIDBinary, DoubleMLDIDMulti
+from doubleml.utils._check_defaults import _check_basic_defaults_after_fit, _check_basic_defaults_before_fit, _fit_bootstrap
+
+df_panel = dml.did.datasets.make_did_CS2021(n_obs=500, dgp_type=1, n_pre_treat_periods=2, n_periods=5, time_type="float")
+dml_panel_data = dml.data.DoubleMLPanelData(
+    df_panel, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]
+)
+
+dml_did_multi_obj = DoubleMLDIDMulti(dml_panel_data, LinearRegression(), LogisticRegression(), [(2, 0, 1)])
+dml_did_binary_obj = DoubleMLDIDBinary(
+    dml_panel_data, g_value=2, t_value_pre=0, t_value_eval=1, ml_g=LinearRegression(), ml_m=LogisticRegression()
+)
+
+
+@pytest.mark.ci
+def test_did_binary_defaults():
+    _check_basic_defaults_before_fit(dml_did_binary_obj)
+
+    # specific parameters
+    assert dml_did_binary_obj.control_group == "never_treated"
+    assert dml_did_binary_obj.anticipation_periods == 0
+
+    _fit_bootstrap(dml_did_binary_obj)
+    _check_basic_defaults_after_fit(dml_did_binary_obj)
+
+
+@pytest.mark.ci
+def test_did_multi_defaults():
+    _check_basic_defaults_before_fit(dml_did_multi_obj)
+
+    # coefs and se
+    assert dml_did_multi_obj.coef is None
+    assert dml_did_multi_obj.se is None
+    assert dml_did_multi_obj.all_coef is None
+    assert dml_did_multi_obj.all_se is None
+    assert dml_did_multi_obj.t_stat is None
+    assert dml_did_multi_obj.pval is None
+
+    # specific parameters
+    assert dml_did_binary_obj.control_group == "never_treated"
+    assert dml_did_binary_obj.anticipation_periods == 0
+
+    _fit_bootstrap(dml_did_multi_obj)
+    _check_basic_defaults_after_fit(dml_did_multi_obj)
+
+
+@pytest.mark.ci
+def test_did_multi_str():
+    # Test the string representation before fitting
+    dml_str = str(dml_did_multi_obj)
+
+    # Check that all important sections are present
+    assert "================== DoubleMLDIDMulti Object ==================" in dml_str
+    assert "------------------ Data summary      ------------------" in dml_str
+    assert "------------------ Score & algorithm ------------------" in dml_str
+    assert "------------------ Machine learner   ------------------" in dml_str
+    assert "------------------ Resampling        ------------------" in dml_str
+    assert "------------------ Fit summary       ------------------" in dml_str
+
+    # Check specific content before fitting
+    assert "Score function: observational" in dml_str
+    assert "No. folds: 5" in dml_str
+    assert "No. repeated sample splits: 1" in dml_str
+    assert "Learner ml_g:" in dml_str
+    assert "Learner ml_m:" in dml_str
+
+    # Fit the model
+    dml_did_multi_obj_fit = dml_did_multi_obj.fit()
+    dml_str_after_fit = str(dml_did_multi_obj_fit)
+
+    # Check that additional information is present after fitting
+    assert "ATT(2,0,1)" in dml_str_after_fit
+    assert "coef" in dml_str_after_fit
+    assert "std err" in dml_str_after_fit
+    assert "t" in dml_str_after_fit
+    assert "P>|t|" in dml_str_after_fit
+    assert "Out-of-sample Performance:" in dml_str_after_fit
diff --git a/doubleml/did/tests/test_return_types.py b/doubleml/did/tests/test_return_types.py
new file mode 100644
index 00000000..a59cec6c
--- /dev/null
+++ b/doubleml/did/tests/test_return_types.py
@@ -0,0 +1,171 @@
+import numpy as np
+import pandas as pd
+import pytest
+from sklearn.linear_model import Lasso, LogisticRegression
+
+from doubleml.data import DoubleMLData, DoubleMLPanelData
+from doubleml.did import DoubleMLDID, DoubleMLDIDBinary, DoubleMLDIDCS
+from doubleml.did.datasets import make_did_CS2021, make_did_SZ2020
+from doubleml.utils._check_return_types import (
+    check_basic_predictions_and_targets,
+    check_basic_property_types_and_shapes,
+    check_basic_return_types,
+    check_sensitivity_return_types,
+)
+
+# Test constants
+N_OBS = 200
+N_TREAT = 1
+N_REP = 1
+N_FOLDS = 3
+N_REP_BOOT = 314
+
+dml_args = {
+    "n_rep": N_REP,
+    "n_folds": N_FOLDS,
+}
+
+
+# create all datasets
+np.random.seed(3141)
+datasets = {}
+
+datasets["did"] = make_did_SZ2020(n_obs=N_OBS)
+datasets["did_cs"] = make_did_SZ2020(n_obs=N_OBS, cross_sectional_data=True)
+
+# Binary outcome
+(x, y, d, t) = make_did_SZ2020(n_obs=N_OBS, cross_sectional_data=True, return_type="array")
+binary_outcome = np.random.binomial(n=1, p=0.5, size=N_OBS)
+
+datasets["did_binary_outcome"] = DoubleMLData.from_arrays(x, binary_outcome, d)
+datasets["did_cs_binary_outcome"] = DoubleMLData.from_arrays(x, binary_outcome, d, t=t)
+
+dml_objs = [
+    (DoubleMLDID(datasets["did"], Lasso(), LogisticRegression(), **dml_args), DoubleMLDID),
+    (DoubleMLDID(datasets["did_binary_outcome"], LogisticRegression(), LogisticRegression(), **dml_args), DoubleMLDID),
+    (DoubleMLDIDCS(datasets["did_cs"], Lasso(), LogisticRegression(), **dml_args), DoubleMLDIDCS),
+    (DoubleMLDIDCS(datasets["did_cs_binary_outcome"], LogisticRegression(), LogisticRegression(), **dml_args), DoubleMLDIDCS),
+]
+
+
+@pytest.mark.ci
+@pytest.mark.parametrize("dml_obj, cls", dml_objs)
+def test_return_types(dml_obj, cls):
+    check_basic_return_types(dml_obj, cls)
+
+    # further return type tests
+    assert isinstance(dml_obj.get_params("ml_m"), dict)
+
+
+@pytest.fixture(params=dml_objs)
+def fitted_dml_obj(request):
+    dml_obj, _ = request.param
+    dml_obj.fit()
+    dml_obj.bootstrap(n_rep_boot=N_REP_BOOT)
+    return dml_obj
+
+
+@pytest.mark.ci
+def test_property_types_and_shapes(fitted_dml_obj):
+    check_basic_property_types_and_shapes(fitted_dml_obj, N_OBS, N_TREAT, N_REP, N_FOLDS, N_REP_BOOT)
+    check_basic_predictions_and_targets(fitted_dml_obj, N_OBS, N_TREAT, N_REP)
+
+
+@pytest.mark.ci
+def test_sensitivity_return_types(fitted_dml_obj):
+    if fitted_dml_obj._sensitivity_implemented:
+        benchmarking_set = [fitted_dml_obj._dml_data.x_cols[0]]
+        check_sensitivity_return_types(fitted_dml_obj, N_OBS, N_REP, N_TREAT, benchmarking_set=benchmarking_set)
+
+
+# panel data
+df_panel = make_did_CS2021(n_obs=N_OBS, dgp_type=1, n_pre_treat_periods=2, n_periods=5, time_type="float")
+df_panel["y_binary"] = np.random.binomial(n=1, p=0.5, size=df_panel.shape[0])
+datasets["did_panel"] = DoubleMLPanelData(
+    df_panel, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]
+)
+datasets["did_panel_binary_outcome"] = DoubleMLPanelData(
+    df_panel, y_col="y_binary", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]
+)
+
+dml_panel_binary_args = dml_args | {
+    "g_value": 2,
+    "t_value_pre": 0,
+    "t_value_eval": 1,
+}
+
+dml_objs_panel = [
+    (
+        DoubleMLDIDBinary(datasets["did_panel"], ml_g=Lasso(), ml_m=LogisticRegression(), **dml_panel_binary_args),
+        DoubleMLDIDBinary,
+    ),
+    (
+        DoubleMLDIDBinary(
+            datasets["did_panel_binary_outcome"], ml_g=LogisticRegression(), ml_m=LogisticRegression(), **dml_panel_binary_args
+        ),
+        DoubleMLDIDBinary,
+    ),
+]
+
+
+@pytest.mark.ci
+@pytest.mark.parametrize("dml_obj, cls", dml_objs_panel)
+def test_panel_return_types(dml_obj, cls):
+    check_basic_return_types(dml_obj, cls)
+
+    # further return type tests
+    assert isinstance(dml_obj.get_params("ml_m"), dict)
+
+    assert isinstance(dml_obj.g_value, (int, np.integer))
+    assert isinstance(dml_obj.t_value_eval, (int, np.integer, float, np.floating))
+    assert isinstance(dml_obj.t_value_pre, (int, np.integer, float, np.floating))
+    assert isinstance(dml_obj.post_treatment, bool)
+
+    # Test panel_data_wide property
+    assert isinstance(dml_obj.panel_data_wide, pd.DataFrame)
+    assert dml_obj.panel_data_wide.shape[0] <= N_OBS
+    assert "G_indicator" in dml_obj.panel_data_wide.columns
+    assert "C_indicator" in dml_obj.panel_data_wide.columns
+    assert "y_diff" in dml_obj.panel_data_wide.columns
+
+    # Test id_positions property
+    assert isinstance(dml_obj.id_positions, np.ndarray)
+    assert dml_obj.id_positions.ndim == 1
+
+    # propensity score properties
+    assert isinstance(dml_obj.in_sample_normalization, bool)
+    assert isinstance(dml_obj.trimming_rule, str)
+    assert dml_obj.trimming_rule in ["truncate"]
+    assert isinstance(dml_obj.trimming_threshold, (float, np.floating))
+    assert 0 <= dml_obj.trimming_threshold <= 0.5
+
+    # Test n_obs property
+    assert isinstance(dml_obj.n_obs, (int, np.integer))
+    assert dml_obj.n_obs <= N_OBS
+
+    # Test consistency between properties
+    if dml_obj.post_treatment:
+        assert dml_obj.g_value <= dml_obj.t_value_eval
+    else:
+        assert dml_obj.g_value > dml_obj.t_value_eval
+
+
+@pytest.fixture(params=dml_objs_panel)
+def fitted_panel_dml_obj(request):
+    dml_obj, _ = request.param
+    dml_obj.fit()
+    dml_obj.bootstrap(n_rep_boot=N_REP_BOOT)
+    return dml_obj
+
+
+@pytest.mark.ci
+def test_panel_property_types_and_shapes(fitted_panel_dml_obj):
+    check_basic_property_types_and_shapes(fitted_panel_dml_obj, N_OBS, N_TREAT, N_REP, N_FOLDS, N_REP_BOOT)
+    check_basic_predictions_and_targets(fitted_panel_dml_obj, N_OBS, N_TREAT, N_REP)
+
+
+@pytest.mark.ci
+def test_panel_sensitivity_return_types(fitted_panel_dml_obj):
+    if fitted_panel_dml_obj._sensitivity_implemented:
+        benchmarking_set = [fitted_panel_dml_obj._dml_data.x_cols[0]]
+        check_sensitivity_return_types(fitted_panel_dml_obj, N_OBS, N_REP, N_TREAT, benchmarking_set=benchmarking_set)
diff --git a/doubleml/did/utils/_aggregation.py b/doubleml/did/utils/_aggregation.py
new file mode 100644
index 00000000..e0cd5b1a
--- /dev/null
+++ b/doubleml/did/utils/_aggregation.py
@@ -0,0 +1,231 @@
+import numpy as np
+
+
+def _check_did_aggregation_dict(aggregation_dict, gt_index):
+    if not isinstance(aggregation_dict, dict):
+        raise ValueError("aggregation must be a dictionary")
+
+    # Validate and extract custom parameters
+    required_keys = {"weight_masks"}
+    if not all(key in aggregation_dict for key in required_keys):
+        raise ValueError(f"aggregation must contain all required keys: {required_keys}")
+
+    # Check if weight_masks is a masked numpy array
+    weight_masks = aggregation_dict["weight_masks"]
+    if not isinstance(weight_masks, np.ma.MaskedArray):
+        raise ValueError("weight_masks must be a numpy masked array")
+
+    # check if weight_masks has 4 dim
+    if weight_masks.ndim != 4:
+        raise ValueError("weight_masks must have 4 dimensions")
+
+    # Check if weight_masks has the same first three dimensions as gt_index
+    if weight_masks.shape[:-1] != gt_index.shape:
+        raise ValueError(
+            f"weight_masks must have shape {gt_index.shape} + (n,) where n is the number of aggregations. "
+            f"Got shape {weight_masks.shape}"
+        )
+
+    n_aggregations = weight_masks.shape[-1]
+    # check if every weight_mask along last axis has the same mask as gt_index
+    for i in range(n_aggregations):
+        if not np.array_equal(weight_masks[..., i].mask, gt_index.mask):
+            raise ValueError("weight_masks must have the same mask as gt_index")
+
+    return aggregation_dict
+
+
+def _compute_did_group_aggregation_weights(gt_index, g_values, d_values, selected_gt_mask):
+    """
+    Calculate weights for aggregating treatment effects by group.
+
+    Parameters
+    ----------
+    gt_index : numpy.ma.MaskedArray
+        Masked array containing group-time indices
+    g_values : array-like
+        Array of unique group values
+    d_values : array-like
+        Array of treatment values
+    selected_gt_mask : numpy.ndarray
+        Boolean mask indicating which group-time combinations to include
+
+    Returns
+    -------
+    dict
+        Dictionary containing:
+        - weight_masks: numpy.ma.MaskedArray with weights for each group
+        - agg_names: list of group names
+        - agg_weights: numpy.ndarray of aggregation weights
+    """
+    selected_gt_indicies = np.where(selected_gt_mask)
+    selected_unique_g_indices = np.unique(selected_gt_indicies[0])
+    n_agg_effects = len(selected_unique_g_indices)
+
+    if n_agg_effects == 0:
+        raise ValueError("No valid groups found for aggregation.")
+
+    agg_names = [None] * n_agg_effects
+    agg_weights = [np.nan] * n_agg_effects
+
+    # Create a weight mask (0 weights) for each of the groups
+    weight_masks = np.ma.masked_array(
+        data=np.zeros((*gt_index.shape, n_agg_effects)),
+        mask=np.broadcast_to(gt_index.mask[..., np.newaxis], (*gt_index.shape, n_agg_effects)),
+        dtype=np.float64,
+    )
+
+    # Write weight masks
+    for idx_agg, g_idx in enumerate(selected_unique_g_indices):
+        # Set group name & weights
+        current_group = g_values[g_idx]
+        agg_names[idx_agg] = str(current_group)
+        agg_weights[idx_agg] = (d_values == current_group).mean()
+
+        # Group weights_masks
+        group_gt_indicies = [(i, j, k) for i, j, k in zip(*selected_gt_indicies) if i == g_idx]
+
+        weight = 1 / len(group_gt_indicies)
+        for i, j, k in group_gt_indicies:
+            weight_masks.data[i, j, k, idx_agg] = weight
+
+    # Normalize weights
+    agg_weights = np.array(agg_weights) / sum(agg_weights)
+
+    return {"weight_masks": weight_masks, "agg_names": agg_names, "agg_weights": agg_weights}
+
+
+def _compute_did_time_aggregation_weights(gt_index, g_values, t_values, d_values, selected_gt_mask):
+    """
+    Calculate weights for aggregating treatment effects over time periods.
+
+    Parameters
+    ----------
+    gt_index : numpy.ma.MaskedArray
+        Masked array containing group-time indices
+    g_values : array-like
+        Array of unique group values
+    t_values : array-like
+        Array of unique time period values
+    d_values : array-like
+        Array of treatment values (g_values for each id)
+    selected_gt_mask : numpy.ndarray
+        Boolean mask indicating which group-time combinations to include
+
+    Returns
+    -------
+    dict
+        Dictionary containing:
+        - weight_masks: numpy.ma.MaskedArray with weights for each group
+        - agg_names: list of group names
+        - agg_weights: numpy.ndarray of aggregation weights
+    """
+    selected_gt_indicies = np.where(selected_gt_mask)
+    selected_unique_t_eval_indices = np.unique(selected_gt_indicies[2])
+    n_agg_effects = len(selected_unique_t_eval_indices)
+
+    if n_agg_effects == 0:
+        raise ValueError("No time periods found for aggregation.")
+
+    agg_names = [None] * n_agg_effects
+    # equal weight due to balanced panel
+    agg_weights = np.ones(n_agg_effects) / n_agg_effects
+
+    # Create a weight mask (0 weights) for each of the groups
+    weight_masks = np.ma.masked_array(
+        data=np.zeros((*gt_index.shape, n_agg_effects)),
+        mask=np.broadcast_to(gt_index.mask[..., np.newaxis], (*gt_index.shape, n_agg_effects)),
+        dtype=np.float64,
+    )
+
+    group_weights = np.zeros(len(g_values))
+    selected_unique_g_indices = np.unique(selected_gt_indicies[0])
+    for g_idx in selected_unique_g_indices:
+        group_weights[g_idx] = (d_values == g_values[g_idx]).mean()  # (requires balanced panel)
+
+    # Write weight masks
+    for idx_agg, t_eval_idx in enumerate(selected_unique_t_eval_indices):
+        # Set time period name
+        current_time_period = t_values[t_eval_idx]
+        agg_names[idx_agg] = str(current_time_period)
+
+        # time weights_masks
+        time_gt_indicies = [(i, j, k) for i, j, k in zip(*selected_gt_indicies) if k == t_eval_idx]
+
+        for i, j, k in time_gt_indicies:
+            weight_masks.data[i, j, k, idx_agg] = group_weights[i]
+
+        # normalize weights
+        weight_masks.data[..., idx_agg] = weight_masks.data[..., idx_agg] / np.sum(weight_masks.data[..., idx_agg])
+
+    return {"weight_masks": weight_masks, "agg_names": agg_names, "agg_weights": agg_weights}
+
+
+def _compute_did_eventstudy_aggregation_weights(gt_index, g_values, t_values, d_values, time_values, selected_gt_mask):
+    """
+    Calculate weights for aggregating treatment effects over time periods.
+
+    Parameters
+    ----------
+    gt_index : numpy.ma.MaskedArray
+        Masked array containing group-time indices
+    g_values : array-like
+        Array of unique group values
+    t_values : array-like
+        Array of unique evaluation time values
+    d_values : array-like
+        Array of treatment values (g_values for each id)
+    time_values : array-like
+        Array of evaluation time values (t_values for each id)
+    selected_gt_mask : numpy.ndarray
+        Boolean mask indicating which group-time combinations to include
+
+    Returns
+    -------
+    dict
+        Dictionary containing:
+        - weight_masks: numpy.ma.MaskedArray with weights for each group
+        - agg_names: list of group names
+        - agg_weights: numpy.ndarray of aggregation weights
+    """
+    selected_gt_indicies = np.where(selected_gt_mask)
+    eventtime = time_values - d_values
+    e_values = np.unique(eventtime)
+    selected_unique_e_values = np.unique([t_values[k] - g_values[i] for i, _, k in zip(*selected_gt_indicies)])
+    assert np.all(np.isin(selected_unique_e_values, e_values))
+    n_agg_effects = len(selected_unique_e_values)
+
+    if n_agg_effects == 0:
+        raise ValueError("No time periods found for aggregation.")
+
+    agg_names = [None] * n_agg_effects
+    agg_weights = np.zeros(n_agg_effects)
+    agg_weights[selected_unique_e_values >= 0] = 1 / np.sum(selected_unique_e_values >= 0)
+
+    # Create a weight mask (0 weights) for each of the groups
+    weight_masks = np.ma.masked_array(
+        data=np.zeros((*gt_index.shape, n_agg_effects)),
+        mask=np.broadcast_to(gt_index.mask[..., np.newaxis], (*gt_index.shape, n_agg_effects)),
+        dtype=np.float64,
+    )
+
+    group_weights = np.zeros(len(g_values))
+    selected_unique_g_indices = np.unique(selected_gt_indicies[0])
+    for g_idx in selected_unique_g_indices:
+        group_weights[g_idx] = (d_values == g_values[g_idx]).mean()  # (requires balanced panel)
+
+    # Write weight masks
+    for idx_agg, e_val in enumerate(selected_unique_e_values):
+        # Set time period name
+        agg_names[idx_agg] = str(e_val)
+
+        # time weights_masks
+        eventtime_gt_indicies = [(i, j, k) for i, j, k in zip(*selected_gt_indicies) if t_values[k] - g_values[i] == e_val]
+
+        for i, j, k in eventtime_gt_indicies:
+            weight_masks.data[i, j, k, idx_agg] = group_weights[i]
+
+        # normalize weights
+        weight_masks.data[..., idx_agg] = weight_masks.data[..., idx_agg] / np.sum(weight_masks.data[..., idx_agg])
+
+    return {"weight_masks": weight_masks, "agg_names": agg_names, "agg_weights": agg_weights}
diff --git a/doubleml/did/utils/_did_utils.py b/doubleml/did/utils/_did_utils.py
new file mode 100644
index 00000000..bb69a1ef
--- /dev/null
+++ b/doubleml/did/utils/_did_utils.py
@@ -0,0 +1,246 @@
+import warnings
+from collections.abc import Iterable
+
+import numpy as np
+import pandas as pd
+
+expected_time_types = (int, float)
+
+
+def _convert_to_numpy_arrray(x, input_name, allow_nan=False):
+    if isinstance(x, np.ndarray):
+        if not x.ndim == 1:
+            raise ValueError(f"{input_name} must be a vector. Number of dimensions is {x.ndim}.")
+    elif isinstance(x, (int, float)):
+        x = np.array([x])
+    elif isinstance(x, Iterable):
+        if not all(isinstance(i, expected_time_types) for i in x):
+            raise TypeError(f"Invalid type for {input_name}: expected one of {expected_time_types}.")
+        x = np.array(x)
+    else:
+        raise TypeError(f"Invalid type for {input_name}.")
+
+    if np.issubdtype(x.dtype, np.floating) and not allow_nan and (np.any(np.isnan(x)) or np.any(np.isinf(x))):
+        raise ValueError(f"{input_name} contains missing or infinite values.")
+
+    if np.issubdtype(x.dtype, np.datetime64) and not allow_nan and np.any(np.isnat(x)):
+        raise ValueError(f"{input_name} contains missing values.")
+
+    return x
+
+
+def _get_never_treated_value(g_values):
+    never_treated_value = 0
+    if np.issubdtype(g_values.dtype, np.floating):
+        never_treated_value = np.inf
+    elif np.issubdtype(g_values.dtype, np.datetime64):
+        never_treated_value = pd.NaT
+    return never_treated_value
+
+
+def _is_never_treated(x, never_treated_value):
+    if not isinstance(x, np.ndarray):
+        x = np.array([x])
+
+    if never_treated_value is np.inf:
+        return np.isinf(x)
+    elif never_treated_value is pd.NaT:
+        return pd.isna(x)
+    else:
+        assert never_treated_value == 0
+        return x == 0
+
+
+def _check_control_group(control_group):
+    valid_control_groups = ["never_treated", "not_yet_treated"]
+    if control_group not in valid_control_groups:
+        raise ValueError(f"The control group has to be one of {valid_control_groups}. " + f"{control_group} was passed.")
+
+    return control_group
+
+
+def _check_anticipation_periods(anticipation_periods):
+    if not isinstance(anticipation_periods, int):
+        raise TypeError("The anticipation periods must be an integer.")
+    if anticipation_periods < 0:
+        raise ValueError("The anticipation periods must be non-negative.")
+
+    return anticipation_periods
+
+
+def _check_gt_combination(gt_combination, g_values, t_values, never_treated_value, anticipation_periods):
+    g_value, t_value_pre, t_value_eval = gt_combination
+    if g_value not in g_values:
+        raise ValueError(f"The value {g_value} is not in the set of treatment group values {g_values}.")
+    if _is_never_treated(g_value, never_treated_value):
+        raise ValueError(f"The never treated group is not allowed as treatment group (g_value={never_treated_value}).")
+    if g_value not in t_values:
+        raise ValueError(f"The value {g_value} (group value) is not in the set of evaluation period values {t_values}.")
+    if t_value_pre not in t_values:
+        raise ValueError(f"The value {t_value_pre} is not in the set of evaluation period values {t_values}.")
+    if t_value_eval not in t_values:
+        raise ValueError(f"The value {t_value_eval} is not in the set of evaluation period values {t_values}.")
+
+    if t_value_pre == t_value_eval:
+        raise ValueError(f"The pre-treatment and evaluation period must be different. Got {t_value_pre} for both.")
+
+    if t_value_pre > t_value_eval:
+        raise ValueError(
+            "The pre-treatment period must be before the evaluation period. "
+            f"Got t_value_pre {t_value_pre} and t_value_eval {t_value_eval}."
+        )
+
+    # get t_value equal to g_value and adjust for anticipation periods
+    maximal_t_pre = t_values[max(np.where(t_values == g_value)[0] - anticipation_periods, 0)]
+    if t_value_pre >= maximal_t_pre:
+        warnings.warn(
+            "The treatment was assigned before the first pre-treatment period (including anticipation). "
+            f"Got t_value_pre {t_value_pre} and g_value {g_value} with {anticipation_periods} anticipation_periods."
+        )
+
+
+def _check_gt_values(g_values, t_values):
+
+    g_values = _convert_to_numpy_arrray(g_values, "g_values", allow_nan=True)
+    t_values = _convert_to_numpy_arrray(t_values, "t_values", allow_nan=False)
+
+    expected_dtypes = (np.integer, np.floating, np.datetime64)
+    if not any(np.issubdtype(g_values.dtype, dt) for dt in expected_dtypes):
+        raise ValueError(f"Invalid data type for g_values: expected one of {expected_dtypes}.")
+    if not any(np.issubdtype(t_values.dtype, dt) for dt in expected_dtypes):
+        raise ValueError(f"Invalid data type for t_values: expected one of {expected_dtypes}.")
+
+    if np.issubdtype(g_values.dtype, np.datetime64) != np.issubdtype(t_values.dtype, np.datetime64):
+        raise ValueError(
+            "g_values and t_values must have the same data type. "
+            f"Got {g_values.dtype} for g_values and {t_values.dtype} for t_values."
+        )
+
+
+def _construct_gt_combinations(setting, g_values, t_values, never_treated_value, anticipation_periods):
+    """Construct treatment-time combinations for difference-in-differences analysis.
+
+    Parameters:
+        setting (str): Strategy for constructing combinations ('standard' only)
+        g_values (array): Treatment group values, must be sorted
+        t_values (array): Time period values, must be sorted
+
+    Returns:
+        list: List of (g_val, t_pre, t_eval) tuples
+    """
+    valid_settings = ["standard", "all"]
+    if setting not in valid_settings:
+        raise ValueError(f"gt_combinations must be one of {valid_settings}. {setting} was passed.")
+
+    treatment_groups = g_values[~_is_never_treated(g_values, never_treated_value)]
+    if not np.all(np.diff(treatment_groups) > 0):
+        raise ValueError("g_values must be sorted in ascending order (Excluding never treated group).")
+    if not np.all(np.diff(t_values) > 0):
+        raise ValueError("t_values must be sorted in ascending order.")
+
+    gt_combinations = []
+    if setting == "standard":
+        for g_val in treatment_groups:
+            t_values_before_g = t_values[t_values < g_val]
+            if len(t_values_before_g) > anticipation_periods:
+                first_eval_index = anticipation_periods + 1  # first relevant evaluation period index
+                t_before_g = t_values_before_g[-first_eval_index]
+
+                # collect all evaluation periods
+                for i_t_eval, t_eval in enumerate(t_values[first_eval_index:]):
+                    t_previous = t_values[i_t_eval]  # refers to t-anticipation_periods-1
+                    t_pre = min(t_previous, t_before_g)  # if t_previous larger than g_val, use t_before_g
+                    gt_combinations.append((g_val, t_pre, t_eval))
+
+    if setting == "all":
+        for g_val in treatment_groups:
+            t_values_before_g = t_values[t_values < g_val]
+            if len(t_values_before_g) > anticipation_periods:
+                first_eval_index = anticipation_periods + 1  # first relevant evaluation period index
+                for t_eval in t_values[first_eval_index:]:
+                    # all t-values before g_val - anticipation_periods
+                    valid_t_pre_values = t_values[t_values <= min(g_val, t_eval)][:-first_eval_index]
+                    for t_pre in valid_t_pre_values:
+                        gt_combinations.append((g_val, t_pre, t_eval))
+
+    if len(gt_combinations) == 0:
+        raise ValueError(
+            "No valid group-time combinations found. "
+            "Please check the treatment group values and time period values (and anticipation)."
+        )
+
+    return gt_combinations
+
+
+def _construct_gt_index(gt_combinations, g_values, t_values):
+    """Construct a 3D array mapping group-time combinations to their indices.
+
+    Parameters:
+        gt_combinations: List of tuples (g_val, t_pre, t_eval)
+        g_values: Array of group values
+        t_values: Array of time values
+
+    Returns:
+        3D numpy masked array where entry [i,j,k] contains the index of the combination
+        in gt_combinations if it exists, masked otherwise
+    """
+    gt_index = np.ma.masked_array(
+        data=np.full(shape=(len(g_values), len(t_values), len(t_values)), fill_value=-1, dtype=np.int64), mask=True
+    )
+    for i_gt_combination, (g_val, t_pre, t_eval) in enumerate(gt_combinations):
+        i_g = np.where(g_values == g_val)[0][0]
+        i_t_pre = np.where(t_values == t_pre)[0][0]
+        i_t_eval = np.where(t_values == t_eval)[0][0]
+        gt_index[i_g, i_t_pre, i_t_eval] = i_gt_combination
+        gt_index.mask[i_g, i_t_pre, i_t_eval] = False
+
+    return gt_index
+
+
+def _construct_post_treatment_mask(g_values, t_values):
+    """Constructs a mask indicating post-treatment periods for group-time combinations.
+
+    Creates a 3D boolean array where entry [i,j,k] is True if the evaluation time t_values[k]
+    is after the treatment time g_values[i], indicating a post-treatment period.
+
+    Parameters
+    ----------
+    g_values : numpy.ndarray
+        1D array of treatment group values (treatment times)
+    t_values : numpy.ndarray
+        1D array of time period values
+
+    Returns
+    -------
+    numpy.ndarray
+        3D boolean array of shape (len(g_values), len(t_values), len(t_values))
+        where True indicates post-treatment periods (t_eval > g_val)
+
+    """
+    # Reshape arrays for broadcasting
+    g_vals = g_values[:, np.newaxis, np.newaxis]  # Shape: (G, 1, 1)
+    t_evals = t_values[np.newaxis, np.newaxis, :]  # Shape: (1, 1, T)
+    t_evals = np.broadcast_to(t_evals, (1, len(t_values), len(t_values)))  # Shape: (1, T, T)
+
+    # Broadcasting creates a mask of shape (G, T, T)
+    post_treatment_mask = t_evals >= g_vals
+    return post_treatment_mask
+
+
+def _set_id_positions(a, n_obs, id_positions, fill_value):
+    if a is not None:
+        new_a = np.full((n_obs, *a.shape[1:]), fill_value=fill_value)
+        new_a[id_positions] = a
+    else:
+        new_a = None
+
+    return new_a
+
+
+def _get_id_positions(a, id_positions):
+    if a is not None:
+        new_a = a[id_positions]
+    else:
+        new_a = None
+
+    return new_a
diff --git a/doubleml/did/utils/_plot.py b/doubleml/did/utils/_plot.py
new file mode 100644
index 00000000..9a3b3aab
--- /dev/null
+++ b/doubleml/did/utils/_plot.py
@@ -0,0 +1,45 @@
+import numpy as np
+import pandas as pd
+
+
+def add_jitter(data, x_col, is_datetime=None, jitter_value=None):
+    """
+    Adds jitter to duplicate x-values for better visibility.
+
+    Args:
+        data (DataFrame): The subset of the dataset to jitter.
+        x_col (str): Column name for x values.
+        is_datetime (bool): Whether the x-values are datetime objects. If None, will be detected.
+        jitter_value (float or timedelta): Jitter amount.
+
+    Returns:
+        DataFrame with an additional 'jittered_x' column.
+    """
+    if data.empty:
+        return data
+
+    data = data.copy()
+
+    # Auto-detect datetime if not specified
+    if is_datetime is None:
+        is_datetime = pd.api.types.is_datetime64_any_dtype(data[x_col])
+
+    # Initialize jittered_x with original values
+    data["jittered_x"] = data[x_col]
+
+    for x_val in data[x_col].unique():
+        mask = data[x_col] == x_val
+        count = mask.sum()
+        if count > 1:
+            # Create evenly spaced jitter values
+            if is_datetime:
+                jitters = [pd.Timedelta(seconds=float(j)) for j in np.linspace(-jitter_value, jitter_value, count)]
+            else:
+                jitters = np.linspace(-jitter_value, jitter_value, count)
+
+            # Apply jitter to each duplicate point
+            data.loc[mask, "jitter_index"] = range(count)
+            for i, j in enumerate(jitters):
+                data.loc[mask & (data["jitter_index"] == i), "jittered_x"] = x_val + j
+
+    return data
diff --git a/doubleml/did/utils/tests/test_add_jitter.py b/doubleml/did/utils/tests/test_add_jitter.py
new file mode 100644
index 00000000..c66cb8bd
--- /dev/null
+++ b/doubleml/did/utils/tests/test_add_jitter.py
@@ -0,0 +1,130 @@
+from datetime import datetime, timedelta
+
+import pandas as pd
+import pytest
+
+from doubleml.did.utils._plot import add_jitter
+
+
+@pytest.fixture
+def numeric_df_no_duplicates():
+    """Create a DataFrame with numeric x values and no duplicates."""
+    return pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [10, 20, 30, 40, 50]})
+
+
+@pytest.fixture
+def numeric_df_with_duplicates():
+    """Create a DataFrame with numeric x values and duplicates."""
+    return pd.DataFrame({"x": [1, 1, 2, 2, 2, 3], "y": [10, 15, 20, 25, 30, 35]})
+
+
+@pytest.fixture
+def datetime_df_with_duplicates():
+    """Create a DataFrame with datetime x values and duplicates."""
+    base_date = datetime(2023, 1, 1)
+    return pd.DataFrame(
+        {
+            "x": [
+                base_date,
+                base_date,
+                base_date + timedelta(days=1),
+                base_date + timedelta(days=1),
+                base_date + timedelta(days=2),
+            ],
+            "y": [10, 15, 20, 25, 30],
+        }
+    )
+
+
+@pytest.mark.ci
+def test_add_jitter_numeric_no_duplicates(numeric_df_no_duplicates):
+    """Test that no jitter is added when there are no duplicates."""
+    result = add_jitter(numeric_df_no_duplicates, "x")
+    # No jitter should be added when there are no duplicates
+    pd.testing.assert_series_equal(result["jittered_x"], result["x"], check_names=False)
+
+
+@pytest.mark.ci
+def test_add_jitter_numeric_with_duplicates(numeric_df_with_duplicates):
+    """Test that jitter is added correctly to numeric values with duplicates."""
+    result = add_jitter(numeric_df_with_duplicates, "x", jitter_value=0.1)
+
+    # Check that all original x-values have jitter applied
+    for x_val in numeric_df_with_duplicates["x"].unique():
+        mask = numeric_df_with_duplicates["x"] == x_val
+        count = mask.sum()
+        if count > 1:
+            jittered_x = result.loc[mask, "jittered_x"]
+            # Check that jittered values are different from original
+            assert not (jittered_x == x_val).all()
+            # Check that jittered values are symmetric around original
+            assert abs(jittered_x.mean() - x_val) < 1e-10
+
+
+@pytest.mark.ci
+def test_add_jitter_datetime(datetime_df_with_duplicates):
+    """Test that jitter is added correctly to datetime values."""
+    result = add_jitter(datetime_df_with_duplicates, "x", jitter_value=20)
+
+    # Check that result contains jittered_x column with datetime type
+    assert pd.api.types.is_datetime64_dtype(result["jittered_x"])
+
+    # Check that duplicates have different jittered values
+    for x_val in datetime_df_with_duplicates["x"].unique():
+        mask = datetime_df_with_duplicates["x"] == x_val
+        count = mask.sum()
+        if count > 1:
+            jittered_values = result.loc[mask, "jittered_x"].tolist()
+            # All jittered values should be unique
+            assert len(set(jittered_values)) == count
+
+
+@pytest.mark.ci
+def test_add_jitter_empty_df():
+    """Test behavior with empty DataFrame."""
+    empty_df = pd.DataFrame({"x": [], "y": []})
+    result = add_jitter(empty_df, "x")
+    assert result.empty
+
+
+@pytest.mark.ci
+def test_add_jitter_explicit_value(numeric_df_with_duplicates):
+    """Test with explicitly specified jitter value."""
+    explicit_jitter = 0.5
+    result = add_jitter(numeric_df_with_duplicates, "x", jitter_value=explicit_jitter)
+
+    # Check that maximum jitter is equal to or less than the specified value
+    for x_val in numeric_df_with_duplicates["x"].unique():
+        mask = numeric_df_with_duplicates["x"] == x_val
+        if mask.sum() > 1:
+            max_diff = (result.loc[mask, "jittered_x"] - x_val).abs().max()
+            assert max_diff <= explicit_jitter
+
+
+@pytest.mark.ci
+def test_add_jitter_single_unique_value():
+    """Test with DataFrame having only one unique x value."""
+    df = pd.DataFrame({"x": [5, 5, 5], "y": [1, 2, 3]})
+    result = add_jitter(df, "x", jitter_value=0.1)
+
+    # Check that jitter was applied
+    assert not (result["jittered_x"] == 5).all()
+
+    # Check that jittered values are centered around the original value
+    assert abs(result["jittered_x"].mean() - 5) < 1e-10
+
+
+@pytest.mark.ci
+def test_add_jitter_explicit_datetime_flag():
+    """Test with explicitly specified is_datetime flag."""
+    # Create DataFrame with string dates
+    df = pd.DataFrame({"x": ["2023-01-01", "2023-01-01", "2023-01-02"], "y": [10, 15, 20]})
+
+    # Without specifying is_datetime, it would treat as strings
+    with pytest.raises(TypeError):
+        _ = add_jitter(df, "x")
+
+    # With is_datetime=True, it should convert and jitter as datetimes
+    with pytest.raises(TypeError):
+        # This should fail because strings can't be converted to datetime implicitly
+        add_jitter(df, "x", is_datetime=True)
diff --git a/doubleml/did/utils/tests/test_check_did_aggregation.py b/doubleml/did/utils/tests/test_check_did_aggregation.py
new file mode 100644
index 00000000..a268c87f
--- /dev/null
+++ b/doubleml/did/utils/tests/test_check_did_aggregation.py
@@ -0,0 +1,90 @@
+import numpy as np
+import pytest
+
+from doubleml.did.utils._aggregation import _check_did_aggregation_dict
+
+
+@pytest.fixture
+def sample_gt_index():
+    """Create a sample gt_index for testing"""
+    return np.ma.array(
+        [[[1, 2], [3, 4]], [[5, 6], [7, 8]]], mask=np.array([[[True, False], [False, True]], [[False, True], [True, False]]])
+    )
+
+
+@pytest.fixture
+def valid_weight_masks(sample_gt_index):
+    """Create valid weight masks for testing"""
+    return np.ma.array(
+        np.zeros((*sample_gt_index.shape, 2)),
+        mask=np.broadcast_to(sample_gt_index.mask[..., np.newaxis], (*sample_gt_index.shape, 2)),
+    )
+
+
+@pytest.mark.ci
+def test_valid_aggregation_dict(sample_gt_index, valid_weight_masks):
+    """Test a valid aggregation dictionary"""
+    valid_dict = {"weight_masks": valid_weight_masks, "agg_names": ["g1", "g2"], "agg_weights": np.array([0.5, 0.5])}
+    result = _check_did_aggregation_dict(valid_dict, sample_gt_index)
+    assert isinstance(result, dict)
+    assert "weight_masks" in result
+
+
+@pytest.mark.ci
+@pytest.mark.parametrize(
+    "invalid_input,error_msg",
+    [
+        ("not_a_dict", "aggregation must be a dictionary"),
+        ({}, "aggregation must contain all required keys: {'weight_masks'}"),
+        ({"weight_masks": np.array([1, 2, 3])}, "weight_masks must be a numpy masked array"),
+    ],
+)
+def test_invalid_input_types(sample_gt_index, invalid_input, error_msg):
+    """Test various invalid input types"""
+    with pytest.raises(ValueError, match=error_msg):
+        _check_did_aggregation_dict(invalid_input, sample_gt_index)
+
+
+@pytest.mark.ci
+def test_invalid_dimensions(sample_gt_index):
+    """Test weight_masks with wrong number of dimensions"""
+    wrong_dims = np.ma.array(np.zeros((sample_gt_index.shape)), mask=sample_gt_index.mask)  # Only 3 dimensions
+    invalid_dict = {"weight_masks": wrong_dims}
+    with pytest.raises(ValueError, match="weight_masks must have 4 dimensions"):
+        _check_did_aggregation_dict(invalid_dict, sample_gt_index)
+
+
+@pytest.mark.ci
+def test_invalid_shape(sample_gt_index):
+    """Test weight_masks with wrong shape"""
+    wrong_shape = np.ma.array(
+        np.zeros((3, 3, 3, 2)), mask=np.zeros((3, 3, 3, 2), dtype=bool)  # Wrong shape for first 3 dimensions
+    )
+    invalid_dict = {"weight_masks": wrong_shape}
+    with pytest.raises(ValueError, match=r"weight_masks must have shape .* \+ \(n,\)"):
+        _check_did_aggregation_dict(invalid_dict, sample_gt_index)
+
+
+@pytest.mark.ci
+def test_invalid_mask_alignment(sample_gt_index):
+    """Test weight_masks with misaligned mask"""
+    wrong_mask = ~sample_gt_index.mask
+    weight_masks = np.ma.array(
+        np.zeros((*sample_gt_index.shape, 2)), mask=np.broadcast_to(wrong_mask[..., np.newaxis], (*sample_gt_index.shape, 2))
+    )
+    invalid_dict = {"weight_masks": weight_masks}
+    with pytest.raises(ValueError, match="weight_masks must have the same mask as gt_index"):
+        _check_did_aggregation_dict(invalid_dict, sample_gt_index)
+
+
+@pytest.mark.ci
+def test_multiple_weight_masks(sample_gt_index, valid_weight_masks):
+    """Test multiple weight masks with different masks"""
+    # Create a weight_masks array with multiple aggregations
+    weight_masks = np.ma.concatenate([valid_weight_masks, valid_weight_masks], axis=-1)
+    # Modify mask of last aggregation
+    weight_masks[..., -1].mask = ~weight_masks[..., -1].mask
+
+    invalid_dict = {"weight_masks": weight_masks}
+    with pytest.raises(ValueError, match="weight_masks must have the same mask as gt_index"):
+        _check_did_aggregation_dict(invalid_dict, sample_gt_index)
diff --git a/doubleml/did/utils/tests/test_did_eventstudy_aggregation.py b/doubleml/did/utils/tests/test_did_eventstudy_aggregation.py
new file mode 100644
index 00000000..7cf556aa
--- /dev/null
+++ b/doubleml/did/utils/tests/test_did_eventstudy_aggregation.py
@@ -0,0 +1,128 @@
+import numpy as np
+import pytest
+
+from doubleml.did.utils._aggregation import _compute_did_eventstudy_aggregation_weights
+
+
+@pytest.mark.ci
+def test_basic_functionality_eventstudy():
+    # Setup basic test data
+    gt_index = np.ma.MaskedArray(data=np.ones((2, 3, 3)), mask=np.zeros((2, 3, 3), dtype=bool))
+    g_values = np.array([2, 3])
+    t_values = np.array([1, 2, 3])
+    d_values = np.array([2, 2, 2, 3, 3, 3])
+    time_values = np.array([1, 2, 3, 1, 2, 3])
+    selected_gt_mask = np.ones((2, 1, 3), dtype=bool)  # 4 options
+
+    result = _compute_did_eventstudy_aggregation_weights(gt_index, g_values, t_values, d_values, time_values, selected_gt_mask)
+
+    assert isinstance(result, dict)
+    assert set(result.keys()) == {"weight_masks", "agg_names", "agg_weights"}
+    assert isinstance(result["weight_masks"], np.ma.MaskedArray)
+    assert result["weight_masks"].shape == (*gt_index.shape, 4)  # 3 time periods
+    assert result["agg_names"] == ["-2", "-1", "0", "1"]
+
+
+@pytest.mark.ci
+def test_weight_computation_eventstudy():
+    gt_index = np.ma.MaskedArray(data=np.ones((2, 3, 3)), mask=np.zeros((2, 3, 3), dtype=bool))
+    g_values = np.array([2, 3])
+    t_values = np.array([1, 2, 3])
+    d_values = np.array([2, 2, 2, 3, 3, 3])
+    time_values = np.array([1, 2, 3, 1, 2, 3])
+
+    # Select specific group-time combinations
+    selected_gt_mask = np.zeros((2, 3, 3), dtype=bool)
+    selected_gt_mask[:, :2, :2] = True  # Select first two time periods for all groups
+
+    result = _compute_did_eventstudy_aggregation_weights(gt_index, g_values, t_values, d_values, time_values, selected_gt_mask)
+
+    # Check if number of aggregations is 3
+    assert len(result["agg_names"]) == 3
+    assert result["agg_names"] == ["-2", "-1", "0"]
+
+    # Check weights sum to 1 for each time period
+    assert np.allclose(np.sum(result["agg_weights"]), 1.0)
+
+    # Check weight distribution within time periods
+    for i in range(result["weight_masks"].shape[-1]):
+        time_weights = result["weight_masks"][..., i]
+        non_masked_values = time_weights.compressed()
+        if len(non_masked_values) > 0:
+            assert np.allclose(np.sum(non_masked_values), 1.0)
+
+        # Check if weights in the selected_gt_mask are equally distributed
+        non_zero = time_weights[selected_gt_mask] != 0
+        assert np.allclose(time_weights[selected_gt_mask].data[non_zero], 1 / sum(non_zero))
+
+
+@pytest.mark.ci
+def test_no_valid_eventstudy_periods():
+    gt_index = np.ma.MaskedArray(data=np.ones((2, 2, 2)), mask=np.zeros((2, 2, 2), dtype=bool))
+    g_values = np.array([1, 2])
+    t_values = np.array([1, 2, 3])
+    d_values = np.array([2, 2, 2, 3, 3, 3])
+    time_values = np.array([1, 2, 3, 1, 2, 3])
+    selected_gt_mask = np.zeros((2, 2, 2), dtype=bool)  # No time periods selected
+
+    with pytest.raises(ValueError, match="No time periods found for aggregation."):
+        _compute_did_eventstudy_aggregation_weights(gt_index, g_values, t_values, d_values, time_values, selected_gt_mask)
+
+
+@pytest.mark.ci
+def test_single_eventstudy_period():
+    gt_index = np.ma.MaskedArray(data=np.ones((2, 3, 3)), mask=np.zeros((2, 3, 3), dtype=bool))
+    g_values = np.array([2, 3])
+    t_values = np.array([1, 2, 3])
+    d_values = np.array([2, 2, 2, 3, 3, 3])
+    time_values = np.array([1, 2, 3, 1, 2, 3])
+    selected_gt_mask = gt_index.mask  # Select all non-masked elements
+    selected_gt_mask[1, 1, 2] = True  # Select a single time period
+
+    result = _compute_did_eventstudy_aggregation_weights(gt_index, g_values, t_values, d_values, time_values, selected_gt_mask)
+
+    assert len(result["agg_names"]) == 1
+    assert result["agg_names"] == ["0"]
+    assert result["weight_masks"].shape[-1] == 1
+    assert np.allclose(result["agg_weights"], [1.0])
+
+
+@pytest.mark.ci
+def test_masked_input_eventstudy():
+    # Create data with shape (2,4,4)
+    data = np.ones((2, 4, 4))
+    mask = np.zeros((2, 4, 4), dtype=bool)
+
+    # Mask some elements in different positions
+    mask[0, 0, 0] = True
+    mask[1, 2, 1] = True
+    mask[1, 1, 2] = True
+
+    gt_index = np.ma.MaskedArray(data=data, mask=mask)
+    g_values = np.array([2, 3])  # One value for each group
+    t_values = np.array([1, 2, 3, 4])  # One value for each time period
+    d_values = np.array([2, 2, 2, 2, 3, 3, 3, 3] * 4)  # Treatment values
+    time_values = np.array([1, 2, 3, 4] * 8)
+    selected_gt_mask = ~mask  # Select all non-masked elements
+
+    result = _compute_did_eventstudy_aggregation_weights(gt_index, g_values, t_values, d_values, time_values, selected_gt_mask)
+
+    # Check dimensions of output
+    assert result["weight_masks"].shape == (2, 4, 4, 5)  # Last dimension is number of event study periods
+
+    # Check if masks are maintained
+    for time_idx in range(5):
+        time_weights = result["weight_masks"][..., time_idx]
+        assert np.array_equal(time_weights.mask, mask)
+
+    # Check weight normalization
+    for time_idx in range(4):
+        weights = result["weight_masks"][..., time_idx].compressed()  # Get non-masked weights
+        if len(weights) > 0:
+            assert np.isclose(weights.sum(), 1.0)  # Weights should sum to 1 for each time period
+
+    # Check agg_names
+    assert result["agg_names"] == ["-2", "-1", "0", "1", "2"]
+
+    # Check agg_weights sum to 1
+    assert np.isclose(sum(result["agg_weights"]), 1.0)
diff --git a/doubleml/did/utils/tests/test_did_group_aggregation.py b/doubleml/did/utils/tests/test_did_group_aggregation.py
new file mode 100644
index 00000000..7dbed7e7
--- /dev/null
+++ b/doubleml/did/utils/tests/test_did_group_aggregation.py
@@ -0,0 +1,113 @@
+import numpy as np
+import pytest
+
+from doubleml.did.utils._aggregation import _compute_did_group_aggregation_weights
+
+
+@pytest.mark.ci
+def test_basic_functionality():
+    # Setup basic test data
+    gt_index = np.ma.MaskedArray(data=np.ones((3, 2, 1)), mask=np.zeros((3, 2, 1), dtype=bool))
+    g_values = np.array([1, 2, 3])
+    d_values = np.array([1, 2, 1, 2, 1, 2])
+    selected_gt_mask = np.ones((3, 2, 1), dtype=bool)
+
+    result = _compute_did_group_aggregation_weights(gt_index, g_values, d_values, selected_gt_mask)
+
+    assert isinstance(result, dict)
+    assert set(result.keys()) == {"weight_masks", "agg_names", "agg_weights"}
+    assert isinstance(result["weight_masks"], np.ma.MaskedArray)
+    assert result["weight_masks"].shape == (*gt_index.shape, 3)  # 3 groups
+
+
+@pytest.mark.ci
+def test_weight_computation():
+    gt_index = np.ma.MaskedArray(data=np.ones((3, 4, 4)), mask=np.zeros((3, 4, 4), dtype=bool))
+    g_values = np.array([1, 2, 3])
+    d_values = np.array([1, 2, 1, 2, 1, 1, 1, 1, 3, 3])
+
+    # select some group-time combinations
+    selected_gt_mask = gt_index.mask.copy()
+    selected_gt_mask[:2, :2, 0] = True
+
+    result = _compute_did_group_aggregation_weights(gt_index, g_values, d_values, selected_gt_mask)
+
+    # check if the number of aggregations is 2 (in this case, group 1 and group 2)
+    assert len(result["agg_names"]) == 2
+
+    # Check weights sum to 1 for each group
+    assert np.allclose(np.sum(result["agg_weights"]), 1.0)
+
+    # Check weight distribution within groups
+    for i in range(result["weight_masks"].shape[-1]):
+        group_weights = result["weight_masks"][..., i]
+        if len(group_weights) > 0:
+            assert np.allclose(np.sum(group_weights.compressed()), 1.0)
+
+        # check if weights in the selected_gt_mask are 0.5
+        assert np.allclose(group_weights[i, ...][selected_gt_mask[i, ...]], 0.5)
+
+    # check if the aggregation weights are [0.75, 0.25]
+    assert np.allclose(result["agg_weights"], np.array([0.75, 0.25]))
+
+
+@pytest.mark.ci
+def test_no_valid_groups():
+    gt_index = np.ma.MaskedArray(data=np.ones((2, 2, 1)), mask=np.zeros((2, 2, 1), dtype=bool))
+    g_values = np.array([1, 2])
+    d_values = np.array([1, 2, 1, 2])
+    selected_gt_mask = np.zeros((2, 2, 1), dtype=bool)  # No groups selected
+
+    with pytest.raises(ValueError, match="No valid groups found for aggregation."):
+        _compute_did_group_aggregation_weights(gt_index, g_values, d_values, selected_gt_mask)
+
+
+@pytest.mark.ci
+def test_single_group():
+    gt_index = np.ma.MaskedArray(data=np.ones((1, 2, 1)), mask=np.zeros((1, 2, 1), dtype=bool))
+    g_values = np.array([1])
+    d_values = np.array([1, 1])
+    selected_gt_mask = np.ones((1, 2, 1), dtype=bool)
+
+    result = _compute_did_group_aggregation_weights(gt_index, g_values, d_values, selected_gt_mask)
+
+    assert len(result["agg_names"]) == 1
+    assert result["weight_masks"].shape[-1] == 1
+    assert np.allclose(result["agg_weights"], [1.0])
+
+
+@pytest.mark.ci
+def test_masked_input():
+    # Create data with shape (3,4,4)
+    data = np.ones((3, 4, 4))
+    mask = np.zeros((3, 4, 4), dtype=bool)
+
+    # Mask some elements in different positions
+    mask[0, 0, 0] = True
+    mask[1, 2, 3] = True
+    mask[2, 1, 1] = True
+
+    gt_index = np.ma.MaskedArray(data=data, mask=mask)
+    g_values = np.array([1, 2, 3])  # One value for each group
+    d_values = np.array([1, 2, 3] * 16)  # Treatment values matching the data size
+    selected_gt_mask = ~mask  # Select all masked elements
+
+    result = _compute_did_group_aggregation_weights(gt_index, g_values, d_values, selected_gt_mask)
+
+    # Check dimensions of output
+    assert result["weight_masks"].shape == (3, 4, 4, 3)  # Last dimension is number of groups
+
+    for group_idx in range(3):
+        group_weights = result["weight_masks"][..., group_idx]
+        assert np.array_equal(group_weights.mask, mask)
+
+    # Check weight normalization
+    for group_idx in range(3):
+        weights = result["weight_masks"][..., group_idx].compressed()  # Get non-masked weights
+        assert np.isclose(weights.sum(), 1.0)  # Weights should sum to 1 for each group
+
+    # Check agg_names
+    assert result["agg_names"] == ["1", "2", "3"]
+
+    # Check agg_weights sum to 1
+    assert np.isclose(sum(result["agg_weights"]), 1.0)
diff --git a/doubleml/did/utils/tests/test_did_time_aggregation.py b/doubleml/did/utils/tests/test_did_time_aggregation.py
new file mode 100644
index 00000000..8ea9e540
--- /dev/null
+++ b/doubleml/did/utils/tests/test_did_time_aggregation.py
@@ -0,0 +1,122 @@
+import numpy as np
+import pytest
+
+from doubleml.did.utils._aggregation import _compute_did_time_aggregation_weights
+
+
+@pytest.mark.ci
+def test_basic_functionality_time():
+    # Setup basic test data
+    gt_index = np.ma.MaskedArray(data=np.ones((2, 3, 3)), mask=np.zeros((2, 3, 3), dtype=bool))
+    g_values = np.array([2, 3])
+    t_values = np.array([1, 2, 3])
+    d_values = np.array([2, 2, 2, 3, 3, 3])
+    selected_gt_mask = np.ones((2, 1, 3), dtype=bool)
+
+    result = _compute_did_time_aggregation_weights(gt_index, g_values, t_values, d_values, selected_gt_mask)
+
+    assert isinstance(result, dict)
+    assert set(result.keys()) == {"weight_masks", "agg_names", "agg_weights"}
+    assert isinstance(result["weight_masks"], np.ma.MaskedArray)
+    assert result["weight_masks"].shape == (*gt_index.shape, 3)  # 3 time periods
+    assert result["agg_names"] == ["1", "2", "3"]
+
+
+@pytest.mark.ci
+def test_weight_computation_time():
+    gt_index = np.ma.MaskedArray(data=np.ones((2, 3, 3)), mask=np.zeros((2, 3, 3), dtype=bool))
+    g_values = np.array([2, 3])
+    t_values = np.array([1, 2, 3])
+    d_values = np.array([2, 2, 2, 3, 3, 3])
+
+    # Select specific group-time combinations
+    selected_gt_mask = np.zeros((2, 3, 3), dtype=bool)
+    selected_gt_mask[:, :2, :2] = True  # Select first two time periods for all groups
+
+    result = _compute_did_time_aggregation_weights(gt_index, g_values, t_values, d_values, selected_gt_mask)
+
+    # Check if number of aggregations is 2 (in this case, time periods 10 and 20)
+    assert len(result["agg_names"]) == 2
+    assert result["agg_names"] == ["1", "2"]
+
+    # Check weights sum to 1 for each time period
+    assert np.allclose(np.sum(result["agg_weights"]), 1.0)
+
+    # Check weight distribution within time periods
+    for i in range(result["weight_masks"].shape[-1]):
+        time_weights = result["weight_masks"][..., i]
+        non_masked_values = time_weights.compressed()
+        if len(non_masked_values) > 0:
+            assert np.allclose(np.sum(non_masked_values), 1.0)
+
+        # Check if weights in the selected_gt_mask are 0.25
+        non_zero = time_weights[selected_gt_mask] != 0
+        assert np.allclose(time_weights[selected_gt_mask].data[non_zero], 0.25)
+
+
+@pytest.mark.ci
+def test_no_valid_time_periods():
+    gt_index = np.ma.MaskedArray(data=np.ones((2, 2, 2)), mask=np.zeros((2, 2, 2), dtype=bool))
+    g_values = np.array([1, 2])
+    t_values = np.array([10, 20])
+    d_values = np.array([1, 2, 1, 2])
+    selected_gt_mask = np.zeros((2, 2, 2), dtype=bool)  # No time periods selected
+
+    with pytest.raises(ValueError, match="No time periods found for aggregation."):
+        _compute_did_time_aggregation_weights(gt_index, g_values, t_values, d_values, selected_gt_mask)
+
+
+@pytest.mark.ci
+def test_single_time_period():
+    gt_index = np.ma.MaskedArray(data=np.ones((2, 3, 3)), mask=np.zeros((2, 3, 3), dtype=bool))
+    g_values = np.array([2, 3])
+    t_values = np.array([1, 2, 3])
+    d_values = np.array([2, 2, 2, 3, 3, 3])
+    selected_gt_mask = np.ones((2, 1, 1), dtype=bool)
+
+    result = _compute_did_time_aggregation_weights(gt_index, g_values, t_values, d_values, selected_gt_mask)
+
+    assert len(result["agg_names"]) == 1
+    assert result["agg_names"] == ["1"]
+    assert result["weight_masks"].shape[-1] == 1
+    assert np.allclose(result["agg_weights"], [1.0])
+
+
+@pytest.mark.ci
+def test_masked_input_time():
+    # Create data with shape (3,4,4)
+    data = np.ones((2, 4, 4))
+    mask = np.zeros((2, 4, 4), dtype=bool)
+
+    # Mask some elements in different positions
+    mask[0, 0, 0] = True
+    mask[1, 2, 1] = True
+    mask[1, 1, 2] = True
+
+    gt_index = np.ma.MaskedArray(data=data, mask=mask)
+    g_values = np.array([2, 3])  # One value for each group
+    t_values = np.array([1, 2, 3, 4])  # One value for each time period
+    d_values = np.array([1, 2, 3, 4] * 6)  # Treatment values
+    selected_gt_mask = ~mask  # Select all non-masked elements
+
+    result = _compute_did_time_aggregation_weights(gt_index, g_values, t_values, d_values, selected_gt_mask)
+
+    # Check dimensions of output
+    assert result["weight_masks"].shape == (2, 4, 4, 4)  # Last dimension is number of time periods
+
+    # Check if masks are maintained
+    for time_idx in range(4):
+        time_weights = result["weight_masks"][..., time_idx]
+        assert np.array_equal(time_weights.mask, mask)
+
+    # Check weight normalization
+    for time_idx in range(4):
+        weights = result["weight_masks"][..., time_idx].compressed()  # Get non-masked weights
+        if len(weights) > 0:
+            assert np.isclose(weights.sum(), 1.0)  # Weights should sum to 1 for each time period
+
+    # Check agg_names
+    assert result["agg_names"] == ["1", "2", "3", "4"]
+
+    # Check agg_weights sum to 1
+    assert np.isclose(sum(result["agg_weights"]), 1.0)
diff --git a/doubleml/did/utils/tests/test_did_utils.py b/doubleml/did/utils/tests/test_did_utils.py
new file mode 100644
index 00000000..df9da7f2
--- /dev/null
+++ b/doubleml/did/utils/tests/test_did_utils.py
@@ -0,0 +1,431 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from doubleml.did.utils._did_utils import (
+    _check_anticipation_periods,
+    _check_control_group,
+    _check_gt_combination,
+    _check_gt_values,
+    _construct_gt_combinations,
+    _construct_gt_index,
+    _construct_post_treatment_mask,
+    _get_id_positions,
+    _get_never_treated_value,
+    _is_never_treated,
+    _set_id_positions,
+)
+
+
+@pytest.mark.ci
+def test_get_never_treated_value():
+    assert _get_never_treated_value(np.array([1, 2])) == 0
+    assert np.isinf(_get_never_treated_value(np.array([1.0, 2.0])))
+    assert np.isinf(_get_never_treated_value(np.array([1.0, 2])))
+    assert _get_never_treated_value(np.array(["2024-01-01", "2024-01-02"], dtype="datetime64")) is pd.NaT
+    assert _get_never_treated_value(np.array(["2024-01-01", "2024-01-02"])) == 0
+
+
+@pytest.mark.ci
+def test_is_never_treated():
+    # check single values
+    arguments = (
+        (0, 0, True),
+        (1, 0, False),
+        (np.inf, np.inf, True),
+        (0, np.inf, False),
+        (np.nan, np.inf, False),
+        (pd.NaT, pd.NaT, True),
+        (0, pd.NaT, False),
+    )
+    for x, never_treated_value, expected in arguments:
+        assert _is_never_treated(x, never_treated_value) == expected
+
+    # check arrays
+    arguments = (
+        (np.array([0, 1]), 0, np.array([True, False])),
+        (np.array([0, 1]), np.inf, np.array([False, False])),
+        (np.array([0, 1]), pd.NaT, np.array([False, False])),
+        (np.array([0, np.inf]), 0, np.array([True, False])),
+        (np.array([0, np.inf]), np.inf, np.array([False, True])),
+        (np.array([0, pd.NaT]), 0, np.array([True, False])),
+        (np.array([0, pd.NaT]), pd.NaT, np.array([False, True])),
+    )
+    for x, never_treated_value, expected in arguments:
+        assert np.all(_is_never_treated(x, never_treated_value) == expected)
+
+
+@pytest.mark.ci
+def test_check_control_group():
+    with pytest.raises(ValueError, match="The control group has to be one of"):
+        _check_control_group("invalid_control_group")
+
+
+@pytest.mark.ci
+def test_check_anticipation_periods():
+    with pytest.raises(TypeError, match="The anticipation periods must be an integer."):
+        _check_anticipation_periods("invalid_type")
+    with pytest.raises(ValueError, match="The anticipation periods must be non-negative."):
+        _check_anticipation_periods(-1)
+
+    assert _check_anticipation_periods(0) == 0
+    assert _check_anticipation_periods(1) == 1
+
+
+@pytest.mark.ci
+def test_check_gt_combination():
+    valid_args = {
+        "gt_combination": (1, 0, 1),
+        "g_values": np.array([-1, 1, 2, np.inf]),
+        "t_values": np.array([0, 1, 2]),
+        "never_treated_value": np.inf,
+        "anticipation_periods": 0,
+    }
+    invalid_args = [
+        (
+            {"gt_combination": (3.0, 0, 1)},
+            ValueError,
+            r"The value 3.0 is not in the set of treatment group values \[-1.  1.  2. inf\].",
+        ),
+        ({"gt_combination": (1, 0, 3)}, ValueError, r"The value 3 is not in the set of evaluation period values \[0 1 2\]."),
+        ({"gt_combination": (1, 3, 1)}, ValueError, r"The value 3 is not in the set of evaluation period values \[0 1 2\]."),
+        (
+            {"gt_combination": (0, 0, 1), "g_values": np.array([1, 2, 0]), "never_treated_value": 0},
+            ValueError,
+            r"The never treated group is not allowed as treatment group \(g_value=0\).",
+        ),
+        (
+            {"gt_combination": (1, 1, 1)},
+            ValueError,
+            "The pre-treatment and evaluation period must be different. Got 1 for both.",
+        ),
+        (
+            {"gt_combination": (1, 1, 0)},
+            ValueError,
+            "The pre-treatment period must be before the evaluation period. Got t_value_pre 1 and t_value_eval 0.",
+        ),
+        (
+            {"gt_combination": (-1, 0, 1)},
+            ValueError,
+            r"The value -1 \(group value\) is not in the set of evaluation period values \[0 1 2\].",
+        ),
+    ]
+    for arg, error, msg in invalid_args:
+        with pytest.raises(error, match=msg):
+            _check_gt_combination(**(valid_args | arg))
+
+    msg = r"The treatment was assigned before the first pre-treatment period \(including anticipation\)."
+    with pytest.warns(UserWarning, match=msg):
+        _check_gt_combination(**(valid_args | {"gt_combination": (1, 1, 2)}))
+    with pytest.warns(UserWarning, match=msg):
+        _check_gt_combination(**(valid_args | {"gt_combination": (1, 0, 1), "anticipation_periods": 1}))
+
+
+@pytest.mark.ci
+def test_input_check_gt_values():
+    valid_args = {
+        "g_values": np.array([1.0, 2.0]),
+        "t_values": np.array([0.0, 1.0, 2.0]),
+    }
+    invalid_args = [
+        ({"g_values": ["test"]}, TypeError, r"Invalid type for g_values: expected one of \(<class 'int'>, <class 'float'>\)."),
+        ({"t_values": ["test"]}, TypeError, r"Invalid type for t_values: expected one of \(<class 'int'>, <class 'float'>\)."),
+        ({"g_values": np.array([[1.0, 2.0]])}, ValueError, "g_values must be a vector. Number of dimensions is 2."),
+        ({"t_values": np.array([[0.0, 1.0, 2.0]])}, ValueError, "t_values must be a vector. Number of dimensions is 2."),
+        ({"g_values": None}, TypeError, "Invalid type for g_values."),
+        ({"t_values": None}, TypeError, "Invalid type for t_values."),
+        ({"t_values": np.array([0.0, 1.0, np.nan])}, ValueError, "t_values contains missing or infinite values."),
+        ({"t_values": np.array([0.0, 1.0, np.inf])}, ValueError, "t_values contains missing or infinite values."),
+        (
+            {"t_values": np.array(["2024-01-01", "2024-01-02", "NaT"], dtype="datetime64")},
+            ValueError,
+            "t_values contains missing values.",
+        ),
+        (
+            {"g_values": np.array(["test", "test"])},
+            ValueError,
+            (
+                "Invalid data type for g_values: expected one of "
+                r"\(<class 'numpy.integer'>, <class 'numpy.floating'>, <class 'numpy.datetime64'>\)."
+            ),
+        ),
+        (
+            {"t_values": np.array(["test", "test"])},
+            ValueError,
+            (
+                "Invalid data type for t_values: expected one of "
+                r"\(<class 'numpy.integer'>, <class 'numpy.floating'>, <class 'numpy.datetime64'>\)."
+            ),
+        ),
+        (
+            {"g_values": np.array(["2024-01-01", "2024-01-02"], dtype="datetime64")},
+            ValueError,
+            r"g_values and t_values must have the same data type. Got datetime64\[D\] for g_values and float64 for t_values.",
+        ),
+    ]
+
+    for arg, error, msg in invalid_args:
+        with pytest.raises(error, match=msg):
+            _check_gt_values(**(valid_args | arg))
+
+
+@pytest.mark.ci
+def test_construct_gt_combinations():
+    msg = r"gt_combinations must be one of \['standard', 'all'\]. test was passed."
+    with pytest.raises(ValueError, match=msg):
+        _construct_gt_combinations(
+            setting="test",
+            g_values=np.array([2, 3]),
+            t_values=np.array([1, 2, 3, 4]),
+            never_treated_value=np.inf,
+            anticipation_periods=0,
+        )
+
+    msg = "g_values must be sorted in ascending order."
+    with pytest.raises(ValueError, match=msg):
+        _construct_gt_combinations(
+            setting="standard",
+            g_values=np.array([3, 2]),
+            t_values=np.array([1, 2, 3, 4]),
+            never_treated_value=np.inf,
+            anticipation_periods=0,
+        )
+
+    msg = "t_values must be sorted in ascending order."
+    with pytest.raises(ValueError, match=msg):
+        _construct_gt_combinations(
+            setting="standard",
+            g_values=np.array([1, 2]),
+            t_values=np.array([3, 2, 1]),
+            never_treated_value=np.inf,
+            anticipation_periods=0,
+        )
+
+    # too large anticipation periods (no valid combinations)
+    msg = (
+        "No valid group-time combinations found. "
+        r"Please check the treatment group values and time period values \(and anticipation\)."
+    )
+    with pytest.raises(ValueError, match=msg):
+        _construct_gt_combinations(
+            setting="standard",
+            g_values=np.array([2, 3]),
+            t_values=np.array([0, 1, 2, 3]),
+            never_treated_value=np.inf,
+            anticipation_periods=3,
+        )
+
+    # Test standard setting
+    standard_combinations = _construct_gt_combinations(
+        setting="standard",
+        g_values=np.array([2, 3]),
+        t_values=np.array([0, 1, 2, 3]),
+        never_treated_value=np.inf,
+        anticipation_periods=0,
+    )
+    expected_standard = [
+        (2, 0, 1),  # g=2, pre=0 (min of t_previous=0 and t_before_g=0), eval=1
+        (2, 1, 2),  # g=2, pre=1 (min of t_previous=1 and t_before_g=1), eval=2
+        (2, 1, 3),  # g=2, pre=1 (min of t_previous=2 and t_before_g=1), eval=3
+        (3, 0, 1),  # g=3, pre=0 (min of t_previous=0 and t_before_g=0), eval=1
+        (3, 1, 2),  # g=3, pre=1 (min of t_previous=1 and t_before_g=1), eval=2
+        (3, 2, 3),  # g=3, pre=2 (min of t_previous=2 and t_before_g=2), eval=3
+    ]
+    assert standard_combinations == expected_standard
+
+    # Test all setting
+    all_combinations = _construct_gt_combinations(
+        setting="all",
+        g_values=np.array([2, 3]),
+        t_values=np.array([0, 1, 2, 3]),
+        never_treated_value=np.inf,
+        anticipation_periods=0,
+    )
+    expected_all = [
+        (2, 0, 1),  # g=2, all pre periods before t_eval=1
+        (2, 0, 2),  # g=2, all pre periods before t_eval=2
+        (2, 1, 2),
+        (2, 0, 3),  # g=2, all pre periods before t_eval=3
+        (2, 1, 3),
+        (3, 0, 1),  # g=3, all pre periods before t_eval=1
+        (3, 0, 2),  # g=3, all pre periods before t_eval=2
+        (3, 1, 2),
+        (3, 0, 3),  # g=3, all pre periods before t_eval=3
+        (3, 1, 3),
+        (3, 2, 3),
+    ]
+    assert all_combinations == expected_all
+
+    # Test standard setting with anticipation periods
+    standard_combinations_anticipation = _construct_gt_combinations(
+        setting="standard",
+        g_values=np.array([2, 3]),
+        t_values=np.array([0, 1, 2, 3]),
+        never_treated_value=np.inf,
+        anticipation_periods=2,
+    )
+    expected_standard_anticipation = [
+        (3, 0, 3),  # g=3, pre=0 (min of t_previous=0 and t_before_g=0), eval=3 with anticipation 2
+    ]
+    assert standard_combinations_anticipation == expected_standard_anticipation
+
+    # Test all setting with anticipation periods
+    all_combinations_anticipation = _construct_gt_combinations(
+        setting="all",
+        g_values=np.array([2, 3]),
+        t_values=np.array([0, 1, 2, 3]),
+        never_treated_value=np.inf,
+        anticipation_periods=2,
+    )
+    expected_all_anticipation = [
+        (3, 0, 3),  # g=3, all pre periods before t_eval=3 with anticipation 2
+    ]
+    assert all_combinations_anticipation == expected_all_anticipation
+
+
+@pytest.mark.ci
+def test_construct_gt_index():
+    g_values = np.array([0, 2, 3])
+    t_values = np.array([1, 2, 3])
+    gt_combinations = [(2, 1, 2), (2, 1, 3), (3, 1, 2)]  # g_val, t_pre, t_eval
+    result = _construct_gt_index(gt_combinations, g_values, t_values)
+    # Check dimensions
+    assert result.shape == (3, 3, 3)
+
+    # Check valid entries
+    assert result[1, 0, 1] == 0  # First combination (2, 1, 2)
+    assert result[1, 0, 2] == 1  # Second combination (2, 1, 3)
+    assert result[2, 0, 1] == 2  # Third combination (3, 1, 2)
+    assert result.mask[1, 0, 1] == np.False_
+    assert result.mask[1, 0, 2] == np.False_
+    assert result.mask[2, 0, 1] == np.False_
+
+    # Check that other entries are masked and contain -1
+    assert result.mask[0, 0, 0] == np.True_
+    assert result.data[0, 0, 0] == -1
+
+    # Test case 2: Empty combinations
+    empty_result = _construct_gt_index([], g_values, t_values)
+    assert empty_result.shape == (3, 3, 3)
+    assert np.all(empty_result.mask)
+    assert np.all(empty_result.data == -1)
+
+    # Test case 3: Single combination
+    single_combination = [(2, 1, 2)]
+    single_result = _construct_gt_index(single_combination, g_values, t_values)
+    assert single_result[1, 0, 1] == 0
+    assert np.sum(~single_result.mask) == 1  # Only one unmasked entry
+
+    # Test case 4: Different dimensions
+    g_values_large = np.array([0, 1, 2, 3, 4])
+    t_values_large = np.array([1, 2, 3, 4])
+    large_result = _construct_gt_index(gt_combinations, g_values_large, t_values_large)
+    assert large_result.shape == (5, 4, 4)
+
+
+@pytest.mark.ci
+def test_construct_post_treatment_mask():
+    # Test case 1: Basic case with integer values
+    g_values = np.array([2, 3])
+    t_values = np.array([1, 2, 3])
+    result = _construct_post_treatment_mask(g_values, t_values)
+
+    # Expected mask pattern for g=2:
+    # t_eval=1: False (1 not >= 2)
+    # t_eval=2: True (2 not >= 2)
+    # t_eval=3: True  (3 >= 2)
+    expected_g2 = np.array([[False, True, True]] * len(t_values))
+    np.testing.assert_array_equal(result[0], expected_g2)
+
+    # Expected mask pattern for g=3:
+    # t_eval=1: False (1 not > 3)
+    # t_eval=2: False (2 not > 3)
+    # t_eval=3: True (3 >= 3)
+    expected_g3 = np.array([[False, False, True]] * len(t_values))
+    np.testing.assert_array_equal(result[1], expected_g3)
+
+    # Test case 2: Float values with non-integer treatment times
+    g_values = np.array([1.5, 2.5])
+    t_values = np.array([1.0, 2.0, 3.0])
+    result = _construct_post_treatment_mask(g_values, t_values)
+
+    expected_g1_5 = np.array([[False, True, True]] * len(t_values))
+    expected_g2_5 = np.array([[False, False, True]] * len(t_values))
+    np.testing.assert_array_equal(result[0], expected_g1_5)
+    np.testing.assert_array_equal(result[1], expected_g2_5)
+
+    # Test case 3: Single group
+    g_values = np.array([2])
+    t_values = np.array([1, 2, 3])
+    result = _construct_post_treatment_mask(g_values, t_values)
+    assert result.shape == (1, 3, 3)
+    np.testing.assert_array_equal(result[0], expected_g2)
+
+    # Test case 4: Single time period
+    g_values = np.array([1, 2])
+    t_values = np.array([3])
+    result = _construct_post_treatment_mask(g_values, t_values)
+    assert result.shape == (2, 1, 1)
+    np.testing.assert_array_equal(result, np.array([[[True]], [[True]]]))
+
+    # Test case 5: Datetime values
+    g_values = np.array(["2020-01-01", "2020-06-01"], dtype="datetime64[D]")
+    t_values = np.array(["2020-01-01", "2020-03-01", "2020-12-01"], dtype="datetime64[D]")
+    result = _construct_post_treatment_mask(g_values, t_values)
+
+    expected_g1 = np.array([[True, True, True]] * len(t_values))
+    expected_g2 = np.array([[False, False, True]] * len(t_values))
+    np.testing.assert_array_equal(result[0], expected_g1)
+    np.testing.assert_array_equal(result[1], expected_g2)
+
+
+@pytest.mark.ci
+def test_get_id_positions():
+    # Test case 1: Normal array with valid positions
+    a = np.array([1, 2, 3, 4, 5])
+    id_positions = np.array([0, 2, 4])
+    expected = np.array([1, 3, 5])
+    result = _get_id_positions(a, id_positions)
+    np.testing.assert_array_equal(result, expected)
+
+    # Test case 2: 2D array with valid positions
+    a_2d = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
+    id_positions = np.array([1, 3])
+    expected_2d = np.array([[3, 4], [7, 8]])
+    result_2d = _get_id_positions(a_2d, id_positions)
+    np.testing.assert_array_equal(result_2d, expected_2d)
+
+    # Test case 3: None input
+    a_none = None
+    id_positions = np.array([0, 1, 2])
+    result_none = _get_id_positions(a_none, id_positions)
+    assert result_none is None
+
+
+@pytest.mark.ci
+def test_set_id_positions():
+    # Test case 1: Basic 1D array
+    a = np.array([1, 2, 3])
+    n_obs = 5
+    id_positions = np.array([1, 3, 4])
+    fill_value = 0
+    expected = np.array([0, 1, 0, 2, 3])
+    result = _set_id_positions(a, n_obs, id_positions, fill_value)
+    np.testing.assert_array_equal(result, expected)
+
+    # Test case 2: 2D array
+    a_2d = np.array([[1, 2], [3, 4], [5, 6]])
+    n_obs = 5
+    id_positions = np.array([0, 2, 4])
+    fill_value = -1
+    expected_2d = np.array([[1, 2], [-1, -1], [3, 4], [-1, -1], [5, 6]])
+    result_2d = _set_id_positions(a_2d, n_obs, id_positions, fill_value)
+    np.testing.assert_array_equal(result_2d, expected_2d)
+
+    # Test case 3: None input
+    a_none = None
+    n_obs = 3
+    id_positions = np.array([0, 1])
+    fill_value = 0
+    result_none = _set_id_positions(a_none, n_obs, id_positions, fill_value)
+    assert result_none is None
diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py
index 1b88c8ee..1b6d3d09 100644
--- a/doubleml/double_ml.py
+++ b/doubleml/double_ml.py
@@ -7,13 +7,14 @@
 from scipy.stats import norm
 from sklearn.base import is_classifier, is_regressor
 
-from .double_ml_data import DoubleMLBaseData, DoubleMLClusterData
-from .double_ml_framework import DoubleMLFramework
-from .utils._checks import _check_external_predictions, _check_sample_splitting
-from .utils._estimation import _aggregate_coefs_and_ses, _rmse, _set_external_predictions, _var_est
-from .utils._sensitivity import _compute_sensitivity_bias
-from .utils.gain_statistics import gain_statistics
-from .utils.resampling import DoubleMLClusterResampling, DoubleMLResampling
+from doubleml.data import DoubleMLClusterData, DoubleMLPanelData
+from doubleml.data.base_data import DoubleMLBaseData
+from doubleml.double_ml_framework import DoubleMLFramework
+from doubleml.utils._checks import _check_external_predictions, _check_sample_splitting
+from doubleml.utils._estimation import _aggregate_coefs_and_ses, _rmse, _set_external_predictions, _var_est
+from doubleml.utils._sensitivity import _compute_sensitivity_bias
+from doubleml.utils.gain_statistics import gain_statistics
+from doubleml.utils.resampling import DoubleMLClusterResampling, DoubleMLResampling
 
 _implemented_data_backends = ["DoubleMLData", "DoubleMLClusterData"]
 
@@ -33,7 +34,12 @@ def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting):
             if obj_dml_data.n_cluster_vars > 2:
                 raise NotImplementedError("Multi-way (n_ways > 2) clustering not yet implemented.")
             self._is_cluster_data = True
+        self._is_panel_data = False
+        if isinstance(obj_dml_data, DoubleMLPanelData):
+            self._is_panel_data = True
+
         self._dml_data = obj_dml_data
+        self._n_obs = self._dml_data.n_obs
 
         # initialize framework which is constructed after the fit method is called
         self._framework = None
@@ -170,6 +176,13 @@ def n_rep(self):
         """
         return self._n_rep
 
+    @property
+    def n_obs(self):
+        """
+        The number of observations used for estimation.
+        """
+        return self._n_obs
+
     @property
     def n_rep_boot(self):
         """
@@ -1210,6 +1223,12 @@ def draw_sample_splitting(self):
         The samples are drawn according to the attributes
         ``n_folds`` and ``n_rep``.
 
+        Parameters
+        ----------
+        n_obs : int or None
+            The number of observations. If ``None``, the number of observations is set to the number of observations in
+            the data set.
+
         Returns
         -------
         self : object
@@ -1218,14 +1237,14 @@ def draw_sample_splitting(self):
             obj_dml_resampling = DoubleMLClusterResampling(
                 n_folds=self._n_folds_per_cluster,
                 n_rep=self.n_rep,
-                n_obs=self._dml_data.n_obs,
+                n_obs=self.n_obs,
                 n_cluster_vars=self._dml_data.n_cluster_vars,
                 cluster_vars=self._dml_data.cluster_vars,
             )
             self._smpls, self._smpls_cluster = obj_dml_resampling.split_samples()
         else:
             obj_dml_resampling = DoubleMLResampling(
-                n_folds=self.n_folds, n_rep=self.n_rep, n_obs=self._dml_data.n_obs, stratify=self._strata
+                n_folds=self.n_folds, n_rep=self.n_rep, n_obs=self.n_obs, stratify=self._strata
             )
             self._smpls = obj_dml_resampling.split_samples()
 
@@ -1292,7 +1311,7 @@ def set_sample_splitting(self, all_smpls, all_smpls_cluster=None):
         >>> dml_plr_obj.set_sample_splitting(smpls)
         """
         self._smpls, self._smpls_cluster, self._n_rep, self._n_folds = _check_sample_splitting(
-            all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data
+            all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data, n_obs=self.n_obs
         )
 
         (
@@ -1623,6 +1642,15 @@ def sensitivity_benchmark(self, benchmarking_set, fit_args=None):
         Computes a benchmark for a given set of features.
         Returns a DataFrame containing the corresponding values for cf_y, cf_d, rho and the change in estimates.
 
+        Parameters
+        ----------
+        benchmarking_set : list
+            List of features to be used for benchmarking.
+
+        fit_args : dict, optional
+            Additional arguments for the fit method.
+            Default is None.
+
         Returns
         -------
         benchmark_results : pandas.DataFrame
diff --git a/doubleml/double_ml_framework.py b/doubleml/double_ml_framework.py
index 60786028..ea1ae9fa 100644
--- a/doubleml/double_ml_framework.py
+++ b/doubleml/double_ml_framework.py
@@ -307,7 +307,7 @@ def __add__(self, other):
             assert np.allclose(self._var_scaling_factors, other._var_scaling_factors)
             var_scaling_factors = self._var_scaling_factors
 
-            # compute standard errors
+            # compute standard errors (Uses factor 1/n for scaling!)
             sigma2_hat = np.divide(np.mean(np.square(scaled_psi), axis=0), var_scaling_factors.reshape(-1, 1))
             all_ses = np.sqrt(sigma2_hat)
             thetas, ses = _aggregate_coefs_and_ses(all_thetas, all_ses, var_scaling_factors)
diff --git a/doubleml/irm/__init__.py b/doubleml/irm/__init__.py
index a48cfe35..7579d6f8 100644
--- a/doubleml/irm/__init__.py
+++ b/doubleml/irm/__init__.py
@@ -2,4 +2,24 @@
 The :mod:`doubleml.irm` module implements double machine learning estimates based on interactive regression models.
 """
 
-__all__ = []
+from .apo import DoubleMLAPO
+from .apos import DoubleMLAPOS
+from .cvar import DoubleMLCVAR
+from .iivm import DoubleMLIIVM
+from .irm import DoubleMLIRM
+from .lpq import DoubleMLLPQ
+from .pq import DoubleMLPQ
+from .qte import DoubleMLQTE
+from .ssm import DoubleMLSSM
+
+__all__ = [
+    "DoubleMLIRM",
+    "DoubleMLAPO",
+    "DoubleMLAPOS",
+    "DoubleMLCVAR",
+    "DoubleMLIIVM",
+    "DoubleMLLPQ",
+    "DoubleMLPQ",
+    "DoubleMLQTE",
+    "DoubleMLSSM",
+]
diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py
index e9b160cb..8099342a 100644
--- a/doubleml/irm/apos.py
+++ b/doubleml/irm/apos.py
@@ -6,8 +6,8 @@
 from joblib import Parallel, delayed
 from sklearn.base import clone
 
+from doubleml.data import DoubleMLClusterData, DoubleMLData
 from doubleml.double_ml import DoubleML
-from doubleml.double_ml_data import DoubleMLClusterData, DoubleMLData
 from doubleml.double_ml_framework import concat
 from doubleml.irm.apo import DoubleMLAPO
 from doubleml.utils._checks import _check_sample_splitting, _check_score, _check_trimming, _check_weights
@@ -261,10 +261,10 @@ def smpls(self):
         """
         if self._smpls is None:
             err_msg = (
-                "Sample splitting not specified. Draw samples via .draw_sample splitting(). "
+                "Sample splitting not specified. Draw samples via .draw_sample_splitting(). "
                 + "External samples not implemented yet."
             )
-            raise ValueError(err_msg)
+            raise NotImplementedError(err_msg)
         return self._smpls
 
     @property
diff --git a/doubleml/irm/cvar.py b/doubleml/irm/cvar.py
index e77031e6..d2aeaced 100644
--- a/doubleml/irm/cvar.py
+++ b/doubleml/irm/cvar.py
@@ -3,8 +3,8 @@
 from sklearn.model_selection import StratifiedKFold, train_test_split
 from sklearn.utils import check_X_y
 
+from doubleml.data.base_data import DoubleMLData
 from doubleml.double_ml import DoubleML
-from doubleml.double_ml_data import DoubleMLData
 from doubleml.double_ml_score_mixins import LinearScoreMixin
 from doubleml.utils._checks import (
     _check_contains_iv,
diff --git a/doubleml/irm/iivm.py b/doubleml/irm/iivm.py
index d78d6f3e..3f252f2a 100644
--- a/doubleml/irm/iivm.py
+++ b/doubleml/irm/iivm.py
@@ -2,8 +2,8 @@
 from sklearn.utils import check_X_y
 from sklearn.utils.multiclass import type_of_target
 
+from doubleml.data.base_data import DoubleMLData
 from doubleml.double_ml import DoubleML
-from doubleml.double_ml_data import DoubleMLData
 from doubleml.double_ml_score_mixins import LinearScoreMixin
 from doubleml.utils._checks import (
     _check_binary_predictions,
diff --git a/doubleml/irm/irm.py b/doubleml/irm/irm.py
index 72db088e..9bf5ed35 100644
--- a/doubleml/irm/irm.py
+++ b/doubleml/irm/irm.py
@@ -5,8 +5,8 @@
 from sklearn.utils import check_X_y
 from sklearn.utils.multiclass import type_of_target
 
+from doubleml.data.base_data import DoubleMLData
 from doubleml.double_ml import DoubleML
-from doubleml.double_ml_data import DoubleMLData
 from doubleml.double_ml_score_mixins import LinearScoreMixin
 from doubleml.utils._checks import (
     _check_binary_predictions,
diff --git a/doubleml/irm/lpq.py b/doubleml/irm/lpq.py
index 56a97969..c98e8fa2 100644
--- a/doubleml/irm/lpq.py
+++ b/doubleml/irm/lpq.py
@@ -4,8 +4,8 @@
 from sklearn.utils import check_X_y
 from sklearn.utils.multiclass import type_of_target
 
+from doubleml.data.base_data import DoubleMLData
 from doubleml.double_ml import DoubleML
-from doubleml.double_ml_data import DoubleMLData
 from doubleml.double_ml_score_mixins import NonLinearScoreMixin
 from doubleml.utils._checks import _check_quantile, _check_score, _check_treatment, _check_trimming, _check_zero_one_treatment
 from doubleml.utils._estimation import (
diff --git a/doubleml/irm/pq.py b/doubleml/irm/pq.py
index 4cdcd74c..f64dc471 100644
--- a/doubleml/irm/pq.py
+++ b/doubleml/irm/pq.py
@@ -3,8 +3,8 @@
 from sklearn.model_selection import StratifiedKFold, train_test_split
 from sklearn.utils import check_X_y
 
+from doubleml.data.base_data import DoubleMLData
 from doubleml.double_ml import DoubleML
-from doubleml.double_ml_data import DoubleMLData
 from doubleml.double_ml_score_mixins import NonLinearScoreMixin
 from doubleml.utils._checks import (
     _check_contains_iv,
diff --git a/doubleml/irm/qte.py b/doubleml/irm/qte.py
index f05269ad..68b91a9a 100644
--- a/doubleml/irm/qte.py
+++ b/doubleml/irm/qte.py
@@ -3,7 +3,7 @@
 from joblib import Parallel, delayed
 from sklearn.base import clone
 
-from doubleml.double_ml_data import DoubleMLClusterData, DoubleMLData
+from doubleml.data import DoubleMLClusterData, DoubleMLData
 from doubleml.double_ml_framework import concat
 from doubleml.irm.cvar import DoubleMLCVAR
 from doubleml.irm.lpq import DoubleMLLPQ
diff --git a/doubleml/irm/ssm.py b/doubleml/irm/ssm.py
index 5a6458ca..c84b326d 100644
--- a/doubleml/irm/ssm.py
+++ b/doubleml/irm/ssm.py
@@ -6,8 +6,8 @@
 from sklearn.model_selection import train_test_split
 from sklearn.utils import check_X_y
 
+from doubleml.data.base_data import DoubleMLData
 from doubleml.double_ml import DoubleML
-from doubleml.double_ml_data import DoubleMLData
 from doubleml.double_ml_score_mixins import LinearScoreMixin
 from doubleml.utils._checks import _check_finite_predictions, _check_score, _check_trimming
 from doubleml.utils._estimation import _dml_cv_predict, _dml_tune, _get_cond_smpls_2d, _predict_zero_one_propensity
diff --git a/doubleml/irm/tests/_utils_apos_manual.py b/doubleml/irm/tests/_utils_apos_manual.py
index efc5eea1..88fc59c2 100644
--- a/doubleml/irm/tests/_utils_apos_manual.py
+++ b/doubleml/irm/tests/_utils_apos_manual.py
@@ -1,7 +1,7 @@
 import numpy as np
 from sklearn.base import clone
 
-from ...double_ml_data import DoubleMLData
+from ...data.base_data import DoubleMLData
 from ...tests._utils_boot import draw_weights
 from ..apo import DoubleMLAPO
 
diff --git a/doubleml/irm/tests/_utils_qte_manual.py b/doubleml/irm/tests/_utils_qte_manual.py
index 25de79cd..0e19e03e 100644
--- a/doubleml/irm/tests/_utils_qte_manual.py
+++ b/doubleml/irm/tests/_utils_qte_manual.py
@@ -1,7 +1,7 @@
 import numpy as np
 from sklearn.base import clone
 
-from ...double_ml_data import DoubleMLData
+from ...data.base_data import DoubleMLData
 from ...tests._utils_boot import draw_weights
 from ...utils._estimation import _default_kde
 from ..pq import DoubleMLPQ
diff --git a/doubleml/irm/tests/test_apos_exceptions.py b/doubleml/irm/tests/test_apos_exceptions.py
index 8e9a0b8a..c309b7e2 100644
--- a/doubleml/irm/tests/test_apos_exceptions.py
+++ b/doubleml/irm/tests/test_apos_exceptions.py
@@ -86,8 +86,8 @@ def test_apos_exception_ipw_normalization():
 def test_apos_exception_properties_and_methods():
     # properties
     dml_obj = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=0, draw_sample_splitting=False)
-    msg = r"Sample splitting not specified. Draw samples via .draw_sample splitting\(\). External samples not implemented yet."
-    with pytest.raises(ValueError, match=msg):
+    msg = r"Sample splitting not specified. Draw samples via .draw_sample_splitting\(\). External samples not implemented yet."
+    with pytest.raises(NotImplementedError, match=msg):
         _ = dml_obj.smpls
 
     # methods
diff --git a/doubleml/irm/tests/test_qte_exceptions.py b/doubleml/irm/tests/test_qte_exceptions.py
index 32193c30..9f94f5d4 100644
--- a/doubleml/irm/tests/test_qte_exceptions.py
+++ b/doubleml/irm/tests/test_qte_exceptions.py
@@ -5,8 +5,8 @@
 from sklearn.linear_model import Lasso, LogisticRegression
 
 from doubleml import DoubleMLData, DoubleMLQTE
+from doubleml.data.base_data import DoubleMLBaseData
 from doubleml.datasets import make_irm_data
-from doubleml.double_ml_data import DoubleMLBaseData
 
 np.random.seed(42)
 n = 100
diff --git a/doubleml/irm/tests/test_ssm_exceptions.py b/doubleml/irm/tests/test_ssm_exceptions.py
index 1f5c6d46..6ff276e3 100644
--- a/doubleml/irm/tests/test_ssm_exceptions.py
+++ b/doubleml/irm/tests/test_ssm_exceptions.py
@@ -5,8 +5,8 @@
 from sklearn.linear_model import Lasso, LogisticRegression
 
 from doubleml import DoubleMLSSM
+from doubleml.data.base_data import DoubleMLBaseData
 from doubleml.datasets import make_ssm_data
-from doubleml.double_ml_data import DoubleMLBaseData
 
 np.random.seed(3141)
 n = 100
diff --git a/doubleml/plm/pliv.py b/doubleml/plm/pliv.py
index dc0fbd29..ba022688 100644
--- a/doubleml/plm/pliv.py
+++ b/doubleml/plm/pliv.py
@@ -6,8 +6,8 @@
 from sklearn.model_selection import GridSearchCV, KFold, RandomizedSearchCV
 from sklearn.utils import check_X_y
 
+from ..data.base_data import DoubleMLData
 from ..double_ml import DoubleML
-from ..double_ml_data import DoubleMLData
 from ..double_ml_score_mixins import LinearScoreMixin
 from ..utils._checks import _check_finite_predictions
 from ..utils._estimation import _dml_cv_predict, _dml_tune
diff --git a/doubleml/plm/plr.py b/doubleml/plm/plr.py
index 1b45d865..a81bac48 100644
--- a/doubleml/plm/plr.py
+++ b/doubleml/plm/plr.py
@@ -5,8 +5,8 @@
 from sklearn.base import clone
 from sklearn.utils import check_X_y
 
+from ..data.base_data import DoubleMLData
 from ..double_ml import DoubleML
-from ..double_ml_data import DoubleMLData
 from ..double_ml_score_mixins import LinearScoreMixin
 from ..utils._checks import _check_binary_predictions, _check_finite_predictions, _check_is_propensity, _check_score
 from ..utils._estimation import _dml_cv_predict, _dml_tune
diff --git a/doubleml/tests/_utils.py b/doubleml/tests/_utils.py
index eeeaab3d..c9d042d1 100644
--- a/doubleml/tests/_utils.py
+++ b/doubleml/tests/_utils.py
@@ -4,7 +4,7 @@
 from sklearn.base import clone
 from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
 
-from ..double_ml_data import DoubleMLBaseData
+from ..data.base_data import DoubleMLBaseData
 from ..utils._estimation import _aggregate_coefs_and_ses, _var_est
 
 
diff --git a/doubleml/tests/conftest.py b/doubleml/tests/conftest.py
index 248697b8..bf53d788 100644
--- a/doubleml/tests/conftest.py
+++ b/doubleml/tests/conftest.py
@@ -4,7 +4,7 @@
 from sklearn.datasets import make_classification, make_regression, make_spd_matrix
 
 from doubleml import DoubleMLData
-from doubleml.datasets import make_irm_data, make_pliv_CHS2015, make_plr_turrell2018
+from doubleml.datasets import make_pliv_CHS2015, make_plr_turrell2018
 
 
 def _g(x):
@@ -55,26 +55,6 @@ def generate_data1(request):
     return data
 
 
-@pytest.fixture(scope="session", params=[(500, 10), (1000, 20)])
-def generate_data_irm_w_missings(request):
-    n_p = request.param
-    np.random.seed(1111)
-    # setting parameters
-    n = n_p[0]
-    p = n_p[1]
-    theta = 0.5
-
-    # generating data
-    (x, y, d) = make_irm_data(n, p, theta, return_type="array")
-
-    # randomly set some entries to np.nan
-    ind = np.random.choice(np.arange(x.size), replace=False, size=int(x.size * 0.05))
-    x[np.unravel_index(ind, x.shape)] = np.nan
-    data = (x, y, d)
-
-    return data
-
-
 @pytest.fixture(scope="session", params=[(1000, 20)])
 def generate_data_iv(request):
     n_p = request.param
diff --git a/doubleml/tests/test_datasets.py b/doubleml/tests/test_datasets.py
index 2f3ff80a..67f612e8 100644
--- a/doubleml/tests/test_datasets.py
+++ b/doubleml/tests/test_datasets.py
@@ -9,7 +9,6 @@
     fetch_bonus,
     make_confounded_irm_data,
     make_confounded_plr_data,
-    make_did_SZ2020,
     make_heterogeneous_data,
     make_iivm_data,
     make_irm_data,
@@ -165,42 +164,6 @@ def test_make_pliv_multiway_cluster_CKMS2021_return_types():
         _ = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="matrix")
 
 
-@pytest.fixture(scope="function", params=[False, True])
-def cross_sectional(request):
-    return request.param
-
-
-@pytest.fixture(scope="function", params=[1, 2, 3, 4, 5, 6])
-def dgp_type(request):
-    return request.param
-
-
-@pytest.mark.ci
-def test_make_did_SZ2020_return_types(cross_sectional, dgp_type):
-    np.random.seed(3141)
-    res = make_did_SZ2020(n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type=DoubleMLData)
-    assert isinstance(res, DoubleMLData)
-    res = make_did_SZ2020(n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type=pd.DataFrame)
-    assert isinstance(res, pd.DataFrame)
-    if cross_sectional:
-        x, y, d, t = make_did_SZ2020(
-            n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type=np.ndarray
-        )
-        assert isinstance(t, np.ndarray)
-    else:
-        x, y, d, _ = make_did_SZ2020(
-            n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type=np.ndarray
-        )
-    assert isinstance(x, np.ndarray)
-    assert isinstance(y, np.ndarray)
-    assert isinstance(d, np.ndarray)
-    with pytest.raises(ValueError, match=msg_inv_return_type):
-        _ = make_did_SZ2020(n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type="matrix")
-    msg = "The dgp_type is not valid."
-    with pytest.raises(ValueError, match=msg):
-        _ = make_did_SZ2020(n_obs=100, dgp_type="5", cross_sectional_data=cross_sectional, return_type="matrix")
-
-
 @pytest.fixture(scope="function", params=[True, False])
 def linear(request):
     return request.param
diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py
index e5fa1924..a4655bb9 100644
--- a/doubleml/tests/test_exceptions.py
+++ b/doubleml/tests/test_exceptions.py
@@ -22,13 +22,13 @@
     DoubleMLQTE,
 )
 from doubleml.datasets import (
-    make_did_SZ2020,
     make_iivm_data,
     make_irm_data,
     make_pliv_CHS2015,
     make_pliv_multiway_cluster_CKMS2021,
     make_plr_CCDDHNR2018,
 )
+from doubleml.did.datasets import make_did_SZ2020
 
 from ._utils import DummyDataClass
 
diff --git a/doubleml/tests/test_model_defaults.py b/doubleml/tests/test_model_defaults.py
index 401827b1..f55a555c 100644
--- a/doubleml/tests/test_model_defaults.py
+++ b/doubleml/tests/test_model_defaults.py
@@ -5,13 +5,13 @@
 
 import doubleml as dml
 from doubleml.datasets import (
-    make_did_SZ2020,
     make_iivm_data,
     make_irm_data,
     make_pliv_CHS2015,
     make_plr_CCDDHNR2018,
     make_ssm_data,
 )
+from doubleml.did.datasets import make_did_SZ2020
 
 np.random.seed(3141)
 dml_data_plr = make_plr_CCDDHNR2018(n_obs=100)
diff --git a/doubleml/tests/test_return_types.py b/doubleml/tests/test_return_types.py
index 51c39c24..11ebd624 100644
--- a/doubleml/tests/test_return_types.py
+++ b/doubleml/tests/test_return_types.py
@@ -24,7 +24,6 @@
     DoubleMLSSM,
 )
 from doubleml.datasets import (
-    make_did_SZ2020,
     make_iivm_data,
     make_irm_data,
     make_pliv_CHS2015,
@@ -32,6 +31,7 @@
     make_plr_CCDDHNR2018,
     make_ssm_data,
 )
+from doubleml.did.datasets import make_did_SZ2020
 
 np.random.seed(3141)
 n_obs = 200
diff --git a/doubleml/utils/_aliases.py b/doubleml/utils/_aliases.py
new file mode 100644
index 00000000..e52a5818
--- /dev/null
+++ b/doubleml/utils/_aliases.py
@@ -0,0 +1,29 @@
+import numpy as np
+import pandas as pd
+
+from doubleml.data import DoubleMLClusterData, DoubleMLData
+
+_array_alias = ["array", "np.ndarray", "np.array", np.ndarray]
+_data_frame_alias = ["DataFrame", "pd.DataFrame", pd.DataFrame]
+_dml_data_alias = ["DoubleMLData", DoubleMLData]
+_dml_cluster_data_alias = ["DoubleMLClusterData", DoubleMLClusterData]
+
+
+def _get_array_alias():
+    """Returns the list of array aliases."""
+    return _array_alias
+
+
+def _get_data_frame_alias():
+    """Returns the list of data frame aliases."""
+    return _data_frame_alias
+
+
+def _get_dml_data_alias():
+    """Returns the list of DoubleMLData aliases."""
+    return _dml_data_alias
+
+
+def _get_dml_cluster_data_alias():
+    """Returns the list of DoubleMLClusterData aliases."""
+    return _dml_cluster_data_alias
diff --git a/doubleml/utils/_check_defaults.py b/doubleml/utils/_check_defaults.py
new file mode 100644
index 00000000..5f376000
--- /dev/null
+++ b/doubleml/utils/_check_defaults.py
@@ -0,0 +1,61 @@
+import numpy as np
+import pandas as pd
+
+from doubleml.double_ml import DoubleML
+
+
+def _check_basic_defaults_before_fit(dml_obj):
+    # general parameters
+    assert dml_obj.n_folds == 5
+    assert dml_obj.n_rep == 1
+    assert dml_obj.framework is None
+    pd.testing.assert_frame_equal(dml_obj.summary, pd.DataFrame(columns=["coef", "std err", "t", "P>|t|"]))
+
+    # bootstrap
+    assert dml_obj.boot_method is None
+    assert dml_obj.n_rep_boot is None
+    assert dml_obj.boot_t_stat is None
+
+    # sensitivity
+    assert dml_obj.sensitivity_params is None
+    assert dml_obj.sensitivity_elements is None
+
+
+def _fit_bootstrap(dml_obj):
+    dml_obj.fit()
+    dml_obj.bootstrap()
+
+
+def _check_basic_defaults_after_fit(dml_obj):
+    # general parameters
+    assert dml_obj.n_folds == 5
+    assert dml_obj.n_rep == 1
+    assert dml_obj.framework is not None
+
+    # coefs and se
+    assert isinstance(dml_obj.coef, np.ndarray)
+    assert isinstance(dml_obj.se, np.ndarray)
+    assert isinstance(dml_obj.all_coef, np.ndarray)
+    assert isinstance(dml_obj.all_se, np.ndarray)
+    assert isinstance(dml_obj.t_stat, np.ndarray)
+    assert isinstance(dml_obj.pval, np.ndarray)
+
+    # bootstrap
+    assert dml_obj.boot_method == "normal"
+    assert dml_obj.n_rep_boot == 500
+    assert isinstance(dml_obj.boot_t_stat, np.ndarray)
+
+    # sensitivity
+    assert dml_obj.sensitivity_params is None
+    assert isinstance(dml_obj.sensitivity_elements, dict)
+
+    # fit method
+    if isinstance(dml_obj, DoubleML):
+        assert dml_obj.predictions is not None
+        assert dml_obj.models is None
+
+    # confint method
+    assert dml_obj.confint().equals(dml_obj.confint(joint=False, level=0.95))
+
+    # p_adjust method
+    assert dml_obj.p_adjust().equals(dml_obj.p_adjust(method="romano-wolf"))
diff --git a/doubleml/utils/_check_return_types.py b/doubleml/utils/_check_return_types.py
new file mode 100644
index 00000000..54462059
--- /dev/null
+++ b/doubleml/utils/_check_return_types.py
@@ -0,0 +1,153 @@
+import numpy as np
+import pandas as pd
+import plotly
+
+from doubleml import DoubleMLFramework
+from doubleml.data import DoubleMLClusterData
+from doubleml.double_ml_score_mixins import NonLinearScoreMixin
+
+
+def check_basic_return_types(dml_obj, cls):
+    # ToDo: A second test case with multiple treatment variables would be helpful
+    assert isinstance(dml_obj.__str__(), str)
+    assert isinstance(dml_obj.summary, pd.DataFrame)
+    assert isinstance(dml_obj.draw_sample_splitting(), cls)
+    if not dml_obj._is_cluster_data:
+        assert isinstance(dml_obj.set_sample_splitting(dml_obj.smpls), cls)
+    else:
+        assert isinstance(dml_obj._dml_data, DoubleMLClusterData)
+    assert isinstance(dml_obj.fit(), cls)
+    assert isinstance(dml_obj.__str__(), str)  # called again after fit, now with numbers
+    assert isinstance(dml_obj.summary, pd.DataFrame)  # called again after fit, now with numbers
+    if not dml_obj._is_cluster_data:
+        assert isinstance(dml_obj.bootstrap(), cls)
+    else:
+        assert isinstance(dml_obj._dml_data, DoubleMLClusterData)
+    assert isinstance(dml_obj.confint(), pd.DataFrame)
+    if not dml_obj._is_cluster_data:
+        assert isinstance(dml_obj.p_adjust(), pd.DataFrame)
+    else:
+        isinstance(dml_obj.p_adjust("bonferroni"), pd.DataFrame)
+    assert isinstance(dml_obj._dml_data.__str__(), str)
+
+
+def check_basic_property_types_and_shapes(dml_obj, n_obs, n_treat, n_rep, n_folds, n_rep_boot):
+    # not checked: learner, learner_names, params, params_names, score
+    # already checked: summary
+
+    # check that the setting is still in line with the hard-coded values
+    assert dml_obj._dml_data.n_treat == n_treat
+    assert dml_obj.n_rep == n_rep
+    assert dml_obj.n_folds == n_folds
+    assert dml_obj._dml_data.n_obs == n_obs
+    assert dml_obj.n_rep_boot == n_rep_boot
+
+    assert isinstance(dml_obj.all_coef, np.ndarray)
+    assert dml_obj.all_coef.shape == (n_treat, n_rep)
+
+    assert isinstance(dml_obj.all_se, np.ndarray)
+    assert dml_obj.all_se.shape == (n_treat, n_rep)
+
+    assert isinstance(dml_obj.boot_t_stat, np.ndarray)
+    assert dml_obj.boot_t_stat.shape == (n_rep_boot, n_treat, n_rep)
+
+    assert isinstance(dml_obj.coef, np.ndarray)
+    assert dml_obj.coef.shape == (n_treat,)
+
+    assert isinstance(dml_obj.psi, np.ndarray)
+    assert dml_obj.psi.shape == (
+        n_obs,
+        n_rep,
+        n_treat,
+    )
+
+    is_nonlinear = isinstance(dml_obj, NonLinearScoreMixin)
+    if is_nonlinear:
+        for score_element in dml_obj._score_element_names:
+            assert isinstance(dml_obj.psi_elements[score_element], np.ndarray)
+            assert dml_obj.psi_elements[score_element].shape == (
+                n_obs,
+                n_rep,
+                n_treat,
+            )
+    else:
+        assert isinstance(dml_obj.psi_elements["psi_a"], np.ndarray)
+        assert dml_obj.psi_elements["psi_a"].shape == (
+            n_obs,
+            n_rep,
+            n_treat,
+        )
+
+        assert isinstance(dml_obj.psi_elements["psi_b"], np.ndarray)
+        assert dml_obj.psi_elements["psi_b"].shape == (
+            n_obs,
+            n_rep,
+            n_treat,
+        )
+
+    assert isinstance(dml_obj.framework, DoubleMLFramework)
+    assert isinstance(dml_obj.pval, np.ndarray)
+    assert dml_obj.pval.shape == (n_treat,)
+
+    assert isinstance(dml_obj.se, np.ndarray)
+    assert dml_obj.se.shape == (n_treat,)
+
+    assert isinstance(dml_obj.t_stat, np.ndarray)
+    assert dml_obj.t_stat.shape == (n_treat,)
+
+    assert isinstance(dml_obj._dml_data.binary_treats, pd.Series)
+    assert len(dml_obj._dml_data.binary_treats) == n_treat
+
+    assert isinstance(dml_obj.smpls, list)
+    assert len(dml_obj.smpls) == n_rep
+    all_tuple = all([all([isinstance(tpl, tuple) for tpl in smpl]) for smpl in dml_obj.smpls])
+    assert all_tuple
+    all_pairs = all([all([len(tpl) == 2 for tpl in smpl]) for smpl in dml_obj.smpls])
+    assert all_pairs
+    n_folds_each_smpl = np.array([len(smpl) for smpl in dml_obj.smpls])
+    assert np.all(n_folds_each_smpl == n_folds_each_smpl[0])
+    assert n_folds_each_smpl[0] == n_folds
+
+    return
+
+
+def check_basic_predictions_and_targets(dml_obj, n_obs, n_treat, n_rep):
+
+    expected_keys = dml_obj.params_names
+    for key in expected_keys:
+        assert isinstance(dml_obj.predictions[key], np.ndarray)
+        assert dml_obj.predictions[key].shape == (n_obs, n_rep, n_treat)
+
+        assert isinstance(dml_obj.nuisance_targets[key], np.ndarray)
+        assert dml_obj.nuisance_targets[key].shape == (n_obs, n_rep, n_treat)
+
+        assert isinstance(dml_obj.nuisance_loss[key], np.ndarray)
+        assert dml_obj.nuisance_loss[key].shape == (n_rep, n_treat)
+
+    return
+
+
+def check_sensitivity_return_types(dml_obj, n_obs, n_rep, n_treat, benchmarking_set):
+    assert isinstance(dml_obj.sensitivity_elements, dict)
+    for key in ["sigma2", "nu2"]:
+        assert isinstance(dml_obj.sensitivity_elements[key], np.ndarray)
+        assert dml_obj.sensitivity_elements[key].shape == (1, n_rep, n_treat)
+    for key in ["psi_sigma2", "psi_nu2", "riesz_rep"]:
+        assert isinstance(dml_obj.sensitivity_elements[key], np.ndarray)
+        assert dml_obj.sensitivity_elements[key].shape == (n_obs, n_rep, n_treat)
+
+    assert isinstance(dml_obj.sensitivity_summary, str)
+    dml_obj.sensitivity_analysis()
+    assert isinstance(dml_obj.sensitivity_summary, str)
+    assert isinstance(dml_obj.sensitivity_plot(), plotly.graph_objs._figure.Figure)
+    benchmarks = {"cf_y": [0.1, 0.2], "cf_d": [0.15, 0.2], "name": ["test1", "test2"]}
+    assert isinstance(dml_obj.sensitivity_plot(value="ci", benchmarks=benchmarks), plotly.graph_objs._figure.Figure)
+
+    assert isinstance(dml_obj.framework._calc_sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0, level=0.95), dict)
+    assert isinstance(
+        dml_obj.framework._calc_robustness_value(null_hypothesis=0.0, level=0.95, rho=1.0, idx_treatment=0), tuple
+    )
+    benchmark = dml_obj.sensitivity_benchmark(benchmarking_set=benchmarking_set)
+    assert isinstance(benchmark, pd.DataFrame)
+
+    return
diff --git a/doubleml/utils/_checks.py b/doubleml/utils/_checks.py
index 90833ded..db1fbf94 100644
--- a/doubleml/utils/_checks.py
+++ b/doubleml/utils/_checks.py
@@ -438,16 +438,17 @@ def _check_cluster_sample_splitting(all_smpls_cluster, dml_data, n_rep, n_folds)
     return smpls_cluster
 
 
-def _check_sample_splitting(all_smpls, all_smpls_cluster, dml_data, is_cluster_data):
+def _check_sample_splitting(all_smpls, all_smpls_cluster, dml_data, is_cluster_data, n_obs=None):
+    # default value for n_obs is None (different for e.g. DoubleMLPanelData)
+    if n_obs is None:
+        n_obs = dml_data.n_obs
     if isinstance(all_smpls, tuple):
         if not len(all_smpls) == 2:
             raise ValueError(
                 "Invalid partition provided. Tuple for train_ind and test_ind must consist of exactly two elements."
             )
-        all_smpls = _check_smpl_split_tpl(all_smpls, dml_data.n_obs)
-        if _check_is_partition([all_smpls], dml_data.n_obs) & _check_is_partition(
-            [(all_smpls[1], all_smpls[0])], dml_data.n_obs
-        ):
+        all_smpls = _check_smpl_split_tpl(all_smpls, n_obs)
+        if _check_is_partition([all_smpls], n_obs) & _check_is_partition([(all_smpls[1], all_smpls[0])], n_obs):
             n_rep = 1
             n_folds = 1
             smpls = [[all_smpls]]
@@ -465,14 +466,14 @@ def _check_sample_splitting(all_smpls, all_smpls_cluster, dml_data, is_cluster_d
                     "Invalid partition provided. All tuples for train_ind and test_ind must consist of exactly two elements."
                 )
             n_rep = 1
-            all_smpls = _check_smpl_split(all_smpls, dml_data.n_obs)
-            if _check_is_partition(all_smpls, dml_data.n_obs):
-                if (len(all_smpls) == 1) & _check_is_partition([(all_smpls[0][1], all_smpls[0][0])], dml_data.n_obs):
+            all_smpls = _check_smpl_split(all_smpls, n_obs)
+            if _check_is_partition(all_smpls, n_obs):
+                if (len(all_smpls) == 1) & _check_is_partition([(all_smpls[0][1], all_smpls[0][0])], n_obs):
                     n_folds = 1
                     smpls = [all_smpls]
                 else:
                     n_folds = len(all_smpls)
-                    smpls = _check_all_smpls([all_smpls], dml_data.n_obs, check_intersect=True)
+                    smpls = _check_all_smpls([all_smpls], n_obs, check_intersect=True)
             else:
                 raise ValueError("Invalid partition provided. Tuples provided that don't form a partition.")
         else:
@@ -494,13 +495,13 @@ def _check_sample_splitting(all_smpls, all_smpls_cluster, dml_data, is_cluster_d
             n_folds_each_smpl = np.array([len(smpl) for smpl in all_smpls])
             if not np.all(n_folds_each_smpl == n_folds_each_smpl[0]):
                 raise ValueError("Invalid partition provided. Different number of folds for repeated sample splitting.")
-            all_smpls = _check_all_smpls(all_smpls, dml_data.n_obs)
-            smpls_are_partitions = [_check_is_partition(smpl, dml_data.n_obs) for smpl in all_smpls]
+            all_smpls = _check_all_smpls(all_smpls, n_obs)
+            smpls_are_partitions = [_check_is_partition(smpl, n_obs) for smpl in all_smpls]
 
             if all(smpls_are_partitions):
                 n_rep = len(all_smpls)
                 n_folds = int(n_folds_each_smpl[0])
-                smpls = _check_all_smpls(all_smpls, dml_data.n_obs, check_intersect=True)
+                smpls = _check_all_smpls(all_smpls, n_obs, check_intersect=True)
             else:
                 raise ValueError("Invalid partition provided. At least one inner list does not form a partition.")
 
diff --git a/pyproject.toml b/pyproject.toml
index 339bd0a3..41f52706 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,6 +22,7 @@ dependencies = [
     "scikit-learn>=1.4.0",
     "statsmodels",
     "matplotlib",
+    "seaborn>=0.13",
     "plotly"
 ]
 classifiers = [