diff --git a/.gitignore b/.gitignore index ba091906..306442b1 100644 --- a/.gitignore +++ b/.gitignore @@ -30,4 +30,4 @@ MANIFEST *.idea *.vscode .flake8 -.coverage \ No newline at end of file +.coverage diff --git a/doubleml/__init__.py b/doubleml/__init__.py index a86735c8..102ea995 100644 --- a/doubleml/__init__.py +++ b/doubleml/__init__.py @@ -1,8 +1,8 @@ import importlib.metadata +from .data import DoubleMLClusterData, DoubleMLData from .did.did import DoubleMLDID from .did.did_cs import DoubleMLDIDCS -from .double_ml_data import DoubleMLClusterData, DoubleMLData from .double_ml_framework import DoubleMLFramework, concat from .irm.apo import DoubleMLAPO from .irm.apos import DoubleMLAPOS diff --git a/doubleml/data/__init__.py b/doubleml/data/__init__.py new file mode 100644 index 00000000..d8a920c6 --- /dev/null +++ b/doubleml/data/__init__.py @@ -0,0 +1,13 @@ +""" +The :mod:`doubleml.data` module implements data classes for double machine learning. +""" + +from .base_data import DoubleMLData +from .cluster_data import DoubleMLClusterData +from .panel_data import DoubleMLPanelData + +__all__ = [ + "DoubleMLData", + "DoubleMLClusterData", + "DoubleMLPanelData", +] diff --git a/doubleml/double_ml_data.py b/doubleml/data/base_data.py similarity index 57% rename from doubleml/double_ml_data.py rename to doubleml/data/base_data.py index 3ebf2f76..318508e9 100644 --- a/doubleml/double_ml_data.py +++ b/doubleml/data/base_data.py @@ -7,8 +7,7 @@ from sklearn.utils.multiclass import type_of_target from sklearn.utils.validation import check_array, check_consistent_length, column_or_1d -from .utils._checks import _check_set -from .utils._estimation import _assure_2d_array +from doubleml.utils._estimation import _assure_2d_array class DoubleMLBaseData(ABC): @@ -127,6 +126,14 @@ class DoubleMLData(DoubleMLBaseData): in the covariates ``x``. Default is ``True``. + force_all_d_finite : bool or str + Indicates whether to raise an error on infinite values and / or missings in the treatment variables ``d``. + Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are + allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). + Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the model used allows for missing + and / or infinite values in the treatment variables ``d`` (e.g. panel data models). + Default is ``True``. + Examples -------- >>> from doubleml import DoubleMLData @@ -150,6 +157,7 @@ def __init__( s_col=None, use_other_treat_as_covariate=True, force_all_x_finite=True, + force_all_d_finite=True, ): DoubleMLBaseData.__init__(self, data) @@ -159,9 +167,10 @@ def __init__( self.t_col = t_col self.s_col = s_col self.x_cols = x_cols - self._check_disjoint_sets_y_d_x_z_t_s() + self._check_disjoint_sets() self.use_other_treat_as_covariate = use_other_treat_as_covariate self.force_all_x_finite = force_all_x_finite + self.force_all_d_finite = force_all_d_finite self._binary_treats = self._check_binary_treats() self._binary_outcome = self._check_binary_outcome() self._set_y_z_t_s() @@ -197,7 +206,18 @@ def _data_summary_str(self): return data_summary @classmethod - def from_arrays(cls, x, y, d, z=None, t=None, s=None, use_other_treat_as_covariate=True, force_all_x_finite=True): + def from_arrays( + cls, + x, + y, + d, + z=None, + t=None, + s=None, + use_other_treat_as_covariate=True, + force_all_x_finite=True, + force_all_d_finite=True, + ): """ Initialize :class:`DoubleMLData` from :class:`numpy.ndarray`'s. @@ -237,6 +257,14 @@ def from_arrays(cls, x, y, d, z=None, t=None, s=None, use_other_treat_as_covaria in the covariates ``x``. Default is ``True``. + force_all_d_finite : bool or str + Indicates whether to raise an error on infinite values and / or missings in the treatment variables ``d``. + Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are + allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). + Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the model used allows for missing + and / or infinite values in the treatment variables ``d`` (e.g. panel data models). + Default is ``True``. + Examples -------- >>> from doubleml import DoubleMLData @@ -255,8 +283,19 @@ def from_arrays(cls, x, y, d, z=None, t=None, s=None, use_other_treat_as_covaria elif not isinstance(force_all_x_finite, bool): raise TypeError("Invalid force_all_x_finite. " + "force_all_x_finite must be True, False or 'allow-nan'.") + if isinstance(force_all_d_finite, str): + if force_all_d_finite != "allow-nan": + raise ValueError( + "Invalid force_all_d_finite " + + force_all_d_finite + + ". " + + "force_all_d_finite must be True, False or 'allow-nan'." + ) + elif not isinstance(force_all_d_finite, bool): + raise TypeError("Invalid force_all_d_finite. " + "force_all_d_finite must be True, False or 'allow-nan'.") + x = check_array(x, ensure_2d=False, allow_nd=False, force_all_finite=force_all_x_finite) - d = check_array(d, ensure_2d=False, allow_nd=False) + d = check_array(d, ensure_2d=False, allow_nd=False, force_all_finite=force_all_x_finite) y = column_or_1d(y, warn=True) x = _assure_2d_array(x) @@ -296,7 +335,7 @@ def from_arrays(cls, x, y, d, z=None, t=None, s=None, use_other_treat_as_covaria x_cols = [f"X{i + 1}" for i in np.arange(x.shape[1])] - # basline version with features, outcome and treatments + # baseline version with features, outcome and treatments data = pd.DataFrame(np.column_stack((x, y, d)), columns=x_cols + [y_col] + d_cols) if z is not None: @@ -309,7 +348,18 @@ def from_arrays(cls, x, y, d, z=None, t=None, s=None, use_other_treat_as_covaria if s is not None: data[s_col] = s - return cls(data, y_col, d_cols, x_cols, z_cols, t_col, s_col, use_other_treat_as_covariate, force_all_x_finite) + return cls( + data, + y_col, + d_cols, + x_cols, + z_cols, + t_col, + s_col, + use_other_treat_as_covariate, + force_all_x_finite, + force_all_d_finite, + ) @property def x(self): @@ -431,14 +481,14 @@ def x_cols(self, value): raise ValueError("Invalid covariates x_cols. At least one covariate is no data column.") assert set(value).issubset(set(self.all_variables)) self._x_cols = value + else: - excluded_cols = set.union({self.y_col}, set(self.d_cols)) - if self.z_cols is not None: - excluded_cols = set.union(excluded_cols, set(self.z_cols)) - for col in [self.t_col, self.s_col]: - col = _check_set(col) - excluded_cols = set.union(excluded_cols, col) + excluded_cols = {self.y_col} | set(self.d_cols) + optional_col_sets = self._get_optional_col_sets() + for optional_col_set in optional_col_sets: + excluded_cols |= optional_col_set self._x_cols = [col for col in self.data.columns if col not in excluded_cols] + if reset_value: self._check_disjoint_sets() # by default, we initialize to the first treatment variable @@ -612,26 +662,41 @@ def force_all_x_finite(self, value): # by default, we initialize to the first treatment variable self.set_x_d(self.d_cols[0]) - def _set_y_z_t_s(self): - assert_all_finite(self.data.loc[:, self.y_col]) - self._y = self.data.loc[:, self.y_col] - if self.z_cols is None: - self._z = None - else: - assert_all_finite(self.data.loc[:, self.z_cols]) - self._z = self.data.loc[:, self.z_cols] + @property + def force_all_d_finite(self): + """ + Indicates whether to raise an error on infinite values and / or missings in the treatment variables ``d``. + Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are + allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). + """ + return self._force_all_d_finite - if self.t_col is None: - self._t = None - else: - assert_all_finite(self.data.loc[:, self.t_col]) - self._t = self.data.loc[:, self.t_col] + @force_all_d_finite.setter + def force_all_d_finite(self, value): + reset_value = hasattr(self, "_force_all_d_finite") + if isinstance(value, str): + if value != "allow-nan": + raise ValueError( + "Invalid force_all_d_finite " + value + ". " + "force_all_d_finite must be True, False or 'allow-nan'." + ) + elif not isinstance(value, bool): + raise TypeError("Invalid force_all_d_finite. " + "force_all_d_finite must be True, False or 'allow-nan'.") + self._force_all_d_finite = value + if reset_value: + # by default, we initialize to the first treatment variable + self.set_x_d(self.d_cols[0]) - if self.s_col is None: - self._s = None - else: - assert_all_finite(self.data.loc[:, self.s_col]) - self._s = self.data.loc[:, self.s_col] + def _set_y_z_t_s(self): + def _set_attr(col): + if col is None: + return None + assert_all_finite(self.data.loc[:, col]) + return self.data.loc[:, col] + + self._y = _set_attr(self.y_col) + self._z = _set_attr(self.z_cols) + self._t = _set_attr(self.t_col) + self._s = _set_attr(self.s_col) def set_x_d(self, treatment_var): """ @@ -655,19 +720,31 @@ def set_x_d(self, treatment_var): xd_list.remove(treatment_var) else: xd_list = self.x_cols - assert_all_finite(self.data.loc[:, treatment_var]) + if self.force_all_d_finite: + assert_all_finite(self.data.loc[:, self.d_cols], allow_nan=self.force_all_d_finite == "allow-nan") if self.force_all_x_finite: assert_all_finite(self.data.loc[:, xd_list], allow_nan=self.force_all_x_finite == "allow-nan") self._d = self.data.loc[:, treatment_var] self._X = self.data.loc[:, xd_list] + def _get_optional_col_sets(self): + # this function can be extended in inherited subclasses + z_cols_set = set(self.z_cols or []) + t_col_set = {self.t_col} if self.t_col else set() + s_col_set = {self.s_col} if self.s_col else set() + + return [z_cols_set, t_col_set, s_col_set] + def _check_binary_treats(self): is_binary = pd.Series(dtype=bool, index=self.d_cols) - for treatment_var in self.d_cols: - this_d = self.data.loc[:, treatment_var] - binary_treat = type_of_target(this_d) == "binary" - zero_one_treat = np.all((np.power(this_d, 2) - this_d) == 0) - is_binary[treatment_var] = binary_treat & zero_one_treat + if not self.force_all_d_finite: + is_binary[:] = False # if we allow infinite values, we cannot check for binary + else: + for treatment_var in self.d_cols: + this_d = self.data.loc[:, treatment_var] + binary_treat = type_of_target(this_d) == "binary" + zero_one_treat = np.all((np.power(this_d, 2) - this_d) == 0) + is_binary[treatment_var] = binary_treat & zero_one_treat return is_binary def _check_binary_outcome(self): @@ -677,11 +754,18 @@ def _check_binary_outcome(self): is_binary = binary_outcome & zero_one_outcome return is_binary + @staticmethod + def _check_disjoint(set1, set2, name1, arg1, name2, arg2): + """Helper method to check for disjoint sets.""" + if not set1.isdisjoint(set2): + raise ValueError(f"At least one variable/column is set as {name1} ({arg1}) and {name2} ({arg2}).") + def _check_disjoint_sets(self): # this function can be extended in inherited subclasses - self._check_disjoint_sets_y_d_x_z_t_s() + self._check_disjoint_sets_y_d_x() + self._check_disjoint_sets_z_t_s() - def _check_disjoint_sets_y_d_x_z_t_s(self): + def _check_disjoint_sets_y_d_x(self): y_col_set = {self.y_col} x_cols_set = set(self.x_cols) d_cols_set = set(self.d_cols) @@ -700,396 +784,31 @@ def _check_disjoint_sets_y_d_x_z_t_s(self): "(``x_cols``). Consider using parameter ``use_other_treat_as_covariate``." ) - if self.z_cols is not None: - z_cols_set = set(self.z_cols) - if not y_col_set.isdisjoint(z_cols_set): - raise ValueError( - f"{str(self.y_col)} cannot be set as outcome variable ``y_col`` and instrumental variable in ``z_cols``." - ) - if not d_cols_set.isdisjoint(z_cols_set): - raise ValueError( - "At least one variable/column is set as treatment variable (``d_cols``) and " - "instrumental variable in ``z_cols``." - ) - if not x_cols_set.isdisjoint(z_cols_set): - raise ValueError( - "At least one variable/column is set as covariate (``x_cols``) and instrumental variable in ``z_cols``." - ) - - self._check_disjoint_sets_t_s() - - def _check_disjoint_sets_t_s(self): + def _check_disjoint_sets_z_t_s(self): y_col_set = {self.y_col} x_cols_set = set(self.x_cols) d_cols_set = set(self.d_cols) - if self.t_col is not None: - t_col_set = {self.t_col} - if not t_col_set.isdisjoint(x_cols_set): - raise ValueError(f"{str(self.t_col)} cannot be set as time variable ``t_col`` and covariate in ``x_cols``.") - if not t_col_set.isdisjoint(d_cols_set): - raise ValueError( - f"{str(self.t_col)} cannot be set as time variable ``t_col`` and treatment variable in ``d_cols``." - ) - if not t_col_set.isdisjoint(y_col_set): - raise ValueError(f"{str(self.t_col)} cannot be set as time variable ``t_col`` and outcome variable ``y_col``.") - if self.z_cols is not None: - z_cols_set = set(self.z_cols) - if not t_col_set.isdisjoint(z_cols_set): - raise ValueError( - f"{str(self.t_col)} cannot be set as time variable ``t_col`` and instrumental variable in ``z_cols``." - ) - - if self.s_col is not None: - s_col_set = {self.s_col} - if not s_col_set.isdisjoint(x_cols_set): - raise ValueError( - f"{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and covariate in ``x_cols``." - ) - if not s_col_set.isdisjoint(d_cols_set): - raise ValueError( - f"{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and treatment " - "variable in ``d_cols``." - ) - if not s_col_set.isdisjoint(y_col_set): - raise ValueError( - f"{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and outcome variable ``y_col``." - ) - if self.z_cols is not None: - z_cols_set = set(self.z_cols) - if not s_col_set.isdisjoint(z_cols_set): - raise ValueError( - f"{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and " - "instrumental variable in ``z_cols``." - ) - if self.t_col is not None: - t_col_set = {self.t_col} - if not s_col_set.isdisjoint(t_col_set): - raise ValueError( - f"{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and time " - "variable ``t_col``." - ) - - -class DoubleMLClusterData(DoubleMLData): - """Double machine learning data-backend for data with cluster variables. - - :class:`DoubleMLClusterData` objects can be initialized from - :class:`pandas.DataFrame`'s as well as :class:`numpy.ndarray`'s. - - Parameters - ---------- - data : :class:`pandas.DataFrame` - The data. - - y_col : str - The outcome variable. - - d_cols : str or list - The treatment variable(s). - - cluster_cols : str or list - The cluster variable(s). - - x_cols : None, str or list - The covariates. - If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor - treatment variables ``d_cols``, nor instrumental variables ``z_cols`` are used as covariates. - Default is ``None``. - - z_cols : None, str or list - The instrumental variable(s). - Default is ``None``. - - t_col : None or str - The time variable (only relevant/used for DiD Estimators). - Default is ``None``. - - s_col : None or str - The score or selection variable (only relevant/used for RDD and SSM Estimatiors). - Default is ``None``. - - use_other_treat_as_covariate : bool - Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. - Default is ``True``. - - force_all_x_finite : bool or str - Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. - Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are - allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). - Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used - for the nuisance functions are capable to provide valid predictions with missings and / or infinite values - in the covariates ``x``. - Default is ``True``. - - Examples - -------- - >>> from doubleml import DoubleMLClusterData - >>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021 - >>> # initialization from pandas.DataFrame - >>> df = make_pliv_multiway_cluster_CKMS2021(return_type='DataFrame') - >>> obj_dml_data_from_df = DoubleMLClusterData(df, 'Y', 'D', ['cluster_var_i', 'cluster_var_j'], z_cols='Z') - >>> # initialization from np.ndarray - >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array') - >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z) - """ - - def __init__( - self, - data, - y_col, - d_cols, - cluster_cols, - x_cols=None, - z_cols=None, - t_col=None, - s_col=None, - use_other_treat_as_covariate=True, - force_all_x_finite=True, - ): - DoubleMLBaseData.__init__(self, data) - - # we need to set cluster_cols (needs _data) before call to the super __init__ because of the x_cols setter - self.cluster_cols = cluster_cols - self._set_cluster_vars() - DoubleMLData.__init__( - self, data, y_col, d_cols, x_cols, z_cols, t_col, s_col, use_other_treat_as_covariate, force_all_x_finite - ) - self._check_disjoint_sets_cluster_cols() - - def __str__(self): - data_summary = self._data_summary_str() - buf = io.StringIO() - self.data.info(verbose=False, buf=buf) - df_info = buf.getvalue() - res = ( - "================== DoubleMLClusterData Object ==================\n" - + "\n------------------ Data summary ------------------\n" - + data_summary - + "\n------------------ DataFrame info ------------------\n" - + df_info - ) - return res - - def _data_summary_str(self): - data_summary = ( - f"Outcome variable: {self.y_col}\n" - f"Treatment variable(s): {self.d_cols}\n" - f"Cluster variable(s): {self.cluster_cols}\n" - f"Covariates: {self.x_cols}\n" - f"Instrument variable(s): {self.z_cols}\n" - ) - if self.t_col is not None: - data_summary += f"Time variable: {self.t_col}\n" - if self.s_col is not None: - data_summary += f"Score/Selection variable: {self.s_col}\n" - - data_summary += f"No. Observations: {self.n_obs}\n" - return data_summary - - @classmethod - def from_arrays( - cls, x, y, d, cluster_vars, z=None, t=None, s=None, use_other_treat_as_covariate=True, force_all_x_finite=True - ): - """ - Initialize :class:`DoubleMLClusterData` from :class:`numpy.ndarray`'s. - - Parameters - ---------- - x : :class:`numpy.ndarray` - Array of covariates. - - y : :class:`numpy.ndarray` - Array of the outcome variable. - - d : :class:`numpy.ndarray` - Array of treatment variables. - - cluster_vars : :class:`numpy.ndarray` - Array of cluster variables. - - z : None or :class:`numpy.ndarray` - Array of instrumental variables. - Default is ``None``. - - t : :class:`numpy.ndarray` - Array of the time variable (only relevant/used for DiD models). - Default is ``None``. - - s : :class:`numpy.ndarray` - Array of the score or selection variable (only relevant/used for RDD or SSM models). - Default is ``None``. - - use_other_treat_as_covariate : bool - Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. - Default is ``True``. - - force_all_x_finite : bool or str - Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. - Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are - allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). - Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used - for the nuisance functions are capable to provide valid predictions with missings and / or infinite values - in the covariates ``x``. - Default is ``True``. - - Examples - -------- - >>> from doubleml import DoubleMLClusterData - >>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021 - >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array') - >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z) - """ - dml_data = DoubleMLData.from_arrays(x, y, d, z, t, s, use_other_treat_as_covariate, force_all_x_finite) - cluster_vars = check_array(cluster_vars, ensure_2d=False, allow_nd=False) - cluster_vars = _assure_2d_array(cluster_vars) - if cluster_vars.shape[1] == 1: - cluster_cols = ["cluster_var"] - else: - cluster_cols = [f"cluster_var{i + 1}" for i in np.arange(cluster_vars.shape[1])] - - data = pd.concat((pd.DataFrame(cluster_vars, columns=cluster_cols), dml_data.data), axis=1) - - return cls( - data, - dml_data.y_col, - dml_data.d_cols, - cluster_cols, - dml_data.x_cols, - dml_data.z_cols, - dml_data.t_col, - dml_data.s_col, - dml_data.use_other_treat_as_covariate, - dml_data.force_all_x_finite, - ) - - @property - def cluster_cols(self): - """ - The cluster variable(s). - """ - return self._cluster_cols - - @cluster_cols.setter - def cluster_cols(self, value): - reset_value = hasattr(self, "_cluster_cols") - if isinstance(value, str): - value = [value] - if not isinstance(value, list): - raise TypeError( - "The cluster variable(s) cluster_cols must be of str or list type. " - f"{str(value)} of type {str(type(value))} was passed." + z_cols_set = set(self.z_cols or []) + t_col_set = {self.t_col} if self.t_col else set() + s_col_set = {self.s_col} if self.s_col else set() + + instrument_checks_args = [ + (y_col_set, "outcome variable", "``y_col``"), + (d_cols_set, "treatment variable", "``d_cols``"), + (x_cols_set, "covariate", "``x_cols``"), + ] + for set1, name, argument in instrument_checks_args: + self._check_disjoint( + set1=set1, name1=name, arg1=argument, set2=z_cols_set, name2="instrumental variable", arg2="``z_cols``" ) - if not len(set(value)) == len(value): - raise ValueError("Invalid cluster variable(s) cluster_cols: Contains duplicate values.") - if not set(value).issubset(set(self.all_variables)): - raise ValueError("Invalid cluster variable(s) cluster_cols. At least one cluster variable is no data column.") - self._cluster_cols = value - if reset_value: - self._check_disjoint_sets() - self._set_cluster_vars() - - @property - def n_cluster_vars(self): - """ - The number of cluster variables. - """ - return len(self.cluster_cols) - - @property - def cluster_vars(self): - """ - Array of cluster variable(s). - """ - return self._cluster_vars.values - - @DoubleMLData.x_cols.setter - def x_cols(self, value): - if value is not None: - # this call might become much easier with https://github.com/python/cpython/pull/26194 - super(self.__class__, self.__class__).x_cols.__set__(self, value) - else: - if self.s_col is None: - if (self.z_cols is not None) & (self.t_col is not None): - y_d_z_t = set.union({self.y_col}, set(self.d_cols), set(self.z_cols), {self.t_col}, set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d_z_t] - elif self.z_cols is not None: - y_d_z = set.union({self.y_col}, set(self.d_cols), set(self.z_cols), set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d_z] - elif self.t_col is not None: - y_d_t = set.union({self.y_col}, set(self.d_cols), {self.t_col}, set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d_t] - else: - y_d = set.union({self.y_col}, set(self.d_cols), set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d] - else: - if (self.z_cols is not None) & (self.t_col is not None): - y_d_z_t_s = set.union( - {self.y_col}, set(self.d_cols), set(self.z_cols), {self.t_col}, {self.s_col}, set(self.cluster_cols) - ) - x_cols = [col for col in self.data.columns if col not in y_d_z_t_s] - elif self.z_cols is not None: - y_d_z_s = set.union({self.y_col}, set(self.d_cols), set(self.z_cols), {self.s_col}, set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d_z_s] - elif self.t_col is not None: - y_d_t_s = set.union({self.y_col}, set(self.d_cols), {self.t_col}, {self.s_col}, set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d_t_s] - else: - y_d_s = set.union({self.y_col}, set(self.d_cols), {self.s_col}, set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d_s] - # this call might become much easier with https://github.com/python/cpython/pull/26194 - super(self.__class__, self.__class__).x_cols.__set__(self, x_cols) - - def _check_disjoint_sets(self): - # apply the standard checks from the DoubleMLData class - super(DoubleMLClusterData, self)._check_disjoint_sets() - self._check_disjoint_sets_cluster_cols() - - def _check_disjoint_sets_cluster_cols(self): - # apply the standard checks from the DoubleMLData class - super(DoubleMLClusterData, self)._check_disjoint_sets() - # special checks for the additional cluster variables - cluster_cols_set = set(self.cluster_cols) - y_col_set = {self.y_col} - x_cols_set = set(self.x_cols) - d_cols_set = set(self.d_cols) - t_col_set = {self.t_col} - s_col_set = {self.s_col} + time_check_args = instrument_checks_args + [(z_cols_set, "instrumental variable", "``z_cols``")] + for set1, name, argument in time_check_args: + self._check_disjoint(set1=set1, name1=name, arg1=argument, set2=t_col_set, name2="time variable", arg2="``t_col``") - if not y_col_set.isdisjoint(cluster_cols_set): - raise ValueError( - f"{str(self.y_col)} cannot be set as outcome variable ``y_col`` and cluster variable in ``cluster_cols``." - ) - if not d_cols_set.isdisjoint(cluster_cols_set): - raise ValueError( - "At least one variable/column is set as treatment variable (``d_cols``) and " - "cluster variable in ``cluster_cols``." + score_check_args = time_check_args + [(t_col_set, "time variable", "``t_col``")] + for set1, name, argument in score_check_args: + self._check_disjoint( + set1=set1, name1=name, arg1=argument, set2=s_col_set, name2="score or selection variable", arg2="``s_col``" ) - # TODO: Is the following combination allowed, or not? - if not x_cols_set.isdisjoint(cluster_cols_set): - raise ValueError( - "At least one variable/column is set as covariate (``x_cols``) and cluster variable in ``cluster_cols``." - ) - if self.z_cols is not None: - z_cols_set = set(self.z_cols) - if not z_cols_set.isdisjoint(cluster_cols_set): - raise ValueError( - "At least one variable/column is set as instrumental variable (``z_cols``) and " - "cluster variable in ``cluster_cols``." - ) - if self.t_col is not None: - if not t_col_set.isdisjoint(cluster_cols_set): - raise ValueError( - f"{str(self.t_col)} cannot be set as time variable ``t_col`` and cluster variable in ``cluster_cols``." - ) - if self.s_col is not None: - if not s_col_set.isdisjoint(cluster_cols_set): - raise ValueError( - f"{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and " - "cluster variable in ``cluster_cols``." - ) - - def _set_cluster_vars(self): - assert_all_finite(self.data.loc[:, self.cluster_cols]) - self._cluster_vars = self.data.loc[:, self.cluster_cols] diff --git a/doubleml/data/cluster_data.py b/doubleml/data/cluster_data.py new file mode 100644 index 00000000..658ab0cc --- /dev/null +++ b/doubleml/data/cluster_data.py @@ -0,0 +1,289 @@ +import io + +import numpy as np +import pandas as pd +from sklearn.utils import assert_all_finite +from sklearn.utils.validation import check_array + +from doubleml.data.base_data import DoubleMLBaseData, DoubleMLData +from doubleml.utils._estimation import _assure_2d_array + + +class DoubleMLClusterData(DoubleMLData): + """Double machine learning data-backend for data with cluster variables. + + :class:`DoubleMLClusterData` objects can be initialized from + :class:`pandas.DataFrame`'s as well as :class:`numpy.ndarray`'s. + + Parameters + ---------- + data : :class:`pandas.DataFrame` + The data. + + y_col : str + The outcome variable. + + d_cols : str or list + The treatment variable(s). + + cluster_cols : str or list + The cluster variable(s). + + x_cols : None, str or list + The covariates. + If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor + treatment variables ``d_cols``, nor instrumental variables ``z_cols`` are used as covariates. + Default is ``None``. + + z_cols : None, str or list + The instrumental variable(s). + Default is ``None``. + + t_col : None or str + The time variable (only relevant/used for DiD Estimators). + Default is ``None``. + + s_col : None or str + The score or selection variable (only relevant/used for RDD and SSM Estimatiors). + Default is ``None``. + + use_other_treat_as_covariate : bool + Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. + Default is ``True``. + + force_all_x_finite : bool or str + Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. + Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are + allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). + Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used + for the nuisance functions are capable to provide valid predictions with missings and / or infinite values + in the covariates ``x``. + Default is ``True``. + + Examples + -------- + >>> from doubleml import DoubleMLClusterData + >>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021 + >>> # initialization from pandas.DataFrame + >>> df = make_pliv_multiway_cluster_CKMS2021(return_type='DataFrame') + >>> obj_dml_data_from_df = DoubleMLClusterData(df, 'Y', 'D', ['cluster_var_i', 'cluster_var_j'], z_cols='Z') + >>> # initialization from np.ndarray + >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array') + >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z) + """ + + def __init__( + self, + data, + y_col, + d_cols, + cluster_cols, + x_cols=None, + z_cols=None, + t_col=None, + s_col=None, + use_other_treat_as_covariate=True, + force_all_x_finite=True, + ): + DoubleMLBaseData.__init__(self, data) + + # we need to set cluster_cols (needs _data) before call to the super __init__ because of the x_cols setter + self.cluster_cols = cluster_cols + self._set_cluster_vars() + DoubleMLData.__init__( + self, data, y_col, d_cols, x_cols, z_cols, t_col, s_col, use_other_treat_as_covariate, force_all_x_finite + ) + self._check_disjoint_sets_cluster_cols() + + def __str__(self): + data_summary = self._data_summary_str() + buf = io.StringIO() + self.data.info(verbose=False, buf=buf) + df_info = buf.getvalue() + res = ( + "================== DoubleMLClusterData Object ==================\n" + + "\n------------------ Data summary ------------------\n" + + data_summary + + "\n------------------ DataFrame info ------------------\n" + + df_info + ) + return res + + def _data_summary_str(self): + data_summary = ( + f"Outcome variable: {self.y_col}\n" + f"Treatment variable(s): {self.d_cols}\n" + f"Cluster variable(s): {self.cluster_cols}\n" + f"Covariates: {self.x_cols}\n" + f"Instrument variable(s): {self.z_cols}\n" + ) + if self.t_col is not None: + data_summary += f"Time variable: {self.t_col}\n" + if self.s_col is not None: + data_summary += f"Score/Selection variable: {self.s_col}\n" + + data_summary += f"No. Observations: {self.n_obs}\n" + return data_summary + + @classmethod + def from_arrays( + cls, x, y, d, cluster_vars, z=None, t=None, s=None, use_other_treat_as_covariate=True, force_all_x_finite=True + ): + """ + Initialize :class:`DoubleMLClusterData` from :class:`numpy.ndarray`'s. + + Parameters + ---------- + x : :class:`numpy.ndarray` + Array of covariates. + + y : :class:`numpy.ndarray` + Array of the outcome variable. + + d : :class:`numpy.ndarray` + Array of treatment variables. + + cluster_vars : :class:`numpy.ndarray` + Array of cluster variables. + + z : None or :class:`numpy.ndarray` + Array of instrumental variables. + Default is ``None``. + + t : :class:`numpy.ndarray` + Array of the time variable (only relevant/used for DiD models). + Default is ``None``. + + s : :class:`numpy.ndarray` + Array of the score or selection variable (only relevant/used for RDD or SSM models). + Default is ``None``. + + use_other_treat_as_covariate : bool + Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. + Default is ``True``. + + force_all_x_finite : bool or str + Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. + Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are + allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). + Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used + for the nuisance functions are capable to provide valid predictions with missings and / or infinite values + in the covariates ``x``. + Default is ``True``. + + Examples + -------- + >>> from doubleml import DoubleMLClusterData + >>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021 + >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array') + >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z) + """ + dml_data = DoubleMLData.from_arrays(x, y, d, z, t, s, use_other_treat_as_covariate, force_all_x_finite) + cluster_vars = check_array(cluster_vars, ensure_2d=False, allow_nd=False) + cluster_vars = _assure_2d_array(cluster_vars) + if cluster_vars.shape[1] == 1: + cluster_cols = ["cluster_var"] + else: + cluster_cols = [f"cluster_var{i + 1}" for i in np.arange(cluster_vars.shape[1])] + + data = pd.concat((pd.DataFrame(cluster_vars, columns=cluster_cols), dml_data.data), axis=1) + + return cls( + data, + dml_data.y_col, + dml_data.d_cols, + cluster_cols, + dml_data.x_cols, + dml_data.z_cols, + dml_data.t_col, + dml_data.s_col, + dml_data.use_other_treat_as_covariate, + dml_data.force_all_x_finite, + ) + + @property + def cluster_cols(self): + """ + The cluster variable(s). + """ + return self._cluster_cols + + @cluster_cols.setter + def cluster_cols(self, value): + reset_value = hasattr(self, "_cluster_cols") + if isinstance(value, str): + value = [value] + if not isinstance(value, list): + raise TypeError( + "The cluster variable(s) cluster_cols must be of str or list type. " + f"{str(value)} of type {str(type(value))} was passed." + ) + if not len(set(value)) == len(value): + raise ValueError("Invalid cluster variable(s) cluster_cols: Contains duplicate values.") + if not set(value).issubset(set(self.all_variables)): + raise ValueError("Invalid cluster variable(s) cluster_cols. At least one cluster variable is no data column.") + self._cluster_cols = value + if reset_value: + self._check_disjoint_sets() + self._set_cluster_vars() + + @property + def n_cluster_vars(self): + """ + The number of cluster variables. + """ + return len(self.cluster_cols) + + @property + def cluster_vars(self): + """ + Array of cluster variable(s). + """ + return self._cluster_vars.values + + def _get_optional_col_sets(self): + base_optional_col_sets = super()._get_optional_col_sets() + cluster_cols_set = set(self.cluster_cols) + return [cluster_cols_set] + base_optional_col_sets + + def _check_disjoint_sets(self): + # apply the standard checks from the DoubleMLData class + super(DoubleMLClusterData, self)._check_disjoint_sets() + self._check_disjoint_sets_cluster_cols() + + def _check_disjoint_sets_cluster_cols(self): + # apply the standard checks from the DoubleMLData class + super(DoubleMLClusterData, self)._check_disjoint_sets() + + # special checks for the additional cluster variables + cluster_cols_set = set(self.cluster_cols) + y_col_set = {self.y_col} + x_cols_set = set(self.x_cols) + d_cols_set = set(self.d_cols) + + z_cols_set = set(self.z_cols or []) + t_col_set = {self.t_col} if self.t_col else set() + s_col_set = {self.s_col} if self.s_col else set() + + # TODO: X can not be used as cluster variable + cluster_checks_args = [ + (y_col_set, "outcome variable", "``y_col``"), + (d_cols_set, "treatment variable", "``d_cols``"), + (x_cols_set, "covariate", "``x_cols``"), + (z_cols_set, "instrumental variable", "``z_cols``"), + (t_col_set, "time variable", "``t_col``"), + (s_col_set, "score or selection variable", "``s_col``"), + ] + for set1, name, argument in cluster_checks_args: + self._check_disjoint( + set1=set1, + name1=name, + arg1=argument, + set2=cluster_cols_set, + name2="cluster variable(s)", + arg2="``cluster_cols``", + ) + + def _set_cluster_vars(self): + assert_all_finite(self.data.loc[:, self.cluster_cols]) + self._cluster_vars = self.data.loc[:, self.cluster_cols] diff --git a/doubleml/data/panel_data.py b/doubleml/data/panel_data.py new file mode 100644 index 00000000..f548ae6a --- /dev/null +++ b/doubleml/data/panel_data.py @@ -0,0 +1,315 @@ +import io + +import numpy as np +import pandas as pd +from sklearn.utils import assert_all_finite + +from doubleml.data.base_data import DoubleMLBaseData, DoubleMLData +from doubleml.data.utils.panel_data_utils import _is_valid_datetime_unit + + +class DoubleMLPanelData(DoubleMLData): + """Double machine learning data-backend for panel data in long format. + + :class:`DoubleMLPanelData` objects can be initialized from + :class:`pandas.DataFrame` as well as :class:`numpy.ndarray` objects. + + Parameters + ---------- + data : :class:`pandas.DataFrame` + The data. + + y_col : str + The outcome variable. + + d_cols : str or list + The treatment variable(s) indicating the treatment groups in terms of first time of treatment exposure. + + t_col : str + The time variable indicating the time. + + id_col : str + Unique unit identifier. + + x_cols : None, str or list + The covariates. + If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor + treatment variables ``d_cols``, nor instrumental variables ``z_cols`` are used as covariates. + Default is ``None``. + + z_cols : None, str or list + The instrumental variable(s). + Default is ``None``. + + use_other_treat_as_covariate : bool + Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. + Default is ``True``. + + force_all_x_finite : bool or str + Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. + Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are + allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). + Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used + for the nuisance functions are capable to provide valid predictions with missings and / or infinite values + in the covariates ``x``. + Default is ``True``. + + datetime_unit : str + The unit of the time and treatment variable (if datetime type). + + Examples + -------- + >>> from doubleml.did.datasets import make_did_CS2021 + >>> from doubleml import DoubleMLPanelData + >>> df = make_did_CS2021(n_obs=500) + >>> dml_data = DoubleMLPanelData( + ... df, + ... y_col="y", + ... d_cols="d", + ... id_col="id", + ... t_col="t", + ... x_cols=["Z1", "Z2", "Z3", "Z4"], + ... datetime_unit="M" + ... ) + """ + + def __init__( + self, + data, + y_col, + d_cols, + t_col, + id_col, + x_cols=None, + z_cols=None, + use_other_treat_as_covariate=True, + force_all_x_finite=True, + datetime_unit="M", + ): + DoubleMLBaseData.__init__(self, data) + + # we need to set id_col (needs _data) before call to the super __init__ because of the x_cols setter + self.id_col = id_col + self._datetime_unit = _is_valid_datetime_unit(datetime_unit) + self._set_id_var() + + DoubleMLData.__init__( + self, + data=data, + y_col=y_col, + d_cols=d_cols, + x_cols=x_cols, + z_cols=z_cols, + t_col=t_col, + s_col=None, + use_other_treat_as_covariate=use_other_treat_as_covariate, + force_all_x_finite=force_all_x_finite, + force_all_d_finite=False, + ) + if self.n_treat != 1: + raise ValueError("Only one treatment column is allowed for panel data.") + + self._check_disjoint_sets_id_col() + + # intialize the unique values of g and t + self._g_values = np.sort(np.unique(self.d)) # unique values of g + self._t_values = np.sort(np.unique(self.t)) # unique values of t + + def __str__(self): + data_summary = self._data_summary_str() + buf = io.StringIO() + self.data.info(verbose=False, buf=buf) + df_info = buf.getvalue() + res = ( + "================== DoubleMLPanelData Object ==================\n" + + "\n------------------ Data summary ------------------\n" + + data_summary + + "\n------------------ DataFrame info ------------------\n" + + df_info + ) + return res + + def _data_summary_str(self): + data_summary = ( + f"Outcome variable: {self.y_col}\n" + f"Treatment variable(s): {self.d_cols}\n" + f"Covariates: {self.x_cols}\n" + f"Instrument variable(s): {self.z_cols}\n" + f"Time variable: {self.t_col}\n" + f"Id variable: {self.id_col}\n" + ) + + data_summary += f"No. Observations: {self.n_obs}\n" + return data_summary + + @classmethod + def from_arrays(cls, x, y, d, t, identifier, z=None, s=None, use_other_treat_as_covariate=True, force_all_x_finite=True): + # TODO: Implement initialization from arrays + raise NotImplementedError("from_arrays is not implemented for DoubleMLPanelData") + + @property + def datetime_unit(self): + """ + The unit of the time variable. + """ + return self._datetime_unit + + @property + def d(self): + """ + Array of treatment variable; + Dynamic! Depends on the currently set treatment variable; + To get an array of all treatment variables (independent of the currently set treatment variable) + call ``obj.data[obj.d_cols].values``. + """ + if pd.api.types.is_datetime64_any_dtype(self._d): + return self._d.values.astype(f"datetime64[{self.datetime_unit}]") + else: + return self._d.values + + @property + def t(self): + """ + Array of time variable. + """ + if pd.api.types.is_datetime64_any_dtype(self._d): + return self._t.values.astype(f"datetime64[{self.datetime_unit}]") + else: + return self._t.values + + @property + def id_col(self): + """ + The id variable. + """ + return self._id_col + + @id_col.setter + def id_col(self, value): + reset_value = hasattr(self, "_id_col") + if not isinstance(value, str): + raise TypeError( + "The id variable id_col must be of str type. " f"{str(value)} of type {str(type(value))} was passed." + ) + if value not in self.all_variables: + raise ValueError("Invalid id variable id_col. " f"{value} is no data column.") + self._id_col = value + if reset_value: + self._check_disjoint_sets() + self._set_id_var() + + @property + def id_var(self): + """ + Array of id variable. + """ + return self._id_var.values + + @property + def id_var_unique(self): + """ + Unique values of id variable. + """ + return self._id_var_unique + + @property + def n_obs(self): + """ + The number of observations. For panel data, the number of unique values for id_col. + """ + return len(self._id_var_unique) + + @property + def g_col(self): + """ + The treatment variable indicating the time of treatment exposure. + """ + return self._d_cols[0] + + @DoubleMLData.d_cols.setter + def d_cols(self, value): + super(self.__class__, self.__class__).d_cols.__set__(self, value) + if hasattr(self, "_g_values"): + self._g_values = np.sort(np.unique(self.d)) # update unique values of g + + @property + def g_values(self): + """ + The unique values of the treatment variable (groups) ``d``. + """ + return self._g_values + + @property + def n_groups(self): + """ + The number of groups. + """ + return len(self.g_values) + + @DoubleMLData.t_col.setter + def t_col(self, value): + if value is None: + raise TypeError("Invalid time variable t_col. Time variable required for panel data.") + super(self.__class__, self.__class__).t_col.__set__(self, value) + if hasattr(self, "_t_values"): + self._t_values = np.sort(np.unique(self.t)) # update unique values of t + + @property + def t_values(self): + """ + The unique values of the time variable ``t``. + """ + return self._t_values + + @property + def n_t_periods(self): + """ + The number of time periods. + """ + return len(self.t_values) + + def _get_optional_col_sets(self): + base_optional_col_sets = super()._get_optional_col_sets() + id_col_set = {self.id_col} + return [id_col_set] + base_optional_col_sets + + def _check_disjoint_sets(self): + # apply the standard checks from the DoubleMLData class + super(DoubleMLPanelData, self)._check_disjoint_sets() + self._check_disjoint_sets_id_col() + + def _check_disjoint_sets_id_col(self): + # apply the standard checks from the DoubleMLData class + super(DoubleMLPanelData, self)._check_disjoint_sets() + + # special checks for the additional id variable (and the time variable) + id_col_set = {self.id_col} + y_col_set = {self.y_col} + x_cols_set = set(self.x_cols) + d_cols_set = set(self.d_cols) + + z_cols_set = set(self.z_cols or []) + t_col_set = {self.t_col} # t_col is not None for panel data + # s_col not tested as not relevant for panel data + + id_col_check_args = [ + (y_col_set, "outcome variable", "``y_col``"), + (d_cols_set, "treatment variable", "``d_cols``"), + (x_cols_set, "covariate", "``x_cols``"), + (z_cols_set, "instrumental variable", "``z_cols``"), + (t_col_set, "time variable", "``t_col``"), + ] + for set1, name, argument in id_col_check_args: + self._check_disjoint( + set1=set1, + name1=name, + arg1=argument, + set2=id_col_set, + name2="identifier variable", + arg2="``id_col``", + ) + + def _set_id_var(self): + assert_all_finite(self.data.loc[:, self.id_col]) + self._id_var = self.data.loc[:, self.id_col] + self._id_var_unique = np.unique(self._id_var.values) diff --git a/doubleml/data/tests/__init__.py b/doubleml/data/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/doubleml/data/tests/conftest.py b/doubleml/data/tests/conftest.py new file mode 100644 index 00000000..6960b58a --- /dev/null +++ b/doubleml/data/tests/conftest.py @@ -0,0 +1,40 @@ +import numpy as np +import pandas as pd +import pytest + +from doubleml.datasets import make_irm_data, make_plr_turrell2018 + + +@pytest.fixture(scope="session", params=[(500, 10), (1000, 20), (1000, 100)]) +def generate_data1(request): + n_p = request.param + np.random.seed(1111) + # setting parameters + n = n_p[0] + p = n_p[1] + theta = 0.5 + + # generating data + data = make_plr_turrell2018(n, p, theta, return_type=pd.DataFrame) + + return data + + +@pytest.fixture(scope="session", params=[(500, 10), (1000, 20)]) +def generate_data_irm_w_missings(request): + n_p = request.param + np.random.seed(1111) + # setting parameters + n = n_p[0] + p = n_p[1] + theta = 0.5 + + # generating data + (x, y, d) = make_irm_data(n, p, theta, return_type="array") + + # randomly set some entries to np.nan + ind = np.random.choice(np.arange(x.size), replace=False, size=int(x.size * 0.05)) + x[np.unravel_index(ind, x.shape)] = np.nan + data = (x, y, d) + + return data diff --git a/doubleml/data/tests/test_cluster_data.py b/doubleml/data/tests/test_cluster_data.py new file mode 100644 index 00000000..e95dfa03 --- /dev/null +++ b/doubleml/data/tests/test_cluster_data.py @@ -0,0 +1,230 @@ +import numpy as np +import pandas as pd +import pytest + +from doubleml import DoubleMLClusterData +from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021, make_plr_CCDDHNR2018 + + +@pytest.mark.ci +def test_obj_vs_from_arrays(): + np.random.seed(3141) + dml_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10) + dml_data_from_array = DoubleMLClusterData.from_arrays( + dml_data.data[dml_data.x_cols], + dml_data.data[dml_data.y_col], + dml_data.data[dml_data.d_cols], + dml_data.data[dml_data.cluster_cols], + dml_data.data[dml_data.z_cols], + ) + df = dml_data.data.copy() + df.rename( + columns={"cluster_var_i": "cluster_var1", "cluster_var_j": "cluster_var2", "Y": "y", "D": "d", "Z": "z"}, inplace=True + ) + assert dml_data_from_array.data.equals(df) + + # with a single cluster variable + dml_data_from_array = DoubleMLClusterData.from_arrays( + dml_data.data[dml_data.x_cols], + dml_data.data[dml_data.y_col], + dml_data.data[dml_data.d_cols], + dml_data.data[dml_data.cluster_cols[1]], + dml_data.data[dml_data.z_cols], + ) + df = dml_data.data.copy().drop(columns="cluster_var_i") + df.rename(columns={"cluster_var_j": "cluster_var", "Y": "y", "D": "d", "Z": "z"}, inplace=True) + assert dml_data_from_array.data.equals(df) + + +@pytest.mark.ci +def test_x_cols_setter_defaults_w_cluster(): + df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "xx3", "cluster1"]) + dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1") + assert dml_data.x_cols == ["xx1", "xx2", "xx3"] + dml_data.x_cols = ["xx1", "xx3"] + assert dml_data.x_cols == ["xx1", "xx3"] + dml_data.x_cols = None + assert dml_data.x_cols == ["xx1", "xx2", "xx3"] + + # with instrument + df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "z", "cluster1"]) + dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="z") + assert dml_data.x_cols == ["xx1", "xx2"] + + # without instrument and with time + df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "tt", "cluster1"]) + dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", t_col="tt") + assert dml_data.x_cols == ["xx1", "xx2"] + + # with instrument and with time + df = pd.DataFrame(np.tile(np.arange(7), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "tt", "cluster1"]) + dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", t_col="tt") + assert dml_data.x_cols == ["xx1", "xx2"] + + # without instrument and with selection + df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "ss", "cluster1"]) + dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", s_col="ss") + assert dml_data.x_cols == ["xx1", "xx2"] + + # with instrument and with selection + df = pd.DataFrame(np.tile(np.arange(7), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "ss", "cluster1"]) + dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", s_col="ss") + assert dml_data.x_cols == ["xx1", "xx2"] + + # without instrument with time with selection + df = pd.DataFrame(np.tile(np.arange(7), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "tt", "ss", "cluster1"]) + dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", t_col="tt", s_col="ss") + assert dml_data.x_cols == ["xx1", "xx2"] + + # with instrument with time with selection + df = pd.DataFrame(np.tile(np.arange(8), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "tt", "ss", "cluster1"]) + dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", t_col="tt", s_col="ss") + assert dml_data.x_cols == ["xx1", "xx2"] + + +@pytest.mark.ci +def test_cluster_cols_setter(): + np.random.seed(3141) + dml_data = make_plr_CCDDHNR2018(n_obs=100) + df = dml_data.data.copy().iloc[:, :10] + df.columns = [f"X{i + 1}" for i in np.arange(7)] + ["y", "d1", "d2"] + dml_data = DoubleMLClusterData( + df, "y", ["d1", "d2"], cluster_cols=[f"X{i + 1}" for i in [5, 6]], x_cols=[f"X{i + 1}" for i in np.arange(5)] + ) + + cluster_vars = df[["X6", "X7"]].values + assert np.array_equal(dml_data.cluster_vars, cluster_vars) + assert dml_data.n_cluster_vars == 2 + + # check that after changing cluster_cols, the cluster_vars array gets updated + cluster_vars = df[["X7", "X6"]].values + dml_data.cluster_cols = ["X7", "X6"] + assert np.array_equal(dml_data.cluster_vars, cluster_vars) + + msg = r"Invalid cluster variable\(s\) cluster_cols. At least one cluster variable is no data column." + with pytest.raises(ValueError, match=msg): + dml_data.cluster_cols = ["X6", "X13"] + with pytest.raises(ValueError, match=msg): + dml_data.cluster_cols = "X13" + + msg = r"The cluster variable\(s\) cluster_cols must be of str or list type. " "5 of type was passed." + with pytest.raises(TypeError, match=msg): + dml_data.cluster_cols = 5 + + # check single cluster variable + cluster_vars = df[["X7"]].values + dml_data.cluster_cols = "X7" + assert np.array_equal(dml_data.cluster_vars, cluster_vars) + assert dml_data.n_cluster_vars == 1 + + +@pytest.mark.ci +def test_disjoint_sets(): + np.random.seed(3141) + df = pd.DataFrame(np.tile(np.arange(6), (4, 1)), columns=["yy", "dd1", "xx1", "xx2", "zz", "tt"]) + + # cluster data + msg = ( + r"At least one variable/column is set as outcome variable \(``y_col``\) " + r"and cluster variable\(s\) \(``cluster_cols``\)." + ) + with pytest.raises(ValueError, match=msg): + _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="yy") + msg = ( + r"At least one variable/column is set as treatment variable \(``d_cols``\) " + r"and cluster variable\(s\) \(``cluster_cols``\)." + ) + with pytest.raises(ValueError, match=msg): + _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="dd1") + msg = ( + r"At least one variable/column is set as covariate \(``x_cols``\) " r"and cluster variable\(s\) \(``cluster_cols``\)." + ) + with pytest.raises(ValueError, match=msg): + _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="xx2") + + msg = ( + r"At least one variable/column is set as instrumental variable \(``z_cols``\) " + r"and cluster variable\(s\) \(``cluster_cols``\)." + ) + with pytest.raises(ValueError, match=msg): + _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], z_cols=["xx2"], cluster_cols="xx2") + + msg = ( + r"At least one variable/column is set as time variable \(``t_col``\) " + r"and cluster variable\(s\) \(``cluster_cols``\)." + ) + with pytest.raises(ValueError, match=msg): + _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], t_col="xx2", cluster_cols="xx2") + + msg = ( + r"At least one variable/column is set as score or selection variable \(``s_col``\) " + r"and cluster variable\(s\) \(``cluster_cols``\)." + ) + with pytest.raises(ValueError, match=msg): + _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], s_col="xx2", cluster_cols="xx2") + + +@pytest.mark.ci +def test_duplicates(): + np.random.seed(3141) + dml_cluster_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10) + + msg = r"Invalid cluster variable\(s\) cluster_cols: Contains duplicate values." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLClusterData(dml_cluster_data.data, y_col="y", d_cols=["d"], cluster_cols=["X3", "X2", "X3"]) + with pytest.raises(ValueError, match=msg): + dml_cluster_data.cluster_cols = ["X3", "X2", "X3"] + + msg = "Invalid pd.DataFrame: Contains duplicate column names." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLClusterData( + pd.DataFrame(np.zeros((100, 5)), columns=["y", "d", "X3", "X2", "y"]), y_col="y", d_cols=["d"], cluster_cols=["X2"] + ) + + +@pytest.mark.ci +def test_dml_datatype(): + data_array = np.zeros((100, 10)) + with pytest.raises(TypeError): + _ = DoubleMLClusterData(data_array, y_col="y", d_cols=["d"], cluster_cols=["X3", "X2"]) + + +@pytest.mark.ci +def test_cluster_data_str(): + np.random.seed(3141) + dml_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10) + + # Convert the object to string + dml_str = str(dml_data) + + # Check that all important sections are present in the string + assert "================== DoubleMLClusterData Object ==================" in dml_str + assert "------------------ Data summary ------------------" in dml_str + assert "------------------ DataFrame info ------------------" in dml_str + + # Check that specific data attributes are correctly included + assert "Outcome variable: Y" in dml_str + assert "Treatment variable(s): ['D']" in dml_str + assert "Cluster variable(s): ['cluster_var_i', 'cluster_var_j']" in dml_str + assert "Covariates: " in dml_str + assert "Instrument variable(s): ['Z']" in dml_str + assert "No. Observations:" in dml_str + + # Test with additional optional attributes + df = dml_data.data.copy() + df["time_var"] = 1 + df["score_var"] = 0.5 + + dml_data_with_optional = DoubleMLClusterData( + data=df, + y_col="Y", + d_cols="D", + cluster_cols=["cluster_var_i", "cluster_var_j"], + z_cols="Z", + t_col="time_var", + s_col="score_var", + ) + + dml_str_optional = str(dml_data_with_optional) + assert "Time variable: time_var" in dml_str_optional + assert "Score/Selection variable: score_var" in dml_str_optional diff --git a/doubleml/tests/test_dml_data.py b/doubleml/data/tests/test_dml_data.py similarity index 73% rename from doubleml/tests/test_dml_data.py rename to doubleml/data/tests/test_dml_data.py index d89e802a..7cf394b5 100644 --- a/doubleml/tests/test_dml_data.py +++ b/doubleml/data/tests/test_dml_data.py @@ -3,16 +3,15 @@ import pytest from sklearn.linear_model import Lasso, LogisticRegression -from doubleml import DoubleMLClusterData, DoubleMLData, DoubleMLDIDCS, DoubleMLPLR, DoubleMLSSM +from doubleml import DoubleMLData, DoubleMLDIDCS, DoubleMLPLR, DoubleMLSSM +from doubleml.data.base_data import DoubleMLBaseData from doubleml.datasets import ( _make_pliv_data, - make_did_SZ2020, make_pliv_CHS2015, - make_pliv_multiway_cluster_CKMS2021, make_plr_CCDDHNR2018, make_ssm_data, ) -from doubleml.double_ml_data import DoubleMLBaseData +from doubleml.did.datasets import make_did_SZ2020 class DummyDataClass(DoubleMLBaseData): @@ -123,32 +122,6 @@ def test_obj_vs_from_arrays(): ) assert np.array_equal(dml_data_from_array.data, dml_data.data) - dml_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10) - dml_data_from_array = DoubleMLClusterData.from_arrays( - dml_data.data[dml_data.x_cols], - dml_data.data[dml_data.y_col], - dml_data.data[dml_data.d_cols], - dml_data.data[dml_data.cluster_cols], - dml_data.data[dml_data.z_cols], - ) - df = dml_data.data.copy() - df.rename( - columns={"cluster_var_i": "cluster_var1", "cluster_var_j": "cluster_var2", "Y": "y", "D": "d", "Z": "z"}, inplace=True - ) - assert dml_data_from_array.data.equals(df) - - # with a single cluster variable - dml_data_from_array = DoubleMLClusterData.from_arrays( - dml_data.data[dml_data.x_cols], - dml_data.data[dml_data.y_col], - dml_data.data[dml_data.d_cols], - dml_data.data[dml_data.cluster_cols[1]], - dml_data.data[dml_data.z_cols], - ) - df = dml_data.data.copy().drop(columns="cluster_var_i") - df.rename(columns={"cluster_var_j": "cluster_var", "Y": "y", "D": "d", "Z": "z"}, inplace=True) - assert dml_data_from_array.data.equals(df) - @pytest.mark.ci def test_add_vars_in_df(): @@ -249,52 +222,6 @@ def test_x_cols_setter_defaults(): assert dml_data.x_cols == ["xx1", "xx2"] -@pytest.mark.ci -def test_x_cols_setter_defaults_w_cluster(): - df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "xx3", "cluster1"]) - dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1") - assert dml_data.x_cols == ["xx1", "xx2", "xx3"] - dml_data.x_cols = ["xx1", "xx3"] - assert dml_data.x_cols == ["xx1", "xx3"] - dml_data.x_cols = None - assert dml_data.x_cols == ["xx1", "xx2", "xx3"] - - # with instrument - df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "z", "cluster1"]) - dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="z") - assert dml_data.x_cols == ["xx1", "xx2"] - - # without instrument and with time - df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "tt", "cluster1"]) - dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", t_col="tt") - assert dml_data.x_cols == ["xx1", "xx2"] - - # with instrument and with time - df = pd.DataFrame(np.tile(np.arange(7), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "tt", "cluster1"]) - dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", t_col="tt") - assert dml_data.x_cols == ["xx1", "xx2"] - - # without instrument and with selection - df = pd.DataFrame(np.tile(np.arange(6), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "ss", "cluster1"]) - dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", s_col="ss") - assert dml_data.x_cols == ["xx1", "xx2"] - - # with instrument and with selection - df = pd.DataFrame(np.tile(np.arange(7), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "ss", "cluster1"]) - dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", s_col="ss") - assert dml_data.x_cols == ["xx1", "xx2"] - - # without instrument with time with selection - df = pd.DataFrame(np.tile(np.arange(7), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "tt", "ss", "cluster1"]) - dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", t_col="tt", s_col="ss") - assert dml_data.x_cols == ["xx1", "xx2"] - - # with instrument with time with selection - df = pd.DataFrame(np.tile(np.arange(8), (6, 1)), columns=["yy", "dd", "xx1", "xx2", "zz", "tt", "ss", "cluster1"]) - dml_data = DoubleMLClusterData(df, y_col="yy", d_cols="dd", cluster_cols="cluster1", z_cols="zz", t_col="tt", s_col="ss") - assert dml_data.x_cols == ["xx1", "xx2"] - - @pytest.mark.ci def test_x_cols_setter(): np.random.seed(3141) @@ -442,42 +369,6 @@ def test_s_col_setter(): assert dml_data.s is None -@pytest.mark.ci -def test_cluster_cols_setter(): - np.random.seed(3141) - dml_data = make_plr_CCDDHNR2018(n_obs=100) - df = dml_data.data.copy().iloc[:, :10] - df.columns = [f"X{i + 1}" for i in np.arange(7)] + ["y", "d1", "d2"] - dml_data = DoubleMLClusterData( - df, "y", ["d1", "d2"], cluster_cols=[f"X{i + 1}" for i in [5, 6]], x_cols=[f"X{i + 1}" for i in np.arange(5)] - ) - - cluster_vars = df[["X6", "X7"]].values - assert np.array_equal(dml_data.cluster_vars, cluster_vars) - assert dml_data.n_cluster_vars == 2 - - # check that after changing cluster_cols, the cluster_vars array gets updated - cluster_vars = df[["X7", "X6"]].values - dml_data.cluster_cols = ["X7", "X6"] - assert np.array_equal(dml_data.cluster_vars, cluster_vars) - - msg = r"Invalid cluster variable\(s\) cluster_cols. At least one cluster variable is no data column." - with pytest.raises(ValueError, match=msg): - dml_data.cluster_cols = ["X6", "X13"] - with pytest.raises(ValueError, match=msg): - dml_data.cluster_cols = "X13" - - msg = r"The cluster variable\(s\) cluster_cols must be of str or list type. " "5 of type was passed." - with pytest.raises(TypeError, match=msg): - dml_data.cluster_cols = 5 - - # check single cluster variable - cluster_vars = df[["X7"]].values - dml_data.cluster_cols = "X7" - assert np.array_equal(dml_data.cluster_vars, cluster_vars) - assert dml_data.n_cluster_vars == 1 - - @pytest.mark.ci def test_y_col_setter(): np.random.seed(3141) @@ -556,79 +447,62 @@ def test_disjoint_sets(): msg = "yy cannot be set as outcome variable ``y_col`` and covariate in ``x_cols``" with pytest.raises(ValueError, match=msg): _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "yy", "xx2"]) - msg = "yy cannot be set as outcome variable ``y_col`` and instrumental variable in ``z_cols``" + + # instrumental variable + msg = r"At least one variable/column is set as outcome variable \(``y_col``\) and instrumental variable \(``z_cols``\)." with pytest.raises(ValueError, match=msg): _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], z_cols="yy") - msg = ( - r"At least one variable/column is set as treatment variable \(``d_cols``\) and instrumental variable in " "``z_cols``." - ) + msg = r"At least one variable/column is set as treatment variable \(``d_cols``\) and instrumental variable \(``z_cols``\)." with pytest.raises(ValueError, match=msg): _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], z_cols=["dd1"]) - msg = r"At least one variable/column is set as covariate \(``x_cols``\) and instrumental variable in " "``z_cols``." + msg = r"At least one variable/column is set as covariate \(``x_cols``\) and instrumental variable \(``z_cols``\)." with pytest.raises(ValueError, match=msg): _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], z_cols="xx2") - msg = "xx2 cannot be set as time variable ``t_col`` and covariate in ``x_cols``." + # time variable + msg = r"At least one variable/column is set as outcome variable \(``y_col``\) and time variable \(``t_col``\)." with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="xx2") - msg = "dd1 cannot be set as time variable ``t_col`` and treatment variable in ``d_cols``." + _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="yy") + msg = r"At least one variable/column is set as treatment variable \(``d_cols``\) and time variable \(``t_col``\)." with pytest.raises(ValueError, match=msg): _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="dd1") - msg = "yy cannot be set as time variable ``t_col`` and outcome variable ``y_col``." + msg = r"At least one variable/column is set as covariate \(``x_cols``\) and time variable \(``t_col``\)." with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="yy") - msg = "zz cannot be set as time variable ``t_col`` and instrumental variable in ``z_cols``." + _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="xx2") + msg = r"At least one variable/column is set as instrumental variable \(``z_cols``\) and time variable \(``t_col``\)." with pytest.raises(ValueError, match=msg): _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], z_cols="zz", t_col="zz") - msg = "xx2 cannot be set as score or selection variable ``s_col`` and covariate in ``x_cols``." - with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], s_col="xx2") - msg = "dd1 cannot be set as score or selection variable ``s_col`` and treatment variable in ``d_cols``." - with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], s_col="dd1") - msg = "yy cannot be set as score or selection variable ``s_col`` and outcome variable ``y_col``." + # score or selection variable + msg = ( + r"At least one variable/column is set as outcome variable \(``y_col``\) and score or selection variable \(``s_col``\)." + ) with pytest.raises(ValueError, match=msg): _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], s_col="yy") - msg = "zz cannot be set as score or selection variable ``s_col`` and instrumental variable in ``z_cols``." - with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], z_cols="zz", s_col="zz") - msg = "tt cannot be set as score or selection variable ``s_col`` and time variable ``t_col``." - with pytest.raises(ValueError, match=msg): - _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="tt", s_col="tt") - - # cluster data - msg = "yy cannot be set as outcome variable ``y_col`` and cluster variable in ``cluster_cols``" - with pytest.raises(ValueError, match=msg): - _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="yy") msg = ( - r"At least one variable/column is set as treatment variable \(``d_cols``\) and cluster variable in " - "``cluster_cols``." + r"At least one variable/column is set as treatment variable \(``d_cols``\) " + r"and score or selection variable \(``s_col``\)." ) with pytest.raises(ValueError, match=msg): - _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="dd1") - msg = r"At least one variable/column is set as covariate \(``x_cols``\) and cluster variable in " "``cluster_cols``." + _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], s_col="dd1") + msg = r"At least one variable/column is set as covariate \(``x_cols``\) and score or selection variable \(``s_col``\)." with pytest.raises(ValueError, match=msg): - _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], cluster_cols="xx2") + _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], s_col="xx2") msg = ( - r"At least one variable/column is set as instrumental variable \(``z_cols``\) and cluster variable in " - "``cluster_cols``." + r"At least one variable/column is set as instrumental variable \(``z_cols``\) " + r"and score or selection variable \(``s_col``\)." ) with pytest.raises(ValueError, match=msg): - _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], z_cols=["xx2"], cluster_cols="xx2") - msg = "xx2 cannot be set as time variable ``t_col`` and cluster variable in ``cluster_cols``." - with pytest.raises(ValueError, match=msg): - _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], t_col="xx2", cluster_cols="xx2") - msg = "xx2 cannot be set as score or selection variable ``s_col`` and cluster variable in ``cluster_cols``." + _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], z_cols="zz", s_col="zz") + msg = r"At least one variable/column is set as time variable \(``t_col``\) and score or selection variable \(``s_col``\)." with pytest.raises(ValueError, match=msg): - _ = DoubleMLClusterData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1"], s_col="xx2", cluster_cols="xx2") + _ = DoubleMLData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="tt", s_col="tt") @pytest.mark.ci def test_duplicates(): np.random.seed(3141) dml_data = make_plr_CCDDHNR2018(n_obs=100) - dml_cluster_data = make_pliv_multiway_cluster_CKMS2021(N=10, M=10) msg = r"Invalid treatment variable\(s\) d_cols: Contains duplicate values." with pytest.raises(ValueError, match=msg): @@ -648,21 +522,11 @@ def test_duplicates(): with pytest.raises(ValueError, match=msg): dml_data.z_cols = ["X15", "X12", "X12", "X15"] - msg = r"Invalid cluster variable\(s\) cluster_cols: Contains duplicate values." - with pytest.raises(ValueError, match=msg): - _ = DoubleMLClusterData(dml_cluster_data.data, y_col="y", d_cols=["d"], cluster_cols=["X3", "X2", "X3"]) - with pytest.raises(ValueError, match=msg): - dml_cluster_data.cluster_cols = ["X3", "X2", "X3"] - msg = "Invalid pd.DataFrame: Contains duplicate column names." with pytest.raises(ValueError, match=msg): _ = DoubleMLData( pd.DataFrame(np.zeros((100, 5)), columns=["y", "d", "X3", "X2", "y"]), y_col="y", d_cols=["d"], x_cols=["X3", "X2"] ) - with pytest.raises(ValueError, match=msg): - _ = DoubleMLClusterData( - pd.DataFrame(np.zeros((100, 5)), columns=["y", "d", "X3", "X2", "y"]), y_col="y", d_cols=["d"], cluster_cols=["X2"] - ) @pytest.mark.ci @@ -672,8 +536,6 @@ def test_dml_datatype(): # f'{str(data_array)} of type {str(type(data_array))} was passed.') with pytest.raises(TypeError): _ = DoubleMLData(data_array, y_col="y", d_cols=["d"], x_cols=["X3", "X2"]) - with pytest.raises(TypeError): - _ = DoubleMLClusterData(data_array, y_col="y", d_cols=["d"], cluster_cols=["X3", "X2"]) @pytest.mark.ci @@ -724,3 +586,57 @@ def test_dml_data_w_missings(generate_data_irm_w_missings): assert dml_data.force_all_x_finite is False dml_data.force_all_x_finite = "allow-nan" assert dml_data.force_all_x_finite == "allow-nan" + + +def test_dml_data_w_missing_d(generate_data1): + data = generate_data1 + np.random.seed(3141) + x_cols = data.columns[data.columns.str.startswith("X")].tolist() + + pd_args = { + "data": data, + "y_col": "y", + "d_cols": ["d"], + "x_cols": x_cols, + } + dml_data = DoubleMLData(force_all_d_finite=True, **pd_args) + + data["d"] = np.nan + np_args = { + "x": data.loc[:, x_cols].values, + "y": data["y"].values, + "d": data["d"].values, + } + msg = r"Input contains NaN." + with pytest.raises(ValueError, match=msg): + dml_data2 = DoubleMLData(force_all_d_finite=False, **pd_args) + dml_data2.force_all_d_finite = True + with pytest.raises(ValueError, match=msg): + _ = DoubleMLData.from_arrays(force_all_d_finite=True, **np_args) + with pytest.raises(ValueError, match=msg): + _ = DoubleMLData(force_all_d_finite=True, **pd_args) + + data["d"] = np.inf + np_args = { + "x": data.loc[:, x_cols].values, + "y": data["y"].values, + "d": data["d"].values, + } + msg = r"Input contains infinity or a value too large for dtype\('float64'\)." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLData.from_arrays(force_all_d_finite=True, **np_args) + with pytest.raises(ValueError, match=msg): + _ = DoubleMLData(force_all_d_finite=True, **pd_args) + + msg = "Invalid force_all_d_finite. force_all_d_finite must be True, False or 'allow-nan'." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLData(force_all_d_finite=1, **pd_args) + with pytest.raises(TypeError, match=msg): + _ = DoubleMLData.from_arrays(force_all_d_finite=1, **np_args) + + data["d"] = 1.0 + assert dml_data.force_all_d_finite is True + dml_data.force_all_d_finite = False + assert dml_data.force_all_d_finite is False + dml_data.force_all_d_finite = "allow-nan" + assert dml_data.force_all_d_finite == "allow-nan" diff --git a/doubleml/data/tests/test_panel_data.py b/doubleml/data/tests/test_panel_data.py new file mode 100644 index 00000000..2f2250ba --- /dev/null +++ b/doubleml/data/tests/test_panel_data.py @@ -0,0 +1,177 @@ +import numpy as np +import pandas as pd +import pytest + +from doubleml.data import DoubleMLPanelData +from doubleml.did.datasets import make_did_SZ2020 + + +@pytest.mark.ci +def test_dml_datatype(): + data_array = np.zeros((100, 10)) + with pytest.raises(TypeError): + _ = DoubleMLPanelData(data_array, y_col="y", d_cols=["d"], t_col="t", id_col="id") + + +@pytest.mark.ci +def test_t_col_setter(): + np.random.seed(3141) + df = make_did_SZ2020(n_obs=100, return_type="DoubleMLPanelData")._data + df["t_new"] = 1.0 + dml_data = DoubleMLPanelData( + data=df, y_col="y", d_cols="d", t_col="t", id_col="id", x_cols=[f"Z{i + 1}" for i in np.arange(4)] + ) + + # check that after changing t_col, the t array gets updated + t_comp = dml_data.data["t_new"].values + dml_data.t_col = "t_new" + assert np.array_equal(dml_data.t, t_comp) + assert dml_data._t_values == np.unique(t_comp) + assert dml_data.n_t_periods == 1 + + msg = "Invalid time variable t_col. a13 is no data column." + with pytest.raises(ValueError, match=msg): + dml_data.t_col = "a13" + + msg = r"The time variable t_col must be of str type \(or None\). " "5 of type was passed." + with pytest.raises(TypeError, match=msg): + dml_data.t_col = 5 + + msg = "Invalid time variable t_col. Time variable required for panel data." + with pytest.raises(TypeError, match=msg): + dml_data.t_col = None + + +@pytest.mark.ci +def test_id_col_setter(): + np.random.seed(3141) + df = make_did_SZ2020(n_obs=100, return_type="DoubleMLPanelData")._data + df["id_new"] = 1.0 + dml_data = DoubleMLPanelData( + data=df, y_col="y", d_cols="d", t_col="t", id_col="id", x_cols=[f"Z{i + 1}" for i in np.arange(4)] + ) + + # check that after changing id_col, the id array etc. gets updated + id_comp = dml_data.data["id_new"].values + dml_data.id_col = "id_new" + assert np.array_equal(dml_data.id_var, id_comp) + assert dml_data._id_var_unique == np.unique(id_comp) + assert dml_data.n_obs == 1 + + msg = "Invalid id variable id_col. a13 is no data column." + with pytest.raises(ValueError, match=msg): + dml_data.id_col = "a13" + + msg = "The id variable id_col must be of str type. " "5 of type was passed." + with pytest.raises(TypeError, match=msg): + dml_data.id_col = 5 + + msg = "The id variable id_col must be of str type. None of type was passed." + with pytest.raises(TypeError, match=msg): + dml_data.id_col = None + + +@pytest.mark.ci +def test_d_col_setter(): + np.random.seed(3141) + df = make_did_SZ2020(n_obs=100, return_type="DoubleMLPanelData")._data + df["d_new"] = 1.0 + dml_data = DoubleMLPanelData( + data=df, y_col="y", d_cols="d", t_col="t", id_col="id", x_cols=[f"Z{i + 1}" for i in np.arange(4)] + ) + + # check that after changing d_col, the id array etc. gets updated + d_comp = dml_data.data["d_new"].values + dml_data.d_cols = "d_new" + assert dml_data.d_cols == ["d_new"] + assert np.array_equal(dml_data.d, d_comp) + assert dml_data.g_col == "d_new" + assert dml_data._g_values == np.unique(d_comp) + assert dml_data.n_groups == 1 + + msg = r"Invalid treatment variable\(s\) d_cols. At least one treatment variable is no data column." + with pytest.raises(ValueError, match=msg): + dml_data.d_cols = "a13" + + msg = r"The treatment variable\(s\) d_cols must be of str or list type. 5 of type was passed." + with pytest.raises(TypeError, match=msg): + dml_data.d_cols = 5 + + msg = r"The treatment variable\(s\) d_cols must be of str or list type. None of type was passed." + with pytest.raises(TypeError, match=msg): + dml_data.d_cols = None + + +@pytest.mark.ci +def test_disjoint_sets(): + np.random.seed(3141) + df = pd.DataFrame(np.tile(np.arange(7), (4, 1)), columns=["yy", "dd1", "xx1", "xx2", "zz", "tt", "id"]) + + msg = r"At least one variable/column is set as outcome variable \(``y_col``\) " r"and identifier variable \(``id_col``\)." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLPanelData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="tt", id_col="yy") + + msg = ( + r"At least one variable/column is set as treatment variable \(``d_cols``\) " r"and identifier variable \(``id_col``\)." + ) + with pytest.raises(ValueError, match=msg): + _ = DoubleMLPanelData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="tt", id_col="dd1") + + msg = r"At least one variable/column is set as covariate \(``x_cols``\) " r"and identifier variable \(``id_col``\)." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLPanelData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="tt", id_col="xx1") + + msg = r"At least one variable/column is set as time variable \(``t_col``\) " r"and identifier variable \(``id_col``\)." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLPanelData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="tt", id_col="tt") + + msg = ( + r"At least one variable/column is set as instrumental variable \(``z_cols``\) " + r"and identifier variable \(``id_col``\)." + ) + with pytest.raises(ValueError, match=msg): + _ = DoubleMLPanelData(df, y_col="yy", d_cols=["dd1"], x_cols=["xx1", "xx2"], t_col="tt", z_cols=["zz"], id_col="zz") + + +@pytest.mark.ci +def test_panel_data_str(): + np.random.seed(3141) + df = make_did_SZ2020(n_obs=100, return_type="DoubleMLPanelData")._data + dml_data = DoubleMLPanelData( + data=df, y_col="y", d_cols="d", t_col="t", id_col="id", x_cols=[f"Z{i + 1}" for i in np.arange(4)] + ) + + # Convert the object to string + dml_str = str(dml_data) + + # Check that all important sections are present in the string + assert "================== DoubleMLPanelData Object ==================" in dml_str + assert "------------------ Data summary ------------------" in dml_str + assert "------------------ DataFrame info ------------------" in dml_str + + # Check that specific data attributes are correctly included + assert "Outcome variable: y" in dml_str + assert "Treatment variable(s): ['d']" in dml_str + assert "Covariates: ['Z1', 'Z2', 'Z3', 'Z4']" in dml_str + assert "Instrument variable(s): None" in dml_str + assert "Time variable: t" in dml_str + assert "Id variable: id" in dml_str + assert "No. Observations:" in dml_str + + +@pytest.mark.ci +def test_panel_data_properties(): + np.random.seed(3141) + df = make_did_SZ2020(n_obs=100, return_type="DoubleMLPanelData")._data + dml_data = DoubleMLPanelData( + data=df, y_col="y", d_cols="d", t_col="t", id_col="id", x_cols=[f"Z{i + 1}" for i in np.arange(4)] + ) + + assert np.array_equal(dml_data.id_var, df["id"].values) + assert np.array_equal(dml_data.id_var_unique, np.unique(df["id"].values)) + assert dml_data.n_obs == len(np.unique(df["id"].values)) + assert dml_data.g_col == "d" + assert np.array_equal(dml_data.g_values, np.sort(np.unique(df["d"].values))) + assert dml_data.n_groups == len(np.unique(df["d"].values)) + assert np.array_equal(dml_data.t_values, np.sort(np.unique(df["t"].values))) + assert dml_data.n_t_periods == len(np.unique(df["t"].values)) diff --git a/doubleml/data/tests/test_panel_data_exceptions.py b/doubleml/data/tests/test_panel_data_exceptions.py new file mode 100644 index 00000000..fab648fe --- /dev/null +++ b/doubleml/data/tests/test_panel_data_exceptions.py @@ -0,0 +1,113 @@ +import numpy as np +import pandas as pd +import pytest + +from doubleml.data import DoubleMLPanelData + + +@pytest.fixture +def sample_data(): + n_ids = 3 + n_periods = 4 + + data = [] + for id_val in range(n_ids): + for t in range(n_periods): + data.append( + { + "id": f"ID_{id_val}", + "time": t, + "y": np.random.normal(), + "treatment": int(t >= 2), + "x1": np.random.normal(), + "x2": np.random.normal(), + "z": np.random.normal(), + } + ) + + return pd.DataFrame(data) + + +@pytest.mark.ci +def test_multiple_treatments_exception(sample_data): + # Test exception when more than one treatment column is provided + with pytest.raises(ValueError, match="Only one treatment column is allowed for panel data."): + # Create copy of data with an additional treatment column + data_multi = sample_data.copy() + data_multi["treatment2"] = np.random.binomial(1, 0.5, size=len(data_multi)) + DoubleMLPanelData(data=data_multi, y_col="y", d_cols=["treatment", "treatment2"], t_col="time", id_col="id") + + +@pytest.mark.ci +def test_id_col_type_exception(sample_data): + # Test exception when id_col is not a string + with pytest.raises(TypeError, match="The id variable id_col must be of str type."): + DoubleMLPanelData(data=sample_data, y_col="y", d_cols="treatment", t_col="time", id_col=123) # Should be a string + + +@pytest.mark.ci +def test_id_col_not_in_data(sample_data): + # Test exception when id_col doesn't exist in data + with pytest.raises(ValueError, match="Invalid id variable id_col. non_existent_id is no data column."): + DoubleMLPanelData(data=sample_data, y_col="y", d_cols="treatment", t_col="time", id_col="non_existent_id") + + +@pytest.mark.ci +def test_time_col_none_exception(sample_data): + # Test exception when t_col is None + with pytest.raises(TypeError, match="Invalid time variable t_col. Time variable required for panel data."): + DoubleMLPanelData( + data=sample_data, y_col="y", d_cols="treatment", t_col=None, id_col="id" # Should not be None for panel data + ) + + +@pytest.mark.ci +def test_overlapping_variables_exception(sample_data): + # Test exception when id_col overlaps with another variable + msg = r"At least one variable/column is set as outcome variable \(``y_col``\) and identifier variable \(``id_col``\)." + with pytest.raises(ValueError, match=msg): + DoubleMLPanelData( + data=sample_data, + y_col="id", # Using id as outcome variable + d_cols="treatment", + t_col="time", + id_col="id", # Same as y_col + ) + + # Test treatment variable overlapping + msg = r"At least one variable/column is set as treatment variable \(``d_cols``\) and identifier variable \(``id_col``\)." + with pytest.raises(ValueError, match=msg): + DoubleMLPanelData(data=sample_data, y_col="y", d_cols="id", t_col="time", id_col="id") # Using id as treatment + + # Test time variable overlapping + msg = r"At least one variable/column is set as time variable \(``t_col``\) and identifier variable \(``id_col``\)." + with pytest.raises(ValueError, match=msg): + DoubleMLPanelData(data=sample_data, y_col="y", d_cols="treatment", t_col="id", id_col="id") # Using id as time + + +@pytest.mark.ci +def test_from_arrays_not_implemented(): + # Test that from_arrays raises NotImplementedError + with pytest.raises(NotImplementedError, match="from_arrays is not implemented for DoubleMLPanelData"): + DoubleMLPanelData.from_arrays( + x=np.random.normal(size=(10, 2)), + y=np.random.normal(size=10), + d=np.random.binomial(1, 0.5, size=10), + t=np.arange(10), + identifier=np.arange(10), + ) + + +@pytest.mark.ci +def test_invalid_datetime_unit(sample_data): + with pytest.raises(ValueError, match="Invalid datetime unit."): + DoubleMLPanelData( + data=sample_data, y_col="y", d_cols="treatment", t_col="time", id_col="id", datetime_unit="invalid_unit" + ) + + +# test if no exception is raised +@pytest.mark.ci +def test_no_exception(sample_data): + DoubleMLPanelData(data=sample_data, y_col="y", d_cols="treatment", t_col="time", id_col="id") + assert True diff --git a/doubleml/data/utils/panel_data_utils.py b/doubleml/data/utils/panel_data_utils.py new file mode 100644 index 00000000..abd365eb --- /dev/null +++ b/doubleml/data/utils/panel_data_utils.py @@ -0,0 +1,8 @@ +valid_datetime_units = {"Y", "M", "D", "h", "m", "s", "ms", "us", "ns"} + + +def _is_valid_datetime_unit(unit): + if unit not in valid_datetime_units: + raise ValueError("Invalid datetime unit.") + else: + return unit diff --git a/doubleml/data/utils/tests/test_panel_data_utils.py b/doubleml/data/utils/tests/test_panel_data_utils.py new file mode 100644 index 00000000..e5201384 --- /dev/null +++ b/doubleml/data/utils/tests/test_panel_data_utils.py @@ -0,0 +1,32 @@ +import pytest + +from doubleml.data.utils.panel_data_utils import _is_valid_datetime_unit + + +@pytest.mark.ci +def test_is_valid_datetime_unit(): + # Test all valid units + for unit in ["Y", "M", "D", "h", "m", "s", "ms", "us", "ns"]: + assert _is_valid_datetime_unit(unit) == unit, f"Unit {unit} should be valid and return itself" + + # Test invalid units + invalid_units = ["", "minutes", "d", "H", "S", "MS", "y", "seconds", "days"] + for unit in invalid_units: + with pytest.raises(ValueError, match="Invalid datetime unit."): + _is_valid_datetime_unit(unit) + + # Test case sensitivity + assert _is_valid_datetime_unit("m") == "m" # minute is valid + assert _is_valid_datetime_unit("M") == "M" # month is valid + + with pytest.raises(ValueError, match="Invalid datetime unit."): + _is_valid_datetime_unit("d") # lowercase day is invalid + + assert _is_valid_datetime_unit("D") == "D" # uppercase day is valid + + # Test edge cases + with pytest.raises(ValueError, match="Invalid datetime unit."): + _is_valid_datetime_unit(None) + + with pytest.raises(ValueError, match="Invalid datetime unit."): + _is_valid_datetime_unit(123) diff --git a/doubleml/datasets.py b/doubleml/datasets.py index a73d8216..0dcd33c7 100644 --- a/doubleml/datasets.py +++ b/doubleml/datasets.py @@ -7,12 +7,13 @@ from sklearn.datasets import make_spd_matrix from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures -from .double_ml_data import DoubleMLClusterData, DoubleMLData +from doubleml.data import DoubleMLClusterData, DoubleMLData +from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_cluster_data_alias, _get_dml_data_alias -_array_alias = ["array", "np.ndarray", "np.array", np.ndarray] -_data_frame_alias = ["DataFrame", "pd.DataFrame", pd.DataFrame] -_dml_data_alias = ["DoubleMLData", DoubleMLData] -_dml_cluster_data_alias = ["DoubleMLClusterData", DoubleMLClusterData] +_array_alias = _get_array_alias() +_data_frame_alias = _get_data_frame_alias() +_dml_data_alias = _get_dml_data_alias() +_dml_cluster_data_alias = _get_dml_cluster_data_alias() def fetch_401K(return_type="DoubleMLData", polynomial_features=False): @@ -856,197 +857,6 @@ def make_pliv_multiway_cluster_CKMS2021(N=25, M=25, dim_X=100, theta=1.0, return raise ValueError("Invalid return_type.") -def make_did_SZ2020(n_obs=500, dgp_type=1, cross_sectional_data=False, return_type="DoubleMLData", **kwargs): - """ - Generates data from a difference-in-differences model used in Sant'Anna and Zhao (2020). - The data generating process is defined as follows. For a generic :math:`W=(W_1, W_2, W_3, W_4)^T`, let - - .. math:: - - f_{reg}(W) &= 210 + 27.4 \\cdot W_1 +13.7 \\cdot (W_2 + W_3 + W_4), - - f_{ps}(W) &= 0.75 \\cdot (-W_1 + 0.5 \\cdot W_2 -0.25 \\cdot W_3 - 0.1 \\cdot W_4). - - - Let :math:`X= (X_1, X_2, X_3, X_4)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries - :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix. - Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, - where :math:`\\tilde{Z}_1 = \\exp(0.5 \\cdot X_1)`, :math:`\\tilde{Z}_2 = 10 + X_2/(1 + \\exp(X_1))`, - :math:`\\tilde{Z}_3 = (0.6 + X_1 \\cdot X_3 / 25)^3` and :math:`\\tilde{Z}_4 = (20 + X_2 + X_4)^2`. - At first define - - .. math:: - - Y_0(0) &= f_{reg}(W_{reg}) + \\nu(W_{reg}, D) + \\varepsilon_0, - - Y_1(d) &= 2 \\cdot f_{reg}(W_{reg}) + \\nu(W_{reg}, D) + \\varepsilon_1(d), - - p(W_{ps}) &= \\frac{\\exp(f_{ps}(W_{ps}))}{1 + \\exp(f_{ps}(W_{ps}))}, - - D &= 1\\{p(W_{ps}) \\ge U\\}, - - where :math:`\\varepsilon_0, \\varepsilon_1(d), d=0, 1` are independent standard normal random variables, - :math:`U \\sim \\mathcal{U}[0, 1]` is a independent standard uniform - and :math:`\\nu(W_{reg}, D)\\sim \\mathcal{N}(D \\cdot f_{reg}(W_{reg}),1)`. - The different data generating processes are defined via - - .. math:: - - DGP1:\\quad W_{reg} &= Z \\quad W_{ps} = Z - - DGP2:\\quad W_{reg} &= Z \\quad W_{ps} = X - - DGP3:\\quad W_{reg} &= X \\quad W_{ps} = Z - - DGP4:\\quad W_{reg} &= X \\quad W_{ps} = X - - DGP5:\\quad W_{reg} &= Z \\quad W_{ps} = 0 - - DGP6:\\quad W_{reg} &= X \\quad W_{ps} = 0, - - such that the last two settings correspond to an experimental setting with treatment probability - of :math:`P(D=1) = \\frac{1}{2}.` - For the panel data the outcome is already defined as the difference :math:`Y = Y_1(D) - Y_0(0)`. - For cross-sectional data the flag ``cross_sectional_data`` has to be set to ``True``. - Then the outcome will be defined to be - - .. math:: - - Y = T \\cdot Y_1(D) + (1-T) \\cdot Y_0(0), - - where :math:`T = 1\\{U_T\\le \\lambda_T \\}` with :math:`U_T\\sim \\mathcal{U}[0, 1]` and :math:`\\lambda_T=0.5`. - The true average treatment effect on the treated is zero for all data generating processes. - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dgp_type : - The DGP to be used. Default value is ``1`` (integer). - cross_sectional_data : - Indicates whether the setting is uses cross-sectional or panel data. Default value is ``False``. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)`` - or ``(x, y, d, t)``. - **kwargs - Additional keyword arguments to set non-default values for the parameter - :math:`xi=0.75`, :math:`c=0.0` and :math:`\\lambda_T=0.5`. - - References - ---------- - Sant’Anna, P. H. and Zhao, J. (2020), - Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122. - doi:`10.1016/j.jeconom.2020.06.003 `_. - """ - xi = kwargs.get("xi", 0.75) - c = kwargs.get("c", 0.0) - lambda_t = kwargs.get("lambda_t", 0.5) - - def f_reg(w): - res = 210 + 27.4 * w[:, 0] + 13.7 * (w[:, 1] + w[:, 2] + w[:, 3]) - return res - - def f_ps(w, xi): - res = xi * (-w[:, 0] + 0.5 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3]) - return res - - dim_x = 4 - cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) - x = np.random.multivariate_normal( - np.zeros(dim_x), - cov_mat, - size=[ - n_obs, - ], - ) - - z_tilde_1 = np.exp(0.5 * x[:, 0]) - z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) - z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 - z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 - - z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4)) - z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) - - # error terms - epsilon_0 = np.random.normal(loc=0, scale=1, size=n_obs) - epsilon_1 = np.random.normal(loc=0, scale=1, size=[n_obs, 2]) - - if dgp_type == 1: - features_ps = z - features_reg = z - elif dgp_type == 2: - features_ps = x - features_reg = z - elif dgp_type == 3: - features_ps = z - features_reg = x - elif dgp_type == 4: - features_ps = x - features_reg = x - elif dgp_type == 5: - features_ps = None - features_reg = z - elif dgp_type == 6: - features_ps = None - features_reg = x - else: - raise ValueError("The dgp_type is not valid.") - - # treatment and propensities - is_experimental = (dgp_type == 5) or (dgp_type == 6) - if is_experimental: - # Set D to be experimental - p = 0.5 * np.ones(n_obs) - else: - p = np.exp(f_ps(features_ps, xi)) / (1 + np.exp(f_ps(features_ps, xi))) - u = np.random.uniform(low=0, high=1, size=n_obs) - d = 1.0 * (p >= u) - - # potential outcomes - nu = np.random.normal(loc=d * f_reg(features_reg), scale=1, size=n_obs) - y0 = f_reg(features_reg) + nu + epsilon_0 - y1_d0 = 2 * f_reg(features_reg) + nu + epsilon_1[:, 0] - y1_d1 = 2 * f_reg(features_reg) + nu + epsilon_1[:, 1] - y1 = d * y1_d1 + (1 - d) * y1_d0 - - if not cross_sectional_data: - y = y1 - y0 - - if return_type in _array_alias: - return z, y, d, None - elif return_type in _data_frame_alias + _dml_data_alias: - z_cols = [f"Z{i + 1}" for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((z, y, d)), columns=z_cols + ["y", "d"]) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, "y", "d", z_cols) - else: - raise ValueError("Invalid return_type.") - - else: - u_t = np.random.uniform(low=0, high=1, size=n_obs) - t = 1.0 * (u_t <= lambda_t) - y = t * y1 + (1 - t) * y0 - - if return_type in _array_alias: - return z, y, d, t - elif return_type in _data_frame_alias + _dml_data_alias: - z_cols = [f"Z{i + 1}" for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((z, y, d, t)), columns=z_cols + ["y", "d", "t"]) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, "y", "d", z_cols, t_col="t") - else: - raise ValueError("Invalid return_type.") - - def make_confounded_irm_data(n_obs=500, theta=0.0, gamma_a=0.127, beta_a=0.58, linear=False, **kwargs): """ Generates counfounded data from an interactive regression model. diff --git a/doubleml/did/__init__.py b/doubleml/did/__init__.py index 594fa680..354ffaa5 100644 --- a/doubleml/did/__init__.py +++ b/doubleml/did/__init__.py @@ -3,9 +3,15 @@ """ from .did import DoubleMLDID +from .did_aggregation import DoubleMLDIDAggregation +from .did_binary import DoubleMLDIDBinary from .did_cs import DoubleMLDIDCS +from .did_multi import DoubleMLDIDMulti __all__ = [ + "DoubleMLDIDAggregation", "DoubleMLDID", "DoubleMLDIDCS", + "DoubleMLDIDBinary", + "DoubleMLDIDMulti", ] diff --git a/doubleml/did/datasets/__init__.py b/doubleml/did/datasets/__init__.py new file mode 100644 index 00000000..aaa5fc0a --- /dev/null +++ b/doubleml/did/datasets/__init__.py @@ -0,0 +1,11 @@ +""" +The :mod:`doubleml.did.datasets` module implements data generating processes for difference-in-differences. +""" + +from .dgp_did_CS2021 import make_did_CS2021 +from .dgp_did_SZ2020 import make_did_SZ2020 + +__all__ = [ + "make_did_SZ2020", + "make_did_CS2021", +] diff --git a/doubleml/did/datasets/dgp_did_CS2021.py b/doubleml/did/datasets/dgp_did_CS2021.py new file mode 100644 index 00000000..50336cdb --- /dev/null +++ b/doubleml/did/datasets/dgp_did_CS2021.py @@ -0,0 +1,301 @@ +import numpy as np +import pandas as pd + +from .dgp_did_SZ2020 import _generate_features, _select_features + +# Based on https://doi.org/10.1016/j.jeconom.2020.12.001 (see Appendix SC) +# and https://d2cml-ai.github.io/csdid/examples/csdid_basic.html#Examples-with-simulated-data + + +def _f_ps_groups(w, xi, n_groups): + # Create coefficient matrix: 4 features x n_groups + coef_vec = np.array([-1.0, 0.5, -0.25, -0.2]) + + # use i_group/n_groups as coeffect for columns + coef_matrix = np.array([coef_vec * (1.0 - (i_group / n_groups)) for i_group in range(n_groups)]).T + + res = xi * (w @ coef_matrix) + return res + + +def _f_reg_time(w, n_time_periods): + coef_vec = np.array([27.4, 13.7, 13.7, 13.7]) + + # use time period as coeffect for columns + coef_matrix = np.array([coef_vec * (i_time / n_time_periods) for i_time in range(1, n_time_periods + 1)]).T + + res = 210 + w @ coef_matrix + return res + + +def make_did_CS2021(n_obs=1000, dgp_type=1, include_never_treated=True, time_type="datetime", **kwargs): + """ + Generate synthetic panel data for difference-in-differences analysis based on Callaway and Sant'Anna (2021). + + This function creates panel data with heterogeneous treatment effects across time periods and groups. + The data includes pre-treatment periods, multiple treatment groups that receive treatment at different times, + and optionally a never-treated group that serves as a control. The true average treatment effect on the + treated (ATT) has a heterogeneous structure dependent on covariates and exposure time. + + The data generating process offers six variations (``dgp_type`` 1-6) that differ in how the regression features + and propensity score features are derived: + + - DGP 1: Outcome and propensity score are linear (in Z) + - DGP 2: Outcome is linear, propensity score is nonlinear + - DGP 3: Outcome is nonlinear, propensity score is linear + - DGP 4: Outcome and propensity score are nonlinear + - DGP 5: Outcome is linear, propensity score is constant (experimental setting) + - DGP 6: Outcome is nonlinear, propensity score is constant (experimental setting) + + Let :math:`X= (X_1, X_2, X_3, X_4)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries + :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix. + + Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, + where :math:`\\tilde{Z}_1 = \\exp(0.5 \\cdot X_1)`, :math:`\\tilde{Z}_2 = 10 + X_2/(1 + \\exp(X_1))`, + :math:`\\tilde{Z}_3 = (0.6 + X_1 \\cdot X_3 / 25)^3` and :math:`\\tilde{Z}_4 = (20 + X_2 + X_4)^2`. + + For a feature vector :math:`W=(W_1, W_2, W_3, W_4)^T` (either X or Z based on ``dgp_type``), the core functions are: + + 1. Time-varying outcome regression function for each time period :math:`t`: + + .. math:: + + f_{reg,t}(W) = 210 + \\frac{t}{T} \\cdot (27.4 \\cdot W_1 + 13.7 \\cdot W_2 + 13.7 \\cdot W_3 + 13.7 \\cdot W_4) + + 2. Group-specific propensity function for each treatment group :math:`g`: + + .. math:: + + f_{ps,g}(W) = \\xi \\cdot \\left(1-\\frac{g}{G}\\right) \\cdot + (-W_1 + 0.5 \\cdot W_2 - 0.25 \\cdot W_3 - 0.2\\cdot W_4) + + where :math:`T` is the number of time periods, :math:`G` is the number of treatment groups, and :math:`\\xi` is a + scale parameter (default: 0.9). + + The panel data model is defined with the following components: + + 1. Time effects: :math:`\\delta_t = t` for time period :math:`t` + + 2. Individual effects: :math:`\\eta_i \\sim \\mathcal{N}(g_i, 1)` where :math:`g_i` is unit :math:`i`'s treatment group + + 3. Treatment effects: For a unit in treatment group :math:`g`, the effect in period :math:`t` is: + + .. math:: + + \\theta_{i,t,g} = \\max(t - t_g + 1, 0) + 0.1 \\cdot X_{i,1} \\cdot \\max(t - t_g + 1, 0) + + where :math:`t_g` is the first treatment period for group :math:`g`, :math:`X_{i,1}` is the first covariate for unit + :math:`i`, and :math:`\\max(t - t_g + 1, 0)` represents the exposure time (0 for pre-treatment periods). + + 4. Potential outcomes for unit :math:`i` in period :math:`t`: + + .. math:: + + Y_{i,t}(0) &= f_{reg,t}(W_{reg}) + \\delta_t + \\eta_i + \\varepsilon_{i,0,t} + + Y_{i,t}(1) &= Y_{i,t}(0) + \\theta_{i,t,g} + (\\varepsilon_{i,1,t} - \\varepsilon_{i,0,t}) + + where :math:`\\varepsilon_{i,0,t}, \\varepsilon_{i,1,t} \\sim \\mathcal{N}(0, 1)`. + + 5. Observed outcomes: + + .. math:: + + Y_{i,t} = Y_{i,t}(1) \\cdot 1\\{t \\geq t_g\\} + Y_{i,t}(0) \\cdot 1\\{t < t_g\\} + + 6. Treatment assignment: + + For non-experimental settings (DGP 1-4), the probability of being in treatment group :math:`g` is: + + .. math:: + + P(G_i = g) = \\frac{\\exp(f_{ps,g}(W_{ps}))}{\\sum_{g'} \\exp(f_{ps,g'}(W_{ps}))} + + For experimental settings (DGP 5-6), each treatment group (including never-treated) has equal probability: + + .. math:: + + P(G_i = g) = \\frac{1}{G} \\text{ for all } g + + The variables :math:`W_{reg}` and :math:`W_{ps}` are selected based on the DGP type: + + .. math:: + + DGP1:\\quad W_{reg} &= Z \\quad W_{ps} = Z + + DGP2:\\quad W_{reg} &= Z \\quad W_{ps} = X + + DGP3:\\quad W_{reg} &= X \\quad W_{ps} = Z + + DGP4:\\quad W_{reg} &= X \\quad W_{ps} = X + + DGP5:\\quad W_{reg} &= Z \\quad W_{ps} = 0 + + DGP6:\\quad W_{reg} &= X \\quad W_{ps} = 0 + + where settings 5-6 correspond to experimental designs with equal probability across treatment groups. + + + Parameters + ---------- + n_obs : int, default=1000 + The number of observations to simulate. + + dgp_type : int, default=1 + The data generating process to be used (1-6). + + include_never_treated : bool, default=True + Whether to include units that are never treated. + + time_type : str, default="datetime" + Type of time variable. Either "datetime" or "float". + + **kwargs + Additional keyword arguments. Accepts the following parameters: + + `c` (float, default=0.0): + Parameter for correlation structure in X. + + `dim_x` (int, default=4): + Dimension of feature vectors. + + `xi` (float, default=0.9): + Scale parameter for the propensity score function. + + `n_periods` (int, default=5): + Number of time periods. + + `anticipation_periods` (int, default=0): + Number of periods before treatment where anticipation effects occur. + + `n_pre_treat_periods` (int, default=2): + Number of pre-treatment periods. + + `start_date` (str, default="2025-01"): + Start date for datetime time variables. + + Returns + ------- + pandas.DataFrame + DataFrame containing the simulated panel data. + + References + ---------- + Callaway, B. and Sant’Anna, P. H. (2021), + Difference-in-Differences with multiple time periods. Journal of Econometrics, 225(2), 200-230. + doi:`10.1016/j.jeconom.2020.12.001 `_. + """ + + c = kwargs.get("c", 0.0) + dim_x = kwargs.get("dim_x", 4) + xi = kwargs.get("xi", 0.9) + n_periods = kwargs.get("n_periods", 5) + anticipation_periods = kwargs.get("anticipation_periods", 0) + n_pre_treat_periods = kwargs.get("n_pre_treat_periods", 2) + start_date = kwargs.get("start_date", "2025-01") + + if anticipation_periods > 0: + n_periods += anticipation_periods # increase number of periods + + expected_time_types = ("datetime", "float") + if time_type not in expected_time_types: + raise ValueError(f"time_type must be one of {expected_time_types}. Got {time_type}.") + + x, z = _generate_features(n_obs, c, dim_x=dim_x) + features_ps, features_reg = _select_features(dgp_type, x, z) + + # generate possible time periods + if time_type == "datetime": + time_periods = np.array([np.datetime64(start_date) + np.timedelta64(i, "M") for i in range(n_periods)]) + never_treated_value = np.datetime64("NaT") + else: + assert time_type == "float" + time_periods = np.arange(n_periods) + never_treated_value = np.inf + n_time_periods = len(time_periods) + + # set treatment values for time periods greater than n_pre_treat_periods + treatment_values = time_periods[time_periods >= time_periods[n_pre_treat_periods]] + max_exposure = len(treatment_values) # exclude never treated + if include_never_treated: + treatment_values = np.append(treatment_values, never_treated_value) + n_treatment_groups = len(treatment_values) + + # treatment assignment and propensities (shape (n_obs,)) + is_experimental = (dgp_type == 5) or (dgp_type == 6) + if is_experimental: + # Set D to be experimental + p = np.ones(n_treatment_groups) / n_treatment_groups + d_index = np.random.choice(n_treatment_groups, size=n_obs, p=p) + else: + unnormalized_p = np.exp(_f_ps_groups(features_ps, xi, n_groups=n_treatment_groups)) + p = unnormalized_p / unnormalized_p.sum(1, keepdims=True) + d_index = np.array([np.random.choice(n_treatment_groups, p=p_row) for p_row in p]) + + # fixed effects (shape (n_obs, n_time_periods)) + time_effects = np.arange(n_time_periods) + delta_t = np.tile(time_effects, (n_obs, 1)) + indiviual_effects = np.random.normal(loc=d_index, scale=1, size=(n_obs,)) + eta_i = np.tile(indiviual_effects, (n_time_periods, 1)).T + + # error terms (shape (n_obs, n_time_periods)) + epsilon_0 = np.random.normal(loc=0, scale=1, size=(n_obs, n_time_periods)) + epsilon_1 = np.random.normal(loc=0, scale=1, size=(n_obs, n_time_periods)) + + # regression function (shape (n_obs, n_time_periods)) + f_reg = _f_reg_time(features_reg, n_time_periods) + + # treatment effecs (shape (n_obs, n_time_periods)) + exposure_pre_period = np.zeros((n_obs, n_pre_treat_periods)) + exposure_post_first_treatment = np.clip(np.arange(max_exposure) - d_index.reshape(-1, 1) + 1, a_min=0, a_max=None) + exposure_time = np.column_stack((exposure_pre_period, exposure_post_first_treatment)) + delta_e = exposure_time + + # add heterogeneity in treatment effects + heterogeneity_x = 0.1 * x[:, 0] + heterogeneity = heterogeneity_x.reshape(-1, 1) * exposure_time + delta_e += heterogeneity + + # potential outcomes (shape (n_obs, n_time_periods)) + y0 = f_reg + delta_t + eta_i + epsilon_0 + y1 = y0 + delta_e + (epsilon_1 - epsilon_0) + + # observed outcomes (shape (n_obs, n_time_periods)) + is_exposed = exposure_time > 0 + y = y1 * is_exposed + y0 * ~is_exposed + + # map treatment index to values + d = np.array([treatment_values[i] for i in d_index]) + d_matrix = np.tile(d, (n_time_periods, 1)).T + + # create matrices to flatten the data + id_matrix = np.tile(np.arange(n_obs), (n_time_periods, 1)).T + time_matrix = np.tile(time_periods, (n_obs, 1)) + + df = pd.DataFrame( + { + "id": id_matrix.flatten(), + "y": y.flatten(), + "y0": y0.flatten(), + "y1": y1.flatten(), + "d": d_matrix.flatten(), + "t": time_matrix.flatten(), + **{f"Z{i + 1}": z[:, i].repeat(n_time_periods) for i in range(dim_x)}, + } + ) + if anticipation_periods > 0: + # filter time periods + df = df[df["t"] >= time_periods[anticipation_periods]] + # filter treatment after anticipation periods + df = df[(df["d"] <= time_periods[-(anticipation_periods + 1)]) | pd.isna(df["d"])] + + # update time periods by subtracting time delta + if time_type == "datetime": + df = df[(df["d"] <= time_periods[-(anticipation_periods + 1)]) | pd.isna(df["d"])] + df["t"] = df["t"].apply(lambda x: x - pd.DateOffset(months=anticipation_periods)) + else: + assert time_type == "float" + df = df[(df["d"] <= time_periods[-(anticipation_periods + 1)]) | np.isinf(df["d"])] + df["t"] = df["t"] - anticipation_periods + + return df diff --git a/doubleml/did/datasets/dgp_did_SZ2020.py b/doubleml/did/datasets/dgp_did_SZ2020.py new file mode 100644 index 00000000..ccfd4a80 --- /dev/null +++ b/doubleml/did/datasets/dgp_did_SZ2020.py @@ -0,0 +1,238 @@ +import numpy as np +import pandas as pd +from scipy.linalg import toeplitz + +from ...data.base_data import DoubleMLData +from ...data.panel_data import DoubleMLPanelData +from ...utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias + +_array_alias = _get_array_alias() +_data_frame_alias = _get_data_frame_alias() +_dml_data_alias = _get_dml_data_alias() + + +def _generate_features(n_obs, c, dim_x=4): + cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) + x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=n_obs) + + z_tilde_1 = np.exp(0.5 * x[:, 0]) + z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) + z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 + z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 + + z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4)) + z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) + + return x, z + + +def _select_features(dgp_type, x, z): + if dgp_type == 1: + features_ps = z + features_reg = z + elif dgp_type == 2: + features_ps = x + features_reg = z + elif dgp_type == 3: + features_ps = z + features_reg = x + elif dgp_type == 4: + features_ps = x + features_reg = x + elif dgp_type == 5: + features_ps = None + features_reg = z + elif dgp_type == 6: + features_ps = None + features_reg = x + else: + raise ValueError("The dgp_type is not valid.") + return features_ps, features_reg + + +def _f_reg(w): + res = 210 + 27.4 * w[:, 0] + 13.7 * (w[:, 1] + w[:, 2] + w[:, 3]) + return res + + +def _f_ps(w, xi): + res = xi * (-w[:, 0] + 0.5 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3]) + return res + + +def make_did_SZ2020(n_obs=500, dgp_type=1, cross_sectional_data=False, return_type="DoubleMLData", **kwargs): + """ + Generates data from a difference-in-differences model used in Sant'Anna and Zhao (2020). + The data generating process is defined as follows. For a generic :math:`W=(W_1, W_2, W_3, W_4)^T`, let + + .. math:: + + f_{reg}(W) &= 210 + 27.4 \\cdot W_1 +13.7 \\cdot (W_2 + W_3 + W_4), + + f_{ps}(W) &= 0.75 \\cdot (-W_1 + 0.5 \\cdot W_2 -0.25 \\cdot W_3 - 0.1 \\cdot W_4). + + + Let :math:`X= (X_1, X_2, X_3, X_4)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries + :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix. + Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, + where :math:`\\tilde{Z}_1 = \\exp(0.5 \\cdot X_1)`, :math:`\\tilde{Z}_2 = 10 + X_2/(1 + \\exp(X_1))`, + :math:`\\tilde{Z}_3 = (0.6 + X_1 \\cdot X_3 / 25)^3` and :math:`\\tilde{Z}_4 = (20 + X_2 + X_4)^2`. + At first define + + .. math:: + + Y_0(0) &= f_{reg}(W_{reg}) + \\nu(W_{reg}, D) + \\varepsilon_0, + + Y_1(d) &= 2 \\cdot f_{reg}(W_{reg}) + \\nu(W_{reg}, D) + \\varepsilon_1(d), + + p(W_{ps}) &= \\frac{\\exp(f_{ps}(W_{ps}))}{1 + \\exp(f_{ps}(W_{ps}))}, + + D &= 1\\{p(W_{ps}) \\ge U\\}, + + where :math:`\\varepsilon_0, \\varepsilon_1(d), d=0, 1` are independent standard normal random variables, + :math:`U \\sim \\mathcal{U}[0, 1]` is a independent standard uniform + and :math:`\\nu(W_{reg}, D)\\sim \\mathcal{N}(D \\cdot f_{reg}(W_{reg}),1)`. + The different data generating processes are defined via + + .. math:: + + DGP1:\\quad W_{reg} &= Z \\quad W_{ps} = Z + + DGP2:\\quad W_{reg} &= Z \\quad W_{ps} = X + + DGP3:\\quad W_{reg} &= X \\quad W_{ps} = Z + + DGP4:\\quad W_{reg} &= X \\quad W_{ps} = X + + DGP5:\\quad W_{reg} &= Z \\quad W_{ps} = 0 + + DGP6:\\quad W_{reg} &= X \\quad W_{ps} = 0, + + such that the last two settings correspond to an experimental setting with treatment probability + of :math:`P(D=1) = \\frac{1}{2}.` + For the panel data the outcome is already defined as the difference :math:`Y = Y_1(D) - Y_0(0)`. + For cross-sectional data the flag ``cross_sectional_data`` has to be set to ``True``. + Then the outcome will be defined to be + + .. math:: + + Y = T \\cdot Y_1(D) + (1-T) \\cdot Y_0(0), + + where :math:`T = 1\\{U_T\\le \\lambda_T \\}` with :math:`U_T\\sim \\mathcal{U}[0, 1]` and :math:`\\lambda_T=0.5`. + The true average treatment effect on the treated is zero for all data generating processes. + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dgp_type : + The DGP to be used. Default value is ``1`` (integer). + cross_sectional_data : + Indicates whether the setting is uses cross-sectional or panel data. Default value is ``False``. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)`` + or ``(x, y, d, t)``. + **kwargs + Additional keyword arguments to set non-default values for the parameter + :math:`xi=0.75`, :math:`c=0.0` and :math:`\\lambda_T=0.5`. + + References + ---------- + Sant’Anna, P. H. and Zhao, J. (2020), + Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122. + doi:`10.1016/j.jeconom.2020.06.003 `_. + """ + xi = kwargs.get("xi", 0.75) + c = kwargs.get("c", 0.0) + lambda_t = kwargs.get("lambda_t", 0.5) + + dim_x = 4 + x, z = _generate_features(n_obs, c, dim_x=dim_x) + + # error terms + epsilon_0 = np.random.normal(loc=0, scale=1, size=n_obs) + epsilon_1 = np.random.normal(loc=0, scale=1, size=[n_obs, 2]) + + features_ps, features_reg = _select_features(dgp_type, x, z) + + # treatment and propensities + is_experimental = (dgp_type == 5) or (dgp_type == 6) + if is_experimental: + # Set D to be experimental + p = 0.5 * np.ones(n_obs) + else: + p = np.exp(_f_ps(features_ps, xi)) / (1 + np.exp(_f_ps(features_ps, xi))) + u = np.random.uniform(low=0, high=1, size=n_obs) + d = 1.0 * (p >= u) + + # potential outcomes + nu = np.random.normal(loc=d * _f_reg(features_reg), scale=1, size=n_obs) + y0 = _f_reg(features_reg) + nu + epsilon_0 + y1_d0 = 2 * _f_reg(features_reg) + nu + epsilon_1[:, 0] + y1_d1 = 2 * _f_reg(features_reg) + nu + epsilon_1[:, 1] + y1 = d * y1_d1 + (1 - d) * y1_d0 + + if not cross_sectional_data: + y = y1 - y0 + + if return_type in _array_alias: + return z, y, d, None + elif return_type in _data_frame_alias + _dml_data_alias: + z_cols = [f"Z{i + 1}" for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((z, y, d)), columns=z_cols + ["y", "d"]) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, "y", "d", z_cols) + elif return_type == "DoubleMLPanelData": + z_cols = [f"Z{i + 1}" for i in np.arange(dim_x)] + df0 = ( + pd.DataFrame( + { + "y": y0, + "d": d.astype(np.int32), + "t": np.zeros_like(y0, dtype=np.int32), + **{col: z[:, i] for i, col in enumerate(z_cols)}, + } + ) + .reset_index() + .rename(columns={"index": "id"}) + ) + df1 = ( + pd.DataFrame( + { + "y": y1, + "d": d.astype(np.int32), + "t": np.ones_like(y0, dtype=np.int32), + **{col: z[:, i] for i, col in enumerate(z_cols)}, + } + ) + .reset_index() + .rename(columns={"index": "id"}) + ) + df = pd.concat([df0, df1], axis=0) + + return DoubleMLPanelData(df, "y", "d", t_col="t", id_col="id", x_cols=z_cols) + else: + raise ValueError("Invalid return_type.") + + else: + u_t = np.random.uniform(low=0, high=1, size=n_obs) + t = 1.0 * (u_t <= lambda_t) + y = t * y1 + (1 - t) * y0 + + if return_type in _array_alias: + return z, y, d, t + elif return_type in _data_frame_alias + _dml_data_alias: + z_cols = [f"Z{i + 1}" for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((z, y, d, t)), columns=z_cols + ["y", "d", "t"]) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, "y", "d", z_cols, t_col="t") + else: + raise ValueError("Invalid return_type.") diff --git a/doubleml/did/did.py b/doubleml/did/did.py index e71068f2..7a671993 100644 --- a/doubleml/did/did.py +++ b/doubleml/did/did.py @@ -4,8 +4,8 @@ from sklearn.utils import check_X_y from sklearn.utils.multiclass import type_of_target +from doubleml.data.base_data import DoubleMLData from doubleml.double_ml import DoubleML -from doubleml.double_ml_data import DoubleMLData from doubleml.double_ml_score_mixins import LinearScoreMixin from doubleml.utils._checks import _check_finite_predictions, _check_is_propensity, _check_score, _check_trimming from doubleml.utils._estimation import _dml_cv_predict, _dml_tune, _get_cond_smpls @@ -209,7 +209,9 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa # nuisance g for d==0 if external_predictions["ml_g0"] is not None: - g_hat0 = {"preds": external_predictions["ml_g0"], "targets": None, "models": None} + ml_g0_targets = np.full_like(y, np.nan, dtype="float64") + ml_g0_targets[d == 0] = y[d == 0] + g_hat0 = {"preds": external_predictions["ml_g0"], "targets": ml_g0_targets, "models": None} else: g_hat0 = _dml_cv_predict( self._learner["ml_g"], @@ -229,7 +231,9 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa # nuisance g for d==1 if external_predictions["ml_g1"] is not None: - g_hat1 = {"preds": external_predictions["ml_g1"], "targets": None, "models": None} + ml_g1_targets = np.full_like(y, np.nan, dtype="float64") + ml_g1_targets[d == 1] = y[d == 1] + g_hat1 = {"preds": external_predictions["ml_g1"], "targets": ml_g1_targets, "models": None} else: g_hat1 = _dml_cv_predict( self._learner["ml_g"], @@ -252,7 +256,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa if self.score == "observational": # nuisance m if external_predictions["ml_m"] is not None: - m_hat = {"preds": external_predictions["ml_m"], "targets": None, "models": None} + m_hat = {"preds": external_predictions["ml_m"], "targets": d, "models": None} else: m_hat = _dml_cv_predict( self._learner["ml_m"], @@ -269,10 +273,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa m_hat["preds"] = _trimm(m_hat["preds"], self.trimming_rule, self.trimming_threshold) # nuisance estimates of the uncond. treatment prob. - p_hat = np.full_like(d, np.nan, dtype="float64") - for train_index, test_index in smpls: - p_hat[test_index] = np.mean(d[train_index]) - + p_hat = np.full_like(d, d.mean(), dtype="float64") psi_a, psi_b = self._score_elements(y, d, g_hat0["preds"], g_hat1["preds"], m_hat["preds"], p_hat) psi_elements = {"psi_a": psi_a, "psi_b": psi_b} @@ -432,3 +433,31 @@ def _nuisance_tuning( res = {"params": params, "tune_res": tune_res} return res + + def sensitivity_benchmark(self, benchmarking_set, fit_args=None): + """ + Computes a benchmark for a given set of features. + Returns a DataFrame containing the corresponding values for cf_y, cf_d, rho and the change in estimates. + + Parameters + ---------- + benchmarking_set : list + List of features to be used for benchmarking. + + fit_args : dict, optional + Additional arguments for the fit method. + Default is None. + + Returns + ------- + benchmark_results : pandas.DataFrame + Benchmark results. + """ + if self.score == "experimental": + warnings.warn( + "Sensitivity benchmarking for experimental score may not be meaningful. " + "Consider using score='observational' for conditional treatment assignment.", + UserWarning, + ) + + return super().sensitivity_benchmark(benchmarking_set, fit_args) diff --git a/doubleml/did/did_aggregation.py b/doubleml/did/did_aggregation.py new file mode 100644 index 00000000..0e34aa37 --- /dev/null +++ b/doubleml/did/did_aggregation.py @@ -0,0 +1,391 @@ +import warnings +from functools import reduce +from operator import add + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns + +from doubleml.double_ml_framework import DoubleMLFramework, concat + + +class DoubleMLDIDAggregation: + """ + Class for aggregating multiple difference-in-differences (DID) frameworks. + + This class enables weighted aggregation of multiple DoubleMLFramework objects, allowing for + both multiple separate aggregations and an overall aggregation across them. It provides + methods for summarizing and visualizing aggregated treatment effects. + + Parameters + ---------- + frameworks : list + List of DoubleMLFramework objects to aggregate. Each framework must be one-dimensional + (n_thetas = 1). + + aggregation_weights : numpy.ndarray + 2D array of weights for aggregating frameworks. Shape should be (n_aggregations, n_frameworks), + where each row corresponds to a separate aggregation of the frameworks. + + overall_aggregation_weights : numpy.ndarray, optional + 1D array of weights for aggregating across the aggregated frameworks. Length should + equal the number of rows in aggregation_weights. If None, equal weights are used. + Default is None. + + aggregation_names : list of str, optional + Names for each aggregation. Length should equal the number of rows in aggregation_weights. + If None, default names like "Aggregation_0", "Aggregation_1", etc. are used. + Default is None. + + aggregation_method_name : str, optional + Name describing the aggregation method used. + Default is "Custom". + + additional_information : dict, optional + Dictionary containing additional information to display in the string representation. + Default is None. + + additional_parameters : dict, optional + Dictionary containing additional parameters used by the class methods. + For example, can contain 'aggregation_color_idx' for plot_effects(). + Default is None. + """ + + def __init__( + self, + frameworks, + aggregation_weights, + overall_aggregation_weights=None, + aggregation_names=None, + aggregation_method_name="Custom", + additional_information=None, + additional_parameters=None, + ): + self._base_frameworks = self._check_frameworks(frameworks) + + self._aggregation_weights, self._overall_aggregation_weights = self._check_weights( + aggregation_weights, overall_aggregation_weights + ) + self._n_aggregations = self.aggregation_weights.shape[0] + + self._aggregation_names, self._aggregation_method_name = self._check_names(aggregation_names, aggregation_method_name) + + if additional_information is not None and not isinstance(additional_information, dict): + raise TypeError("'additional_information' must be a dictionary (or None)") + self._additional_information = additional_information + if additional_parameters is not None and not isinstance(additional_parameters, dict): + raise TypeError("'additional_parameters' must be a dictionary (or None)") + self._additional_parameters = additional_parameters + + agg_frameworks = [None] * self._n_aggregations + for idx_agg in range(self._n_aggregations): + weights = self.aggregation_weights[idx_agg, :] + weighted_frameworks = [w * f for w, f in zip(weights, self.base_frameworks)] + agg_frameworks[idx_agg] = reduce(add, weighted_frameworks) + + self._aggregated_frameworks = concat(agg_frameworks) + self._aggregated_frameworks.treatment_names = self._aggregation_names + + # overall framework + overall_weighted_frameworks = [w * f for w, f in zip(self.overall_aggregation_weights, agg_frameworks)] + self._overall_aggregated_framework = reduce(add, overall_weighted_frameworks) + + def __str__(self): + class_name = self.__class__.__name__ + header = ( + f"================== {class_name} Object ==================\n" + f" {self.aggregation_method_name} Aggregation \n" + ) + overall_summary = self.overall_summary.to_string(index=False) + aggregated_effects_summary = self.aggregated_summary.to_string(index=True) + + res = ( + header + + "\n------------------ Overall Aggregated Effects ------------------\n" + + overall_summary + + "\n------------------ Aggregated Effects ------------------\n" + + aggregated_effects_summary + ) + if self.additional_information is not None: + res += "\n------------------ Additional Information ------------------\n" + res += self.additional_information + + return res + + @property + def base_frameworks(self): + """Underlying frameworks""" + return self._base_frameworks + + @property + def aggregated_frameworks(self): + """Aggregated frameworks""" + return self._aggregated_frameworks + + @property + def overall_aggregated_framework(self): + """Overall aggregated framework""" + return self._overall_aggregated_framework + + @property + def aggregation_weights(self): + """Aggregation weights""" + return self._aggregation_weights + + @property + def overall_aggregation_weights(self): + """Overall aggregation weights""" + return self._overall_aggregation_weights + + @property + def n_aggregations(self): + """Number of aggregations""" + return self._n_aggregations + + @property + def aggregation_names(self): + """Aggregation names""" + return self._aggregation_names + + @property + def aggregation_method_name(self): + """Aggregation method name""" + return self._aggregation_method_name + + @property + def aggregated_summary(self): + """ + A summary for the aggregated effects. + """ + return self.aggregated_frameworks.summary + + @property + def overall_summary(self): + """ + A summary for the overall aggregated effect. + """ + return self.overall_aggregated_framework.summary + + @property + def additional_information(self): + """Additional information""" + if self._additional_information is None: + add_info = None + else: + add_info = str() + for key, value in self._additional_information.items(): + add_info += f"{key}: {value}\n" + return add_info + + @property + def additional_parameters(self): + """Additional parameters""" + return self._additional_parameters + + def plot_effects( + self, + level=0.95, + joint=True, + figsize=(12, 6), + sort_by=None, + color_palette="colorblind", + title="Aggregated Treatment Effects", + y_label="Effect", + ): + """ + Plot aggregated treatment effect estimates with confidence intervals. + + Parameters + ---------- + level : float + Confidence level for the intervals. + Default is ``0.95``. + joint : bool + Indicates whether joint confidence intervals are computed. + Default is ``True``. + figsize : tuple + Figure size as (width, height). + Default is ``(12, 6)``. + sort_by : str or None + How to sort the results - 'estimate', 'name', or None. + Default is ``None``. + color_palette : str or list + Seaborn color palette name or list of colors. + Default is ``"colorblind"``. + title : str + Title for the plot. + Default is ``"Aggregated Treatment Effects"``. + y_label : str + Label for y-axis. + Default is ``"Effect"``. + + Returns + ------- + fig : matplotlib.figure.Figure + The created figure object. + ax : matplotlib.axes.Axes + The axes object for further customization. + + Notes + ----- + If ``joint=True`` and bootstrapping hasn't been performed, this method will automatically + perform bootstrapping with default parameters and issue a warning. + """ + df = self._create_ci_dataframe(level=level, joint=joint) + + # Validate sorting column + valid_sort_options = {"estimate", "name", None} + if sort_by not in valid_sort_options: + raise ValueError(f"Invalid sort_by value. Choose from {valid_sort_options}.") + + # Sort data if requested + if sort_by == "estimate": + df = df.sort_values(by="Estimate", ascending=False) + elif sort_by == "name": + df = df.sort_values(by="Aggregation_Names", ascending=True) + + # Handle color palette + colors = sns.color_palette(color_palette) if isinstance(color_palette, str) else color_palette + selected_colors = [colors[idx] for idx in df["color_idx"]] + + # Create figure + fig, ax = plt.subplots(figsize=figsize) + + # Plot zero reference line + ax.axhline(y=0, color="black", linestyle="--", alpha=0.5, label="Zero effect") + + # Calculate asymmetric error bars + x_positions = np.arange(len(df)) + yerr = np.array([df["Estimate"] - df["CI_Lower"], df["CI_Upper"] - df["Estimate"]]) # lower error # upper error + + for i, (x, y, color) in enumerate(zip(x_positions, df["Estimate"], selected_colors)): + ax.errorbar( + x, + y, + yerr=[[yerr[0, i]], [yerr[1, i]]], + fmt="o", + capsize=4, + color=color, + ecolor=color, + markersize=8, + markeredgewidth=1.5, + linewidth=1.5, + ) + + # Set labels and title + ax.set_xticks(x_positions) + ax.set_xticklabels(df["Aggregation_Names"]) + ax.set_ylabel(y_label) + ax.set_title(title) + + ax.grid(axis="y", alpha=0.3) + plt.tight_layout() + + return fig, ax + + def _check_frameworks(self, frameworks): + msg = "The 'frameworks' must be a list of DoubleMLFramework objects" + is_list = isinstance(frameworks, list) + all_frameworks = all(isinstance(framework, DoubleMLFramework) for framework in frameworks) + if not is_list or not all_frameworks: + raise TypeError(msg) + + if not all(framework.n_thetas == 1 for framework in frameworks): + raise ValueError("All frameworks must be one-dimensional") + + return frameworks + + def _check_weights(self, aggregation_weights, overall_aggregation_weights): + + # aggregation weights + if not isinstance(aggregation_weights, np.ndarray): + raise TypeError("'aggregation_weights' must be a numpy array") + + if not aggregation_weights.ndim == 2: + raise ValueError("'aggregation_weights' must be a 2-dimensional array") + + if not aggregation_weights.shape[1] == len(self.base_frameworks): + raise ValueError("The number of rows in 'aggregation_weights' must be equal to the number of frameworks") + + n_aggregations = aggregation_weights.shape[0] + # overall aggregation weights + if overall_aggregation_weights is None: + overall_aggregation_weights = np.ones(n_aggregations) / n_aggregations + + if not isinstance(overall_aggregation_weights, np.ndarray): + raise TypeError("'overall_aggregation_weights' must be a numpy array") + if not overall_aggregation_weights.ndim == 1: + raise ValueError("'overall_aggregation_weights' must be a 1-dimensional array") + if not len(overall_aggregation_weights) == n_aggregations: + raise ValueError( + "'overall_aggregation_weights' must have the same length as the number of aggregated frameworks " + "(number of rows in 'aggregation_weights')." + ) + + return aggregation_weights, overall_aggregation_weights + + def _check_names(self, aggregation_names, aggregation_method_name): + if aggregation_names is None: + aggregation_names = [f"Aggregation_{i}" for i in range(self.n_aggregations)] + + if not isinstance(aggregation_names, list): + raise TypeError("'aggregation_names' must be a list of strings") + + if not all(isinstance(name, str) for name in aggregation_names): + raise TypeError("'aggregation_names' must be a list of strings") + + if not len(aggregation_names) == self.n_aggregations: + raise ValueError("'aggregation_names' must have the same length as the number of aggregations") + + if not isinstance(aggregation_method_name, str): + raise TypeError("'aggregation_method_name' must be a string") + + return aggregation_names, aggregation_method_name + + def _create_ci_dataframe(self, level=0.95, joint=True): + """ + Create a DataFrame with coefficient estimates and confidence intervals. + + Parameters + ---------- + level : float, default=0.95 + Confidence level for intervals. + joint : bool, default=True + Whether to use joint confidence intervals. + + Returns + ------- + pandas.DataFrame + DataFrame containing: + - Aggregation names + - Coefficient estimates + - Lower and upper confidence interval bounds + - Color indices for plotting + """ + + if joint and self.aggregated_frameworks.boot_t_stat is None: + self.aggregated_frameworks.bootstrap() + warnings.warn( + "Joint confidence intervals require bootstrapping which hasn't been performed yet. " + "Automatically applying '.aggregated_frameworks.bootstrap(method=\"normal\", n_rep_boot=500)' " + "with default values. For different bootstrap settings, call bootstrap() explicitly before plotting.", + UserWarning, + ) + ci = self.aggregated_frameworks.confint(level=level, joint=joint) + + default_color_idx = [0] * self._n_aggregations + if self.additional_parameters is None: + color_idx = default_color_idx + else: + color_idx = self.additional_parameters.get("aggregation_color_idx", default_color_idx) + + df = pd.DataFrame( + { + "Aggregation_Names": self.aggregation_names, + "Estimate": self.aggregated_frameworks.thetas, + "CI_Lower": ci.iloc[:, 0], + "CI_Upper": ci.iloc[:, 1], + "color_idx": color_idx, + } + ) + return df diff --git a/doubleml/did/did_binary.py b/doubleml/did/did_binary.py new file mode 100644 index 00000000..e4d309db --- /dev/null +++ b/doubleml/did/did_binary.py @@ -0,0 +1,752 @@ +import warnings + +import numpy as np +from sklearn.utils import check_X_y + +from doubleml.data.panel_data import DoubleMLPanelData +from doubleml.did.utils._did_utils import ( + _check_anticipation_periods, + _check_control_group, + _check_gt_combination, + _check_gt_values, + _get_id_positions, + _get_never_treated_value, + _is_never_treated, + _set_id_positions, +) +from doubleml.double_ml import DoubleML +from doubleml.double_ml_score_mixins import LinearScoreMixin +from doubleml.utils._checks import ( + _check_bool, + _check_finite_predictions, + _check_is_propensity, + _check_score, + _check_trimming, +) +from doubleml.utils._estimation import _dml_cv_predict, _dml_tune, _get_cond_smpls +from doubleml.utils._propensity_score import _trimm + + +class DoubleMLDIDBinary(LinearScoreMixin, DoubleML): + """Double machine learning for difference-in-differences models with panel data (binary setting in terms of group and time + combinations). + + Parameters + ---------- + obj_dml_data : :class:`DoubleMLPanelData` object + The :class:`DoubleMLPanelData` object providing the data and specifying the variables for the causal model. + + g_value : int + The value indicating the treatment group (first period with treatment). + Default is ``None``. This implements the case for the smallest, non-zero value of G. + + t_value_pre : int + The value indicating the baseline pre-treatment period. + + t_value_eval : int + The value indicating the period for evaluation. + + ml_g : estimator implementing ``fit()`` and ``predict()`` + A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. + :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function :math:`g_0(d,X) = E[Y_1-Y_0|D=d, X]`. + For a binary outcome variable :math:`Y` (with values 0 and 1), a classifier implementing ``fit()`` and + ``predict_proba()`` can also be specified. If :py:func:`sklearn.base.is_classifier` returns ``True``, + ``predict_proba()`` is used otherwise ``predict()``. + + ml_m : classifier implementing ``fit()`` and ``predict_proba()`` + A machine learner implementing ``fit()`` and ``predict_proba()`` methods (e.g. + :py:class:`sklearn.ensemble.RandomForestClassifier`) for the nuisance function :math:`m_0(X) = E[D=1|X]`. + Only relevant for ``score='observational'``. + + control_group : str + Specifies the control group. Either ``'never_treated'`` or ``'not_yet_treated'``. + Default is ``'never_treated'``. + + anticipation_periods : int + Number of anticipation periods. Default is ``0``. + + n_folds : int + Number of folds. + Default is ``5``. + + n_rep : int + Number of repetitons for the sample splitting. + Default is ``1``. + + score : str + A str (``'observational'`` or ``'experimental'``) specifying the score function. + The ``'experimental'`` scores refers to an A/B setting, where the treatment is independent + from the pretreatment covariates. + Default is ``'observational'``. + + in_sample_normalization : bool + Indicates whether to use a sligthly different normalization from Sant'Anna and Zhao (2020). + Default is ``True``. + + trimming_rule : str + A str (``'truncate'`` is the only choice) specifying the trimming approach. + Default is ``'truncate'``. + + trimming_threshold : float + The threshold used for trimming. + Default is ``1e-2``. + + draw_sample_splitting : bool + Indicates whether the sample splitting should be drawn during initialization of the object. + Default is ``True``. + + print_periods : bool + Indicates whether to print information about the evaluated periods. + Default is ``False``. + + """ + + def __init__( + self, + obj_dml_data, + g_value, + t_value_pre, + t_value_eval, + ml_g, + ml_m=None, + control_group="never_treated", + anticipation_periods=0, + n_folds=5, + n_rep=1, + score="observational", + in_sample_normalization=True, + trimming_rule="truncate", + trimming_threshold=1e-2, + draw_sample_splitting=True, + print_periods=False, + ): + + super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting=False) + + self._check_data(self._dml_data) + g_values = self._dml_data.g_values + t_values = self._dml_data.t_values + + _check_bool(print_periods, "print_periods") + self._print_periods = print_periods + + self._control_group = _check_control_group(control_group) + self._never_treated_value = _get_never_treated_value(g_values) + self._anticipation_periods = _check_anticipation_periods(anticipation_periods) + + _check_gt_combination( + (g_value, t_value_pre, t_value_eval), g_values, t_values, self.never_treated_value, self.anticipation_periods + ) + self._g_value = g_value + self._t_value_pre = t_value_pre + self._t_value_eval = t_value_eval + + # check if post_treatment evaluation + if g_value <= t_value_eval: + post_treatment = True + else: + post_treatment = False + + self._post_treatment = post_treatment + + if self._print_periods: + print( + f"Evaluation of ATT({g_value}, {t_value_eval}), with pre-treatment period {t_value_pre},\n" + + f"post-treatment: {post_treatment}. Control group: {control_group}.\n" + ) + + # Preprocess data + # Y1, Y0 might be needed if we want to support custom estimators and scores; currently only output y_diff + self._panel_data_wide = self._preprocess_data(self._g_value, self._t_value_pre, self._t_value_eval) + + # Handling id values to match pairwise evaluation & simultaneous inference + id_panel_data = self._panel_data_wide[self._dml_data.id_col].values + id_original = self._dml_data.id_var_unique + if not np.all(np.isin(id_panel_data, id_original)): + raise ValueError("The id values in the panel data are not a subset of the original id values.") + + # Find position of id_panel_data in original data + # These entries should be replaced by nuisance predictions, all others should be set to 0. + self._id_positions = np.searchsorted(id_original, id_panel_data) + + # Numeric values for positions of the entries in id_panel_data inside id_original + # np.nonzero(np.isin(id_original, id_panel_data)) + self._n_subset = self._panel_data_wide.shape[0] + self._n_obs = self._n_subset # Effective sample size used for resampling + self._n_treated_subset = self._panel_data_wide["G_indicator"].sum() + + # Save x and y for later ML estimation + self._x_panel = self._panel_data_wide.loc[:, self._dml_data.x_cols].values + self._y_panel = self._panel_data_wide.loc[:, "y_diff"].values + self._g_panel = self._panel_data_wide.loc[:, "G_indicator"].values + + valid_scores = ["observational", "experimental"] + _check_score(self.score, valid_scores, allow_callable=False) + + self._in_sample_normalization = in_sample_normalization + if not isinstance(self.in_sample_normalization, bool): + raise TypeError( + "in_sample_normalization indicator has to be boolean. " + + f"Object of type {str(type(self.in_sample_normalization))} passed." + ) + + # set stratication for resampling + self._strata = self._panel_data_wide["G_indicator"] + if draw_sample_splitting: + self.draw_sample_splitting() + + # check learners + ml_g_is_classifier = self._check_learner(ml_g, "ml_g", regressor=True, classifier=True) + if self.score == "observational": + _ = self._check_learner(ml_m, "ml_m", regressor=False, classifier=True) + self._learner = {"ml_g": ml_g, "ml_m": ml_m} + else: + assert self.score == "experimental" + if ml_m is not None: + warnings.warn( + ( + 'A learner ml_m has been provided for score = "experimental" but will be ignored. ' + "A learner ml_m is not required for estimation." + ) + ) + self._learner = {"ml_g": ml_g} + + if ml_g_is_classifier: + if obj_dml_data.binary_outcome: + self._predict_method = {"ml_g": "predict_proba"} + else: + raise ValueError( + f"The ml_g learner {str(ml_g)} was identified as classifier " + "but the outcome variable is not binary with values 0 and 1." + ) + else: + self._predict_method = {"ml_g": "predict"} + + if "ml_m" in self._learner: + self._predict_method["ml_m"] = "predict_proba" + self._initialize_ml_nuisance_params() + + self._trimming_rule = trimming_rule + self._trimming_threshold = trimming_threshold + _check_trimming(self._trimming_rule, self._trimming_threshold) + + self._sensitivity_implemented = True + self._external_predictions_implemented = True + + def __str__(self): + class_name = self.__class__.__name__ + header = f"================== {class_name} Object ==================\n" + data_summary = self._dml_data._data_summary_str() + score_info = ( + f"Score function: {str(self.score)}\n" + f"Treatment group: {str(self.g_value)}\n" + f"Pre-treatment period: {str(self.t_value_pre)}\n" + f"Evaluation period: {str(self.t_value_eval)}\n" + f"Control group: {str(self.control_group)}\n" + f"Anticipation periods: {str(self.anticipation_periods)}\n" + f"Effective sample size: {str(self.n_obs)}\n" + ) + learner_info = "" + for key, value in self.learner.items(): + learner_info += f"Learner {key}: {str(value)}\n" + if self.nuisance_loss is not None: + learner_info += "Out-of-sample Performance:\n" + is_classifier = [value for value in self._is_classifier.values()] + is_regressor = [not value for value in is_classifier] + if any(is_regressor): + learner_info += "Regression:\n" + for learner in [key for key, value in self._is_classifier.items() if value is False]: + learner_info += f"Learner {learner} RMSE: {self.nuisance_loss[learner]}\n" + if any(is_classifier): + learner_info += "Classification:\n" + for learner in [key for key, value in self._is_classifier.items() if value is True]: + learner_info += f"Learner {learner} Log Loss: {self.nuisance_loss[learner]}\n" + + if self._is_cluster_data: + resampling_info = ( + f"No. folds per cluster: {self._n_folds_per_cluster}\n" + f"No. folds: {self.n_folds}\n" + f"No. repeated sample splits: {self.n_rep}\n" + ) + else: + resampling_info = f"No. folds: {self.n_folds}\nNo. repeated sample splits: {self.n_rep}\n" + fit_summary = str(self.summary) + res = ( + header + + "\n------------------ Data summary ------------------\n" + + data_summary + + "\n------------------ Score & algorithm ------------------\n" + + score_info + + "\n------------------ Machine learner ------------------\n" + + learner_info + + "\n------------------ Resampling ------------------\n" + + resampling_info + + "\n------------------ Fit summary ------------------\n" + + fit_summary + ) + return res + + @property + def g_value(self): + """ + The value indicating the treatment group (first period with treatment). + """ + return self._g_value + + @property + def t_value_eval(self): + """ + The value indicating the evaluation period. + """ + return self._t_value_eval + + @property + def t_value_pre(self): + """ + The value indicating the pre-treatment period. + """ + return self._t_value_pre + + @property + def never_treated_value(self): + """ + The value indicating that a unit was never treated. + """ + return self._never_treated_value + + @property + def post_treatment(self): + """ + Indicates whether the evaluation period is after the treatment period. + """ + return self._post_treatment + + @property + def control_group(self): + """ + The control group. + """ + return self._control_group + + @property + def anticipation_periods(self): + """ + The number of anticipation periods. + """ + return self._anticipation_periods + + @property + def panel_data_wide(self): + """ + The preprocessed panel data in wide format. + """ + return self._panel_data_wide + + @property + def id_positions(self): + """ + The positions of the id values in the original data. + """ + return self._id_positions + + @property + def in_sample_normalization(self): + """ + Indicates whether the in sample normalization of weights are used. + """ + return self._in_sample_normalization + + @property + def trimming_rule(self): + """ + Specifies the used trimming rule. + """ + return self._trimming_rule + + @property + def trimming_threshold(self): + """ + Specifies the used trimming threshold. + """ + return self._trimming_threshold + + @property + def n_obs(self): + """ + The number of observations used for estimation. + """ + return self._n_subset + + def _initialize_ml_nuisance_params(self): + if self.score == "observational": + valid_learner = ["ml_g0", "ml_g1", "ml_m"] + else: + assert self.score == "experimental" + valid_learner = ["ml_g0", "ml_g1"] + self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in valid_learner} + + def _check_data(self, obj_dml_data): + if not isinstance(obj_dml_data, DoubleMLPanelData): + raise TypeError( + "For repeated outcomes the data must be of DoubleMLPanelData type. " + f"{str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed." + ) + if obj_dml_data.z_cols is not None: + raise NotImplementedError( + "Incompatible data. " + " and ".join(obj_dml_data.z_cols) + " have been set as instrumental variable(s). " + "At the moment there are not DiD models with instruments implemented." + ) + + one_treat = obj_dml_data.n_treat == 1 + if not (one_treat): + raise ValueError( + "Incompatible data. " + "To fit an DID model with DML " + "exactly one variable needs to be specified as treatment variable." + ) + _check_gt_values(obj_dml_data.g_values, obj_dml_data.t_values) + return + + def _preprocess_data(self, g_value, pre_t, eval_t): + data = self._dml_data.data + + y_col = self._dml_data.y_col + t_col = self._dml_data.t_col + id_col = self._dml_data.id_col + g_col = self._dml_data.g_col + + # relevent data subset + data_subset_indicator = data[t_col].isin([pre_t, eval_t]) + data_subset = data[data_subset_indicator].sort_values(by=[id_col, t_col]) + + # Construct G (treatment group) indicating treatment period in g + G_indicator = (data_subset[g_col] == g_value).astype(int) + + # Construct C (control group) indicating never treated or not yet treated + never_treated = _is_never_treated(data_subset[g_col], self.never_treated_value).reshape(-1) + if self.control_group == "never_treated": + C_indicator = never_treated.astype(int) + + elif self.control_group == "not_yet_treated": + # adjust max_g_value for anticipation periods + t_values = self._dml_data.t_values + max_g_value = t_values[min(np.where(t_values == eval_t)[0][0] + self.anticipation_periods, len(t_values) - 1)] + # not in G just as a additional check + later_treated = (data_subset[g_col] > max_g_value) & (G_indicator == 0) + not_yet_treated = never_treated | later_treated + C_indicator = not_yet_treated.astype(int) + + if np.sum(C_indicator) == 0: + raise ValueError("No observations in the control group.") + + data_subset = data_subset.assign(C_indicator=C_indicator, G_indicator=G_indicator) + # reduce to relevant subset + data_subset = data_subset[(data_subset["G_indicator"] == 1) | (data_subset["C_indicator"] == 1)] + # check if G and C are disjoint + assert sum(G_indicator & C_indicator) == 0 + + # Alternatively, use .shift() (check if time ordering is correct) + # y_diff = this_data.groupby(id_col)[y_col].shift(-1) + y_diff = ( + data_subset[data_subset[t_col] == eval_t][y_col].values - data_subset[data_subset[t_col] == pre_t][y_col].values + ) + + # keep covariates only observations from the first period + # Data processing from long to wide format + select_cols = [id_col, "G_indicator", "C_indicator"] + self._dml_data.x_cols + first_period = data_subset[t_col].min() + wide_data = data_subset[select_cols][data_subset[t_col] == first_period] + wide_data = wide_data.assign(y_diff=y_diff) + + return wide_data + + def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False): + + # Here: d is a binary treatment indicator + x, y = check_X_y(self._x_panel, self._y_panel, force_all_finite=False) + x, d = check_X_y(x, self._g_panel, force_all_finite=False) + # nuisance g + # get train indices for d == 0 + smpls_d0, smpls_d1 = _get_cond_smpls(smpls, d) + + # nuisance g for d==0 + if external_predictions["ml_g0"] is not None: + ml_g0_targets = np.full_like(y, np.nan, dtype="float64") + ml_g0_targets[d == 0] = y[d == 0] + ml_g0_pred = _get_id_positions(external_predictions["ml_g0"], self.id_positions) + g_hat0 = {"preds": ml_g0_pred, "targets": ml_g0_targets, "models": None} + else: + g_hat0 = _dml_cv_predict( + self._learner["ml_g"], + x, + y, + smpls=smpls_d0, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_g0"), + method=self._predict_method["ml_g"], + return_models=return_models, + ) + + _check_finite_predictions(g_hat0["preds"], self._learner["ml_g"], "ml_g", smpls) + # adjust target values to consider only compatible subsamples + g_hat0["targets"] = g_hat0["targets"].astype(float) + g_hat0["targets"][d == 1] = np.nan + + # nuisance g for d==1 + if external_predictions["ml_g1"] is not None: + ml_g1_targets = np.full_like(y, np.nan, dtype="float64") + ml_g1_targets[d == 1] = y[d == 1] + ml_g1_pred = _get_id_positions(external_predictions["ml_g1"], self.id_positions) + g_hat1 = {"preds": ml_g1_pred, "targets": ml_g1_targets, "models": None} + else: + g_hat1 = _dml_cv_predict( + self._learner["ml_g"], + x, + y, + smpls=smpls_d1, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_g1"), + method=self._predict_method["ml_g"], + return_models=return_models, + ) + + _check_finite_predictions(g_hat1["preds"], self._learner["ml_g"], "ml_g", smpls) + # adjust target values to consider only compatible subsamples + g_hat1["targets"] = g_hat1["targets"].astype(float) + g_hat1["targets"][d == 0] = np.nan + + # only relevant for observational setting + m_hat = {"preds": None, "targets": None, "models": None} + if self.score == "observational": + # nuisance m + if external_predictions["ml_m"] is not None: + ml_m_pred = _get_id_positions(external_predictions["ml_m"], self.id_positions) + m_hat = {"preds": ml_m_pred, "targets": d, "models": None} + else: + m_hat = _dml_cv_predict( + self._learner["ml_m"], + x, + d, + smpls=smpls, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_m"), + method=self._predict_method["ml_m"], + return_models=return_models, + ) + _check_finite_predictions(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls) + _check_is_propensity(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls, eps=1e-12) + m_hat["preds"] = _trimm(m_hat["preds"], self.trimming_rule, self.trimming_threshold) + + # nuisance estimates of the uncond. treatment prob. + p_hat = np.full_like(d, d.mean(), dtype="float64") + psi_a, psi_b = self._score_elements(y, d, g_hat0["preds"], g_hat1["preds"], m_hat["preds"], p_hat) + + extend_kwargs = { + "n_obs": self._dml_data.n_obs, + "id_positions": self.id_positions, + } + psi_elements = { + "psi_a": _set_id_positions(psi_a, fill_value=0.0, **extend_kwargs), + "psi_b": _set_id_positions(psi_b, fill_value=0.0, **extend_kwargs), + } + preds = { + "predictions": { + "ml_g0": _set_id_positions(g_hat0["preds"], fill_value=np.nan, **extend_kwargs), + "ml_g1": _set_id_positions(g_hat1["preds"], fill_value=np.nan, **extend_kwargs), + "ml_m": _set_id_positions(m_hat["preds"], fill_value=np.nan, **extend_kwargs), + }, + "targets": { + "ml_g0": _set_id_positions(g_hat0["targets"], fill_value=np.nan, **extend_kwargs), + "ml_g1": _set_id_positions(g_hat1["targets"], fill_value=np.nan, **extend_kwargs), + "ml_m": _set_id_positions(m_hat["targets"], fill_value=np.nan, **extend_kwargs), + }, + "models": {"ml_g0": g_hat0["models"], "ml_g1": g_hat1["models"], "ml_m": m_hat["models"]}, + } + + return psi_elements, preds + + def _score_elements(self, y, d, g_hat0, g_hat1, m_hat, p_hat): + # calc residuals + resid_d0 = y - g_hat0 + + if self.score == "observational": + if self.in_sample_normalization: + weight_psi_a = np.divide(d, np.mean(d)) + propensity_weight = np.multiply(1.0 - d, np.divide(m_hat, 1.0 - m_hat)) + weight_resid_d0 = np.divide(d, np.mean(d)) - np.divide(propensity_weight, np.mean(propensity_weight)) + else: + weight_psi_a = np.divide(d, p_hat) + weight_resid_d0 = np.divide(d - m_hat, np.multiply(p_hat, 1.0 - m_hat)) + + psi_b_1 = np.zeros_like(y) + + else: + assert self.score == "experimental" + if self.in_sample_normalization: + weight_psi_a = np.ones_like(y) + weight_g0 = np.divide(d, np.mean(d)) - 1.0 + weight_g1 = 1.0 - np.divide(d, np.mean(d)) + weight_resid_d0 = np.divide(d, np.mean(d)) - np.divide(1.0 - d, np.mean(1.0 - d)) + else: + weight_psi_a = np.ones_like(y) + weight_g0 = np.divide(d, p_hat) - 1.0 + weight_g1 = 1.0 - np.divide(d, p_hat) + weight_resid_d0 = np.divide(d - p_hat, np.multiply(p_hat, 1.0 - p_hat)) + + psi_b_1 = np.multiply(weight_g0, g_hat0) + np.multiply(weight_g1, g_hat1) + + # set score elements + psi_a = -1.0 * weight_psi_a + psi_b = psi_b_1 + np.multiply(weight_resid_d0, resid_d0) + + return psi_a, psi_b + + def _nuisance_tuning( + self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search + ): + x, y = check_X_y(self._x_panel, self._y_panel, force_all_finite=False) + x, d = check_X_y(x, self._g_panel, force_all_finite=False) + + # get train indices for d == 0 and d == 1 + smpls_d0, smpls_d1 = _get_cond_smpls(smpls, d) + + if scoring_methods is None: + scoring_methods = {"ml_g": None, "ml_m": None} + + train_inds = [train_index for (train_index, _) in smpls] + train_inds_d0 = [train_index for (train_index, _) in smpls_d0] + train_inds_d1 = [train_index for (train_index, _) in smpls_d1] + g0_tune_res = _dml_tune( + y, + x, + train_inds_d0, + self._learner["ml_g"], + param_grids["ml_g"], + scoring_methods["ml_g"], + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + ) + g1_tune_res = _dml_tune( + y, + x, + train_inds_d1, + self._learner["ml_g"], + param_grids["ml_g"], + scoring_methods["ml_g"], + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + ) + + g0_best_params = [xx.best_params_ for xx in g0_tune_res] + g1_best_params = [xx.best_params_ for xx in g1_tune_res] + + if self.score == "observational": + m_tune_res = _dml_tune( + d, + x, + train_inds, + self._learner["ml_m"], + param_grids["ml_m"], + scoring_methods["ml_m"], + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + ) + m_best_params = [xx.best_params_ for xx in m_tune_res] + params = {"ml_g0": g0_best_params, "ml_g1": g1_best_params, "ml_m": m_best_params} + tune_res = {"g0_tune": g0_tune_res, "g1_tune": g1_tune_res, "m_tune": m_tune_res} + else: + params = {"ml_g0": g0_best_params, "ml_g1": g1_best_params} + tune_res = {"g0_tune": g0_tune_res, "g1_tune": g1_tune_res} + + res = {"params": params, "tune_res": tune_res} + + return res + + def _sensitivity_element_est(self, preds): + y = self._y_panel + d = self._g_panel + + m_hat = _get_id_positions(preds["predictions"]["ml_m"], self.id_positions) + g_hat0 = _get_id_positions(preds["predictions"]["ml_g0"], self.id_positions) + g_hat1 = _get_id_positions(preds["predictions"]["ml_g1"], self.id_positions) + + g_hat = np.multiply(d, g_hat1) + np.multiply(1.0 - d, g_hat0) + sigma2_score_element = np.square(y - g_hat) + sigma2 = np.mean(sigma2_score_element) + psi_sigma2 = sigma2_score_element - sigma2 + + # calc m(W,alpha) and Riesz representer + p_hat = np.mean(d) + if self.score == "observational": + propensity_weight_d0 = np.divide(m_hat, 1.0 - m_hat) + if self.in_sample_normalization: + weight_d0 = np.multiply(1.0 - d, propensity_weight_d0) + mean_weight_d0 = np.mean(weight_d0) + + m_alpha = np.multiply( + np.divide(d, p_hat), np.divide(1.0, p_hat) + np.divide(propensity_weight_d0, mean_weight_d0) + ) + rr = np.divide(d, p_hat) - np.divide(weight_d0, mean_weight_d0) + else: + m_alpha = np.multiply(np.divide(d, np.square(p_hat)), (1.0 + propensity_weight_d0)) + rr = np.divide(d, p_hat) - np.multiply(np.divide(1.0 - d, p_hat), propensity_weight_d0) + else: + assert self.score == "experimental" + # the same with or without self-normalization + m_alpha = np.divide(1.0, p_hat) + np.divide(1.0, 1.0 - p_hat) + rr = np.divide(d, p_hat) - np.divide(1.0 - d, 1.0 - p_hat) + + nu2_score_element = np.multiply(2.0, m_alpha) - np.square(rr) + nu2 = np.mean(nu2_score_element) + psi_nu2 = nu2_score_element - nu2 + + extend_kwargs = { + "n_obs": self._dml_data.n_obs, + "id_positions": self.id_positions, + "fill_value": 0.0, + } + + # add scaling to make variance estimation consistent (sample size difference) + scaling = self._dml_data.n_obs / self._n_subset + element_dict = { + "sigma2": sigma2, + "nu2": nu2, + "psi_sigma2": scaling * _set_id_positions(psi_sigma2, **extend_kwargs), + "psi_nu2": scaling * _set_id_positions(psi_nu2, **extend_kwargs), + "riesz_rep": scaling * _set_id_positions(rr, **extend_kwargs), + } + return element_dict + + def sensitivity_benchmark(self, benchmarking_set, fit_args=None): + """ + Computes a benchmark for a given set of features. + Returns a DataFrame containing the corresponding values for cf_y, cf_d, rho and the change in estimates. + + Parameters + ---------- + benchmarking_set : list + List of features to be used for benchmarking. + + fit_args : dict, optional + Additional arguments for the fit method. + Default is None. + + Returns + ------- + benchmark_results : pandas.DataFrame + Benchmark results. + """ + if self.score == "experimental": + warnings.warn( + "Sensitivity benchmarking for experimental score may not be meaningful. " + "Consider using score='observational' for conditional treatment assignment.", + UserWarning, + ) + + return super().sensitivity_benchmark(benchmarking_set, fit_args) diff --git a/doubleml/did/did_cs.py b/doubleml/did/did_cs.py index a198bcea..ab2af5b9 100644 --- a/doubleml/did/did_cs.py +++ b/doubleml/did/did_cs.py @@ -4,8 +4,8 @@ from sklearn.utils import check_X_y from sklearn.utils.multiclass import type_of_target +from doubleml.data.base_data import DoubleMLData from doubleml.double_ml import DoubleML -from doubleml.double_ml_data import DoubleMLData from doubleml.double_ml_score_mixins import LinearScoreMixin from doubleml.utils._checks import _check_finite_predictions, _check_is_propensity, _check_score, _check_trimming from doubleml.utils._estimation import _dml_cv_predict, _dml_tune, _get_cond_smpls_2d @@ -664,3 +664,31 @@ def _nuisance_tuning( res = {"params": params, "tune_res": tune_res} return res + + def sensitivity_benchmark(self, benchmarking_set, fit_args=None): + """ + Computes a benchmark for a given set of features. + Returns a DataFrame containing the corresponding values for cf_y, cf_d, rho and the change in estimates. + + Parameters + ---------- + benchmarking_set : list + List of features to be used for benchmarking. + + fit_args : dict, optional + Additional arguments for the fit method. + Default is None. + + Returns + ------- + benchmark_results : pandas.DataFrame + Benchmark results. + """ + if self.score == "experimental": + warnings.warn( + "Sensitivity benchmarking for experimental score may not be meaningful. " + "Consider using score='observational' for conditional treatment assignment.", + UserWarning, + ) + + return super().sensitivity_benchmark(benchmarking_set, fit_args) diff --git a/doubleml/did/did_multi.py b/doubleml/did/did_multi.py new file mode 100644 index 00000000..0243cca5 --- /dev/null +++ b/doubleml/did/did_multi.py @@ -0,0 +1,1367 @@ +import copy +import warnings + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +from joblib import Parallel, delayed +from matplotlib.lines import Line2D +from sklearn.base import clone + +from doubleml.data import DoubleMLPanelData +from doubleml.did.did_aggregation import DoubleMLDIDAggregation +from doubleml.did.did_binary import DoubleMLDIDBinary +from doubleml.did.utils._aggregation import ( + _check_did_aggregation_dict, + _compute_did_eventstudy_aggregation_weights, + _compute_did_group_aggregation_weights, + _compute_did_time_aggregation_weights, +) +from doubleml.did.utils._did_utils import ( + _check_anticipation_periods, + _check_control_group, + _check_gt_combination, + _check_gt_values, + _construct_gt_combinations, + _construct_gt_index, + _construct_post_treatment_mask, + _get_never_treated_value, +) +from doubleml.did.utils._plot import add_jitter +from doubleml.double_ml import DoubleML +from doubleml.double_ml_framework import concat +from doubleml.utils._checks import _check_score, _check_trimming +from doubleml.utils._descriptive import generate_summary +from doubleml.utils.gain_statistics import gain_statistics + + +class DoubleMLDIDMulti: + """Double machine learning for multi-period difference-in-differences models. + + Parameters + ---------- + obj_dml_data : :class:`DoubleMLPanelData` object + The :class:`DoubleMLPanelData` object providing the data and specifying the variables for the causal model. + + ml_g : estimator implementing ``fit()`` and ``predict()`` + A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. + :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function + :math:`g_0(0,X) = E[Y_{t_\text{eval}}-Y_{t_\text{pre}}|X, C__{t_\text{eval} + \\delta} = 1]`. + For a binary outcome variable :math:`Y` (with values 0 and 1), a classifier implementing ``fit()`` and + ``predict_proba()`` can also be specified. + + ml_m : classifier implementing ``fit()`` and ``predict_proba()`` + A machine learner implementing ``fit()`` and ``predict_proba()`` methods (e.g. + :py:class:`sklearn.ensemble.RandomForestClassifier`) for the nuisance function :math:`m_0(X) = E[D=1|X]`. + Only relevant for ``score='observational'``. Default is ``None``. + + gt_combinations : array-like + A list of tuples with the group-time combinations to be evaluated. + + control_group : str + Specifies the control group. Either ``'never_treated'`` or ``'not_yet_treated'``. + Default is ``'never_treated'``. + + anticipation_periods : int + Number of anticipation periods. Default is ``0``. + + n_folds : int + Number of folds for cross-fitting. + Default is ``5``. + + n_rep : int + Number of repetitions for the sample splitting. + Default is ``1``. + + score : str + A str (``'observational'`` or ``'experimental'``) specifying the score function. + The ``'experimental'`` scores refers to an A/B setting, where the treatment is independent + from the pretreatment covariates. + Default is ``'observational'``. + + in_sample_normalization : bool + Indicates whether to use in-sample normalization of weights. + Default is ``True``. + + trimming_rule : str + A str (``'truncate'`` is the only choice) specifying the trimming approach. + Default is ``'truncate'``. + + trimming_threshold : float + The threshold used for trimming. + Default is ``1e-2``. + + draw_sample_splitting : bool + Indicates whether the sample splitting should be drawn during initialization. + Default is ``True``. + + print_periods : bool + Indicates whether to print information about the evaluated periods. + Default is ``False``. + + Examples + -------- + >>> import numpy as np + >>> import doubleml as dml + >>> from doubleml.did.datasets import make_did_CS2021 + >>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier + >>> np.random.seed(42) + >>> df = make_did_CS2021(n_obs=500) + >>> dml_data = dml.data.DoubleMLPanelData( + ... df, + ... y_col="y", + ... d_cols="d", + ... id_col="id", + ... t_col="t", + ... x_cols=["Z1", "Z2", "Z3", "Z4"], + ... datetime_unit="M" + ... ) + >>> ml_g = RandomForestRegressor(n_estimators=100, max_depth=5) + >>> ml_m = RandomForestClassifier(n_estimators=100, max_depth=5) + >>> dml_did_obj = dml.did.DoubleMLDIDMulti( + ... obj_dml_data=dml_data, + ... ml_g=ml_g, + ... ml_m=ml_m, + ... gt_combinations="standard", + ... control_group="never_treated", + ... ) + >>> print(dml_did_obj.fit()) + """ + + def __init__( + self, + obj_dml_data, + ml_g, + ml_m=None, + gt_combinations="standard", + control_group="never_treated", + anticipation_periods=0, + n_folds=5, + n_rep=1, + score="observational", + in_sample_normalization=True, + trimming_rule="truncate", + trimming_threshold=1e-2, + draw_sample_splitting=True, + print_periods=False, + ): + + self._dml_data = obj_dml_data + self._is_cluster_data = False + self._is_panel_data = isinstance(obj_dml_data, DoubleMLPanelData) + self._check_data(self._dml_data) + self._g_values = self._dml_data.g_values + self._t_values = self._dml_data.t_values + self._print_periods = print_periods + + self._control_group = _check_control_group(control_group) + self._never_treated_value = _get_never_treated_value(self.g_values) + self._anticipation_periods = _check_anticipation_periods(anticipation_periods) + + self._gt_combinations = self._validate_gt_combinations(gt_combinations) + self._gt_index = _construct_gt_index(self.gt_combinations, self.g_values, self.t_values) + self._post_treatment_mask = _construct_post_treatment_mask(self.g_values, self.t_values) + self._gt_labels = [f"ATT({g},{t_pre},{t_eval})" for g, t_pre, t_eval in self.gt_combinations] + + self._in_sample_normalization = in_sample_normalization + if not isinstance(self.in_sample_normalization, bool): + raise TypeError( + "in_sample_normalization indicator has to be boolean. " + + f"Object of type {str(type(self.in_sample_normalization))} passed." + ) + + self._n_folds = n_folds + self._n_rep = n_rep + + # check score + self._score = score + valid_scores = ["observational", "experimental"] + _check_score(self.score, valid_scores, allow_callable=False) + + # initialize framework which is constructed after the fit method is called + self._framework = None + + # initialize and check trimming + self._trimming_rule = trimming_rule + self._trimming_threshold = trimming_threshold + _check_trimming(self._trimming_rule, self._trimming_threshold) + + ml_g_is_classifier = DoubleML._check_learner(ml_g, "ml_g", regressor=True, classifier=True) + if self.score == "observational": + _ = DoubleML._check_learner(ml_m, "ml_m", regressor=False, classifier=True) + self._learner = {"ml_g": clone(ml_g), "ml_m": clone(ml_m)} + else: + assert self.score == "experimental" + if ml_m is not None: + warnings.warn( + ( + 'A learner ml_m has been provided for score = "experimental" but will be ignored. ' + "A learner ml_m is not required for estimation." + ) + ) + self._learner = {"ml_g": ml_g, "ml_m": None} + + if ml_g_is_classifier: + if obj_dml_data.binary_outcome: + self._predict_method = {"ml_g": "predict_proba", "ml_m": "predict_proba"} + else: + raise ValueError( + f"The ml_g learner {str(ml_g)} was identified as classifier " + "but the outcome variable is not binary with values 0 and 1." + ) + else: + self._predict_method = {"ml_g": "predict", "ml_m": "predict_proba"} + + # perform sample splitting + self._smpls = None + self._draw_sample_splitting = draw_sample_splitting + + # initialize all models if splits are known + self._modellist = self._initialize_models() + self._nuisance_loss = None + + def __str__(self): + class_name = self.__class__.__name__ + header = f"================== {class_name} Object ==================\n" + data_summary = self._dml_data._data_summary_str() + score_info = ( + f"Score function: {str(self.score)}\n" + f"Control group: {str(self.control_group)}\n" + f"Anticipation periods: {str(self.anticipation_periods)}\n" + ) + resampling_info = f"No. folds: {self.n_folds}\nNo. repeated sample splits: {self.n_rep}\n" + learner_info = "" + for key, value in self._learner.items(): + learner_info += f"Learner {key}: {str(value)}\n" + if self.nuisance_loss is not None: + learner_info += "Out-of-sample Performance:\n" + is_classifier = [value for value in self.modellist[0]._is_classifier.values()] + is_regressor = [not value for value in is_classifier] + if any(is_regressor): + learner_info += "Regression:\n" + for learner in [key for key, value in self.modellist[0]._is_classifier.items() if value is False]: + learner_info += f"Learner {learner} RMSE: {self.nuisance_loss[learner]}\n" + if any(is_classifier): + learner_info += "Classification:\n" + for learner in [key for key, value in self.modellist[0]._is_classifier.items() if value is True]: + learner_info += f"Learner {learner} Log Loss: {self.nuisance_loss[learner]}\n" + fit_summary = str(self.summary) + res = ( + header + + "\n------------------ Data summary ------------------\n" + + data_summary + + "\n------------------ Score & algorithm ------------------\n" + + score_info + + "\n------------------ Machine learner ------------------\n" + + learner_info + + "\n------------------ Resampling ------------------\n" + + resampling_info + + "\n------------------ Fit summary ------------------\n" + + fit_summary + ) + return res + + @property + def score(self): + """ + The score function. + """ + return self._score + + @property + def control_group(self): + """ + The control group. + """ + return self._control_group + + @property + def anticipation_periods(self): + """ + The number of anticipation periods. + """ + return self._anticipation_periods + + @property + def gt_combinations(self): + """ + The combinations of g and t values. + """ + return self._gt_combinations + + @property + def gt_index(self): + """ + The index of the combinations of g and t values. + """ + return self._gt_index + + @property + def n_gt_atts(self): + """ + The number of evaluated combinations of the treatment variable and the period. + """ + return len(self.gt_combinations) + + @property + def gt_labels(self): + """ + The evaluated labels of the treatment effects 'ATT(g, t_pre, t_eval)' and the period. + """ + return self._gt_labels + + @property + def g_values(self): + """ + The values of the treatment variable. + """ + return self._g_values + + @property + def t_values(self): + """ + The values of the time periods. + """ + return self._t_values + + @property + def never_treated_value(self): + """ + The value indicating that a unit was never treated. + """ + return self._never_treated_value + + @property + def in_sample_normalization(self): + """ + Indicates whether the in sample normalization of weights are used. + """ + return self._in_sample_normalization + + @property + def trimming_rule(self): + """ + Specifies the used trimming rule. + """ + return self._trimming_rule + + @property + def trimming_threshold(self): + """ + Specifies the used trimming threshold. + """ + return self._trimming_threshold + + @property + def n_folds(self): + """ + Number of folds. + """ + return self._n_folds + + @property + def n_rep(self): + """ + Number of repetitions for the sample splitting. + """ + return self._n_rep + + @property + def n_rep_boot(self): + """ + The number of bootstrap replications. + """ + if self._framework is None: + n_rep_boot = None + else: + n_rep_boot = self._framework.n_rep_boot + return n_rep_boot + + @property + def boot_method(self): + """ + The method to construct the bootstrap replications. + """ + if self._framework is None: + method = None + else: + method = self._framework.boot_method + return method + + @property + def coef(self): + """ + Estimates for the causal parameter(s) after calling :meth:`fit` (shape (``n_gt_atts``,)). + """ + if self._framework is None: + coef = None + else: + coef = self.framework.thetas + return coef + + @property + def all_coef(self): + """ + Estimates of the causal parameter(s) for the ``n_rep`` different sample splits after calling :meth:`fit` + (shape (``n_gt_atts``, ``n_rep``)). + """ + if self._framework is None: + all_coef = None + else: + all_coef = self.framework.all_thetas + return all_coef + + @property + def se(self): + """ + Standard errors for the causal parameter(s) after calling :meth:`fit` (shape (``n_gt_atts``,)). + """ + if self._framework is None: + se = None + else: + se = self.framework.ses + return se + + @property + def all_se(self): + """ + Standard errors of the causal parameter(s) for the ``n_rep`` different sample splits after calling :meth:`fit` + (shape (``n_gt_atts``, ``n_rep``)). + """ + if self._framework is None: + all_se = None + else: + all_se = self.framework.all_ses + return all_se + + @property + def t_stat(self): + """ + t-statistics for the causal parameter(s) after calling :meth:`fit` (shape (``n_gt_atts``,)). + """ + if self._framework is None: + t_stats = None + else: + t_stats = self.framework.t_stats + return t_stats + + @property + def pval(self): + """ + p-values for the causal parameter(s) (shape (``n_gt_atts``,)). + """ + if self._framework is None: + pvals = None + else: + pvals = self.framework.pvals + return pvals + + @property + def boot_t_stat(self): + """ + Bootstrapped t-statistics for the causal parameter(s) after calling :meth:`fit` and :meth:`bootstrap` + (shape (``n_rep_boot``, ``n_gt_atts``, ``n_rep``)). + """ + if self._framework is None: + boot_t_stat = None + else: + boot_t_stat = self._framework.boot_t_stat + return boot_t_stat + + @property + def nuisance_loss(self): + """ + The losses of the nuisance models (root-mean-squared-errors or logloss). + """ + return self._nuisance_loss + + @property + def framework(self): + """ + The corresponding :class:`doubleml.DoubleMLFramework` object. + """ + return self._framework + + @property + def modellist(self): + """ + The list of DoubleMLDIDBinary models. + """ + return self._modellist + + @property + def sensitivity_elements(self): + """ + Values of the sensitivity components after calling :meth:`fit`; + If available (e.g., PLR, IRM) a dictionary with entries ``sigma2``, ``nu2``, ``psi_sigma2``, ``psi_nu2`` + and ``riesz_rep``. + """ + if self._framework is None: + sensitivity_elements = None + else: + sensitivity_elements = self._framework.sensitivity_elements + return sensitivity_elements + + @property + def sensitivity_params(self): + """ + Values of the sensitivity parameters after calling :meth:`sesitivity_analysis`; + If available (e.g., PLR, IRM) a dictionary with entries ``theta``, ``se``, ``ci``, ``rv`` + and ``rva``. + """ + if self._framework is None: + sensitivity_params = None + else: + sensitivity_params = self._framework.sensitivity_params + return sensitivity_params + + @property + def summary(self): + """ + A summary for the estimated causal effect after calling :meth:`fit`. + """ + if self.framework is None: + col_names = ["coef", "std err", "t", "P>|t|"] + df_summary = pd.DataFrame(columns=col_names) + else: + ci = self.confint() + df_summary = generate_summary(self.coef, self.se, self.t_stat, self.pval, ci, self.gt_labels) + return df_summary + + @property + def sensitivity_summary(self): + """ + Returns a summary for the sensitivity analysis after calling :meth:`sensitivity_analysis`. + Returns + ------- + res : str + Summary for the sensitivity analysis. + """ + if self._framework is None: + raise ValueError("Apply sensitivity_analysis() before sensitivity_summary.") + else: + sensitivity_summary = self._framework.sensitivity_summary + return sensitivity_summary + + def fit(self, n_jobs_models=None, n_jobs_cv=None, store_predictions=True, store_models=False, external_predictions=None): + """ + Estimate DoubleMLDIDMulti models. + + Parameters + ---------- + n_jobs_models : None or int + The number of CPUs to use to fit the group-time ATTs. ``None`` means ``1``. + Default is ``None``. + + n_jobs_cv : None or int + The number of CPUs to use to fit the learners. ``None`` means ``1``. + Does not speed up computation for quantile models. + Default is ``None``. + + store_predictions : bool + Indicates whether the predictions for the nuisance functions should be stored in ``predictions``. + Default is ``True``. + + store_models : bool + Indicates whether the fitted models for the nuisance functions should be stored in ``models``. This allows + to analyze the fitted models or extract information like variable importance. + Default is ``False``. + + external_predictions : dict or None + A nested dictionary where the keys correspond the the treatment levels and can contain predictions according to + each treatment level. The values have to be dictionaries which can contain keys ``'ml_g0'``, ``'ml_g1'`` + and ``'ml_m'``. + Default is `None`. + + Returns + ------- + self : object + """ + + if external_predictions is not None: + self._check_external_predictions(external_predictions) + ext_pred_dict = self._rename_external_predictions(external_predictions) + else: + ext_pred_dict = None + + # parallel estimation of the models + parallel = Parallel(n_jobs=n_jobs_models, verbose=0, pre_dispatch="2*n_jobs") + fitted_models = parallel( + delayed(self._fit_model)(i_gt, n_jobs_cv, store_predictions, store_models, ext_pred_dict) + for i_gt in range(self.n_gt_atts) + ) + + # combine the estimates and scores + framework_list = [None] * self.n_gt_atts + + for i_gt in range(self.n_gt_atts): + self._modellist[i_gt] = fitted_models[i_gt] + framework_list[i_gt] = self._modellist[i_gt].framework + + # aggregate all frameworks + self._framework = concat(framework_list) + self._framework.treatment_names = self._gt_labels + + # store the nuisance losses + self._nuisance_loss = self._calc_nuisance_loss() + + return self + + def confint(self, joint=False, level=0.95): + """ + Confidence intervals for DoubleML models. + + Parameters + ---------- + joint : bool + Indicates whether joint confidence intervals are computed. + Default is ``False`` + level : float + The confidence level. + Default is ``0.95``. + + Returns + ------- + df_ci : pd.DataFrame + A data frame with the confidence interval(s). + """ + + if self.framework is None: + raise ValueError("Apply fit() before confint().") + + df_ci = self.framework.confint(joint=joint, level=level) + df_ci.set_index(pd.Index(self.gt_labels), inplace=True) + + return df_ci + + def p_adjust(self, method="romano-wolf"): + """ + Multiple testing adjustment for DoubleML models. + + Parameters + ---------- + method : str + A str (``'romano-wolf''``, ``'bonferroni'``, ``'holm'``, etc) specifying the adjustment method. + In addition to ``'romano-wolf''``, all methods implemented in + :py:func:`statsmodels.stats.multitest.multipletests` can be applied. + Default is ``'romano-wolf'``. + + Returns + ------- + p_val : pd.DataFrame + A data frame with adjusted p-values. + """ + + if self.framework is None: + raise ValueError("Apply fit() before p_adjust().") + + p_val, _ = self.framework.p_adjust(method=method) + p_val.set_index(pd.Index(self.gt_labels), inplace=True) + + return p_val + + def bootstrap(self, method="normal", n_rep_boot=500): + """ + Multiplier bootstrap for DoubleML models. + + Parameters + ---------- + method : str + A str (``'Bayes'``, ``'normal'`` or ``'wild'``) specifying the multiplier bootstrap method. + Default is ``'normal'`` + n_rep_boot : int + The number of bootstrap replications. + + Returns + ------- + self : object + """ + if self._framework is None: + raise ValueError("Apply fit() before bootstrap().") + self._framework.bootstrap(method=method, n_rep_boot=n_rep_boot) + + return self + + def sensitivity_analysis(self, cf_y=0.03, cf_d=0.03, rho=1.0, level=0.95, null_hypothesis=0.0): + """ + Performs a sensitivity analysis to account for unobserved confounders. + The evaluated scenario is stored as a dictionary in the property ``sensitivity_params``. + + Parameters + ---------- + cf_y : float + Percentage of the residual variation of the outcome explained by latent/confounding variables. + Default is ``0.03``. + cf_d : float + Percentage gains in the variation of the Riesz representer generated by latent/confounding variables. + Default is ``0.03``. + rho : float + The correlation between the differences in short and long representations in the main regression and + Riesz representer. Has to be in [-1,1]. The absolute value determines the adversarial strength of the + confounding (maximizes at 1.0). + Default is ``1.0``. + level : float + The confidence level. + Default is ``0.95``. + null_hypothesis : float or numpy.ndarray + Null hypothesis for the effect. Determines the robustness values. + If it is a single float uses the same null hypothesis for all estimated parameters. + Else the array has to be of shape (n_coefs,). + Default is ``0.0``. + + Returns + ------- + self : object + """ + + if self._framework is None: + raise ValueError("Apply fit() before sensitivity_analysis().") + self._framework.sensitivity_analysis(cf_y=cf_y, cf_d=cf_d, rho=rho, level=level, null_hypothesis=null_hypothesis) + + return self + + def sensitivity_plot( + self, + idx_treatment=0, + value="theta", + rho=1.0, + level=0.95, + null_hypothesis=0.0, + include_scenario=True, + benchmarks=None, + fill=True, + grid_bounds=(0.15, 0.15), + grid_size=100, + ): + """ + Contour plot of the sensivity with respect to latent/confounding variables. + + Parameters + ---------- + idx_gt_atte : int + Index of the treatment to perform the sensitivity analysis. + Default is ``0``. + value : str + Determines which contours to plot. Valid values are ``'theta'`` (refers to the bounds) + and ``'ci'`` (refers to the bounds including statistical uncertainty). + Default is ``'theta'``. + rho: float + The correlation between the differences in short and long representations in the main regression and + Riesz representer. Has to be in [-1,1]. The absolute value determines the adversarial strength of the + confounding (maximizes at 1.0). + Default is ``1.0``. + level : float + The confidence level. + Default is ``0.95``. + null_hypothesis : float + Null hypothesis for the effect. Determines the direction of the contour lines. + include_scenario : bool + Indicates whether to highlight the scenario from the call of :meth:`sensitivity_analysis`. + Default is ``True``. + benchmarks : dict or None + Dictionary of benchmarks to be included in the plot. The keys are ``cf_y``, ``cf_d`` and ``name``. + Default is ``None``. + fill : bool + Indicates whether to use a heatmap style or only contour lines. + Default is ``True``. + grid_bounds : tuple + Determines the evaluation bounds of the grid for ``cf_d`` and ``cf_y``. Has to contain two floats in [0, 1). + Default is ``(0.15, 0.15)``. + grid_size : int + Determines the number of evaluation points of the grid. + Default is ``100``. + + Returns + ------- + fig : object + Plotly figure of the sensitivity contours. + """ + if self._framework is None: + raise ValueError("Apply fit() before sensitivity_plot().") + fig = self._framework.sensitivity_plot( + idx_treatment=idx_treatment, + value=value, + rho=rho, + level=level, + null_hypothesis=null_hypothesis, + include_scenario=include_scenario, + benchmarks=benchmarks, + fill=fill, + grid_bounds=grid_bounds, + grid_size=grid_size, + ) + + return fig + + def sensitivity_benchmark(self, benchmarking_set, fit_args=None): + """ + Computes a benchmark for a given set of features. + Returns a DataFrame containing the corresponding values for cf_y, cf_d, rho and the change in estimates. + + Returns + ------- + benchmark_results : pandas.DataFrame + Benchmark results. + """ + x_list_long = self._dml_data.x_cols + + # input checks + if self.sensitivity_elements is None: + raise NotImplementedError(f"Sensitivity analysis not yet implemented for {self.__class__.__name__}.") + if not isinstance(benchmarking_set, list): + raise TypeError( + "benchmarking_set must be a list. " f"{str(benchmarking_set)} of type {type(benchmarking_set)} was passed." + ) + if len(benchmarking_set) == 0: + raise ValueError("benchmarking_set must not be empty.") + if not set(benchmarking_set) <= set(x_list_long): + raise ValueError( + f"benchmarking_set must be a subset of features {str(self._dml_data.x_cols)}. " + f"{str(benchmarking_set)} was passed." + ) + if fit_args is not None and not isinstance(fit_args, dict): + raise TypeError("fit_args must be a dict. " f"{str(fit_args)} of type {type(fit_args)} was passed.") + + # refit short form of the model + x_list_short = [x for x in x_list_long if x not in benchmarking_set] + dml_short = copy.deepcopy(self) + dml_short._dml_data.x_cols = x_list_short + if fit_args is not None: + dml_short.fit(**fit_args) + else: + dml_short.fit() + + benchmark_dict = gain_statistics(dml_long=self, dml_short=dml_short) + df_benchmark = pd.DataFrame(benchmark_dict, index=self.gt_labels) + return df_benchmark + + def aggregate(self, aggregation="group"): + """ + Aggregates treatment effects. + + Parameters + ---------- + aggregation : str or dict + Method to aggregate treatment effects or dictionary with aggregation weights (masked numpy array). + Has to one of ``'group'``, ``'time'``, ``'eventstudy'`` or a masked numpy array. + Default is ``'group'``. + + Returns + ------- + DoubleMLFramework + Aggregated treatment effects framework + + """ + if self.framework is None: + raise ValueError("Apply fit() before aggregate().") + + # select all non-masked values + selected_gt_mask = ~self.gt_index.mask + + # get aggregation weights + aggregation_dict = self._get_agg_weights(selected_gt_mask, aggregation) + aggregation_dict = _check_did_aggregation_dict(aggregation_dict, self.gt_index) + # set elements for readability + weight_masks = aggregation_dict["weight_masks"] + + # ordered frameworks + all_frameworks = [self.modellist[idx].framework for idx in self.gt_index.compressed()] + # ordered weights + n_aggregations = weight_masks.shape[-1] + weight_list = [weight_masks[..., idx_agg].compressed() for idx_agg in range(n_aggregations)] + all_agg_weights = np.stack(weight_list, axis=0) + + additional_info = { + "Score function": self.score, + "Control group": self.control_group, + "Anticipation periods": self.anticipation_periods, + } + + additional_params = { + "gt_combinations": self.gt_combinations, + "gt_index": self.gt_index, + "weight_masks": weight_masks, + } + + # set plotting colors for eventstudy + if aggregation_dict["method"] == "Event Study": + additional_params["aggregation_color_idx"] = [0 if "-" in name else 1 for name in aggregation_dict["agg_names"]] + else: + additional_params["aggregation_color_idx"] = [1] * n_aggregations + + aggregation_args = { + "frameworks": all_frameworks, + "aggregation_weights": all_agg_weights, + "overall_aggregation_weights": aggregation_dict.get("agg_weights", None), + "aggregation_names": aggregation_dict.get("agg_names", None), + "aggregation_method_name": aggregation_dict["method"], + "additional_information": additional_info, + "additional_parameters": additional_params, + } + + agg_obj = DoubleMLDIDAggregation(**aggregation_args) + return agg_obj + + def plot_effects( + self, + level=0.95, + joint=True, + figsize=(12, 8), + color_palette="colorblind", + date_format=None, + y_label="Effect", + title="Estimated ATTs by Group", + jitter_value=None, + default_jitter=0.1, + ): + """ + Plots coefficient estimates with confidence intervals over time, grouped by first treated period. + + Parameters + ---------- + level : float + The confidence level for the intervals. + Default is ``0.95``. + joint : bool + Indicates whether joint confidence intervals are computed. + Default is ``True``. + figsize : tuple + Figure size as (width, height). + Default is ``(12, 8)``. + color_palette : str + Name of seaborn color palette to use for distinguishing pre and post treatment effects. + Default is ``"colorblind"``. + date_format : str + Format string for date ticks if x-axis contains datetime values. + Default is ``None``. + y_label : str + Label for y-axis. + Default is ``"Effect"``. + title : str + Title for the entire plot. + Default is ``"Estimated ATTs by Group"``. + jitter_value : float + Amount of jitter to apply to points. + Default is ``None``. + default_jitter : float + Default amount of jitter to apply to points. + Default is ``0.1``. + + Returns + ------- + fig : matplotlib.figure.Figure + The created figure object + axes : list + List of matplotlib axis objects for further customization + + Notes + ----- + If joint=True and bootstrapping hasn't been performed, this method will automatically + perform bootstrapping with default parameters and issue a warning. + """ + if self.framework is None: + raise ValueError("Apply fit() before plot_effects().") + df = self._create_ci_dataframe(level=level, joint=joint) + + # Sort time periods and treatment groups + first_treated_periods = sorted(df["First Treated"].unique()) + n_periods = len(first_treated_periods) + + # Set up colors + colors = dict(zip(["pre", "post"], sns.color_palette(color_palette)[:2])) + + # Check if x-axis is datetime or convert to float + is_datetime = pd.api.types.is_datetime64_any_dtype(df["Evaluation Period"]) + if pd.api.types.is_integer_dtype(df["Evaluation Period"]): + df["Evaluation Period"] = df["Evaluation Period"].astype(float) + + # Create figure and subplots + fig = plt.figure(figsize=figsize) + gs = fig.add_gridspec(n_periods + 1, 1, height_ratios=[3] * n_periods + [0.5]) + axes = [fig.add_subplot(gs[i]) for i in range(n_periods)] + + # Auto-calculate jitter if not specified + if jitter_value is None: + all_values = self.t_values + if is_datetime: + jitter_value = (all_values[1] - all_values[0]).astype("timedelta64[s]").astype(int) * default_jitter + else: + jitter_value = (all_values[1] - all_values[0]) * default_jitter + + # Plot each treatment group + for idx, period in enumerate(first_treated_periods): + period_df = df[df["First Treated"] == period] + ax = axes[idx] + + self._plot_single_group(ax, period_df, period, colors, is_datetime, jitter_value) + + # Set axis labels + if idx == n_periods - 1: # Only bottom plot gets x label + ax.set_xlabel("Evaluation Period") + ax.set_ylabel(y_label) + + # Format date ticks if needed + if is_datetime and date_format: + ax.xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter(date_format)) + plt.setp(ax.xaxis.get_majorticklabels()) + + # Add legend + legend_ax = fig.add_subplot(gs[-1]) + legend_ax.axis("off") + legend_elements = [ + Line2D([0], [0], color="red", linestyle=":", alpha=0.7, label="Treatment start"), + Line2D([0], [0], color="black", linestyle="--", alpha=0.5, label="Zero effect"), + Line2D([0], [0], marker="o", color=colors["pre"], linestyle="None", label="Pre-treatment", markersize=5), + Line2D([0], [0], marker="o", color=colors["post"], linestyle="None", label="Post-treatment", markersize=5), + ] + legend_ax.legend(handles=legend_elements, loc="center", ncol=4, mode="expand", borderaxespad=0.0) + + # Set title and layout + plt.suptitle(title, y=1.02) + plt.tight_layout() + + return fig, axes + + def _plot_single_group(self, ax, period_df, period, colors, is_datetime, jitter_value): + """ + Plot estimates for a single treatment group on the given axis. + + Parameters + ---------- + ax : matplotlib.axes.Axes + Matplotlib axis to plot on. + period_df : pandas.DataFrame + DataFrame containing estimates for a specific time period. + period : int or datetime + Treatment period for this group. + colors : dict + Dictionary with 'pre' and 'post' color values. + is_datetime : bool + Whether the x-axis represents datetime values. + jitter_value : float + Amount of jitter to apply to points. + Default is ``None``. + + Returns + ------- + matplotlib.axes.Axes + The updated axis object. + """ + + # Plot reference lines + ax.axvline(x=period, color="red", linestyle=":", alpha=0.7) + ax.axhline(y=0, color="black", linestyle="--", alpha=0.5) + + # Split and jitter data + pre_treatment = add_jitter( + period_df[period_df["Pre-Treatment"]], + "Evaluation Period", + is_datetime=is_datetime, + jitter_value=jitter_value, + ) + post_treatment = add_jitter( + period_df[~period_df["Pre-Treatment"]], + "Evaluation Period", + is_datetime=is_datetime, + jitter_value=jitter_value, + ) + + # Plot pre-treatment points + if not pre_treatment.empty: + ax.scatter(pre_treatment["jittered_x"], pre_treatment["Estimate"], color=colors["pre"], alpha=0.8, s=30) + ax.errorbar( + pre_treatment["jittered_x"], + pre_treatment["Estimate"], + yerr=[ + pre_treatment["Estimate"] - pre_treatment["CI Lower"], + pre_treatment["CI Upper"] - pre_treatment["Estimate"], + ], + fmt="o", + capsize=3, + color=colors["pre"], + markersize=4, + markeredgewidth=1, + linewidth=1, + ) + + # Plot post-treatment points + if not post_treatment.empty: + ax.scatter(post_treatment["jittered_x"], post_treatment["Estimate"], color=colors["post"], alpha=0.8, s=30) + ax.errorbar( + post_treatment["jittered_x"], + post_treatment["Estimate"], + yerr=[ + post_treatment["Estimate"] - post_treatment["CI Lower"], + post_treatment["CI Upper"] - post_treatment["Estimate"], + ], + fmt="o", + capsize=3, + color=colors["post"], + markersize=4, + markeredgewidth=1, + linewidth=1, + ) + + # Format axes + if is_datetime: + period_str = np.datetime64(period, self._dml_data.datetime_unit) + else: + period_str = period + ax.set_title(f"First Treated: {period_str}") + ax.grid(True, alpha=0.3) + + return ax + + def _get_agg_weights(self, selected_gt_mask, aggregation): + """ + Calculate weights for aggregating treatment effects. + + Parameters + ---------- + selected_gt_mask : numpy.ndarray + Boolean mask indicating which group-time combinations to include + aggregation : str or dict + Method to aggregate treatment effects + + Returns + ------- + tuple + (weight_masks, agg_names, agg_weights) + """ + + if isinstance(aggregation, dict): + aggregation_dict = aggregation + + elif isinstance(aggregation, str): + valid_aggregations = ["group", "time", "eventstudy"] + if aggregation not in valid_aggregations: + raise ValueError(f"aggregation must be one of {valid_aggregations}. " f"{str(aggregation)} was passed.") + + if aggregation == "group": + # exclude pre-treatment combinations + selected_gt_mask = selected_gt_mask & self._post_treatment_mask + aggregation_dict = _compute_did_group_aggregation_weights( + gt_index=self.gt_index, + g_values=self.g_values, + d_values=self._dml_data.d, + selected_gt_mask=selected_gt_mask, + ) + aggregation_dict["method"] = "Group" + elif aggregation == "time": + # exclude pre-treatment combinations + selected_gt_mask = selected_gt_mask & self._post_treatment_mask + aggregation_dict = _compute_did_time_aggregation_weights( + gt_index=self.gt_index, + g_values=self.g_values, + t_values=self.t_values, + d_values=self._dml_data.d, + selected_gt_mask=selected_gt_mask, + ) + aggregation_dict["method"] = "Time" + elif aggregation == "eventstudy": + aggregation_dict = _compute_did_eventstudy_aggregation_weights( + gt_index=self.gt_index, + g_values=self.g_values, + t_values=self.t_values, + d_values=self._dml_data.d, + time_values=self._dml_data.t, + selected_gt_mask=selected_gt_mask, + ) + aggregation_dict["method"] = "Event Study" + else: + raise TypeError( + "aggregation must be a string or dictionary. " f"{str(aggregation)} of type {type(aggregation)} was passed." + ) + + return aggregation_dict + + def _fit_model(self, i_gt, n_jobs_cv=None, store_predictions=True, store_models=False, external_predictions_dict=None): + + model = self.modellist[i_gt] + if external_predictions_dict is not None: + external_predictions = external_predictions_dict[self.gt_labels[i_gt]] + else: + external_predictions = None + model.fit( + n_jobs_cv=n_jobs_cv, + store_predictions=store_predictions, + store_models=store_models, + external_predictions=external_predictions, + ) + return model + + def _check_data(self, obj_dml_data): + if not isinstance(obj_dml_data, DoubleMLPanelData): + raise TypeError( + "The data has to be a DoubleMLPanelData object. " + f"{str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed." + ) + if obj_dml_data.z_cols is not None: + raise NotImplementedError( + "Incompatible data. " + " and ".join(obj_dml_data.z_cols) + " have been set as instrumental variable(s). " + "At the moment there are not DiD models with instruments implemented." + ) + _check_gt_values(obj_dml_data.g_values, obj_dml_data.t_values) + return + + def _validate_gt_combinations(self, gt_combinations): + """Validate all treatment-time combinations.""" + + if isinstance(gt_combinations, str): + gt_combinations = _construct_gt_combinations( + gt_combinations, self.g_values, self.t_values, self.never_treated_value, self.anticipation_periods + ) + + if not isinstance(gt_combinations, list): + raise TypeError( + "gt_combinations must be a list. " + f"{str(gt_combinations)} of type {type(gt_combinations)} was passed." + ) + + if len(gt_combinations) == 0: + raise ValueError("gt_combinations must not be empty.") + + if not all(isinstance(gt_combination, tuple) for gt_combination in gt_combinations): + raise TypeError("gt_combinations must be a list of tuples. At least one element is not a tuple.") + + if not all(len(gt_combination) == 3 for gt_combination in gt_combinations): + raise ValueError( + "gt_combinations must be a list of tuples with 3 elements. At least one tuple has not 3 elements." + ) + + for gt_combination in gt_combinations: + _check_gt_combination( + gt_combination, self.g_values, self.t_values, self.never_treated_value, self.anticipation_periods + ) + + return gt_combinations + + def _check_external_predictions(self, external_predictions): + expected_keys = self.gt_labels + if not isinstance(external_predictions, dict): + raise TypeError( + "external_predictions must be a dictionary. " + f"Object of type {type(external_predictions)} passed." + ) + + if not set(external_predictions.keys()).issubset(set(expected_keys)): + raise ValueError( + "external_predictions must be a subset of all gt_combinations. " + + f"Expected keys: {set(expected_keys)}. " + + f"Passed keys: {set(external_predictions.keys())}." + ) + + expected_learner_keys = ["ml_g0", "ml_g1", "ml_m"] + for key, value in external_predictions.items(): + if not isinstance(value, dict): + raise TypeError( + f"external_predictions[{key}] must be a dictionary. " + f"Object of type {type(value)} passed." + ) + if not set(value.keys()).issubset(set(expected_learner_keys)): + raise ValueError( + f"external_predictions[{key}] must be a subset of {set(expected_learner_keys)}. " + + f"Passed keys: {set(value.keys())}." + ) + + return + + def _rename_external_predictions(self, external_predictions): + d_col = self._dml_data.d_cols[0] + ext_pred_dict = {gt_combination: {d_col: {}} for gt_combination in self.gt_labels} + for gt_combination in self.gt_labels: + if "ml_g0" in external_predictions[gt_combination]: + ext_pred_dict[gt_combination][d_col]["ml_g0"] = external_predictions[gt_combination]["ml_g0"] + if "ml_g1" in external_predictions[gt_combination]: + ext_pred_dict[gt_combination][d_col]["ml_g1"] = external_predictions[gt_combination]["ml_g1"] + if "ml_m" in external_predictions[gt_combination]: + ext_pred_dict[gt_combination][d_col]["ml_m"] = external_predictions[gt_combination]["ml_m"] + + return ext_pred_dict + + def _calc_nuisance_loss(self): + nuisance_loss = {learner: np.full((self.n_rep, self.n_gt_atts), np.nan) for learner in self.modellist[0].params_names} + for i_model, model in enumerate(self.modellist): + for learner in self.modellist[0].params_names: + for i_rep in range(self.n_rep): + nuisance_loss[learner][i_rep, i_model] = model.nuisance_loss[learner][i_rep].item() + nuisance_loss[learner][i_rep, i_model] = model.nuisance_loss[learner][i_rep].item() + + return nuisance_loss + + def _initialize_models(self): + modellist = [None] * self.n_gt_atts + kwargs = { + "obj_dml_data": self._dml_data, + "ml_g": self._learner["ml_g"], + "ml_m": self._learner["ml_m"], + "control_group": self.control_group, + "anticipation_periods": self.anticipation_periods, + "score": self.score, + "n_folds": self.n_folds, + "n_rep": self.n_rep, + "trimming_rule": self.trimming_rule, + "trimming_threshold": self.trimming_threshold, + "in_sample_normalization": self.in_sample_normalization, + "draw_sample_splitting": True, + "print_periods": self._print_periods, + } + for i_model, (g_value, t_value_pre, t_value_eval) in enumerate(self.gt_combinations): + # initialize models for all levels + model = DoubleMLDIDBinary(g_value=g_value, t_value_pre=t_value_pre, t_value_eval=t_value_eval, **kwargs) + + modellist[i_model] = model + + return modellist + + def _create_ci_dataframe(self, level=0.95, joint=True): + """ + Create a DataFrame with coefficient estimates and confidence intervals for treatment effects. + + Parameters + ---------- + level : float, default=0.95 + Confidence level for intervals (between 0 and 1). + joint : bool, default=True + Whether to use joint confidence intervals. If True and bootstrapping hasn't been + performed yet, will automatically call bootstrap() with default parameters. + + Returns + ------- + pandas.DataFrame + DataFrame containing: + - 'First Treated': First treatment time for each group + - 'Pre-treatment Period': Pre-treatment time period + - 'Evaluation Period': Evaluation time period + - 'Estimate': Treatment effect estimates + - 'CI Lower': Lower bound of confidence intervals + - 'CI Upper': Upper bound of confidence intervals + - 'Pre-Treatment': Boolean indicating if evaluation period is before treatment + + Notes + ----- + If joint=True and bootstrapping hasn't been performed, this method will automatically + perform bootstrapping with default parameters and issue a warning. + """ + + if joint and self.framework.boot_t_stat is None: + self.bootstrap() + warnings.warn( + "Joint confidence intervals require bootstrapping which hasn't been performed yet. " + "Automatically applying '.bootstrap(method=\"normal\", n_rep_boot=500)' with default values. " + "For different bootstrap settings, call bootstrap() explicitly before plotting.", + UserWarning, + ) + + ci = self.confint(level=level, joint=joint) + df = pd.DataFrame( + { + "First Treated": [gt_combination[0] for gt_combination in self.gt_combinations], + "Pre-treatment Period": [gt_combination[1] for gt_combination in self.gt_combinations], + "Evaluation Period": [gt_combination[2] for gt_combination in self.gt_combinations], + "Estimate": self.framework.thetas, + "CI Lower": ci.iloc[:, 0], + "CI Upper": ci.iloc[:, 1], + "Pre-Treatment": [gt_combination[2] < gt_combination[0] for gt_combination in self.gt_combinations], + } + ) + + return df diff --git a/doubleml/did/tests/_utils_did_manual.py b/doubleml/did/tests/_utils_did_manual.py index e48c9042..e314c301 100644 --- a/doubleml/did/tests/_utils_did_manual.py +++ b/doubleml/did/tests/_utils_did_manual.py @@ -105,7 +105,7 @@ def fit_nuisance_did( p_hat_list = [] for train_index, _ in smpls: - p_hat_list.append(np.mean(d[train_index])) + p_hat_list.append(np.mean(d)) return g_hat0_list, g_hat1_list, m_hat_list, p_hat_list diff --git a/doubleml/did/tests/conftest.py b/doubleml/did/tests/conftest.py index 90e8394c..de528156 100644 --- a/doubleml/did/tests/conftest.py +++ b/doubleml/did/tests/conftest.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from doubleml.datasets import make_did_SZ2020 +from doubleml.did.datasets import make_did_SZ2020 @pytest.fixture(scope="session", params=[(500, 1), (1000, 1), (1000, 2)]) @@ -30,3 +30,17 @@ def generate_data_did_cs(request): data = make_did_SZ2020(n, dgp_type=dpg, cross_sectional_data=True, return_type="array") return data + + +@pytest.fixture(scope="session", params=[(500, 1), (1000, 1), (1000, 2)]) +def generate_data_did_binary(request): + params = request.param + np.random.seed(1111) + # setting parameters + n = params[0] + dpg = params[1] + + # generating data + data = make_did_SZ2020(n, dgp_type=dpg, return_type="DoubleMLPanelData") + + return data diff --git a/doubleml/did/tests/test_datasets.py b/doubleml/did/tests/test_datasets.py new file mode 100644 index 00000000..0e323ec9 --- /dev/null +++ b/doubleml/did/tests/test_datasets.py @@ -0,0 +1,79 @@ +import numpy as np +import pandas as pd +import pytest + +from doubleml import DoubleMLData +from doubleml.did.datasets import make_did_CS2021, make_did_SZ2020 + +msg_inv_return_type = "Invalid return_type." + + +@pytest.fixture(scope="function", params=[False, True]) +def cross_sectional(request): + return request.param + + +@pytest.fixture(scope="function", params=[1, 2, 3, 4, 5, 6]) +def dgp_type(request): + return request.param + + +@pytest.mark.ci +def test_make_did_SZ2020_return_types(cross_sectional, dgp_type): + np.random.seed(3141) + res = make_did_SZ2020(n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type=DoubleMLData) + assert isinstance(res, DoubleMLData) + res = make_did_SZ2020(n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type=pd.DataFrame) + assert isinstance(res, pd.DataFrame) + if cross_sectional: + x, y, d, t = make_did_SZ2020( + n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type=np.ndarray + ) + assert isinstance(t, np.ndarray) + else: + x, y, d, _ = make_did_SZ2020( + n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type=np.ndarray + ) + assert isinstance(x, np.ndarray) + assert isinstance(y, np.ndarray) + assert isinstance(d, np.ndarray) + with pytest.raises(ValueError, match=msg_inv_return_type): + _ = make_did_SZ2020(n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type="matrix") + msg = "The dgp_type is not valid." + with pytest.raises(ValueError, match=msg): + _ = make_did_SZ2020(n_obs=100, dgp_type="5", cross_sectional_data=cross_sectional, return_type="matrix") + + +@pytest.fixture(scope="function", params=[True, False]) +def include_never_treated(request): + return request.param + + +@pytest.fixture(scope="function", params=["datetime", "float"]) +def time_type(request): + return request.param + + +@pytest.fixture(scope="function", params=[0, 2]) +def anticipation_periods(request): + return request.param + + +@pytest.mark.ci +def test_make_did_CS2021_return_types(dgp_type, include_never_treated, time_type, anticipation_periods): + np.random.seed(3141) + df = make_did_CS2021( + n_obs=100, + dgp_type=dgp_type, + include_never_treated=include_never_treated, + time_type=time_type, + anticipation_periods=anticipation_periods, + ) + assert isinstance(df, pd.DataFrame) + + +@pytest.mark.ci +def test_make_did_CS2021_exceptions(): + msg = r"time_type must be one of \('datetime', 'float'\). Got 2." + with pytest.raises(ValueError, match=msg): + _ = make_did_CS2021(n_obs=100, time_type=2) diff --git a/doubleml/did/tests/test_did_aggregation.py b/doubleml/did/tests/test_did_aggregation.py new file mode 100644 index 00000000..cc3c4304 --- /dev/null +++ b/doubleml/did/tests/test_did_aggregation.py @@ -0,0 +1,98 @@ +import numpy as np +import pytest + +from doubleml.did.did_aggregation import DoubleMLDIDAggregation +from doubleml.double_ml_framework import DoubleMLFramework +from doubleml.tests._utils import generate_dml_dict + + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + + +@pytest.fixture(scope="module", params=[2, 5]) +def n_base_fameworks(request): + return request.param + + +@pytest.fixture(scope="module") +def base_framework(n_rep): + # Create a consistent framework for all tests + n_obs = 10 + n_thetas = 1 + + # Generate consistent scores with known effect + np.random.seed(42) + psi_a = np.ones(shape=(n_obs, n_thetas, n_rep)) + psi_b = np.random.normal(size=(n_obs, n_thetas, n_rep)) + + doubleml_dict = generate_dml_dict(psi_a, psi_b) + return DoubleMLFramework(doubleml_dict) + + +@pytest.fixture(scope="module", params=["ones", "random", "zeros", "mixed"]) +def weight_type(request): + return request.param + + +@pytest.fixture(scope="module", params=[1, 4, 5]) +def n_aggregations(request): + return request.param + + +@pytest.fixture +def weights(n_aggregations, n_base_fameworks, weight_type): + np.random.seed(42) + + if weight_type == "ones": + aggregation_weights = np.ones(shape=(n_aggregations, n_base_fameworks)) + overall_aggregation_weights = np.ones(shape=n_aggregations) + elif weight_type == "random": + aggregation_weights = np.random.rand(n_aggregations, n_base_fameworks) + overall_aggregation_weights = np.random.rand(n_aggregations) + elif weight_type == "zeros": + aggregation_weights = np.zeros(shape=(n_aggregations, n_base_fameworks)) + overall_aggregation_weights = np.zeros(shape=n_aggregations) + else: # mixed + aggregation_weights = np.ones(shape=(n_aggregations, n_base_fameworks)) + aggregation_weights[::2] = 0.5 # Set every other row to 0.5 + overall_aggregation_weights = np.ones(shape=n_aggregations) + overall_aggregation_weights[::2] = 0.5 + + return aggregation_weights, overall_aggregation_weights + + +@pytest.mark.ci +def test_multiple_equal_frameworks(base_framework, weights): + """Test that aggregating the same framework with different weights works correctly""" + agg_weights, overall_agg_weights = weights + + n_aggregations = agg_weights.shape[0] + n_frameworks = agg_weights.shape[1] + # Create list of identical frameworks + frameworks = [base_framework] * n_frameworks + + # Create aggregation + aggregation = DoubleMLDIDAggregation( + frameworks=frameworks, aggregation_weights=agg_weights, overall_aggregation_weights=overall_agg_weights + ) + + # Expected results + scaled_frameworks = [None] * n_aggregations + for i_agg in range(n_aggregations): + scaled_frameworks[i_agg] = sum(agg_weights[i_agg]) * base_framework + + # Check individual aggregation results + np.testing.assert_allclose(aggregation.aggregated_frameworks.all_thetas[i_agg], scaled_frameworks[i_agg].all_thetas[0]) + np.testing.assert_allclose( + aggregation.aggregated_frameworks.scaled_psi[:, i_agg, :], scaled_frameworks[i_agg].scaled_psi[:, 0, :] + ) + # ses might differ due to 1/n and 1/n-1 scaling + + # Check overall aggregation results + overall_weights = sum([overall_agg_weights[i] * sum(agg_weights[i]) for i in range(n_aggregations)]) + overall_scaled_framework = overall_weights * base_framework + + np.testing.assert_allclose(aggregation.overall_aggregated_framework.all_thetas, overall_scaled_framework.all_thetas) + np.testing.assert_allclose(aggregation.overall_aggregated_framework.scaled_psi, overall_scaled_framework.scaled_psi) diff --git a/doubleml/did/tests/test_did_aggregation_exceptions.py b/doubleml/did/tests/test_did_aggregation_exceptions.py new file mode 100644 index 00000000..0f895b5b --- /dev/null +++ b/doubleml/did/tests/test_did_aggregation_exceptions.py @@ -0,0 +1,190 @@ +import numpy as np +import pytest + +from doubleml.did.did_aggregation import DoubleMLDIDAggregation +from doubleml.double_ml_framework import DoubleMLFramework +from doubleml.tests._utils import generate_dml_dict + + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + + +@pytest.fixture(scope="module", params=[1]) +def n_thetas(request): + return request.param + + +@pytest.fixture +def mock_framework(n_rep, n_thetas): + # Create a minimal mock of DoubleMLFramework + n_obs = 10 + # generate score samples + psi_a = np.ones(shape=(n_obs, n_thetas, n_rep)) + psi_b = np.random.normal(size=(n_obs, n_thetas, n_rep)) + doubleml_dict = generate_dml_dict(psi_a, psi_b) + return DoubleMLFramework(doubleml_dict) + + +@pytest.fixture +def frameworks(mock_framework): + # Create a list of 3 frameworks + return [mock_framework] * 3 + + +@pytest.fixture +def aggregation_weights(): + # Create sample weights for 2 aggregations over 3 frameworks + return np.array([[0.5, 0.3, 0.2], [0.2, 0.5, 0.3]]) + + +@pytest.mark.ci +def test_valid_initialization(frameworks, aggregation_weights): + # Test initialization with valid parameters + aggregation = DoubleMLDIDAggregation( + frameworks=frameworks, + aggregation_weights=aggregation_weights, + overall_aggregation_weights=np.array([0.6, 0.4]), + aggregation_names=["agg1", "agg2"], + aggregation_method_name="custom", + additional_information={"key": "value"}, + ) + assert isinstance(aggregation.base_frameworks, list) + assert isinstance(aggregation.aggregation_weights, np.ndarray) + assert isinstance(aggregation.additional_information, str) + + +@pytest.mark.ci +def test_invalid_frameworks(aggregation_weights): + # Test with invalid frameworks type + with pytest.raises(TypeError, match="The 'frameworks' must be a list of DoubleMLFramework objects"): + DoubleMLDIDAggregation(frameworks="invalid_frameworks", aggregation_weights=aggregation_weights) + + +@pytest.mark.ci +def test_invalid_framework_dim(): + psi_a = np.ones(shape=(10, 2, 1)) + psi_b = np.random.normal(size=(10, 2, 1)) + doubleml_dict = generate_dml_dict(psi_a, psi_b) + framework = DoubleMLFramework(doubleml_dict) + + # Test with invalid framework dimension + with pytest.raises(ValueError, match="All frameworks must be one-dimensional"): + DoubleMLDIDAggregation(frameworks=[framework, framework], aggregation_weights=np.array([[0.5, 0.5], [0.3, 0.7]])) + + +@pytest.mark.ci +def test_invalid_aggregation_weights(frameworks): + # Test with invalid aggregation_weights type + with pytest.raises(TypeError, match="'aggregation_weights' must be a numpy array"): + DoubleMLDIDAggregation(frameworks=frameworks, aggregation_weights=[1, 2, 3]) # list instead of numpy array + + +@pytest.mark.ci +def test_invalid_aggregation_weights_ndim(frameworks): + # Test with 1D array instead of 2D + with pytest.raises(ValueError, match="'aggregation_weights' must be a 2-dimensional array"): + DoubleMLDIDAggregation(frameworks=frameworks, aggregation_weights=np.array([0.5, 0.3, 0.2])) + + +@pytest.mark.ci +def test_invalid_aggregation_weights_shape(frameworks): + # Test with wrong number of columns + with pytest.raises( + ValueError, match="The number of rows in 'aggregation_weights' must be equal to the number of frameworks" + ): + DoubleMLDIDAggregation( + frameworks=frameworks, aggregation_weights=np.array([[0.5, 0.5], [0.3, 0.7]]) # Only 2 columns for 3 frameworks + ) + + +@pytest.mark.ci +def test_invalid_overall_aggregation_weights(frameworks, aggregation_weights): + # Test with invalid overall_aggregation_weights type + with pytest.raises(TypeError, match="'overall_aggregation_weights' must be a numpy array"): + DoubleMLDIDAggregation( + frameworks=frameworks, + aggregation_weights=aggregation_weights, + overall_aggregation_weights=[0.5, 0.5], # list instead of numpy array + ) + + +@pytest.mark.ci +def test_invalid_overall_weights_ndim(frameworks, aggregation_weights): + # Test with 2D array instead of 1D + with pytest.raises(ValueError, match="'overall_aggregation_weights' must be a 1-dimensional array"): + DoubleMLDIDAggregation( + frameworks=frameworks, + aggregation_weights=aggregation_weights, + overall_aggregation_weights=np.array([[0.5], [0.5]]), + ) + + +@pytest.mark.ci +def test_invalid_overall_weights_length(frameworks, aggregation_weights): + # Test with wrong length + with pytest.raises( + ValueError, match="'overall_aggregation_weights' must have the same length as the number of aggregated frameworks" + ): + DoubleMLDIDAggregation( + frameworks=frameworks, + aggregation_weights=aggregation_weights, + overall_aggregation_weights=np.array([0.5, 0.3, 0.2]), # 3 weights for 2 aggregations + ) + + +@pytest.mark.ci +def test_invalid_aggregation_names_type(frameworks, aggregation_weights): + # Test with non-list type + with pytest.raises(TypeError, match="'aggregation_names' must be a list of strings"): + DoubleMLDIDAggregation( + frameworks=frameworks, aggregation_weights=aggregation_weights, aggregation_names="invalid_names" + ) + + +@pytest.mark.ci +def test_invalid_aggregation_names_content(frameworks, aggregation_weights): + # Test with non-string elements + with pytest.raises(TypeError, match="'aggregation_names' must be a list of strings"): + DoubleMLDIDAggregation(frameworks=frameworks, aggregation_weights=aggregation_weights, aggregation_names=[1, 2]) + + +@pytest.mark.ci +def test_invalid_aggregation_names_length(frameworks, aggregation_weights): + # Test with wrong length + with pytest.raises(ValueError, match="'aggregation_names' must have the same length as the number of aggregations"): + DoubleMLDIDAggregation( + frameworks=frameworks, + aggregation_weights=aggregation_weights, + aggregation_names=["agg1"], # Only 1 name for 2 aggregations + ) + + +@pytest.mark.ci +def test_invalid_method_name_type(frameworks, aggregation_weights): + # Test with non-string type + with pytest.raises(TypeError, match="'aggregation_method_name' must be a string"): + DoubleMLDIDAggregation(frameworks=frameworks, aggregation_weights=aggregation_weights, aggregation_method_name=123) + + +@pytest.mark.ci +def test_invalid_additional_information(frameworks, aggregation_weights): + # Test with invalid additional_information type + with pytest.raises(TypeError, match="'additional_information' must be a dictionary"): + DoubleMLDIDAggregation( + frameworks=frameworks, + aggregation_weights=aggregation_weights, + additional_information=[1, 2, 3], # list instead of dict + ) + + +@pytest.mark.ci +def test_additional_parameters(frameworks, aggregation_weights): + # Test with invalid additional_parameters type + with pytest.raises(TypeError, match="'additional_parameters' must be a dictionary"): + DoubleMLDIDAggregation( + frameworks=frameworks, + aggregation_weights=aggregation_weights, + additional_parameters=[1, 2, 3], # list instead of dict + ) diff --git a/doubleml/did/tests/test_did_aggregation_plot.py b/doubleml/did/tests/test_did_aggregation_plot.py new file mode 100644 index 00000000..1079b144 --- /dev/null +++ b/doubleml/did/tests/test_did_aggregation_plot.py @@ -0,0 +1,192 @@ +import warnings + +import matplotlib.pyplot as plt +import numpy as np +import pytest +from matplotlib.axes import Axes +from matplotlib.figure import Figure + +from doubleml.did.did_aggregation import DoubleMLDIDAggregation +from doubleml.double_ml_framework import DoubleMLFramework +from doubleml.tests._utils import generate_dml_dict + + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + + +@pytest.fixture +def mock_framework(n_rep): + # Create a minimal mock of DoubleMLFramework + n_obs = 10 + n_thetas = 1 + # generate score samples + psi_a = np.ones(shape=(n_obs, n_thetas, n_rep)) + psi_b = np.random.normal(size=(n_obs, n_thetas, n_rep)) + doubleml_dict = generate_dml_dict(psi_a, psi_b) + return DoubleMLFramework(doubleml_dict) + + +@pytest.fixture +def simple_aggregation(mock_framework): + """Create a simple DoubleMLDIDAggregation object for testing.""" + # Get two framework instances + fw1 = mock_framework + fw2 = mock_framework + + # Set treatment names (important for the test) + fw1.treatment_names = ["Treatment 1"] + fw2.treatment_names = ["Treatment 2"] + + # Weights for aggregation + agg_weights = np.array([[1.0, 0.0], [0.0, 1.0]]) + overall_weights = np.array([0.7, 0.3]) + + agg_obj = DoubleMLDIDAggregation( + frameworks=[fw1, fw2], + aggregation_weights=agg_weights, + overall_aggregation_weights=overall_weights, + aggregation_names=["Group A", "Group B"], + aggregation_method_name="Test Method", + additional_information={"Test Info": "Value"}, + additional_parameters={"aggregation_color_idx": [0, 1]}, + ) + + agg_obj.aggregated_frameworks.bootstrap(n_rep_boot=10) + return agg_obj + + +@pytest.mark.ci +def test_plot_effects_returns_fig_ax(simple_aggregation): + """Test that plot_effects returns figure and axes objects.""" + fig, ax = simple_aggregation.plot_effects() + + assert isinstance(fig, Figure) + assert isinstance(ax, Axes) + plt.close("all") + + +@pytest.mark.ci +def test_plot_effects_invalid_sort_by(simple_aggregation): + """Test that invalid sort_by values raise ValueError.""" + with pytest.raises(ValueError, match="Invalid sort_by value"): + simple_aggregation.plot_effects(sort_by="invalid") + + # These should not raise + for valid_value in ["name", "estimate", None]: + _ = simple_aggregation.plot_effects(sort_by=valid_value) + + plt.close("all") + + +@pytest.mark.ci +def test_plot_effects_sorting(simple_aggregation): + """Test that sorting works correctly.""" + # Get the dataframe that would be created inside the method + df = simple_aggregation._create_ci_dataframe() + + # Test name sorting + _, ax = simple_aggregation.plot_effects(sort_by="name") + labels = [text.get_text() for text in ax.get_xticklabels()] + expected = sorted(df["Aggregation_Names"]) + assert labels == expected + + # Test estimate sorting + _, ax = simple_aggregation.plot_effects(sort_by="estimate") + labels = [text.get_text() for text in ax.get_xticklabels()] + expected = df.sort_values("Estimate", ascending=False)["Aggregation_Names"].tolist() + assert labels == expected + + plt.close("all") + + +@pytest.mark.ci +def test_plot_effects_elements(simple_aggregation): + """Test that the plot contains expected elements.""" + _, ax = simple_aggregation.plot_effects(title="Test Title", y_label="Test Label") + + # Check title and y-label + assert ax.get_title() == "Test Title" + assert ax.get_ylabel() == "Test Label" + + # Check that we have the zero line + zero_lines = [line for line in ax.get_lines() if line.get_linestyle() == "--"] + assert len(zero_lines) == 1 + + # Check we have scatter points for estimates + assert len(ax.collections) > 0 + + # Check we have the correct number of x-ticks + assert len(ax.get_xticks()) == 2 # We have 2 groups in our fixture + + plt.close("all") + + +@pytest.mark.ci +def test_plot_effects_custom_figsize(simple_aggregation): + """Test that figsize parameter works.""" + custom_figsize = (8, 4) + fig, _ = simple_aggregation.plot_effects(figsize=custom_figsize) + + # Convert to inches for comparison (matplotlib uses inches) + width, height = fig.get_size_inches() + assert (width, height) == custom_figsize + + plt.close("all") + + +@pytest.mark.ci +def test_plot_effects_custom_colors(simple_aggregation): + """Test that color_palette parameter works.""" + # Custom color list + custom_colors = [(1, 0, 0), (0, 1, 0)] # Red and green + _, _ = simple_aggregation.plot_effects(color_palette=custom_colors) + plt.close("all") + + # Named palette + _, _ = simple_aggregation.plot_effects(color_palette="Set1") + plt.close("all") + + +@pytest.mark.ci +def test_joint_ci_bootstrap_warning(mock_framework): + """Test that requesting joint confidence intervals without bootstrapping issues a warning.""" + # Create a new aggregation object without bootstrapping + fw1 = mock_framework + fw2 = mock_framework + + # Set treatment names + fw1.treatment_names = ["Treatment 1"] + fw2.treatment_names = ["Treatment 2"] + + # Weights for aggregation + agg_weights = np.array([[1.0, 0.0], [0.0, 1.0]]) + overall_weights = np.array([0.7, 0.3]) + + # Create aggregation without bootstrapping + aggregation = DoubleMLDIDAggregation( + frameworks=[fw1, fw2], + aggregation_weights=agg_weights, + overall_aggregation_weights=overall_weights, + aggregation_names=["Group A", "Group B"], + additional_parameters={"aggregation_color_idx": [0, 1]}, + ) + + # Ensure no bootstrapping exists + aggregation.aggregated_frameworks._boot_t_stat = None + + # Check that a warning is raised with the expected message + with pytest.warns(UserWarning, match="Joint confidence intervals require bootstrapping"): + _ = aggregation.plot_effects(joint=True) + + # Verify that bootstrap was performed + assert aggregation.aggregated_frameworks.boot_t_stat is not None + + # No warning should be raised when plotting again + with warnings.catch_warnings(record=True) as recorded_warnings: + warnings.simplefilter("always") # Ensure all warnings are recorded + _ = aggregation.plot_effects(joint=True) + + assert len(recorded_warnings) == 0 + plt.close("all") diff --git a/doubleml/did/tests/test_did_aggregation_return_types.py b/doubleml/did/tests/test_did_aggregation_return_types.py new file mode 100644 index 00000000..e63eda70 --- /dev/null +++ b/doubleml/did/tests/test_did_aggregation_return_types.py @@ -0,0 +1,189 @@ +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import pytest +from matplotlib.axes import Axes +from matplotlib.figure import Figure + +from doubleml.did.did_aggregation import DoubleMLDIDAggregation +from doubleml.double_ml_framework import DoubleMLFramework +from doubleml.tests._utils import generate_dml_dict + + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + + +@pytest.fixture +def mock_framework(n_rep): + # Create a minimal mock of DoubleMLFramework + n_obs = 10 + n_thetas = 1 + # generate score samples + psi_a = np.ones(shape=(n_obs, n_thetas, n_rep)) + psi_b = np.random.normal(size=(n_obs, n_thetas, n_rep)) + doubleml_dict = generate_dml_dict(psi_a, psi_b) + return DoubleMLFramework(doubleml_dict) + + +@pytest.fixture +def frameworks(mock_framework): + # Create a list of 3 frameworks + return [mock_framework] * 3 + + +@pytest.fixture +def aggregation_weights(): + # Create sample weights for 2 aggregations over 3 frameworks + return np.array([[0.5, 0.3, 0.2], [0.2, 0.5, 0.3]]) + + +@pytest.mark.ci +@pytest.mark.parametrize( + "property_name,expected_value", + [ + ("overall_aggregation_weights", lambda w: np.array([0.5, 0.5])), # Equal weights for 2 aggregations + ("aggregation_names", lambda w: ["Aggregation_0", "Aggregation_1"]), + ("aggregation_method_name", lambda w: "Custom"), + ("additional_information", lambda w: None), + ("additional_parameters", lambda w: None), + ], +) +def test_default_values(frameworks, aggregation_weights, property_name, expected_value): + # Test that default values are correctly set when not explicitly provided + aggregation = DoubleMLDIDAggregation(frameworks=frameworks, aggregation_weights=aggregation_weights) + + expected = expected_value(aggregation_weights) + actual = getattr(aggregation, property_name) + + if property_name == "overall_aggregation_weights": + np.testing.assert_array_equal(actual, expected) + else: + assert actual == expected + + +@pytest.mark.ci +def test_custom_aggregation_values(frameworks, aggregation_weights): + # Test all custom values are properly set when provided + custom_names = ["Custom1", "Custom2"] + custom_method = "MyMethod" + custom_overall_weights = np.array([0.7, 0.3]) + custom_info = {"info": "test"} + custom_params = {"param": 123} + + aggregation = DoubleMLDIDAggregation( + frameworks=frameworks, + aggregation_weights=aggregation_weights, + overall_aggregation_weights=custom_overall_weights, + aggregation_names=custom_names, + aggregation_method_name=custom_method, + additional_information=custom_info, + additional_parameters=custom_params, + ) + + assert aggregation.aggregation_names == custom_names + assert aggregation.aggregation_method_name == custom_method + np.testing.assert_array_equal(aggregation.overall_aggregation_weights, custom_overall_weights) + assert "info: test" in aggregation.additional_information + assert aggregation.additional_parameters == custom_params + + +@pytest.mark.ci +@pytest.mark.parametrize( + "property_name,expected_type", + [ + ("base_frameworks", list), + ("aggregated_frameworks", DoubleMLFramework), + ("overall_aggregated_framework", DoubleMLFramework), + ("aggregation_weights", np.ndarray), + ("overall_aggregation_weights", np.ndarray), + ("n_aggregations", int), + ("aggregation_names", list), + ("aggregation_method_name", str), + ("aggregated_summary", pd.DataFrame), + ("overall_summary", pd.DataFrame), + ], +) +def test_return_types(frameworks, aggregation_weights, property_name, expected_type): + # Test that properties return the expected types + aggregation = DoubleMLDIDAggregation(frameworks=frameworks, aggregation_weights=aggregation_weights) + + value = getattr(aggregation, property_name) + assert isinstance(value, expected_type) + + +@pytest.mark.ci +def test_additional_info_return_types(frameworks, aggregation_weights): + # Test additional_information and additional_parameters return types + + # Test when None + aggregation1 = DoubleMLDIDAggregation(frameworks=frameworks, aggregation_weights=aggregation_weights) + assert aggregation1.additional_information is None + assert aggregation1.additional_parameters is None + + # Test when provided + aggregation2 = DoubleMLDIDAggregation( + frameworks=frameworks, + aggregation_weights=aggregation_weights, + additional_information={"info": "value"}, + additional_parameters={"param": "value"}, + ) + assert isinstance(aggregation2.additional_information, str) + assert isinstance(aggregation2.additional_parameters, dict) + + +@pytest.mark.ci +def test_str_representation(frameworks, aggregation_weights): + # Test string representation without additional information + aggregation1 = DoubleMLDIDAggregation( + frameworks=frameworks, aggregation_weights=aggregation_weights, aggregation_method_name="TestMethod" + ) + str_output = str(aggregation1) + + # Check presence of all required sections + assert "================== DoubleMLDIDAggregation Object ==================" in str_output + assert "TestMethod Aggregation" in str_output + assert "------------------ Overall Aggregated Effects ------------------" in str_output + assert "------------------ Aggregated Effects ------------------" in str_output + assert "------------------ Additional Information ------------------" not in str_output + + # Test string representation with additional information + aggregation2 = DoubleMLDIDAggregation( + frameworks=frameworks, + aggregation_weights=aggregation_weights, + aggregation_method_name="TestMethod", + additional_information={"key": "value"}, + ) + str_output_with_info = str(aggregation2) + + # Check additional information section + assert "------------------ Additional Information ------------------" in str_output_with_info + assert "key: value" in str_output_with_info + + +@pytest.mark.ci +def test_plot_effects_return_type(frameworks, aggregation_weights): + """Test that plot_effects method returns matplotlib Figure and Axes objects.""" + aggregation = DoubleMLDIDAggregation(frameworks=frameworks, aggregation_weights=aggregation_weights) + aggregation.aggregated_frameworks.bootstrap(n_rep_boot=10) + + # Test basic call without parameters + fig, ax = aggregation.plot_effects() + assert isinstance(fig, Figure) + assert isinstance(ax, Axes) + plt.close(fig) + + # Test with parameters + fig, ax = aggregation.plot_effects( + level=0.9, + joint=False, + figsize=(10, 5), + sort_by="estimate", + color_palette="Set2", + title="Custom Title", + y_label="Custom Y-Label", + ) + assert isinstance(fig, Figure) + assert isinstance(ax, Axes) + plt.close(fig) diff --git a/doubleml/did/tests/test_did_binary_control_groups.py b/doubleml/did/tests/test_did_binary_control_groups.py new file mode 100644 index 00000000..b8406b15 --- /dev/null +++ b/doubleml/did/tests/test_did_binary_control_groups.py @@ -0,0 +1,31 @@ +from sklearn.linear_model import LinearRegression, LogisticRegression + +import doubleml as dml + +df = dml.did.datasets.make_did_CS2021(n_obs=500, dgp_type=1, n_pre_treat_periods=2, n_periods=4, time_type="float") +dml_data = dml.data.DoubleMLPanelData(df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]) + +args = { + "obj_dml_data": dml_data, + "ml_g": LinearRegression(), + "ml_m": LogisticRegression(), + "g_value": 2, + "t_value_pre": 0, + "t_value_eval": 1, + "score": "observational", + "n_rep": 1, +} + + +def test_control_groups_different(): + dml_did_never_treated = dml.did.DoubleMLDIDBinary(control_group="never_treated", **args) + dml_did_not_yet_treated = dml.did.DoubleMLDIDBinary(control_group="not_yet_treated", **args) + + assert dml_did_never_treated._n_subset != dml_did_not_yet_treated._n_subset + # same treatment group + assert dml_did_never_treated._n_treated_subset == dml_did_not_yet_treated._n_treated_subset + + dml_did_never_treated.fit() + dml_did_not_yet_treated.fit() + + assert dml_did_never_treated.coef != dml_did_not_yet_treated.coef diff --git a/doubleml/did/tests/test_did_binary_exceptions.py b/doubleml/did/tests/test_did_binary_exceptions.py new file mode 100644 index 00000000..c7aa2395 --- /dev/null +++ b/doubleml/did/tests/test_did_binary_exceptions.py @@ -0,0 +1,152 @@ +from unittest.mock import patch + +import numpy as np +import pandas as pd +import pytest +from sklearn.linear_model import LinearRegression, LogisticRegression + +import doubleml as dml + +dml_data = dml.did.datasets.make_did_SZ2020(n_obs=500, dgp_type=1, return_type="DoubleMLPanelData") + +valid_arguments = { + "obj_dml_data": dml_data, + "ml_g": LinearRegression(), + "ml_m": LogisticRegression(), + "g_value": 1, + "t_value_pre": 0, + "t_value_eval": 1, + "score": "observational", + "n_rep": 1, + "draw_sample_splitting": True, +} + + +@pytest.mark.ci +def test_input(): + # control group + msg = r"The control group has to be one of \['never_treated', 'not_yet_treated'\]. 0 was passed." + with pytest.raises(ValueError, match=msg): + invalid_arguments = {"control_group": 0} + _ = dml.did.DoubleMLDIDBinary(**(valid_arguments | invalid_arguments)) + + # g value + msg = r"The value test is not in the set of treatment group values \[0 1\]." + with pytest.raises(ValueError, match=msg): + invalid_arguments = {"g_value": "test"} + _ = dml.did.DoubleMLDIDBinary(**(valid_arguments | invalid_arguments)) + + msg = r"The never treated group is not allowed as treatment group \(g_value=0\)." + with pytest.raises(ValueError, match=msg): + invalid_arguments = {"g_value": 0} + _ = dml.did.DoubleMLDIDBinary(**(valid_arguments | invalid_arguments)) + + msg = r"The never treated group is not allowed as treatment group \(g_value=0\)." + with pytest.raises(ValueError, match=msg): + invalid_arguments = {"g_value": 0.0} + _ = dml.did.DoubleMLDIDBinary(**(valid_arguments | invalid_arguments)) + + # t values + msg = r"The value test is not in the set of evaluation period values \[0 1\]." + with pytest.raises(ValueError, match=msg): + invalid_arguments = {"t_value_pre": "test"} + _ = dml.did.DoubleMLDIDBinary(**(valid_arguments | invalid_arguments)) + with pytest.raises(ValueError, match=msg): + invalid_arguments = {"t_value_eval": "test"} + _ = dml.did.DoubleMLDIDBinary(**(valid_arguments | invalid_arguments)) + + # in-sample normalization + msg = "in_sample_normalization indicator has to be boolean. Object of type passed." + with pytest.raises(TypeError, match=msg): + invalid_arguments = {"in_sample_normalization": "test"} + _ = dml.did.DoubleMLDIDBinary(**(valid_arguments | invalid_arguments)) + + # ml_g classifier + msg = r"The ml_g learner LogisticRegression\(\) was identified as" + with pytest.raises(ValueError, match=msg): + invalid_arguments = {"ml_g": LogisticRegression()} + _ = dml.did.DoubleMLDIDBinary(**(valid_arguments | invalid_arguments)) + + +@pytest.mark.ci +def test_no_control_group_exception(): + msg = "No observations in the control group." + with pytest.raises(ValueError, match=msg): + invalid_data = dml.did.datasets.make_did_SZ2020(n_obs=500, dgp_type=1, return_type="DoubleMLPanelData") + invalid_data.data["d"] = 1.0 + invalid_arguments = {"obj_dml_data": invalid_data, "control_group": "not_yet_treated"} + _ = dml.did.DoubleMLDIDBinary(**(valid_arguments | invalid_arguments)) + + +@pytest.mark.ci +def test_check_data_exceptions(): + """Test exception handling for _check_data method in DoubleMLDIDBinary""" + df = pd.DataFrame(np.random.normal(size=(10, 5)), columns=[f"Col_{i}" for i in range(5)]) + + # Test 1: Data has to be DoubleMLPanelData + invalid_data_types = [ + dml.data.DoubleMLData(df, y_col="Col_0", d_cols="Col_1"), + ] + + for invalid_data in invalid_data_types: + msg = r"For repeated outcomes the data must be of DoubleMLPanelData type\." + with pytest.raises(TypeError, match=msg): + _ = dml.did.DoubleMLDIDBinary( + obj_dml_data=invalid_data, + ml_g=LinearRegression(), + ml_m=LogisticRegression(), + g_value=1, + t_value_pre=0, + t_value_eval=1, + ) + + # Test 2: Data cannot have instrumental variables + df_with_z = dml_data.data.copy() + dml_data_with_z = dml.data.DoubleMLPanelData( + df_with_z, y_col="y", d_cols="d", id_col="id", t_col="t", z_cols=["Z1"], x_cols=["Z2", "Z3", "Z4"] + ) + + msg = r"Incompatible data. Z1 have been set as instrumental variable\(s\)." + with pytest.raises(NotImplementedError, match=msg): + _ = dml.did.DoubleMLDIDBinary( + obj_dml_data=dml_data_with_z, + ml_g=LinearRegression(), + ml_m=LogisticRegression(), + g_value=1, + t_value_pre=0, + t_value_eval=1, + ) + + # Test 3: Data must have exactly one treatment variable (using mock) + with patch.object(dml_data.__class__, "n_treat", property(lambda self: 2)): + msg = ( + "Incompatible data. To fit an DID model with DML exactly one variable needs to be specified as treatment variable." + ) + with pytest.raises(ValueError, match=msg): + _ = dml.did.DoubleMLDIDBinary( + obj_dml_data=dml_data, + ml_g=LinearRegression(), + ml_m=LogisticRegression(), + g_value=1, + t_value_pre=0, + t_value_eval=1, + ) + + +@pytest.mark.ci +def test_benchmark_warning(): + """Test warning when sensitivity_benchmark is called with experimental score""" + args = { + "obj_dml_data": dml_data, + "ml_g": LinearRegression(), + "ml_m": LogisticRegression(), + "g_value": 1, + "t_value_pre": 0, + "t_value_eval": 1, + "n_rep": 1, + } + # Create a DID model with experimental score + did_model = dml.did.DoubleMLDIDBinary(**args, score="experimental") + did_model.fit() + with pytest.warns(UserWarning, match="Sensitivity benchmarking for experimental score may not be meaningful"): + did_model.sensitivity_benchmark(["Z1", "Z2"]) diff --git a/doubleml/did/tests/test_did_binary_external_predictions.py b/doubleml/did/tests/test_did_binary_external_predictions.py new file mode 100644 index 00000000..ccc136d0 --- /dev/null +++ b/doubleml/did/tests/test_did_binary_external_predictions.py @@ -0,0 +1,163 @@ +import math + +import numpy as np +import pytest +from sklearn.linear_model import LinearRegression, LogisticRegression + +from doubleml.data import DoubleMLPanelData +from doubleml.did import DoubleMLDIDBinary +from doubleml.did.datasets import make_did_CS2021, make_did_SZ2020 +from doubleml.tests._utils import draw_smpls +from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor + + +@pytest.fixture(scope="module", params=["observational", "experimental"]) +def did_score(request): + return request.param + + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + + +@pytest.fixture(scope="module") +def doubleml_did_fixture(did_score, n_rep): + n_obs = 500 + n_folds = 5 + + ext_predictions = {"d": {}} + dml_data = make_did_SZ2020(n_obs=n_obs, return_type="DoubleMLPanelData") + + kwargs = { + "obj_dml_data": dml_data, + "g_value": 1, + "t_value_pre": 0, + "t_value_eval": 1, + "score": did_score, + "n_rep": n_rep, + "draw_sample_splitting": False, + } + + dml_did = DoubleMLDIDBinary(ml_g=LinearRegression(), ml_m=LogisticRegression(), **kwargs) + all_smpls = draw_smpls(n_obs, n_folds, n_rep=n_rep, groups=dml_did._g_panel) + dml_did.set_sample_splitting(all_smpls) + + np.random.seed(3141) + dml_did.fit(store_predictions=True) + + ext_predictions["d"]["ml_g0"] = dml_did.predictions["ml_g0"][:, :, 0] + ext_predictions["d"]["ml_g1"] = dml_did.predictions["ml_g1"][:, :, 0] + if did_score == "observational": + ext_predictions["d"]["ml_m"] = dml_did.predictions["ml_m"][:, :, 0] + + dml_did_ext = DoubleMLDIDBinary(ml_g=DMLDummyRegressor(), ml_m=DMLDummyClassifier(), **kwargs) + dml_did_ext.set_sample_splitting(all_smpls) + np.random.seed(3141) + dml_did_ext.fit(external_predictions=ext_predictions) + + res_dict = { + "coef": dml_did.coef[0], + "coef_ext": dml_did_ext.coef[0], + "se": dml_did.se[0], + "se_ext": dml_did_ext.se[0], + "score": dml_did.psi, + "score_ext": dml_did_ext.psi, + "dml_did_nuisance_loss": dml_did.nuisance_loss, + "dml_did_ext_nuisance_loss": dml_did_ext.nuisance_loss, + } + + return res_dict + + +@pytest.mark.ci +def test_coef(doubleml_did_fixture): + assert math.isclose(doubleml_did_fixture["coef"], doubleml_did_fixture["coef_ext"], rel_tol=1e-9, abs_tol=1e-3) + + +@pytest.mark.ci +def test_se(doubleml_did_fixture): + assert math.isclose(doubleml_did_fixture["se"], doubleml_did_fixture["se_ext"], rel_tol=1e-9, abs_tol=1e-3) + + +@pytest.mark.ci +def test_score(doubleml_did_fixture): + assert np.allclose(doubleml_did_fixture["score"], doubleml_did_fixture["score_ext"], rtol=1e-9, atol=1e-3) + + +@pytest.mark.ci +def test_nuisance_loss(doubleml_did_fixture): + for key, value in doubleml_did_fixture["dml_did_nuisance_loss"].items(): + assert np.allclose(value, doubleml_did_fixture["dml_did_ext_nuisance_loss"][key], rtol=1e-9, atol=1e-3) + + +@pytest.fixture(scope="module") +def doubleml_did_panel_fixture(did_score, n_rep): + n_obs = 500 + n_folds = 5 + dgp = 1 + + ext_predictions = {"d": {}} + df = make_did_CS2021(n_obs=n_obs, dgp_type=dgp, time_type="float") + dml_panel_data = DoubleMLPanelData(df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]) + + kwargs = { + "obj_dml_data": dml_panel_data, + "g_value": 2, + "t_value_pre": 0, + "t_value_eval": 1, + "score": did_score, + "n_rep": n_rep, + "draw_sample_splitting": False, + } + + dml_did = DoubleMLDIDBinary(ml_g=LinearRegression(), ml_m=LogisticRegression(), **kwargs) + all_smpls = draw_smpls(n_obs=dml_did._n_subset, n_folds=n_folds, n_rep=n_rep, groups=dml_did._g_panel) + dml_did.set_sample_splitting(all_smpls) + + np.random.seed(3141) + dml_did.fit(store_predictions=True) + + pred = dml_did.predictions + ext_predictions["d"]["ml_g0"] = pred["ml_g0"][:, :, 0] + ext_predictions["d"]["ml_g1"] = pred["ml_g1"][:, :, 0] + if did_score == "observational": + ext_predictions["d"]["ml_m"] = pred["ml_m"][:, :, 0] + dml_did_ext = DoubleMLDIDBinary(ml_g=DMLDummyRegressor(), ml_m=DMLDummyClassifier(), **kwargs) + dml_did_ext.set_sample_splitting(all_smpls) + np.random.seed(3141) + dml_did_ext.fit(external_predictions=ext_predictions) + + res_dict = { + "coef": dml_did.coef[0], + "coef_ext": dml_did_ext.coef[0], + "se": dml_did.se[0], + "se_ext": dml_did_ext.se[0], + "score": dml_did.psi, + "score_ext": dml_did_ext.psi, + "dml_did_nuisance_loss": dml_did.nuisance_loss, + "dml_did_ext_nuisance_loss": dml_did_ext.nuisance_loss, + } + + return res_dict + + +@pytest.mark.ci +def test_panel_coef(doubleml_did_panel_fixture): + assert math.isclose(doubleml_did_panel_fixture["coef"], doubleml_did_panel_fixture["coef_ext"], rel_tol=1e-9, abs_tol=1e-3) + + +@pytest.mark.ci +def test_panel_se(doubleml_did_panel_fixture): + assert math.isclose(doubleml_did_panel_fixture["se"], doubleml_did_panel_fixture["se_ext"], rel_tol=1e-9, abs_tol=1e-3) + + +@pytest.mark.ci +def test_panel_score(doubleml_did_panel_fixture): + assert np.allclose(doubleml_did_panel_fixture["score"], doubleml_did_panel_fixture["score_ext"], rtol=1e-9, atol=1e-3) + + +@pytest.mark.ci +def test_panel_nuisance_loss(doubleml_did_panel_fixture): + for key, value in doubleml_did_panel_fixture["dml_did_nuisance_loss"].items(): + assert np.allclose(value, doubleml_did_panel_fixture["dml_did_ext_nuisance_loss"][key], rtol=1e-9, atol=1e-3) diff --git a/doubleml/did/tests/test_did_binary_placebo.py b/doubleml/did/tests/test_did_binary_placebo.py new file mode 100644 index 00000000..ab90030e --- /dev/null +++ b/doubleml/did/tests/test_did_binary_placebo.py @@ -0,0 +1,58 @@ +import numpy as np +import pytest +from lightgbm import LGBMClassifier, LGBMRegressor + +from doubleml.data import DoubleMLPanelData +from doubleml.did import DoubleMLDIDBinary +from doubleml.did.datasets import make_did_CS2021 + + +@pytest.fixture(scope="module", params=["observational", "experimental"]) +def did_score(request): + return request.param + + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + + +@pytest.fixture(scope="module") +def doubleml_did_fixture(did_score, n_rep): + n_obs = 500 + dgp = 5 # has to be experimental (for experimental score to be valid) + df = make_did_CS2021(n_obs=n_obs, dgp=dgp, n_pre_treat_periods=3) + dml_data = DoubleMLPanelData(df, y_col="y", d_cols="d", t_col="t", id_col="id", x_cols=["Z1", "Z2", "Z3", "Z4"]) + + kwargs = { + "obj_dml_data": dml_data, + "g_value": dml_data.g_values[0], + "t_value_pre": dml_data.t_values[0], + "t_value_eval": dml_data.t_values[1], + "ml_g": LGBMRegressor(verbose=-1), + "ml_m": LGBMClassifier(verbose=-1), + "score": did_score, + "n_rep": n_rep, + "n_folds": 5, + "draw_sample_splitting": True, + } + + dml_did = DoubleMLDIDBinary(**kwargs) + + np.random.seed(3141) + dml_did.fit() + ci = dml_did.confint(level=0.99) + + res_dict = { + "coef": dml_did.coef[0], + "ci_lower": ci.iloc[0, 0], + "ci_upper": ci.iloc[0, 1], + } + + return res_dict + + +@pytest.mark.ci +def test_zero(doubleml_did_fixture): + assert doubleml_did_fixture["ci_lower"] <= 0.0 + assert doubleml_did_fixture["ci_upper"] >= 0.0 diff --git a/doubleml/did/tests/test_did_binary_stdout.py b/doubleml/did/tests/test_did_binary_stdout.py new file mode 100644 index 00000000..04687fb9 --- /dev/null +++ b/doubleml/did/tests/test_did_binary_stdout.py @@ -0,0 +1,49 @@ +import io +from contextlib import redirect_stdout + +import pytest +from sklearn.linear_model import LinearRegression, LogisticRegression + +import doubleml as dml + +dml_data = dml.did.datasets.make_did_SZ2020(n_obs=500, dgp_type=1, return_type="DoubleMLPanelData") + + +@pytest.mark.ci +def test_print_periods(): + """Test that print_periods parameter correctly controls output printing.""" + + # Create test data + dml_data = dml.did.datasets.make_did_SZ2020(n_obs=100, return_type="DoubleMLPanelData") + + # Test 1: Default case (print_periods=False) - should not print anything + f = io.StringIO() + with redirect_stdout(f): + _ = dml.did.DoubleMLDIDBinary( + obj_dml_data=dml_data, + ml_g=LinearRegression(), + ml_m=LogisticRegression(), + g_value=1, + t_value_pre=0, + t_value_eval=1, + print_periods=False, # Default + ) + output_default = f.getvalue() + assert output_default.strip() == "", "Expected no output with print_periods=False" + + # Test 2: With print_periods=True - should print information + f = io.StringIO() + with redirect_stdout(f): + _ = dml.did.DoubleMLDIDBinary( + obj_dml_data=dml_data, + ml_g=LinearRegression(), + ml_m=LogisticRegression(), + g_value=1, + t_value_pre=0, + t_value_eval=1, + print_periods=True, + ) + output_print = f.getvalue() + assert "Evaluation of ATT(1, 1), with pre-treatment period 0" in output_print + assert "post-treatment: True" in output_print + assert "Control group: never_treated" in output_print diff --git a/doubleml/did/tests/test_did_binary_tune.py b/doubleml/did/tests/test_did_binary_tune.py new file mode 100644 index 00000000..a817223f --- /dev/null +++ b/doubleml/did/tests/test_did_binary_tune.py @@ -0,0 +1,213 @@ +import math + +import numpy as np +import pytest +from sklearn.base import clone +from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import LogisticRegression + +import doubleml as dml + +from ...tests._utils import draw_smpls +from ._utils_did_manual import boot_did, fit_did, tune_nuisance_did + + +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) +def learner_g(request): + return request.param + + +@pytest.fixture(scope="module", params=[LogisticRegression()]) +def learner_m(request): + return request.param + + +@pytest.fixture(scope="module", params=["observational", "experimental"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def in_sample_normalization(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def tune_on_folds(request): + return request.param + + +def get_par_grid(learner): + if learner.__class__ in [RandomForestRegressor]: + par_grid = {"n_estimators": [5, 10, 20]} + else: + assert learner.__class__ in [LogisticRegression] + par_grid = {"C": np.logspace(-4, 2, 10)} + return par_grid + + +@pytest.fixture(scope="module") +def dml_did_fixture(generate_data_did_binary, learner_g, learner_m, score, in_sample_normalization, tune_on_folds): + par_grid = {"ml_g": get_par_grid(learner_g), "ml_m": get_par_grid(learner_m)} + n_folds_tune = 4 + + boot_methods = ["normal"] + n_folds = 2 + n_rep_boot = 499 + + # collect data + dml_panel_data = generate_data_did_binary + df = dml_panel_data._data.sort_values(by=["id", "t"]) + df_panel = df.groupby("id").agg( + {"y": lambda x: x.iloc[1] - x.iloc[0], "d": "first", "Z1": "first", "Z2": "first", "Z3": "first", "Z4": "first"} + ) + + n_obs = df_panel.shape[0] + all_smpls = draw_smpls(n_obs, n_folds, n_rep=1, groups=df_panel["d"]) + obj_dml_data = dml.DoubleMLData(df_panel, y_col="y", d_cols="d", x_cols=["Z1", "Z2", "Z3", "Z4"]) + + # Set machine learning methods for m & g + ml_g = clone(learner_g) + ml_m = clone(learner_m) + + dml_args = { + "ml_g": ml_g, + "ml_m": ml_m, + "n_folds": n_folds, + "score": score, + "in_sample_normalization": in_sample_normalization, + "draw_sample_splitting": False, + } + + dml_did_binary_obj = dml.did.DoubleMLDIDBinary( + dml_panel_data, + g_value=1, + t_value_pre=0, + t_value_eval=1, + **dml_args, + ) + + dml_did_obj = dml.DoubleMLDID( + obj_dml_data, + **dml_args, + ) + + # synchronize the sample splitting + dml_did_obj.set_sample_splitting(all_smpls=all_smpls) + dml_did_binary_obj.set_sample_splitting(all_smpls=all_smpls) + + # tune hyperparameters + np.random.seed(3141) + tune_res = dml_did_obj.tune(par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune, return_tune_res=False) + assert isinstance(tune_res, dml.DoubleMLDID) + np.random.seed(3141) + tune_res_binary = dml_did_binary_obj.tune( + par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune, return_tune_res=False + ) + assert isinstance(tune_res_binary, dml.did.DoubleMLDIDBinary) + + dml_did_obj.fit() + dml_did_binary_obj.fit() + + # manual fit + y = df_panel["y"].values + d = df_panel["d"].values + x = df_panel[["Z1", "Z2", "Z3", "Z4"]].values + np.random.seed(3141) + smpls = all_smpls[0] + + if tune_on_folds: + g0_params, g1_params, m_params = tune_nuisance_did( + y, x, d, clone(learner_g), clone(learner_m), smpls, score, n_folds_tune, par_grid["ml_g"], par_grid["ml_m"] + ) + else: + xx = [(np.arange(len(y)), np.array([]))] + g0_params, g1_params, m_params = tune_nuisance_did( + y, x, d, clone(learner_g), clone(learner_m), xx, score, n_folds_tune, par_grid["ml_g"], par_grid["ml_m"] + ) + g0_params = g0_params * n_folds + if score == "experimental": + g1_params = g1_params * n_folds + m_params = None + else: + assert score == "observational" + g1_params = None + m_params = m_params * n_folds + + res_manual = fit_did( + y, + x, + d, + clone(learner_g), + clone(learner_m), + all_smpls, + score, + in_sample_normalization, + g0_params=g0_params, + g1_params=g1_params, + m_params=m_params, + ) + + res_dict = { + "coef": dml_did_obj.coef, + "coef_binary": dml_did_binary_obj.coef, + "coef_manual": res_manual["theta"], + "se": dml_did_obj.se, + "se_binary": dml_did_binary_obj.se, + "se_manual": res_manual["se"], + "boot_methods": boot_methods, + } + + for bootstrap in boot_methods: + np.random.seed(3141) + boot_t_stat = boot_did( + y, + res_manual["thetas"], + res_manual["ses"], + res_manual["all_psi_a"], + res_manual["all_psi_b"], + all_smpls, + bootstrap, + n_rep_boot, + ) + + np.random.seed(3141) + dml_did_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) + np.random.seed(3141) + dml_did_binary_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) + + res_dict["boot_t_stat" + bootstrap] = dml_did_obj.boot_t_stat + res_dict["boot_t_stat" + bootstrap + "_binary"] = dml_did_binary_obj.boot_t_stat + res_dict["boot_t_stat" + bootstrap + "_manual"] = boot_t_stat.reshape(-1, 1, 1) + + return res_dict + + +@pytest.mark.ci +def test_dml_did_coef(dml_did_fixture): + assert math.isclose(dml_did_fixture["coef"][0], dml_did_fixture["coef_manual"], rel_tol=1e-9, abs_tol=1e-4) + assert math.isclose(dml_did_fixture["coef_binary"][0], dml_did_fixture["coef"][0], rel_tol=1e-9, abs_tol=1e-4) + + +@pytest.mark.ci +def test_dml_did_se(dml_did_fixture): + assert math.isclose(dml_did_fixture["se"][0], dml_did_fixture["se_manual"], rel_tol=1e-9, abs_tol=1e-4) + assert math.isclose(dml_did_fixture["se_binary"][0], dml_did_fixture["se"][0], rel_tol=1e-9, abs_tol=1e-4) + + +@pytest.mark.ci +def test_boot(dml_did_fixture): + for bootstrap in dml_did_fixture["boot_methods"]: + assert np.allclose( + dml_did_fixture["boot_t_stat" + bootstrap], + dml_did_fixture["boot_t_stat" + bootstrap + "_manual"], + rtol=1e-9, + atol=1e-4, + ) + + assert np.allclose( + dml_did_fixture["boot_t_stat" + bootstrap], + dml_did_fixture["boot_t_stat" + bootstrap + "_binary"], + rtol=1e-9, + atol=1e-4, + ) diff --git a/doubleml/did/tests/test_did_binary_vs_did_panel.py b/doubleml/did/tests/test_did_binary_vs_did_panel.py new file mode 100644 index 00000000..1eacdf6a --- /dev/null +++ b/doubleml/did/tests/test_did_binary_vs_did_panel.py @@ -0,0 +1,215 @@ +import math + +import numpy as np +import pytest +from sklearn.base import clone +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.linear_model import LinearRegression, LogisticRegression + +import doubleml as dml +from doubleml.did.datasets import make_did_CS2021 +from doubleml.did.utils._did_utils import _get_id_positions + + +@pytest.fixture( + scope="module", + params=[ + [LinearRegression(), LogisticRegression(solver="lbfgs", max_iter=250)], + [ + RandomForestRegressor(max_depth=5, n_estimators=10, random_state=42), + RandomForestClassifier(max_depth=5, n_estimators=10, random_state=42), + ], + ], +) +def learner(request): + return request.param + + +@pytest.fixture(scope="module", params=["observational", "experimental"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def in_sample_normalization(request): + return request.param + + +@pytest.fixture(scope="module", params=[0.1]) +def trimming_threshold(request): + return request.param + + +@pytest.fixture(scope="module", params=["datetime", "float"]) +def time_type(request): + return request.param + + +@pytest.fixture(scope="module") +def dml_did_binary_vs_did_fixture(time_type, learner, score, in_sample_normalization, trimming_threshold): + n_obs = 500 + dpg = 1 + + boot_methods = ["normal"] + n_rep_boot = 50000 + + # collect data + df = make_did_CS2021(n_obs=n_obs, dgp_type=dpg, time_type=time_type) + dml_panel_data = dml.data.DoubleMLPanelData( + df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] + ) + + dml_args = { + "ml_g": clone(learner[0]), + "ml_m": clone(learner[1]), + "n_folds": 3, + "score": score, + "in_sample_normalization": in_sample_normalization, + "trimming_threshold": trimming_threshold, + "draw_sample_splitting": True, + } + + dml_did_binary_obj = dml.did.DoubleMLDIDBinary( + dml_panel_data, + g_value=dml_panel_data.g_values[0], + t_value_pre=dml_panel_data.t_values[0], + t_value_eval=dml_panel_data.t_values[1], + **dml_args, + ) + dml_did_binary_obj.fit() + + df_wide = dml_did_binary_obj._panel_data_wide.copy() + dml_data = dml.data.DoubleMLData(df_wide, y_col="y_diff", d_cols="G_indicator", x_cols=["Z1", "Z2", "Z3", "Z4"]) + dml_did_obj = dml.DoubleMLDID( + dml_data, + **dml_args, + ) + + # use external predictions (sample splitting is hard to synchronize) + ext_predictions = {"G_indicator": {}} + ext_predictions["G_indicator"]["ml_g0"] = _get_id_positions( + dml_did_binary_obj.predictions["ml_g0"][:, :, 0], dml_did_binary_obj._id_positions + ) + ext_predictions["G_indicator"]["ml_g1"] = _get_id_positions( + dml_did_binary_obj.predictions["ml_g1"][:, :, 0], dml_did_binary_obj._id_positions + ) + if score == "observational": + ext_predictions["G_indicator"]["ml_m"] = _get_id_positions( + dml_did_binary_obj.predictions["ml_m"][:, :, 0], dml_did_binary_obj._id_positions + ) + dml_did_obj.fit(external_predictions=ext_predictions) + + res_dict = { + "coef": dml_did_obj.coef, + "coef_binary": dml_did_binary_obj.coef, + "se": dml_did_obj.se, + "se_binary": dml_did_binary_obj.se, + "nuisance_loss": dml_did_obj.nuisance_loss, + "nuisance_loss_binary": dml_did_binary_obj.nuisance_loss, + "boot_methods": boot_methods, + "dml_did_binary_obj": dml_did_binary_obj, + } + + for bootstrap in boot_methods: + np.random.seed(3141) + dml_did_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) + np.random.seed(3141) + dml_did_binary_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) + + # approximately same ci (bootstrap not identical due to size of score) + res_dict["boot_ci" + bootstrap] = dml_did_obj.confint(joint=True) + res_dict["boot_ci" + bootstrap + "_binary"] = dml_did_binary_obj.confint(joint=True) + + # sensitivity tests + res_dict["sensitivity_elements"] = dml_did_obj.sensitivity_elements + res_dict["sensitivity_elements_binary"] = dml_did_binary_obj.sensitivity_elements + + dml_did_obj.sensitivity_analysis() + dml_did_binary_obj.sensitivity_analysis() + + res_dict["sensitivity_params"] = dml_did_obj.sensitivity_params + res_dict["sensitivity_params_binary"] = dml_did_binary_obj.sensitivity_params + + return res_dict + + +@pytest.mark.ci +def test_coefs(dml_did_binary_vs_did_fixture): + assert math.isclose( + dml_did_binary_vs_did_fixture["coef_binary"][0], dml_did_binary_vs_did_fixture["coef"][0], rel_tol=1e-9, abs_tol=1e-4 + ) + + +@pytest.mark.ci +def test_ses(dml_did_binary_vs_did_fixture): + assert math.isclose( + dml_did_binary_vs_did_fixture["se_binary"][0], dml_did_binary_vs_did_fixture["se"][0], rel_tol=1e-9, abs_tol=1e-4 + ) + + +@pytest.mark.ci +def test_boot(dml_did_binary_vs_did_fixture): + for bootstrap in dml_did_binary_vs_did_fixture["boot_methods"]: + assert np.allclose( + dml_did_binary_vs_did_fixture["boot_ci" + bootstrap].values, + dml_did_binary_vs_did_fixture["boot_ci" + bootstrap + "_binary"].values, + atol=1e-2, + ) + + +@pytest.mark.ci +def test_nuisance_loss(dml_did_binary_vs_did_fixture): + assert ( + dml_did_binary_vs_did_fixture["nuisance_loss"].keys() == dml_did_binary_vs_did_fixture["nuisance_loss_binary"].keys() + ) + for key, value in dml_did_binary_vs_did_fixture["nuisance_loss"].items(): + assert np.allclose(value, dml_did_binary_vs_did_fixture["nuisance_loss_binary"][key], rtol=1e-9, atol=1e-3) + + +@pytest.mark.ci +def test_sensitivity_elements(dml_did_binary_vs_did_fixture): + sensitivity_element_names = ["sigma2", "nu2"] + for sensitivity_element in sensitivity_element_names: + assert np.allclose( + dml_did_binary_vs_did_fixture["sensitivity_elements"][sensitivity_element], + dml_did_binary_vs_did_fixture["sensitivity_elements_binary"][sensitivity_element], + rtol=1e-9, + atol=1e-4, + ) + for sensitivity_element in ["psi_sigma2", "psi_nu2", "riesz_rep"]: + dml_binary_obj = dml_did_binary_vs_did_fixture["dml_did_binary_obj"] + scaling = dml_binary_obj._n_subset / dml_binary_obj._dml_data.n_obs + binary_sensitivity_element = scaling * _get_id_positions( + dml_did_binary_vs_did_fixture["sensitivity_elements_binary"][sensitivity_element], dml_binary_obj._id_positions + ) + assert np.allclose( + dml_did_binary_vs_did_fixture["sensitivity_elements"][sensitivity_element], + binary_sensitivity_element, + rtol=1e-9, + atol=1e-4, + ) + + +@pytest.mark.ci +def test_sensitivity_params(dml_did_binary_vs_did_fixture): + for key in ["theta", "se", "ci"]: + assert np.allclose( + dml_did_binary_vs_did_fixture["sensitivity_params"][key]["lower"], + dml_did_binary_vs_did_fixture["sensitivity_params_binary"][key]["lower"], + rtol=1e-9, + atol=1e-4, + ) + assert np.allclose( + dml_did_binary_vs_did_fixture["sensitivity_params"][key]["upper"], + dml_did_binary_vs_did_fixture["sensitivity_params_binary"][key]["upper"], + rtol=1e-9, + atol=1e-4, + ) + + for key in ["rv", "rva"]: + assert np.allclose( + dml_did_binary_vs_did_fixture["sensitivity_params"][key], + dml_did_binary_vs_did_fixture["sensitivity_params_binary"][key], + rtol=1e-9, + atol=1e-4, + ) diff --git a/doubleml/did/tests/test_did_binary_vs_did_two_period.py b/doubleml/did/tests/test_did_binary_vs_did_two_period.py new file mode 100644 index 00000000..0db2a752 --- /dev/null +++ b/doubleml/did/tests/test_did_binary_vs_did_two_period.py @@ -0,0 +1,264 @@ +import math + +import numpy as np +import pytest +from sklearn.base import clone +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.linear_model import LinearRegression, LogisticRegression + +import doubleml as dml + +from ...tests._utils import draw_smpls +from ._utils_did_manual import boot_did, fit_did, fit_sensitivity_elements_did + + +@pytest.fixture( + scope="module", + params=[ + [LinearRegression(), LogisticRegression(solver="lbfgs", max_iter=250)], + [ + RandomForestRegressor(max_depth=5, n_estimators=10, random_state=42), + RandomForestClassifier(max_depth=5, n_estimators=10, random_state=42), + ], + ], +) +def learner(request): + return request.param + + +@pytest.fixture(scope="module", params=["observational", "experimental"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def in_sample_normalization(request): + return request.param + + +@pytest.fixture(scope="module", params=[0.1]) +def trimming_threshold(request): + return request.param + + +@pytest.fixture(scope="module") +def dml_did_binary_vs_did_fixture(generate_data_did_binary, learner, score, in_sample_normalization, trimming_threshold): + boot_methods = ["normal"] + n_folds = 2 + n_rep_boot = 499 + + # collect data + dml_panel_data = generate_data_did_binary + df = dml_panel_data._data.sort_values(by=["id", "t"]) + df_panel = df.groupby("id").agg( + {"y": lambda x: x.iloc[1] - x.iloc[0], "d": "first", "Z1": "first", "Z2": "first", "Z3": "first", "Z4": "first"} + ) + + n_obs = df_panel.shape[0] + all_smpls = draw_smpls(n_obs, n_folds) + obj_dml_data = dml.DoubleMLData(df_panel, y_col="y", d_cols="d", x_cols=["Z1", "Z2", "Z3", "Z4"]) + + # Set machine learning methods for m & g + ml_g = clone(learner[0]) + ml_m = clone(learner[1]) + + dml_args = { + "ml_g": ml_g, + "ml_m": ml_m, + "n_folds": n_folds, + "score": score, + "in_sample_normalization": in_sample_normalization, + "trimming_threshold": trimming_threshold, + "draw_sample_splitting": False, + } + + dml_did_binary_obj = dml.did.DoubleMLDIDBinary( + dml_panel_data, + g_value=1, + t_value_pre=0, + t_value_eval=1, + **dml_args, + ) + + dml_did_obj = dml.DoubleMLDID( + obj_dml_data, + **dml_args, + ) + + # synchronize the sample splitting + dml_did_obj.set_sample_splitting(all_smpls=all_smpls) + dml_did_binary_obj.set_sample_splitting(all_smpls=all_smpls) + + dml_did_obj.fit() + dml_did_binary_obj.fit() + + # manual fit + y = df_panel["y"].values + d = df_panel["d"].values + x = df_panel[["Z1", "Z2", "Z3", "Z4"]].values + + np.random.seed(3141) + res_manual = fit_did( + y, + x, + d, + clone(learner[0]), + clone(learner[1]), + all_smpls, + score, + in_sample_normalization, + trimming_threshold=trimming_threshold, + ) + + res_dict = { + "coef": dml_did_obj.coef, + "coef_binary": dml_did_binary_obj.coef, + "coef_manual": res_manual["theta"], + "se": dml_did_obj.se, + "se_binary": dml_did_binary_obj.se, + "se_manual": res_manual["se"], + "nuisance_loss": dml_did_obj.nuisance_loss, + "nuisance_loss_binary": dml_did_binary_obj.nuisance_loss, + "boot_methods": boot_methods, + } + + for bootstrap in boot_methods: + np.random.seed(3141) + boot_t_stat = boot_did( + y, + res_manual["thetas"], + res_manual["ses"], + res_manual["all_psi_a"], + res_manual["all_psi_b"], + all_smpls, + bootstrap, + n_rep_boot, + ) + + np.random.seed(3141) + dml_did_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) + np.random.seed(3141) + dml_did_binary_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) + + res_dict["boot_t_stat" + bootstrap] = dml_did_obj.boot_t_stat + res_dict["boot_t_stat" + bootstrap + "_binary"] = dml_did_binary_obj.boot_t_stat + res_dict["boot_t_stat" + bootstrap + "_manual"] = boot_t_stat.reshape(-1, 1, 1) + + # sensitivity tests + res_dict["sensitivity_elements"] = dml_did_obj.sensitivity_elements + res_dict["sensitivity_elements_binary"] = dml_did_binary_obj.sensitivity_elements + res_dict["sensitivity_elements_manual"] = fit_sensitivity_elements_did( + y, + d, + all_coef=dml_did_obj.all_coef, + predictions=dml_did_obj.predictions, + score=score, + in_sample_normalization=in_sample_normalization, + n_rep=1, + ) + + # sensitivity tests + res_dict["sensitivity_elements"] = dml_did_obj.sensitivity_elements + res_dict["sensitivity_elements_binary"] = dml_did_binary_obj.sensitivity_elements + + dml_did_obj.sensitivity_analysis() + dml_did_binary_obj.sensitivity_analysis() + + res_dict["sensitivity_params"] = dml_did_obj.sensitivity_params + res_dict["sensitivity_params_binary"] = dml_did_binary_obj.sensitivity_params + + return res_dict + + +@pytest.mark.ci +def test_coefs(dml_did_binary_vs_did_fixture): + assert math.isclose( + dml_did_binary_vs_did_fixture["coef"][0], dml_did_binary_vs_did_fixture["coef_manual"], rel_tol=1e-9, abs_tol=1e-4 + ) + assert math.isclose( + dml_did_binary_vs_did_fixture["coef_binary"][0], dml_did_binary_vs_did_fixture["coef"][0], rel_tol=1e-9, abs_tol=1e-4 + ) + + +@pytest.mark.ci +def test_ses(dml_did_binary_vs_did_fixture): + assert math.isclose( + dml_did_binary_vs_did_fixture["se"][0], dml_did_binary_vs_did_fixture["se_manual"], rel_tol=1e-9, abs_tol=1e-4 + ) + assert math.isclose( + dml_did_binary_vs_did_fixture["se_binary"][0], dml_did_binary_vs_did_fixture["se"][0], rel_tol=1e-9, abs_tol=1e-4 + ) + + +@pytest.mark.ci +def test_boot(dml_did_binary_vs_did_fixture): + for bootstrap in dml_did_binary_vs_did_fixture["boot_methods"]: + assert np.allclose( + dml_did_binary_vs_did_fixture["boot_t_stat" + bootstrap], + dml_did_binary_vs_did_fixture["boot_t_stat" + bootstrap + "_manual"], + atol=1e-4, + ) + assert np.allclose( + dml_did_binary_vs_did_fixture["boot_t_stat" + bootstrap], + dml_did_binary_vs_did_fixture["boot_t_stat" + bootstrap + "_binary"], + atol=1e-4, + ) + + +@pytest.mark.ci +def test_nuisance_loss(dml_did_binary_vs_did_fixture): + assert ( + dml_did_binary_vs_did_fixture["nuisance_loss"].keys() == dml_did_binary_vs_did_fixture["nuisance_loss_binary"].keys() + ) + for key, value in dml_did_binary_vs_did_fixture["nuisance_loss"].items(): + assert np.allclose(value, dml_did_binary_vs_did_fixture["nuisance_loss_binary"][key], rtol=1e-9, atol=1e-3) + + +@pytest.mark.ci +def test_sensitivity_elements(dml_did_binary_vs_did_fixture): + sensitivity_element_names = ["sigma2", "nu2", "psi_sigma2", "psi_nu2"] + for sensitivity_element in sensitivity_element_names: + assert np.allclose( + dml_did_binary_vs_did_fixture["sensitivity_elements"][sensitivity_element], + dml_did_binary_vs_did_fixture["sensitivity_elements_manual"][sensitivity_element], + rtol=1e-9, + atol=1e-4, + ) + assert np.allclose( + dml_did_binary_vs_did_fixture["sensitivity_elements"][sensitivity_element], + dml_did_binary_vs_did_fixture["sensitivity_elements_binary"][sensitivity_element], + rtol=1e-9, + atol=1e-4, + ) + for sensitivity_element in ["riesz_rep"]: + assert np.allclose( + dml_did_binary_vs_did_fixture["sensitivity_elements"][sensitivity_element], + dml_did_binary_vs_did_fixture["sensitivity_elements_binary"][sensitivity_element], + rtol=1e-9, + atol=1e-4, + ) + + +@pytest.mark.ci +def test_sensitivity_params(dml_did_binary_vs_did_fixture): + for key in ["theta", "se", "ci"]: + assert np.allclose( + dml_did_binary_vs_did_fixture["sensitivity_params"][key]["lower"], + dml_did_binary_vs_did_fixture["sensitivity_params_binary"][key]["lower"], + rtol=1e-9, + atol=1e-4, + ) + assert np.allclose( + dml_did_binary_vs_did_fixture["sensitivity_params"][key]["upper"], + dml_did_binary_vs_did_fixture["sensitivity_params_binary"][key]["upper"], + rtol=1e-9, + atol=1e-4, + ) + + for key in ["rv", "rva"]: + assert np.allclose( + dml_did_binary_vs_did_fixture["sensitivity_params"][key], + dml_did_binary_vs_did_fixture["sensitivity_params_binary"][key], + rtol=1e-9, + atol=1e-4, + ) diff --git a/doubleml/did/tests/test_did_cs_external_predictions.py b/doubleml/did/tests/test_did_cs_external_predictions.py index f4a47997..2b28ac8a 100644 --- a/doubleml/did/tests/test_did_cs_external_predictions.py +++ b/doubleml/did/tests/test_did_cs_external_predictions.py @@ -5,7 +5,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression from doubleml import DoubleMLDIDCS -from doubleml.datasets import make_did_SZ2020 +from doubleml.did.datasets import make_did_SZ2020 from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor from ...tests._utils import draw_smpls diff --git a/doubleml/did/tests/test_did_external_predictions.py b/doubleml/did/tests/test_did_external_predictions.py index 9027e7dc..7234be8e 100644 --- a/doubleml/did/tests/test_did_external_predictions.py +++ b/doubleml/did/tests/test_did_external_predictions.py @@ -5,7 +5,7 @@ from sklearn.linear_model import LinearRegression, LogisticRegression from doubleml import DoubleMLDID -from doubleml.datasets import make_did_SZ2020 +from doubleml.did.datasets import make_did_SZ2020 from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor from ...tests._utils import draw_smpls @@ -42,11 +42,36 @@ def doubleml_did_fixture(did_score, n_rep): np.random.seed(3141) dml_did_ext.fit(external_predictions=ext_predictions) - res_dict = {"coef_normal": dml_did.coef[0], "coef_ext": dml_did_ext.coef[0]} + res_dict = { + "coef": dml_did.coef[0], + "coef_ext": dml_did_ext.coef[0], + "se": dml_did.se[0], + "se_ext": dml_did_ext.se[0], + "score": dml_did.psi, + "score_ext": dml_did_ext.psi, + "dml_did_nuisance_loss": dml_did.nuisance_loss, + "dml_did_ext_nuisance_loss": dml_did_ext.nuisance_loss, + } return res_dict @pytest.mark.ci -def test_doubleml_did_coef(doubleml_did_fixture): - assert math.isclose(doubleml_did_fixture["coef_normal"], doubleml_did_fixture["coef_ext"], rel_tol=1e-9, abs_tol=1e-3) +def test_coef(doubleml_did_fixture): + assert math.isclose(doubleml_did_fixture["coef"], doubleml_did_fixture["coef_ext"], rel_tol=1e-9, abs_tol=1e-3) + + +@pytest.mark.ci +def test_se(doubleml_did_fixture): + assert math.isclose(doubleml_did_fixture["se"], doubleml_did_fixture["se_ext"], rel_tol=1e-9, abs_tol=1e-3) + + +@pytest.mark.ci +def test_score(doubleml_did_fixture): + assert np.allclose(doubleml_did_fixture["score"], doubleml_did_fixture["score_ext"], rtol=1e-9, atol=1e-3) + + +@pytest.mark.ci +def test_nuisance_loss(doubleml_did_fixture): + for key, value in doubleml_did_fixture["dml_did_nuisance_loss"].items(): + assert np.allclose(value, doubleml_did_fixture["dml_did_ext_nuisance_loss"][key], rtol=1e-9, atol=1e-3) diff --git a/doubleml/did/tests/test_did_multi_aggregation_manual_weights.py b/doubleml/did/tests/test_did_multi_aggregation_manual_weights.py new file mode 100644 index 00000000..35512d8f --- /dev/null +++ b/doubleml/did/tests/test_did_multi_aggregation_manual_weights.py @@ -0,0 +1 @@ +# TODO: For each aggregation method check if the manual weights equal the string aggregation method. diff --git a/doubleml/did/tests/test_did_multi_aggregation_single_gt.py b/doubleml/did/tests/test_did_multi_aggregation_single_gt.py new file mode 100644 index 00000000..0f71d91b --- /dev/null +++ b/doubleml/did/tests/test_did_multi_aggregation_single_gt.py @@ -0,0 +1,112 @@ +import math + +import pytest +from sklearn.linear_model import LinearRegression, LogisticRegression + +import doubleml as dml +from doubleml.did.datasets import make_did_CS2021 + + +@pytest.fixture(scope="module", params=["group", "time", "eventstudy"]) +def aggregation(request): + return request.param + + +@pytest.fixture( + scope="module", + params=[ + [LinearRegression(), LogisticRegression(solver="lbfgs", max_iter=250)], + ], +) +def learner(request): + return request.param + + +@pytest.fixture(scope="module", params=["observational", "experimental"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def in_sample_normalization(request): + return request.param + + +@pytest.fixture(scope="module", params=[0.1]) +def trimming_threshold(request): + return request.param + + +@pytest.fixture(scope="module", params=["datetime", "float"]) +def time_type(request): + return request.param + + +@pytest.fixture(scope="module") +def dml_single_gt_aggregation(aggregation, time_type, learner, score, in_sample_normalization, trimming_threshold): + n_obs = 500 + dpg = 1 + + # collect data + df = make_did_CS2021(n_obs=n_obs, dgp_type=dpg, time_type=time_type) + dml_panel_data = dml.data.DoubleMLPanelData( + df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] + ) + + dml_args = { + "n_folds": 3, + "score": score, + "in_sample_normalization": in_sample_normalization, + "trimming_threshold": trimming_threshold, + "draw_sample_splitting": True, + } + gt_combination = [(dml_panel_data.g_values[0], dml_panel_data.t_values[0], dml_panel_data.t_values[3])] + dml_obj = dml.did.DoubleMLDIDMulti( + dml_panel_data, + ml_g=learner[0], + ml_m=learner[1], + gt_combinations=gt_combination, + **dml_args, + ) + dml_obj.fit() + + dml_obj_agg = dml_obj.aggregate(aggregation=aggregation) + + res_dict = { + "dml_obj": dml_obj, + "dml_obj_agg": dml_obj_agg, + } + + return res_dict + + +@pytest.mark.ci +def test_dml_single_gt_thetas(dml_single_gt_aggregation): + assert math.isclose( + dml_single_gt_aggregation["dml_obj"].coef[0], + dml_single_gt_aggregation["dml_obj_agg"].aggregated_frameworks.thetas[0], + rel_tol=1e-9, + abs_tol=1e-4, + ) + assert math.isclose( + dml_single_gt_aggregation["dml_obj_agg"].aggregated_frameworks.thetas[0], + dml_single_gt_aggregation["dml_obj_agg"].overall_aggregated_framework.thetas[0], + rel_tol=1e-9, + abs_tol=1e-4, + ) + + +@pytest.mark.ci +def test_dml_single_gt_ses(dml_single_gt_aggregation): + assert math.isclose( + dml_single_gt_aggregation["dml_obj"].se[0], + dml_single_gt_aggregation["dml_obj_agg"].aggregated_frameworks.ses[0], + rel_tol=1e-9, + abs_tol=1e-4, + ) + assert math.isclose( + dml_single_gt_aggregation["dml_obj_agg"].aggregated_frameworks.ses[0], + dml_single_gt_aggregation["dml_obj_agg"].overall_aggregated_framework.ses[0], + rel_tol=1e-9, + abs_tol=1e-4, + ) diff --git a/doubleml/did/tests/test_did_multi_aggregation_weight_index.py b/doubleml/did/tests/test_did_multi_aggregation_weight_index.py new file mode 100644 index 00000000..d001a4a8 --- /dev/null +++ b/doubleml/did/tests/test_did_multi_aggregation_weight_index.py @@ -0,0 +1 @@ +# TODO: For each aggregation method check if the aggregated weights correspond to certain gt_combinations (group, time etc.) diff --git a/doubleml/did/tests/test_did_multi_exceptions.py b/doubleml/did/tests/test_did_multi_exceptions.py new file mode 100644 index 00000000..aead8e48 --- /dev/null +++ b/doubleml/did/tests/test_did_multi_exceptions.py @@ -0,0 +1,239 @@ +from unittest.mock import patch + +import pytest +from sklearn.linear_model import LinearRegression, LogisticRegression + +import doubleml as dml + +df = dml.did.datasets.make_did_CS2021(n_obs=500, dgp_type=1, n_pre_treat_periods=0, n_periods=3, time_type="float") +dml_data = dml.data.DoubleMLPanelData(df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"]) +# df_binary_outcome = df.copy() +# df_binary_outcome["y"] = (df_binary_outcome["y"] > df_binary_outcome["y"].median()).astype(int) +# dml_data_binary_outcome = dml.data.DoubleMLPanelData( +# df_binary_outcome, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] +# ) + +valid_arguments = { + "obj_dml_data": dml_data, + "ml_g": LinearRegression(), + "ml_m": LogisticRegression(), + "gt_combinations": [(1, 0, 1)], +} + + +@pytest.mark.ci +def test_input(): + # data + msg = r"The data has to be a DoubleMLPanelData object. 0 of type was passed." + with pytest.raises(TypeError, match=msg): + invalid_arguments = {"obj_dml_data": 0} + _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments)) + + invalid_data = dml.data.DoubleMLPanelData( + df, y_col="y", d_cols="d", id_col="id", t_col="t", z_cols=["Z4"], x_cols=["Z1", "Z2", "Z3"] + ) + msg = r"Incompatible data. Z4 have been set as instrumental variable\(s\)." + with pytest.raises(NotImplementedError, match=msg): + invalid_arguments = {"obj_dml_data": invalid_data} + _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments)) + + # control group + msg = r"The control group has to be one of \['never_treated', 'not_yet_treated'\]. 0 was passed." + with pytest.raises(ValueError, match=msg): + invalid_arguments = {"control_group": 0} + _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments)) + + # propensity score adjustments + msg = "in_sample_normalization indicator has to be boolean. Object of type passed." + with pytest.raises(TypeError, match=msg): + invalid_arguments = {"in_sample_normalization": "test"} + _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments)) + + # score + msg = "Invalid score test. Valid score observational or experimental." + with pytest.raises(ValueError, match=msg): + invalid_arguments = {"score": "test"} + _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments)) + + # trimming + msg = "Invalid trimming_rule discard. Valid trimming_rule truncate." + with pytest.raises(ValueError, match=msg): + invalid_arguments = {"trimming_rule": "discard"} + _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments)) + + msg = "trimming_threshold has to be a float. Object of type passed." + with pytest.raises(TypeError, match=msg): + invalid_arguments = {"trimming_threshold": "test"} + _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments)) + + msg = "Invalid trimming_threshold 0.6. trimming_threshold has to be between 0 and 0.5." + with pytest.raises(ValueError, match=msg): + invalid_arguments = {"trimming_threshold": 0.6} + _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments)) + + +@pytest.mark.ci +def test_exception_learners(): + msg = ( + r"The ml_g learner LogisticRegression\(\) was identified as classifier but " + + "the outcome variable is not binary with values 0 and 1." + ) + with pytest.raises(ValueError, match=msg): + invalid_arguments = {"ml_g": LogisticRegression()} + _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments)) + + msg = ( + 'A learner ml_m has been provided for score = "experimental" but will be ignored. ' + "A learner ml_m is not required for estimation." + ) + with pytest.warns(UserWarning, match=msg): + invalid_arguments = {"score": "experimental"} + _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments)) + + +@pytest.mark.ci +def test_exception_gt_combinations(): + msg = r"gt_combinations must be one of \['standard', 'all'\]. test was passed." + with pytest.raises(ValueError, match=msg): + invalid_arguments = {"gt_combinations": "test"} + _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments)) + + msg = "gt_combinations must be a list. 1 of type was passed." + with pytest.raises(TypeError, match=msg): + invalid_arguments = {"gt_combinations": 1} + _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments)) + + msg = "gt_combinations must not be empty." + with pytest.raises(ValueError, match=msg): + invalid_arguments = {"gt_combinations": []} + _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments)) + + msg = "gt_combinations must be a list of tuples. At least one element is not a tuple." + with pytest.raises(TypeError, match=msg): + invalid_arguments = {"gt_combinations": [1]} + _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments)) + + msg = "gt_combinations must be a list of tuples with 3 elements. At least one tuple has not 3 elements." + with pytest.raises(ValueError, match=msg): + invalid_arguments = {"gt_combinations": [(1, 0)]} + _ = dml.did.DoubleMLDIDMulti(**(valid_arguments | invalid_arguments)) + + +@pytest.mark.ci +def test_exceptions_aggregate(): + dml_obj = dml.did.DoubleMLDIDMulti(**valid_arguments) + # test without fit() + msg = r"Apply fit\(\) before aggregate\(\)." + with pytest.raises(ValueError, match=msg): + dml_obj.aggregate() + + dml_obj.fit() + + # Test non-string input + msg = "aggregation must be a string or dictionary. 123 of type was passed." + with pytest.raises(TypeError, match=msg): + dml_obj.aggregate(aggregation=123) + + # Test invalid string value + msg = "aggregation must be one of \\['group', 'time', 'eventstudy'\\]. invalid was passed." + with pytest.raises(ValueError, match=msg): + dml_obj.aggregate(aggregation="invalid") + + +@pytest.mark.ci +def test_check_external_predictions(): + # Create DID instance + model = dml.did.DoubleMLDIDMulti(**valid_arguments) + + # Test 1: Invalid type (not a dictionary) + invalid_pred = ["not a dict"] + with pytest.raises(TypeError, match="external_predictions must be a dictionary"): + model.fit(external_predictions=invalid_pred) + + # Test 2: Invalid keys in top-level dictionary + invalid_keys = {"invalid_key": {}} + with pytest.raises(ValueError, match="external_predictions must be a subset of all gt_combinations"): + model.fit(external_predictions=invalid_keys) + + # Test 3: Invalid type for nested prediction dictionary + invalid_nested = {model.gt_labels[0]: "not a dict"} + msg = r"external_predictions\[ATT\(1,0,1\)\] must be a dictionary\. Object of type passed\." + with pytest.raises(TypeError, match=msg): + model.fit(external_predictions=invalid_nested) + + # Test 4: Invalid keys in nested prediction dictionary + invalid_learner = {model.gt_labels[0]: {"invalid_learner": None}} + with pytest.raises(ValueError, match="must be a subset of "): + model.fit(external_predictions=invalid_learner) + + # Test 5: Valid external predictions should not raise + valid_pred = {model.gt_labels[0]: {"ml_g0": None, "ml_g1": None, "ml_m": None}} + model._check_external_predictions(valid_pred) + + +@pytest.mark.ci +def test_exceptions_before_fit(): + """Test exception handling for confint() and p_adjust() methods when fit() hasn't been called.""" + dml_obj = dml.did.DoubleMLDIDMulti(**valid_arguments) + + msg = r"Apply fit\(\) before {}." + with pytest.raises(ValueError, match=msg.format("confint")): + dml_obj.confint() + + with pytest.raises(ValueError, match=msg.format("p_adjust")): + dml_obj.p_adjust() + + with pytest.raises(ValueError, match=msg.format("bootstrap")): + dml_obj.bootstrap() + + with pytest.raises(ValueError, match=msg.format("sensitivity_analysis")): + dml_obj.sensitivity_analysis() + + with pytest.raises(ValueError, match=msg.format("sensitivity_plot")): + dml_obj.sensitivity_plot() + + with pytest.raises(ValueError, match=msg.format("aggregate")): + dml_obj.aggregate() + + msg = r"Apply sensitivity_analysis\(\) before sensitivity_summary." + with pytest.raises(ValueError, match=msg): + _ = dml_obj.sensitivity_summary + + +@pytest.mark.ci +def test_exceptions_sensitivity_benchmark(): + """Test exception handling for sensitivity_benchmark() method.""" + dml_obj = dml.did.DoubleMLDIDMulti(**valid_arguments) + dml_obj.fit() + + # Test 1: sensitivity_elements is None + with patch.object(dml_obj.__class__, "sensitivity_elements", property(lambda self: None)): + msg = "Sensitivity analysis not yet implemented for" + with pytest.raises(NotImplementedError, match=msg): + dml_obj.sensitivity_benchmark(benchmarking_set=["Z1"]) + + # Test 2: benchmarking_set is not a list + invalid_types = [123, "string", {"dict": "value"}, (1, 2, 3)] + for invalid_type in invalid_types: + msg = "benchmarking_set must be a list." + with pytest.raises(TypeError, match=msg): + dml_obj.sensitivity_benchmark(benchmarking_set=invalid_type) + + # Test 3: benchmarking_set is an empty list + msg = "benchmarking_set must not be empty." + with pytest.raises(ValueError, match=msg): + dml_obj.sensitivity_benchmark(benchmarking_set=[]) + + # Test 4: benchmarking_set is not a subset of features + msg = ( + r"benchmarking_set must be a subset of features \['Z1', 'Z2', 'Z3', 'Z4'\]. \['Z5', 'NonExistentFeature'\] was passed." + ) + with pytest.raises(ValueError, match=msg): + dml_obj.sensitivity_benchmark(benchmarking_set=["Z5", "NonExistentFeature"]) + + # Test 5: fit_args is not None and not a dictionary + invalid_types = [123, "string", ["list"], (1, 2, 3)] + for invalid_type in invalid_types: + msg = "fit_args must be a dict." + with pytest.raises(TypeError, match=msg): + dml_obj.sensitivity_benchmark(benchmarking_set=["Z1"], fit_args=invalid_type) diff --git a/doubleml/did/tests/test_did_multi_external_predictions.py b/doubleml/did/tests/test_did_multi_external_predictions.py new file mode 100644 index 00000000..2e7003f9 --- /dev/null +++ b/doubleml/did/tests/test_did_multi_external_predictions.py @@ -0,0 +1,102 @@ +import math + +import numpy as np +import pytest +from sklearn.linear_model import LinearRegression, LogisticRegression + +import doubleml as dml +from doubleml.did.datasets import make_did_CS2021 +from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor + + +@pytest.fixture(scope="module", params=["observational", "experimental"]) +def did_score(request): + return request.param + + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def set_ml_m_ext(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def set_ml_g_ext(request): + return request.param + + +@pytest.fixture(scope="module") +def doubleml_did_multi_ext_fixture(did_score, n_rep, set_ml_m_ext, set_ml_g_ext): + n_obs = 500 + n_folds = 5 + dgp = 1 + ml_g = LinearRegression() + ml_m = LogisticRegression(random_state=42) + + # collect data + df = make_did_CS2021(n_obs=n_obs, dgp_type=dgp, time_type="float") + dml_panel_data = dml.data.DoubleMLPanelData( + df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] + ) + + dml_args = { + "obj_dml_data": dml_panel_data, + "gt_combinations": [(2, 0, 1)], + "score": did_score, + "n_rep": n_rep, + "n_folds": n_folds, + } + + np.random.seed(3141) + dml_obj = dml.did.DoubleMLDIDMulti( + ml_g=ml_g, + ml_m=ml_m, + **dml_args, + ) + np.random.seed(3141) + dml_obj.fit() + + ext_pred_dict = {gt_combination: {} for gt_combination in dml_obj.gt_labels} + if set_ml_m_ext and did_score == "observational": + for i_gt_combination, gt_label in enumerate(dml_obj.gt_labels): + ext_pred_dict[gt_label]["ml_m"] = dml_obj.modellist[i_gt_combination].predictions["ml_m"][:, :, 0] + ml_m_ext = DMLDummyClassifier() + else: + ml_m_ext = ml_m + + if set_ml_g_ext: + for i_gt_combination, gt_label in enumerate(dml_obj.gt_labels): + ext_pred_dict[gt_label]["ml_g0"] = dml_obj.modellist[i_gt_combination].predictions["ml_g0"][:, :, 0] + ext_pred_dict[gt_label]["ml_g1"] = dml_obj.modellist[i_gt_combination].predictions["ml_g1"][:, :, 0] + ml_g_ext = DMLDummyRegressor() + else: + ml_g_ext = ml_g + + np.random.seed(3141) + dml_obj_ext = dml.did.DoubleMLDIDMulti( + ml_g=ml_g_ext, + ml_m=ml_m_ext, + **dml_args, + ) + np.random.seed(3141) + dml_obj_ext.fit(external_predictions=ext_pred_dict) + + res_dict = { + "coef": dml_obj.coef[0], + "coef_ext": dml_obj_ext.coef[0], + "se": dml_obj.se[0], + "se_ext": dml_obj_ext.se[0], + } + + return res_dict + + +@pytest.mark.ci +def test_coef(doubleml_did_multi_ext_fixture): + assert math.isclose( + doubleml_did_multi_ext_fixture["coef"], doubleml_did_multi_ext_fixture["coef_ext"], rel_tol=1e-9, abs_tol=1e-3 + ) diff --git a/doubleml/did/tests/test_did_multi_placebo.py b/doubleml/did/tests/test_did_multi_placebo.py new file mode 100644 index 00000000..8f01d426 --- /dev/null +++ b/doubleml/did/tests/test_did_multi_placebo.py @@ -0,0 +1,62 @@ +import numpy as np +import pytest +from sklearn.linear_model import LinearRegression, LogisticRegression + +from doubleml.data import DoubleMLPanelData +from doubleml.did import DoubleMLDIDMulti +from doubleml.did.datasets import make_did_CS2021 + + +@pytest.fixture(scope="module", params=["observational", "experimental"]) +def did_score(request): + return request.param + + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + + +@pytest.fixture(scope="module") +def doubleml_did_fixture(did_score, n_rep): + n_obs = 1000 + dgp = 5 # has to be experimental (for experimental score to be valid) + np.random.seed(42) + df = make_did_CS2021(n_obs=n_obs, dgp=dgp, n_pre_treat_periods=3, n_periods=5, time_type="float") + dml_data = DoubleMLPanelData(df, y_col="y", d_cols="d", t_col="t", id_col="id", x_cols=["Z1", "Z2", "Z3", "Z4"]) + + # all placebo combinations + gt_combinations_group3 = [(3, 0, 1), (3, 0, 2), (3, 1, 2)] + gt_combinations_group4 = [(4, 0, 1), (4, 0, 2), (4, 0, 3), (4, 1, 2), (4, 1, 3), (4, 2, 3)] + gt_combinations = gt_combinations_group3 + gt_combinations_group4 + + kwargs = { + "obj_dml_data": dml_data, + "ml_g": LinearRegression(), + "ml_m": LogisticRegression(), + "gt_combinations": gt_combinations, + "score": did_score, + "n_rep": n_rep, + "n_folds": 5, + "draw_sample_splitting": True, + } + + dml_did = DoubleMLDIDMulti(**kwargs) + + np.random.seed(3141) + dml_did.fit() + ci = dml_did.confint(level=0.95) + + res_dict = { + "coef": dml_did.coef[:], + "ci_lower": ci.iloc[:, 0], + "ci_upper": ci.iloc[:, 1], + } + + return res_dict + + +@pytest.mark.ci +def test_zero(doubleml_did_fixture): + assert all(doubleml_did_fixture["ci_lower"] <= 0.0) + assert all(doubleml_did_fixture["ci_upper"] >= 0.0) diff --git a/doubleml/did/tests/test_did_multi_plot.py b/doubleml/did/tests/test_did_multi_plot.py new file mode 100644 index 00000000..2eb15dcc --- /dev/null +++ b/doubleml/did/tests/test_did_multi_plot.py @@ -0,0 +1,175 @@ +import matplotlib.pyplot as plt +import numpy as np +import pytest +from sklearn.linear_model import LinearRegression, LogisticRegression + +from doubleml.data import DoubleMLPanelData +from doubleml.did import DoubleMLDIDMulti +from doubleml.did.datasets import make_did_CS2021 + + +@pytest.fixture(scope="module", params=["observational", "experimental"]) +def did_score(request): + return request.param + + +@pytest.fixture(scope="module", params=[1, 3]) +def n_rep(request): + return request.param + + +@pytest.fixture(scope="module") +def doubleml_did_fixture(did_score, n_rep): + n_obs = 1000 + dgp = 5 # has to be experimental (for experimental score to be valid) + np.random.seed(42) + df = make_did_CS2021(n_obs=n_obs, dgp=dgp, n_pre_treat_periods=3, n_periods=5, time_type="float") + dml_data = DoubleMLPanelData(df, y_col="y", d_cols="d", t_col="t", id_col="id", x_cols=["Z1", "Z2", "Z3", "Z4"]) + + kwargs = { + "obj_dml_data": dml_data, + "ml_g": LinearRegression(), + "ml_m": LogisticRegression(), + "gt_combinations": "all", + "score": did_score, + "n_rep": n_rep, + "n_folds": 2, + "draw_sample_splitting": True, + } + + dml_did = DoubleMLDIDMulti(**kwargs) + + np.random.seed(3141) + dml_did.fit() + + res_dict = { + "model": dml_did, + } + return res_dict + + +@pytest.mark.ci +def test_plot_bootstrap_warnings(doubleml_did_fixture): + msg = "Joint confidence intervals require bootstrapping" + with pytest.warns(UserWarning, match=msg): + _ = doubleml_did_fixture["model"].plot_effects() + + +@pytest.mark.ci +def test_plot_effects_default(doubleml_did_fixture): + dml_obj = doubleml_did_fixture["model"] + fig, axes = dml_obj.plot_effects() + + assert isinstance(fig, plt.Figure) + assert isinstance(axes, list) + assert all(isinstance(ax, plt.Axes) for ax in axes) + plt.close("all") + + +@pytest.mark.ci +def test_plot_effects_confidence_level(doubleml_did_fixture): + """Test plot_effects with different confidence levels.""" + dml_obj = doubleml_did_fixture["model"] + + # Test with 90% confidence level + fig, _ = dml_obj.plot_effects(level=0.9) + assert isinstance(fig, plt.Figure) + + # assert figure is not equal to default value + fig_default, _ = dml_obj.plot_effects() + assert fig_default != fig + + plt.close("all") + + +@pytest.mark.ci +def test_plot_effects_joint_ci(doubleml_did_fixture): + """Test plot_effects with different joint confidence interval settings.""" + dml_obj = doubleml_did_fixture["model"] + + # Test with joint=False + fig, _ = dml_obj.plot_effects(joint=False) + assert isinstance(fig, plt.Figure) + + # assert figure is not equal to default value + fig_default, _ = dml_obj.plot_effects() + assert fig_default != fig + + plt.close("all") + + +@pytest.mark.ci +def test_plot_effects_figure_size(doubleml_did_fixture): + """Test plot_effects with custom figure size.""" + dml_obj = doubleml_did_fixture["model"] + + custom_figsize = (10, 5) + fig, _ = dml_obj.plot_effects(figsize=custom_figsize) + assert isinstance(fig, plt.Figure) + + # Check if figure size matches the specified size + width, height = fig.get_size_inches() + assert (width, height) == custom_figsize + + plt.close("all") + + +@pytest.mark.ci +def test_plot_effects_color_palette(doubleml_did_fixture): + """Test plot_effects with different color palettes.""" + dml_obj = doubleml_did_fixture["model"] + + # Test with a different seaborn palette + fig, _ = dml_obj.plot_effects(color_palette="Set1") + assert isinstance(fig, plt.Figure) + + # Test with a custom color list + custom_colors = [(1, 0, 0), (0, 1, 0)] # Red and green + fig, _ = dml_obj.plot_effects(color_palette=custom_colors) + assert isinstance(fig, plt.Figure) + + plt.close("all") + + +@pytest.mark.ci +def test_plot_effects_labels_and_title(doubleml_did_fixture): + """Test plot_effects with custom labels and title.""" + dml_obj = doubleml_did_fixture["model"] + + custom_title = "Custom Title for Test" + custom_ylabel = "Custom Y Label" + + fig, axes = dml_obj.plot_effects(title=custom_title, y_label=custom_ylabel) + assert isinstance(fig, plt.Figure) + + # Check if title is set correctly (title is on the figure level) + assert fig._suptitle.get_text() == custom_title + + # Check if y_label is set correctly (at least on the first axis) + assert axes[0].get_ylabel() == custom_ylabel + + plt.close("all") + + +@pytest.mark.ci +def test_plot_effects_jitter(doubleml_did_fixture): + """Test plot_effects with custom jitter settings.""" + dml_obj = doubleml_did_fixture["model"] + + # Test with custom jitter value + fig, _ = dml_obj.plot_effects(jitter_value=0.2) + assert isinstance(fig, plt.Figure) + + # assert figure is not equal to default value + fig_default, _ = dml_obj.plot_effects() + assert fig_default != fig + + # Test with custom default_jitter + fig, _ = dml_obj.plot_effects(default_jitter=0.05) + assert isinstance(fig, plt.Figure) + + # assert figure is not equal to default value + fig_default, _ = dml_obj.plot_effects() + assert fig_default != fig + + plt.close("all") diff --git a/doubleml/did/tests/test_did_multi_return_types.py b/doubleml/did/tests/test_did_multi_return_types.py new file mode 100644 index 00000000..2e12ce10 --- /dev/null +++ b/doubleml/did/tests/test_did_multi_return_types.py @@ -0,0 +1,193 @@ +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import plotly +import pytest +from matplotlib.axes import Axes +from matplotlib.figure import Figure +from sklearn.linear_model import Lasso, LogisticRegression + +from doubleml.data import DoubleMLPanelData +from doubleml.did import DoubleMLDIDAggregation, DoubleMLDIDMulti +from doubleml.did.datasets import make_did_CS2021 +from doubleml.double_ml_framework import DoubleMLFramework + +# Test constants +N_OBS = 200 +N_REP = 1 +N_FOLDS = 3 +N_REP_BOOT = 314 + +dml_args = { + "n_rep": N_REP, + "n_folds": N_FOLDS, + "gt_combinations": "standard", +} + + +# create all datasets +np.random.seed(3141) +datasets = {} + +# panel data +df_panel = make_did_CS2021(n_obs=N_OBS, dgp_type=1, n_pre_treat_periods=2, n_periods=5, time_type="float") +df_panel["y_binary"] = np.random.binomial(n=1, p=0.5, size=df_panel.shape[0]) +datasets["did_panel"] = DoubleMLPanelData( + df_panel, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] +) +datasets["did_panel_binary_outcome"] = DoubleMLPanelData( + df_panel, y_col="y_binary", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] +) + + +dml_objs = [ + (DoubleMLDIDMulti(datasets["did_panel"], ml_g=Lasso(), ml_m=LogisticRegression(), **dml_args), DoubleMLDIDMulti), + ( + DoubleMLDIDMulti( + datasets["did_panel_binary_outcome"], ml_g=LogisticRegression(), ml_m=LogisticRegression(), **dml_args + ), + DoubleMLDIDMulti, + ), +] + + +@pytest.mark.ci +@pytest.mark.parametrize("dml_obj, cls", dml_objs) +def test_panel_return_types(dml_obj, cls): + assert isinstance(dml_obj.__str__(), str) + assert isinstance(dml_obj.summary, pd.DataFrame) + # assert isinstance(dml_obj.draw_sample_splitting(), cls) # not implemented + assert isinstance(dml_obj.fit(), cls) + assert isinstance(dml_obj.__str__(), str) # called again after fit, now with numbers + assert isinstance(dml_obj.summary, pd.DataFrame) # called again after fit, now with numbers + assert isinstance(dml_obj.bootstrap(), cls) + + assert isinstance(dml_obj.confint(), pd.DataFrame) + assert isinstance(dml_obj.p_adjust(), pd.DataFrame) + + assert isinstance(dml_obj._dml_data.__str__(), str) + + # further return type tests + + +@pytest.fixture(params=dml_objs) +def fitted_dml_obj(request): + dml_obj, _ = request.param + dml_obj.fit() + dml_obj.bootstrap(n_rep_boot=N_REP_BOOT) + return dml_obj + + +@pytest.mark.ci +def test_panel_property_types_and_shapes(fitted_dml_obj): + n_treat = len(fitted_dml_obj.gt_combinations) + dml_obj = fitted_dml_obj + + # check_basic_property_types_and_shapes + # check that the setting is still in line with the hard-coded values + assert dml_obj._dml_data.n_treat == 1 + assert dml_obj.n_gt_atts == n_treat + assert dml_obj.n_rep == N_REP + assert dml_obj.n_folds == N_FOLDS + assert dml_obj._dml_data.n_obs == N_OBS + assert dml_obj.n_rep_boot == N_REP_BOOT + + assert isinstance(dml_obj.all_coef, np.ndarray) + assert dml_obj.all_coef.shape == (n_treat, N_REP) + + assert isinstance(dml_obj.all_se, np.ndarray) + assert dml_obj.all_se.shape == (n_treat, N_REP) + + assert isinstance(dml_obj.boot_t_stat, np.ndarray) + assert dml_obj.boot_t_stat.shape == (N_REP_BOOT, n_treat, N_REP) + + assert isinstance(dml_obj.coef, np.ndarray) + assert dml_obj.coef.shape == (n_treat,) + + assert isinstance(dml_obj.se, np.ndarray) + assert dml_obj.se.shape == (n_treat,) + + assert isinstance(dml_obj.t_stat, np.ndarray) + assert dml_obj.t_stat.shape == (n_treat,) + + assert isinstance(dml_obj.framework.scaled_psi, np.ndarray) + assert dml_obj.framework.scaled_psi.shape == ( + N_OBS, + n_treat, + N_REP, + ) + + assert isinstance(dml_obj.framework, DoubleMLFramework) + assert isinstance(dml_obj.pval, np.ndarray) + assert dml_obj.pval.shape == (n_treat,) + + assert isinstance(dml_obj._dml_data.binary_treats, pd.Series) + assert len(dml_obj._dml_data.binary_treats) == 1 + + # check_basic_predictions_and_targets + expected_keys = ["ml_g0", "ml_g1", "ml_m"] + for key in expected_keys: + assert isinstance(dml_obj.nuisance_loss[key], np.ndarray) + assert dml_obj.nuisance_loss[key].shape == (N_REP, n_treat) + + +@pytest.mark.ci +def test_panel_sensitivity_return_types(fitted_dml_obj): + n_treat = len(fitted_dml_obj.gt_combinations) + benchmarking_set = [fitted_dml_obj._dml_data.x_cols[0]] + dml_obj = fitted_dml_obj + + assert isinstance(dml_obj.sensitivity_elements, dict) + for key in ["sigma2", "nu2", "max_bias"]: + assert isinstance(dml_obj.sensitivity_elements[key], np.ndarray) + assert dml_obj.sensitivity_elements[key].shape == (1, n_treat, N_REP) + for key in ["psi_max_bias"]: + assert isinstance(dml_obj.sensitivity_elements[key], np.ndarray) + assert dml_obj.sensitivity_elements[key].shape == (N_OBS, n_treat, N_REP) + + assert isinstance(dml_obj.sensitivity_summary, str) + dml_obj.sensitivity_analysis() + assert isinstance(dml_obj.sensitivity_summary, str) + assert isinstance(dml_obj.sensitivity_plot(), plotly.graph_objs._figure.Figure) + benchmarks = {"cf_y": [0.1, 0.2], "cf_d": [0.15, 0.2], "name": ["test1", "test2"]} + assert isinstance(dml_obj.sensitivity_plot(value="ci", benchmarks=benchmarks), plotly.graph_objs._figure.Figure) + + assert isinstance(dml_obj.framework._calc_sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0, level=0.95), dict) + assert isinstance( + dml_obj.framework._calc_robustness_value(null_hypothesis=0.0, level=0.95, rho=1.0, idx_treatment=0), tuple + ) + benchmark = dml_obj.sensitivity_benchmark(benchmarking_set=benchmarking_set) + assert isinstance(benchmark, pd.DataFrame) + + +@pytest.mark.ci +def test_panel_plot_effects(fitted_dml_obj): + fig, axes = fitted_dml_obj.plot_effects() + assert isinstance(fig, Figure) + + # list of axes objects + assert isinstance(axes, list) + for ax in axes: + assert isinstance(ax, Axes) + + plt.close(fig) + + +@pytest.fixture(scope="module", params=["eventstudy", "group", "time"]) +def aggregation(request): + return request.param + + +@pytest.mark.ci +def test_panel_agg_return_types(fitted_dml_obj, aggregation): + agg_obj = fitted_dml_obj.aggregate(aggregation=aggregation) + agg_obj.aggregated_frameworks.bootstrap(n_rep_boot=10) + + assert isinstance(agg_obj, DoubleMLDIDAggregation) + assert isinstance(agg_obj.__str__(), str) + + # test plotting + fig, ax = agg_obj.plot_effects() + assert isinstance(fig, Figure) + assert isinstance(ax, Axes) + plt.close(fig) diff --git a/doubleml/did/tests/test_did_multi_vs_binary.py b/doubleml/did/tests/test_did_multi_vs_binary.py new file mode 100644 index 00000000..40b877b2 --- /dev/null +++ b/doubleml/did/tests/test_did_multi_vs_binary.py @@ -0,0 +1,206 @@ +import math + +import numpy as np +import pytest +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.linear_model import LinearRegression, LogisticRegression + +import doubleml as dml +from doubleml.did.datasets import make_did_CS2021 +from doubleml.utils import DMLDummyClassifier, DMLDummyRegressor + + +@pytest.fixture( + scope="module", + params=[ + [LinearRegression(), LogisticRegression(solver="lbfgs", max_iter=250)], + [ + RandomForestRegressor(max_depth=5, n_estimators=10, random_state=42), + RandomForestClassifier(max_depth=5, n_estimators=10, random_state=42), + ], + ], +) +def learner(request): + return request.param + + +@pytest.fixture(scope="module", params=["observational", "experimental"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def in_sample_normalization(request): + return request.param + + +@pytest.fixture(scope="module", params=[0.1]) +def trimming_threshold(request): + return request.param + + +@pytest.fixture(scope="module", params=["datetime", "float"]) +def time_type(request): + return request.param + + +@pytest.fixture(scope="module") +def dml_did_binary_vs_did_multi_fixture(time_type, learner, score, in_sample_normalization, trimming_threshold): + n_obs = 500 + dpg = 1 + boot_methods = ["normal"] + n_rep_boot = 50000 + + # collect data + df = make_did_CS2021(n_obs=n_obs, dgp_type=dpg, time_type=time_type) + dml_panel_data = dml.data.DoubleMLPanelData( + df, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] + ) + + dml_args = { + "n_folds": 3, + "score": score, + "in_sample_normalization": in_sample_normalization, + "trimming_threshold": trimming_threshold, + "draw_sample_splitting": True, + } + gt_combination = [(dml_panel_data.g_values[0], dml_panel_data.t_values[0], dml_panel_data.t_values[1])] + dml_did_multi_obj = dml.did.DoubleMLDIDMulti( + dml_panel_data, + ml_g=learner[0], + ml_m=learner[1], + gt_combinations=gt_combination, + **dml_args, + ) + dml_did_multi_obj.fit() + + treatment_col = dml_panel_data.d_cols[0] + ext_pred_dict = {treatment_col: {}} + ext_pred_dict[treatment_col]["ml_g0"] = dml_did_multi_obj.modellist[0].predictions["ml_g0"][:, :, 0] + ext_pred_dict[treatment_col]["ml_g1"] = dml_did_multi_obj.modellist[0].predictions["ml_g1"][:, :, 0] + if score == "observational": + ext_pred_dict[treatment_col]["ml_m"] = dml_did_multi_obj.modellist[0].predictions["ml_m"][:, :, 0] + + dml_did_binary_obj = dml.did.DoubleMLDIDBinary( + dml_panel_data, + g_value=gt_combination[0][0], + t_value_pre=gt_combination[0][1], + t_value_eval=gt_combination[0][2], + ml_g=DMLDummyRegressor(), + ml_m=DMLDummyClassifier(), + **dml_args, + ) + dml_did_binary_obj.fit(external_predictions=ext_pred_dict) + + res_dict = { + "coef_multi": dml_did_multi_obj.coef, + "coef_binary": dml_did_binary_obj.coef, + "se_multi": dml_did_multi_obj.se, + "se_binary": dml_did_binary_obj.se, + "boot_methods": boot_methods, + "nuisance_loss_multi": dml_did_multi_obj.nuisance_loss, + "nuisance_loss_binary": dml_did_binary_obj.nuisance_loss, + } + + for bootstrap in boot_methods: + np.random.seed(3141) + dml_did_multi_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) + np.random.seed(3141) + dml_did_binary_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) + + # approximately same ci (bootstrap not identical due to size of score) + res_dict["boot_ci" + bootstrap + "_multi"] = dml_did_multi_obj.confint(joint=True) + res_dict["boot_ci" + bootstrap + "_binary"] = dml_did_binary_obj.confint(joint=True) + + # sensitivity tests + res_dict["sensitivity_elements_multi"] = dml_did_multi_obj.sensitivity_elements + res_dict["sensitivity_elements_binary"] = dml_did_binary_obj.framework.sensitivity_elements + + dml_did_multi_obj.sensitivity_analysis() + dml_did_binary_obj.sensitivity_analysis() + + res_dict["sensitivity_params_multi"] = dml_did_multi_obj.sensitivity_params + res_dict["sensitivity_params_binary"] = dml_did_binary_obj.sensitivity_params + + return res_dict + + +@pytest.mark.ci +def test_coefs(dml_did_binary_vs_did_multi_fixture): + assert math.isclose( + dml_did_binary_vs_did_multi_fixture["coef_binary"][0], + dml_did_binary_vs_did_multi_fixture["coef_multi"][0], + rel_tol=1e-9, + abs_tol=1e-4, + ) + + +@pytest.mark.ci +def test_se(dml_did_binary_vs_did_multi_fixture): + assert math.isclose( + dml_did_binary_vs_did_multi_fixture["se_binary"][0], + dml_did_binary_vs_did_multi_fixture["se_multi"][0], + rel_tol=1e-9, + abs_tol=1e-4, + ) + + +@pytest.mark.ci +def test_boot(dml_did_binary_vs_did_multi_fixture): + for bootstrap in dml_did_binary_vs_did_multi_fixture["boot_methods"]: + assert np.allclose( + dml_did_binary_vs_did_multi_fixture["boot_ci" + bootstrap + "_multi"].values, + dml_did_binary_vs_did_multi_fixture["boot_ci" + bootstrap + "_binary"].values, + atol=1e-2, + ) + + +@pytest.mark.ci +def test_nuisance_loss(dml_did_binary_vs_did_multi_fixture): + assert ( + dml_did_binary_vs_did_multi_fixture["nuisance_loss_multi"].keys() + == dml_did_binary_vs_did_multi_fixture["nuisance_loss_binary"].keys() + ) + for key, value in dml_did_binary_vs_did_multi_fixture["nuisance_loss_multi"].items(): + assert np.allclose(value, dml_did_binary_vs_did_multi_fixture["nuisance_loss_binary"][key], rtol=1e-9, atol=1e-3) + + +@pytest.mark.ci +def test_sensitivity_elements(dml_did_binary_vs_did_multi_fixture): + elements_multi = dml_did_binary_vs_did_multi_fixture["sensitivity_elements_multi"] + elements_binary = dml_did_binary_vs_did_multi_fixture["sensitivity_elements_binary"] + sensitivity_element_names = ["max_bias", "psi_max_bias", "sigma2", "nu2"] + for sensitivity_element in sensitivity_element_names: + assert np.allclose( + elements_multi[sensitivity_element], + elements_binary[sensitivity_element], + rtol=1e-9, + atol=1e-4, + ) + + +@pytest.mark.ci +def test_sensitivity_params(dml_did_binary_vs_did_multi_fixture): + multi_params = dml_did_binary_vs_did_multi_fixture["sensitivity_params_multi"] + binary_params = dml_did_binary_vs_did_multi_fixture["sensitivity_params_binary"] + for key in ["theta", "se", "ci"]: + assert np.allclose( + multi_params[key]["lower"], + binary_params[key]["lower"], + rtol=1e-9, + atol=1e-4, + ) + assert np.allclose( + multi_params[key]["upper"], + binary_params[key]["upper"], + rtol=1e-9, + atol=1e-4, + ) + + for key in ["rv", "rva"]: + assert np.allclose( + multi_params[key], + binary_params[key], + rtol=1e-9, + atol=1e-4, + ) diff --git a/doubleml/did/tests/test_model_defaults.py b/doubleml/did/tests/test_model_defaults.py new file mode 100644 index 00000000..f8c59e70 --- /dev/null +++ b/doubleml/did/tests/test_model_defaults.py @@ -0,0 +1,81 @@ +import pytest +from sklearn.linear_model import LinearRegression, LogisticRegression + +import doubleml as dml +from doubleml.did import DoubleMLDIDBinary, DoubleMLDIDMulti +from doubleml.utils._check_defaults import _check_basic_defaults_after_fit, _check_basic_defaults_before_fit, _fit_bootstrap + +df_panel = dml.did.datasets.make_did_CS2021(n_obs=500, dgp_type=1, n_pre_treat_periods=2, n_periods=5, time_type="float") +dml_panel_data = dml.data.DoubleMLPanelData( + df_panel, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] +) + +dml_did_multi_obj = DoubleMLDIDMulti(dml_panel_data, LinearRegression(), LogisticRegression(), [(2, 0, 1)]) +dml_did_binary_obj = DoubleMLDIDBinary( + dml_panel_data, g_value=2, t_value_pre=0, t_value_eval=1, ml_g=LinearRegression(), ml_m=LogisticRegression() +) + + +@pytest.mark.ci +def test_did_binary_defaults(): + _check_basic_defaults_before_fit(dml_did_binary_obj) + + # specific parameters + assert dml_did_binary_obj.control_group == "never_treated" + assert dml_did_binary_obj.anticipation_periods == 0 + + _fit_bootstrap(dml_did_binary_obj) + _check_basic_defaults_after_fit(dml_did_binary_obj) + + +@pytest.mark.ci +def test_did_multi_defaults(): + _check_basic_defaults_before_fit(dml_did_multi_obj) + + # coefs and se + assert dml_did_multi_obj.coef is None + assert dml_did_multi_obj.se is None + assert dml_did_multi_obj.all_coef is None + assert dml_did_multi_obj.all_se is None + assert dml_did_multi_obj.t_stat is None + assert dml_did_multi_obj.pval is None + + # specific parameters + assert dml_did_binary_obj.control_group == "never_treated" + assert dml_did_binary_obj.anticipation_periods == 0 + + _fit_bootstrap(dml_did_multi_obj) + _check_basic_defaults_after_fit(dml_did_multi_obj) + + +@pytest.mark.ci +def test_did_multi_str(): + # Test the string representation before fitting + dml_str = str(dml_did_multi_obj) + + # Check that all important sections are present + assert "================== DoubleMLDIDMulti Object ==================" in dml_str + assert "------------------ Data summary ------------------" in dml_str + assert "------------------ Score & algorithm ------------------" in dml_str + assert "------------------ Machine learner ------------------" in dml_str + assert "------------------ Resampling ------------------" in dml_str + assert "------------------ Fit summary ------------------" in dml_str + + # Check specific content before fitting + assert "Score function: observational" in dml_str + assert "No. folds: 5" in dml_str + assert "No. repeated sample splits: 1" in dml_str + assert "Learner ml_g:" in dml_str + assert "Learner ml_m:" in dml_str + + # Fit the model + dml_did_multi_obj_fit = dml_did_multi_obj.fit() + dml_str_after_fit = str(dml_did_multi_obj_fit) + + # Check that additional information is present after fitting + assert "ATT(2,0,1)" in dml_str_after_fit + assert "coef" in dml_str_after_fit + assert "std err" in dml_str_after_fit + assert "t" in dml_str_after_fit + assert "P>|t|" in dml_str_after_fit + assert "Out-of-sample Performance:" in dml_str_after_fit diff --git a/doubleml/did/tests/test_return_types.py b/doubleml/did/tests/test_return_types.py new file mode 100644 index 00000000..a59cec6c --- /dev/null +++ b/doubleml/did/tests/test_return_types.py @@ -0,0 +1,171 @@ +import numpy as np +import pandas as pd +import pytest +from sklearn.linear_model import Lasso, LogisticRegression + +from doubleml.data import DoubleMLData, DoubleMLPanelData +from doubleml.did import DoubleMLDID, DoubleMLDIDBinary, DoubleMLDIDCS +from doubleml.did.datasets import make_did_CS2021, make_did_SZ2020 +from doubleml.utils._check_return_types import ( + check_basic_predictions_and_targets, + check_basic_property_types_and_shapes, + check_basic_return_types, + check_sensitivity_return_types, +) + +# Test constants +N_OBS = 200 +N_TREAT = 1 +N_REP = 1 +N_FOLDS = 3 +N_REP_BOOT = 314 + +dml_args = { + "n_rep": N_REP, + "n_folds": N_FOLDS, +} + + +# create all datasets +np.random.seed(3141) +datasets = {} + +datasets["did"] = make_did_SZ2020(n_obs=N_OBS) +datasets["did_cs"] = make_did_SZ2020(n_obs=N_OBS, cross_sectional_data=True) + +# Binary outcome +(x, y, d, t) = make_did_SZ2020(n_obs=N_OBS, cross_sectional_data=True, return_type="array") +binary_outcome = np.random.binomial(n=1, p=0.5, size=N_OBS) + +datasets["did_binary_outcome"] = DoubleMLData.from_arrays(x, binary_outcome, d) +datasets["did_cs_binary_outcome"] = DoubleMLData.from_arrays(x, binary_outcome, d, t=t) + +dml_objs = [ + (DoubleMLDID(datasets["did"], Lasso(), LogisticRegression(), **dml_args), DoubleMLDID), + (DoubleMLDID(datasets["did_binary_outcome"], LogisticRegression(), LogisticRegression(), **dml_args), DoubleMLDID), + (DoubleMLDIDCS(datasets["did_cs"], Lasso(), LogisticRegression(), **dml_args), DoubleMLDIDCS), + (DoubleMLDIDCS(datasets["did_cs_binary_outcome"], LogisticRegression(), LogisticRegression(), **dml_args), DoubleMLDIDCS), +] + + +@pytest.mark.ci +@pytest.mark.parametrize("dml_obj, cls", dml_objs) +def test_return_types(dml_obj, cls): + check_basic_return_types(dml_obj, cls) + + # further return type tests + assert isinstance(dml_obj.get_params("ml_m"), dict) + + +@pytest.fixture(params=dml_objs) +def fitted_dml_obj(request): + dml_obj, _ = request.param + dml_obj.fit() + dml_obj.bootstrap(n_rep_boot=N_REP_BOOT) + return dml_obj + + +@pytest.mark.ci +def test_property_types_and_shapes(fitted_dml_obj): + check_basic_property_types_and_shapes(fitted_dml_obj, N_OBS, N_TREAT, N_REP, N_FOLDS, N_REP_BOOT) + check_basic_predictions_and_targets(fitted_dml_obj, N_OBS, N_TREAT, N_REP) + + +@pytest.mark.ci +def test_sensitivity_return_types(fitted_dml_obj): + if fitted_dml_obj._sensitivity_implemented: + benchmarking_set = [fitted_dml_obj._dml_data.x_cols[0]] + check_sensitivity_return_types(fitted_dml_obj, N_OBS, N_REP, N_TREAT, benchmarking_set=benchmarking_set) + + +# panel data +df_panel = make_did_CS2021(n_obs=N_OBS, dgp_type=1, n_pre_treat_periods=2, n_periods=5, time_type="float") +df_panel["y_binary"] = np.random.binomial(n=1, p=0.5, size=df_panel.shape[0]) +datasets["did_panel"] = DoubleMLPanelData( + df_panel, y_col="y", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] +) +datasets["did_panel_binary_outcome"] = DoubleMLPanelData( + df_panel, y_col="y_binary", d_cols="d", id_col="id", t_col="t", x_cols=["Z1", "Z2", "Z3", "Z4"] +) + +dml_panel_binary_args = dml_args | { + "g_value": 2, + "t_value_pre": 0, + "t_value_eval": 1, +} + +dml_objs_panel = [ + ( + DoubleMLDIDBinary(datasets["did_panel"], ml_g=Lasso(), ml_m=LogisticRegression(), **dml_panel_binary_args), + DoubleMLDIDBinary, + ), + ( + DoubleMLDIDBinary( + datasets["did_panel_binary_outcome"], ml_g=LogisticRegression(), ml_m=LogisticRegression(), **dml_panel_binary_args + ), + DoubleMLDIDBinary, + ), +] + + +@pytest.mark.ci +@pytest.mark.parametrize("dml_obj, cls", dml_objs_panel) +def test_panel_return_types(dml_obj, cls): + check_basic_return_types(dml_obj, cls) + + # further return type tests + assert isinstance(dml_obj.get_params("ml_m"), dict) + + assert isinstance(dml_obj.g_value, (int, np.integer)) + assert isinstance(dml_obj.t_value_eval, (int, np.integer, float, np.floating)) + assert isinstance(dml_obj.t_value_pre, (int, np.integer, float, np.floating)) + assert isinstance(dml_obj.post_treatment, bool) + + # Test panel_data_wide property + assert isinstance(dml_obj.panel_data_wide, pd.DataFrame) + assert dml_obj.panel_data_wide.shape[0] <= N_OBS + assert "G_indicator" in dml_obj.panel_data_wide.columns + assert "C_indicator" in dml_obj.panel_data_wide.columns + assert "y_diff" in dml_obj.panel_data_wide.columns + + # Test id_positions property + assert isinstance(dml_obj.id_positions, np.ndarray) + assert dml_obj.id_positions.ndim == 1 + + # propensity score properties + assert isinstance(dml_obj.in_sample_normalization, bool) + assert isinstance(dml_obj.trimming_rule, str) + assert dml_obj.trimming_rule in ["truncate"] + assert isinstance(dml_obj.trimming_threshold, (float, np.floating)) + assert 0 <= dml_obj.trimming_threshold <= 0.5 + + # Test n_obs property + assert isinstance(dml_obj.n_obs, (int, np.integer)) + assert dml_obj.n_obs <= N_OBS + + # Test consistency between properties + if dml_obj.post_treatment: + assert dml_obj.g_value <= dml_obj.t_value_eval + else: + assert dml_obj.g_value > dml_obj.t_value_eval + + +@pytest.fixture(params=dml_objs_panel) +def fitted_panel_dml_obj(request): + dml_obj, _ = request.param + dml_obj.fit() + dml_obj.bootstrap(n_rep_boot=N_REP_BOOT) + return dml_obj + + +@pytest.mark.ci +def test_panel_property_types_and_shapes(fitted_panel_dml_obj): + check_basic_property_types_and_shapes(fitted_panel_dml_obj, N_OBS, N_TREAT, N_REP, N_FOLDS, N_REP_BOOT) + check_basic_predictions_and_targets(fitted_panel_dml_obj, N_OBS, N_TREAT, N_REP) + + +@pytest.mark.ci +def test_panel_sensitivity_return_types(fitted_panel_dml_obj): + if fitted_panel_dml_obj._sensitivity_implemented: + benchmarking_set = [fitted_panel_dml_obj._dml_data.x_cols[0]] + check_sensitivity_return_types(fitted_panel_dml_obj, N_OBS, N_REP, N_TREAT, benchmarking_set=benchmarking_set) diff --git a/doubleml/did/utils/_aggregation.py b/doubleml/did/utils/_aggregation.py new file mode 100644 index 00000000..e0cd5b1a --- /dev/null +++ b/doubleml/did/utils/_aggregation.py @@ -0,0 +1,231 @@ +import numpy as np + + +def _check_did_aggregation_dict(aggregation_dict, gt_index): + if not isinstance(aggregation_dict, dict): + raise ValueError("aggregation must be a dictionary") + + # Validate and extract custom parameters + required_keys = {"weight_masks"} + if not all(key in aggregation_dict for key in required_keys): + raise ValueError(f"aggregation must contain all required keys: {required_keys}") + + # Check if weight_masks is a masked numpy array + weight_masks = aggregation_dict["weight_masks"] + if not isinstance(weight_masks, np.ma.MaskedArray): + raise ValueError("weight_masks must be a numpy masked array") + + # check if weight_masks has 4 dim + if weight_masks.ndim != 4: + raise ValueError("weight_masks must have 4 dimensions") + + # Check if weight_masks has the same first three dimensions as gt_index + if weight_masks.shape[:-1] != gt_index.shape: + raise ValueError( + f"weight_masks must have shape {gt_index.shape} + (n,) where n is the number of aggregations. " + f"Got shape {weight_masks.shape}" + ) + + n_aggregations = weight_masks.shape[-1] + # check if every weight_mask along last axis has the same mask as gt_index + for i in range(n_aggregations): + if not np.array_equal(weight_masks[..., i].mask, gt_index.mask): + raise ValueError("weight_masks must have the same mask as gt_index") + + return aggregation_dict + + +def _compute_did_group_aggregation_weights(gt_index, g_values, d_values, selected_gt_mask): + """ + Calculate weights for aggregating treatment effects by group. + + Parameters + ---------- + gt_index : numpy.ma.MaskedArray + Masked array containing group-time indices + g_values : array-like + Array of unique group values + d_values : array-like + Array of treatment values + selected_gt_mask : numpy.ndarray + Boolean mask indicating which group-time combinations to include + + Returns + ------- + dict + Dictionary containing: + - weight_masks: numpy.ma.MaskedArray with weights for each group + - agg_names: list of group names + - agg_weights: numpy.ndarray of aggregation weights + """ + selected_gt_indicies = np.where(selected_gt_mask) + selected_unique_g_indices = np.unique(selected_gt_indicies[0]) + n_agg_effects = len(selected_unique_g_indices) + + if n_agg_effects == 0: + raise ValueError("No valid groups found for aggregation.") + + agg_names = [None] * n_agg_effects + agg_weights = [np.nan] * n_agg_effects + + # Create a weight mask (0 weights) for each of the groups + weight_masks = np.ma.masked_array( + data=np.zeros((*gt_index.shape, n_agg_effects)), + mask=np.broadcast_to(gt_index.mask[..., np.newaxis], (*gt_index.shape, n_agg_effects)), + dtype=np.float64, + ) + + # Write weight masks + for idx_agg, g_idx in enumerate(selected_unique_g_indices): + # Set group name & weights + current_group = g_values[g_idx] + agg_names[idx_agg] = str(current_group) + agg_weights[idx_agg] = (d_values == current_group).mean() + + # Group weights_masks + group_gt_indicies = [(i, j, k) for i, j, k in zip(*selected_gt_indicies) if i == g_idx] + + weight = 1 / len(group_gt_indicies) + for i, j, k in group_gt_indicies: + weight_masks.data[i, j, k, idx_agg] = weight + + # Normalize weights + agg_weights = np.array(agg_weights) / sum(agg_weights) + + return {"weight_masks": weight_masks, "agg_names": agg_names, "agg_weights": agg_weights} + + +def _compute_did_time_aggregation_weights(gt_index, g_values, t_values, d_values, selected_gt_mask): + """ + Calculate weights for aggregating treatment effects over time periods. + + Parameters + ---------- + gt_index : numpy.ma.MaskedArray + Masked array containing group-time indices + g_values : array-like + Array of unique group values + t_values : array-like + Array of unique time period values + d_values : array-like + Array of treatment values (g_values for each id) + selected_gt_mask : numpy.ndarray + Boolean mask indicating which group-time combinations to include + + Returns + ------- + dict + Dictionary containing: + - weight_masks: numpy.ma.MaskedArray with weights for each group + - agg_names: list of group names + - agg_weights: numpy.ndarray of aggregation weights + """ + selected_gt_indicies = np.where(selected_gt_mask) + selected_unique_t_eval_indices = np.unique(selected_gt_indicies[2]) + n_agg_effects = len(selected_unique_t_eval_indices) + + if n_agg_effects == 0: + raise ValueError("No time periods found for aggregation.") + + agg_names = [None] * n_agg_effects + # equal weight due to balanced panel + agg_weights = np.ones(n_agg_effects) / n_agg_effects + + # Create a weight mask (0 weights) for each of the groups + weight_masks = np.ma.masked_array( + data=np.zeros((*gt_index.shape, n_agg_effects)), + mask=np.broadcast_to(gt_index.mask[..., np.newaxis], (*gt_index.shape, n_agg_effects)), + dtype=np.float64, + ) + + group_weights = np.zeros(len(g_values)) + selected_unique_g_indices = np.unique(selected_gt_indicies[0]) + for g_idx in selected_unique_g_indices: + group_weights[g_idx] = (d_values == g_values[g_idx]).mean() # (requires balanced panel) + + # Write weight masks + for idx_agg, t_eval_idx in enumerate(selected_unique_t_eval_indices): + # Set time period name + current_time_period = t_values[t_eval_idx] + agg_names[idx_agg] = str(current_time_period) + + # time weights_masks + time_gt_indicies = [(i, j, k) for i, j, k in zip(*selected_gt_indicies) if k == t_eval_idx] + + for i, j, k in time_gt_indicies: + weight_masks.data[i, j, k, idx_agg] = group_weights[i] + + # normalize weights + weight_masks.data[..., idx_agg] = weight_masks.data[..., idx_agg] / np.sum(weight_masks.data[..., idx_agg]) + + return {"weight_masks": weight_masks, "agg_names": agg_names, "agg_weights": agg_weights} + + +def _compute_did_eventstudy_aggregation_weights(gt_index, g_values, t_values, d_values, time_values, selected_gt_mask): + """ + Calculate weights for aggregating treatment effects over time periods. + + Parameters + ---------- + gt_index : numpy.ma.MaskedArray + Masked array containing group-time indices + g_values : array-like + Array of unique group values + t_values : array-like + Array of unique evaluation time values + d_values : array-like + Array of treatment values (g_values for each id) + time_values : array-like + Array of evaluation time values (t_values for each id) + selected_gt_mask : numpy.ndarray + Boolean mask indicating which group-time combinations to include + + Returns + ------- + dict + Dictionary containing: + - weight_masks: numpy.ma.MaskedArray with weights for each group + - agg_names: list of group names + - agg_weights: numpy.ndarray of aggregation weights + """ + selected_gt_indicies = np.where(selected_gt_mask) + eventtime = time_values - d_values + e_values = np.unique(eventtime) + selected_unique_e_values = np.unique([t_values[k] - g_values[i] for i, _, k in zip(*selected_gt_indicies)]) + assert np.all(np.isin(selected_unique_e_values, e_values)) + n_agg_effects = len(selected_unique_e_values) + + if n_agg_effects == 0: + raise ValueError("No time periods found for aggregation.") + + agg_names = [None] * n_agg_effects + agg_weights = np.zeros(n_agg_effects) + agg_weights[selected_unique_e_values >= 0] = 1 / np.sum(selected_unique_e_values >= 0) + + # Create a weight mask (0 weights) for each of the groups + weight_masks = np.ma.masked_array( + data=np.zeros((*gt_index.shape, n_agg_effects)), + mask=np.broadcast_to(gt_index.mask[..., np.newaxis], (*gt_index.shape, n_agg_effects)), + dtype=np.float64, + ) + + group_weights = np.zeros(len(g_values)) + selected_unique_g_indices = np.unique(selected_gt_indicies[0]) + for g_idx in selected_unique_g_indices: + group_weights[g_idx] = (d_values == g_values[g_idx]).mean() # (requires balanced panel) + + # Write weight masks + for idx_agg, e_val in enumerate(selected_unique_e_values): + # Set time period name + agg_names[idx_agg] = str(e_val) + + # time weights_masks + eventtime_gt_indicies = [(i, j, k) for i, j, k in zip(*selected_gt_indicies) if t_values[k] - g_values[i] == e_val] + + for i, j, k in eventtime_gt_indicies: + weight_masks.data[i, j, k, idx_agg] = group_weights[i] + + # normalize weights + weight_masks.data[..., idx_agg] = weight_masks.data[..., idx_agg] / np.sum(weight_masks.data[..., idx_agg]) + + return {"weight_masks": weight_masks, "agg_names": agg_names, "agg_weights": agg_weights} diff --git a/doubleml/did/utils/_did_utils.py b/doubleml/did/utils/_did_utils.py new file mode 100644 index 00000000..bb69a1ef --- /dev/null +++ b/doubleml/did/utils/_did_utils.py @@ -0,0 +1,246 @@ +import warnings +from collections.abc import Iterable + +import numpy as np +import pandas as pd + +expected_time_types = (int, float) + + +def _convert_to_numpy_arrray(x, input_name, allow_nan=False): + if isinstance(x, np.ndarray): + if not x.ndim == 1: + raise ValueError(f"{input_name} must be a vector. Number of dimensions is {x.ndim}.") + elif isinstance(x, (int, float)): + x = np.array([x]) + elif isinstance(x, Iterable): + if not all(isinstance(i, expected_time_types) for i in x): + raise TypeError(f"Invalid type for {input_name}: expected one of {expected_time_types}.") + x = np.array(x) + else: + raise TypeError(f"Invalid type for {input_name}.") + + if np.issubdtype(x.dtype, np.floating) and not allow_nan and (np.any(np.isnan(x)) or np.any(np.isinf(x))): + raise ValueError(f"{input_name} contains missing or infinite values.") + + if np.issubdtype(x.dtype, np.datetime64) and not allow_nan and np.any(np.isnat(x)): + raise ValueError(f"{input_name} contains missing values.") + + return x + + +def _get_never_treated_value(g_values): + never_treated_value = 0 + if np.issubdtype(g_values.dtype, np.floating): + never_treated_value = np.inf + elif np.issubdtype(g_values.dtype, np.datetime64): + never_treated_value = pd.NaT + return never_treated_value + + +def _is_never_treated(x, never_treated_value): + if not isinstance(x, np.ndarray): + x = np.array([x]) + + if never_treated_value is np.inf: + return np.isinf(x) + elif never_treated_value is pd.NaT: + return pd.isna(x) + else: + assert never_treated_value == 0 + return x == 0 + + +def _check_control_group(control_group): + valid_control_groups = ["never_treated", "not_yet_treated"] + if control_group not in valid_control_groups: + raise ValueError(f"The control group has to be one of {valid_control_groups}. " + f"{control_group} was passed.") + + return control_group + + +def _check_anticipation_periods(anticipation_periods): + if not isinstance(anticipation_periods, int): + raise TypeError("The anticipation periods must be an integer.") + if anticipation_periods < 0: + raise ValueError("The anticipation periods must be non-negative.") + + return anticipation_periods + + +def _check_gt_combination(gt_combination, g_values, t_values, never_treated_value, anticipation_periods): + g_value, t_value_pre, t_value_eval = gt_combination + if g_value not in g_values: + raise ValueError(f"The value {g_value} is not in the set of treatment group values {g_values}.") + if _is_never_treated(g_value, never_treated_value): + raise ValueError(f"The never treated group is not allowed as treatment group (g_value={never_treated_value}).") + if g_value not in t_values: + raise ValueError(f"The value {g_value} (group value) is not in the set of evaluation period values {t_values}.") + if t_value_pre not in t_values: + raise ValueError(f"The value {t_value_pre} is not in the set of evaluation period values {t_values}.") + if t_value_eval not in t_values: + raise ValueError(f"The value {t_value_eval} is not in the set of evaluation period values {t_values}.") + + if t_value_pre == t_value_eval: + raise ValueError(f"The pre-treatment and evaluation period must be different. Got {t_value_pre} for both.") + + if t_value_pre > t_value_eval: + raise ValueError( + "The pre-treatment period must be before the evaluation period. " + f"Got t_value_pre {t_value_pre} and t_value_eval {t_value_eval}." + ) + + # get t_value equal to g_value and adjust for anticipation periods + maximal_t_pre = t_values[max(np.where(t_values == g_value)[0] - anticipation_periods, 0)] + if t_value_pre >= maximal_t_pre: + warnings.warn( + "The treatment was assigned before the first pre-treatment period (including anticipation). " + f"Got t_value_pre {t_value_pre} and g_value {g_value} with {anticipation_periods} anticipation_periods." + ) + + +def _check_gt_values(g_values, t_values): + + g_values = _convert_to_numpy_arrray(g_values, "g_values", allow_nan=True) + t_values = _convert_to_numpy_arrray(t_values, "t_values", allow_nan=False) + + expected_dtypes = (np.integer, np.floating, np.datetime64) + if not any(np.issubdtype(g_values.dtype, dt) for dt in expected_dtypes): + raise ValueError(f"Invalid data type for g_values: expected one of {expected_dtypes}.") + if not any(np.issubdtype(t_values.dtype, dt) for dt in expected_dtypes): + raise ValueError(f"Invalid data type for t_values: expected one of {expected_dtypes}.") + + if np.issubdtype(g_values.dtype, np.datetime64) != np.issubdtype(t_values.dtype, np.datetime64): + raise ValueError( + "g_values and t_values must have the same data type. " + f"Got {g_values.dtype} for g_values and {t_values.dtype} for t_values." + ) + + +def _construct_gt_combinations(setting, g_values, t_values, never_treated_value, anticipation_periods): + """Construct treatment-time combinations for difference-in-differences analysis. + + Parameters: + setting (str): Strategy for constructing combinations ('standard' only) + g_values (array): Treatment group values, must be sorted + t_values (array): Time period values, must be sorted + + Returns: + list: List of (g_val, t_pre, t_eval) tuples + """ + valid_settings = ["standard", "all"] + if setting not in valid_settings: + raise ValueError(f"gt_combinations must be one of {valid_settings}. {setting} was passed.") + + treatment_groups = g_values[~_is_never_treated(g_values, never_treated_value)] + if not np.all(np.diff(treatment_groups) > 0): + raise ValueError("g_values must be sorted in ascending order (Excluding never treated group).") + if not np.all(np.diff(t_values) > 0): + raise ValueError("t_values must be sorted in ascending order.") + + gt_combinations = [] + if setting == "standard": + for g_val in treatment_groups: + t_values_before_g = t_values[t_values < g_val] + if len(t_values_before_g) > anticipation_periods: + first_eval_index = anticipation_periods + 1 # first relevant evaluation period index + t_before_g = t_values_before_g[-first_eval_index] + + # collect all evaluation periods + for i_t_eval, t_eval in enumerate(t_values[first_eval_index:]): + t_previous = t_values[i_t_eval] # refers to t-anticipation_periods-1 + t_pre = min(t_previous, t_before_g) # if t_previous larger than g_val, use t_before_g + gt_combinations.append((g_val, t_pre, t_eval)) + + if setting == "all": + for g_val in treatment_groups: + t_values_before_g = t_values[t_values < g_val] + if len(t_values_before_g) > anticipation_periods: + first_eval_index = anticipation_periods + 1 # first relevant evaluation period index + for t_eval in t_values[first_eval_index:]: + # all t-values before g_val - anticipation_periods + valid_t_pre_values = t_values[t_values <= min(g_val, t_eval)][:-first_eval_index] + for t_pre in valid_t_pre_values: + gt_combinations.append((g_val, t_pre, t_eval)) + + if len(gt_combinations) == 0: + raise ValueError( + "No valid group-time combinations found. " + "Please check the treatment group values and time period values (and anticipation)." + ) + + return gt_combinations + + +def _construct_gt_index(gt_combinations, g_values, t_values): + """Construct a 3D array mapping group-time combinations to their indices. + + Parameters: + gt_combinations: List of tuples (g_val, t_pre, t_eval) + g_values: Array of group values + t_values: Array of time values + + Returns: + 3D numpy masked array where entry [i,j,k] contains the index of the combination + in gt_combinations if it exists, masked otherwise + """ + gt_index = np.ma.masked_array( + data=np.full(shape=(len(g_values), len(t_values), len(t_values)), fill_value=-1, dtype=np.int64), mask=True + ) + for i_gt_combination, (g_val, t_pre, t_eval) in enumerate(gt_combinations): + i_g = np.where(g_values == g_val)[0][0] + i_t_pre = np.where(t_values == t_pre)[0][0] + i_t_eval = np.where(t_values == t_eval)[0][0] + gt_index[i_g, i_t_pre, i_t_eval] = i_gt_combination + gt_index.mask[i_g, i_t_pre, i_t_eval] = False + + return gt_index + + +def _construct_post_treatment_mask(g_values, t_values): + """Constructs a mask indicating post-treatment periods for group-time combinations. + + Creates a 3D boolean array where entry [i,j,k] is True if the evaluation time t_values[k] + is after the treatment time g_values[i], indicating a post-treatment period. + + Parameters + ---------- + g_values : numpy.ndarray + 1D array of treatment group values (treatment times) + t_values : numpy.ndarray + 1D array of time period values + + Returns + ------- + numpy.ndarray + 3D boolean array of shape (len(g_values), len(t_values), len(t_values)) + where True indicates post-treatment periods (t_eval > g_val) + + """ + # Reshape arrays for broadcasting + g_vals = g_values[:, np.newaxis, np.newaxis] # Shape: (G, 1, 1) + t_evals = t_values[np.newaxis, np.newaxis, :] # Shape: (1, 1, T) + t_evals = np.broadcast_to(t_evals, (1, len(t_values), len(t_values))) # Shape: (1, T, T) + + # Broadcasting creates a mask of shape (G, T, T) + post_treatment_mask = t_evals >= g_vals + return post_treatment_mask + + +def _set_id_positions(a, n_obs, id_positions, fill_value): + if a is not None: + new_a = np.full((n_obs, *a.shape[1:]), fill_value=fill_value) + new_a[id_positions] = a + else: + new_a = None + + return new_a + + +def _get_id_positions(a, id_positions): + if a is not None: + new_a = a[id_positions] + else: + new_a = None + + return new_a diff --git a/doubleml/did/utils/_plot.py b/doubleml/did/utils/_plot.py new file mode 100644 index 00000000..9a3b3aab --- /dev/null +++ b/doubleml/did/utils/_plot.py @@ -0,0 +1,45 @@ +import numpy as np +import pandas as pd + + +def add_jitter(data, x_col, is_datetime=None, jitter_value=None): + """ + Adds jitter to duplicate x-values for better visibility. + + Args: + data (DataFrame): The subset of the dataset to jitter. + x_col (str): Column name for x values. + is_datetime (bool): Whether the x-values are datetime objects. If None, will be detected. + jitter_value (float or timedelta): Jitter amount. + + Returns: + DataFrame with an additional 'jittered_x' column. + """ + if data.empty: + return data + + data = data.copy() + + # Auto-detect datetime if not specified + if is_datetime is None: + is_datetime = pd.api.types.is_datetime64_any_dtype(data[x_col]) + + # Initialize jittered_x with original values + data["jittered_x"] = data[x_col] + + for x_val in data[x_col].unique(): + mask = data[x_col] == x_val + count = mask.sum() + if count > 1: + # Create evenly spaced jitter values + if is_datetime: + jitters = [pd.Timedelta(seconds=float(j)) for j in np.linspace(-jitter_value, jitter_value, count)] + else: + jitters = np.linspace(-jitter_value, jitter_value, count) + + # Apply jitter to each duplicate point + data.loc[mask, "jitter_index"] = range(count) + for i, j in enumerate(jitters): + data.loc[mask & (data["jitter_index"] == i), "jittered_x"] = x_val + j + + return data diff --git a/doubleml/did/utils/tests/test_add_jitter.py b/doubleml/did/utils/tests/test_add_jitter.py new file mode 100644 index 00000000..c66cb8bd --- /dev/null +++ b/doubleml/did/utils/tests/test_add_jitter.py @@ -0,0 +1,130 @@ +from datetime import datetime, timedelta + +import pandas as pd +import pytest + +from doubleml.did.utils._plot import add_jitter + + +@pytest.fixture +def numeric_df_no_duplicates(): + """Create a DataFrame with numeric x values and no duplicates.""" + return pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [10, 20, 30, 40, 50]}) + + +@pytest.fixture +def numeric_df_with_duplicates(): + """Create a DataFrame with numeric x values and duplicates.""" + return pd.DataFrame({"x": [1, 1, 2, 2, 2, 3], "y": [10, 15, 20, 25, 30, 35]}) + + +@pytest.fixture +def datetime_df_with_duplicates(): + """Create a DataFrame with datetime x values and duplicates.""" + base_date = datetime(2023, 1, 1) + return pd.DataFrame( + { + "x": [ + base_date, + base_date, + base_date + timedelta(days=1), + base_date + timedelta(days=1), + base_date + timedelta(days=2), + ], + "y": [10, 15, 20, 25, 30], + } + ) + + +@pytest.mark.ci +def test_add_jitter_numeric_no_duplicates(numeric_df_no_duplicates): + """Test that no jitter is added when there are no duplicates.""" + result = add_jitter(numeric_df_no_duplicates, "x") + # No jitter should be added when there are no duplicates + pd.testing.assert_series_equal(result["jittered_x"], result["x"], check_names=False) + + +@pytest.mark.ci +def test_add_jitter_numeric_with_duplicates(numeric_df_with_duplicates): + """Test that jitter is added correctly to numeric values with duplicates.""" + result = add_jitter(numeric_df_with_duplicates, "x", jitter_value=0.1) + + # Check that all original x-values have jitter applied + for x_val in numeric_df_with_duplicates["x"].unique(): + mask = numeric_df_with_duplicates["x"] == x_val + count = mask.sum() + if count > 1: + jittered_x = result.loc[mask, "jittered_x"] + # Check that jittered values are different from original + assert not (jittered_x == x_val).all() + # Check that jittered values are symmetric around original + assert abs(jittered_x.mean() - x_val) < 1e-10 + + +@pytest.mark.ci +def test_add_jitter_datetime(datetime_df_with_duplicates): + """Test that jitter is added correctly to datetime values.""" + result = add_jitter(datetime_df_with_duplicates, "x", jitter_value=20) + + # Check that result contains jittered_x column with datetime type + assert pd.api.types.is_datetime64_dtype(result["jittered_x"]) + + # Check that duplicates have different jittered values + for x_val in datetime_df_with_duplicates["x"].unique(): + mask = datetime_df_with_duplicates["x"] == x_val + count = mask.sum() + if count > 1: + jittered_values = result.loc[mask, "jittered_x"].tolist() + # All jittered values should be unique + assert len(set(jittered_values)) == count + + +@pytest.mark.ci +def test_add_jitter_empty_df(): + """Test behavior with empty DataFrame.""" + empty_df = pd.DataFrame({"x": [], "y": []}) + result = add_jitter(empty_df, "x") + assert result.empty + + +@pytest.mark.ci +def test_add_jitter_explicit_value(numeric_df_with_duplicates): + """Test with explicitly specified jitter value.""" + explicit_jitter = 0.5 + result = add_jitter(numeric_df_with_duplicates, "x", jitter_value=explicit_jitter) + + # Check that maximum jitter is equal to or less than the specified value + for x_val in numeric_df_with_duplicates["x"].unique(): + mask = numeric_df_with_duplicates["x"] == x_val + if mask.sum() > 1: + max_diff = (result.loc[mask, "jittered_x"] - x_val).abs().max() + assert max_diff <= explicit_jitter + + +@pytest.mark.ci +def test_add_jitter_single_unique_value(): + """Test with DataFrame having only one unique x value.""" + df = pd.DataFrame({"x": [5, 5, 5], "y": [1, 2, 3]}) + result = add_jitter(df, "x", jitter_value=0.1) + + # Check that jitter was applied + assert not (result["jittered_x"] == 5).all() + + # Check that jittered values are centered around the original value + assert abs(result["jittered_x"].mean() - 5) < 1e-10 + + +@pytest.mark.ci +def test_add_jitter_explicit_datetime_flag(): + """Test with explicitly specified is_datetime flag.""" + # Create DataFrame with string dates + df = pd.DataFrame({"x": ["2023-01-01", "2023-01-01", "2023-01-02"], "y": [10, 15, 20]}) + + # Without specifying is_datetime, it would treat as strings + with pytest.raises(TypeError): + _ = add_jitter(df, "x") + + # With is_datetime=True, it should convert and jitter as datetimes + with pytest.raises(TypeError): + # This should fail because strings can't be converted to datetime implicitly + add_jitter(df, "x", is_datetime=True) diff --git a/doubleml/did/utils/tests/test_check_did_aggregation.py b/doubleml/did/utils/tests/test_check_did_aggregation.py new file mode 100644 index 00000000..a268c87f --- /dev/null +++ b/doubleml/did/utils/tests/test_check_did_aggregation.py @@ -0,0 +1,90 @@ +import numpy as np +import pytest + +from doubleml.did.utils._aggregation import _check_did_aggregation_dict + + +@pytest.fixture +def sample_gt_index(): + """Create a sample gt_index for testing""" + return np.ma.array( + [[[1, 2], [3, 4]], [[5, 6], [7, 8]]], mask=np.array([[[True, False], [False, True]], [[False, True], [True, False]]]) + ) + + +@pytest.fixture +def valid_weight_masks(sample_gt_index): + """Create valid weight masks for testing""" + return np.ma.array( + np.zeros((*sample_gt_index.shape, 2)), + mask=np.broadcast_to(sample_gt_index.mask[..., np.newaxis], (*sample_gt_index.shape, 2)), + ) + + +@pytest.mark.ci +def test_valid_aggregation_dict(sample_gt_index, valid_weight_masks): + """Test a valid aggregation dictionary""" + valid_dict = {"weight_masks": valid_weight_masks, "agg_names": ["g1", "g2"], "agg_weights": np.array([0.5, 0.5])} + result = _check_did_aggregation_dict(valid_dict, sample_gt_index) + assert isinstance(result, dict) + assert "weight_masks" in result + + +@pytest.mark.ci +@pytest.mark.parametrize( + "invalid_input,error_msg", + [ + ("not_a_dict", "aggregation must be a dictionary"), + ({}, "aggregation must contain all required keys: {'weight_masks'}"), + ({"weight_masks": np.array([1, 2, 3])}, "weight_masks must be a numpy masked array"), + ], +) +def test_invalid_input_types(sample_gt_index, invalid_input, error_msg): + """Test various invalid input types""" + with pytest.raises(ValueError, match=error_msg): + _check_did_aggregation_dict(invalid_input, sample_gt_index) + + +@pytest.mark.ci +def test_invalid_dimensions(sample_gt_index): + """Test weight_masks with wrong number of dimensions""" + wrong_dims = np.ma.array(np.zeros((sample_gt_index.shape)), mask=sample_gt_index.mask) # Only 3 dimensions + invalid_dict = {"weight_masks": wrong_dims} + with pytest.raises(ValueError, match="weight_masks must have 4 dimensions"): + _check_did_aggregation_dict(invalid_dict, sample_gt_index) + + +@pytest.mark.ci +def test_invalid_shape(sample_gt_index): + """Test weight_masks with wrong shape""" + wrong_shape = np.ma.array( + np.zeros((3, 3, 3, 2)), mask=np.zeros((3, 3, 3, 2), dtype=bool) # Wrong shape for first 3 dimensions + ) + invalid_dict = {"weight_masks": wrong_shape} + with pytest.raises(ValueError, match=r"weight_masks must have shape .* \+ \(n,\)"): + _check_did_aggregation_dict(invalid_dict, sample_gt_index) + + +@pytest.mark.ci +def test_invalid_mask_alignment(sample_gt_index): + """Test weight_masks with misaligned mask""" + wrong_mask = ~sample_gt_index.mask + weight_masks = np.ma.array( + np.zeros((*sample_gt_index.shape, 2)), mask=np.broadcast_to(wrong_mask[..., np.newaxis], (*sample_gt_index.shape, 2)) + ) + invalid_dict = {"weight_masks": weight_masks} + with pytest.raises(ValueError, match="weight_masks must have the same mask as gt_index"): + _check_did_aggregation_dict(invalid_dict, sample_gt_index) + + +@pytest.mark.ci +def test_multiple_weight_masks(sample_gt_index, valid_weight_masks): + """Test multiple weight masks with different masks""" + # Create a weight_masks array with multiple aggregations + weight_masks = np.ma.concatenate([valid_weight_masks, valid_weight_masks], axis=-1) + # Modify mask of last aggregation + weight_masks[..., -1].mask = ~weight_masks[..., -1].mask + + invalid_dict = {"weight_masks": weight_masks} + with pytest.raises(ValueError, match="weight_masks must have the same mask as gt_index"): + _check_did_aggregation_dict(invalid_dict, sample_gt_index) diff --git a/doubleml/did/utils/tests/test_did_eventstudy_aggregation.py b/doubleml/did/utils/tests/test_did_eventstudy_aggregation.py new file mode 100644 index 00000000..7cf556aa --- /dev/null +++ b/doubleml/did/utils/tests/test_did_eventstudy_aggregation.py @@ -0,0 +1,128 @@ +import numpy as np +import pytest + +from doubleml.did.utils._aggregation import _compute_did_eventstudy_aggregation_weights + + +@pytest.mark.ci +def test_basic_functionality_eventstudy(): + # Setup basic test data + gt_index = np.ma.MaskedArray(data=np.ones((2, 3, 3)), mask=np.zeros((2, 3, 3), dtype=bool)) + g_values = np.array([2, 3]) + t_values = np.array([1, 2, 3]) + d_values = np.array([2, 2, 2, 3, 3, 3]) + time_values = np.array([1, 2, 3, 1, 2, 3]) + selected_gt_mask = np.ones((2, 1, 3), dtype=bool) # 4 options + + result = _compute_did_eventstudy_aggregation_weights(gt_index, g_values, t_values, d_values, time_values, selected_gt_mask) + + assert isinstance(result, dict) + assert set(result.keys()) == {"weight_masks", "agg_names", "agg_weights"} + assert isinstance(result["weight_masks"], np.ma.MaskedArray) + assert result["weight_masks"].shape == (*gt_index.shape, 4) # 3 time periods + assert result["agg_names"] == ["-2", "-1", "0", "1"] + + +@pytest.mark.ci +def test_weight_computation_eventstudy(): + gt_index = np.ma.MaskedArray(data=np.ones((2, 3, 3)), mask=np.zeros((2, 3, 3), dtype=bool)) + g_values = np.array([2, 3]) + t_values = np.array([1, 2, 3]) + d_values = np.array([2, 2, 2, 3, 3, 3]) + time_values = np.array([1, 2, 3, 1, 2, 3]) + + # Select specific group-time combinations + selected_gt_mask = np.zeros((2, 3, 3), dtype=bool) + selected_gt_mask[:, :2, :2] = True # Select first two time periods for all groups + + result = _compute_did_eventstudy_aggregation_weights(gt_index, g_values, t_values, d_values, time_values, selected_gt_mask) + + # Check if number of aggregations is 3 + assert len(result["agg_names"]) == 3 + assert result["agg_names"] == ["-2", "-1", "0"] + + # Check weights sum to 1 for each time period + assert np.allclose(np.sum(result["agg_weights"]), 1.0) + + # Check weight distribution within time periods + for i in range(result["weight_masks"].shape[-1]): + time_weights = result["weight_masks"][..., i] + non_masked_values = time_weights.compressed() + if len(non_masked_values) > 0: + assert np.allclose(np.sum(non_masked_values), 1.0) + + # Check if weights in the selected_gt_mask are equally distributed + non_zero = time_weights[selected_gt_mask] != 0 + assert np.allclose(time_weights[selected_gt_mask].data[non_zero], 1 / sum(non_zero)) + + +@pytest.mark.ci +def test_no_valid_eventstudy_periods(): + gt_index = np.ma.MaskedArray(data=np.ones((2, 2, 2)), mask=np.zeros((2, 2, 2), dtype=bool)) + g_values = np.array([1, 2]) + t_values = np.array([1, 2, 3]) + d_values = np.array([2, 2, 2, 3, 3, 3]) + time_values = np.array([1, 2, 3, 1, 2, 3]) + selected_gt_mask = np.zeros((2, 2, 2), dtype=bool) # No time periods selected + + with pytest.raises(ValueError, match="No time periods found for aggregation."): + _compute_did_eventstudy_aggregation_weights(gt_index, g_values, t_values, d_values, time_values, selected_gt_mask) + + +@pytest.mark.ci +def test_single_eventstudy_period(): + gt_index = np.ma.MaskedArray(data=np.ones((2, 3, 3)), mask=np.zeros((2, 3, 3), dtype=bool)) + g_values = np.array([2, 3]) + t_values = np.array([1, 2, 3]) + d_values = np.array([2, 2, 2, 3, 3, 3]) + time_values = np.array([1, 2, 3, 1, 2, 3]) + selected_gt_mask = gt_index.mask # Select all non-masked elements + selected_gt_mask[1, 1, 2] = True # Select a single time period + + result = _compute_did_eventstudy_aggregation_weights(gt_index, g_values, t_values, d_values, time_values, selected_gt_mask) + + assert len(result["agg_names"]) == 1 + assert result["agg_names"] == ["0"] + assert result["weight_masks"].shape[-1] == 1 + assert np.allclose(result["agg_weights"], [1.0]) + + +@pytest.mark.ci +def test_masked_input_eventstudy(): + # Create data with shape (2,4,4) + data = np.ones((2, 4, 4)) + mask = np.zeros((2, 4, 4), dtype=bool) + + # Mask some elements in different positions + mask[0, 0, 0] = True + mask[1, 2, 1] = True + mask[1, 1, 2] = True + + gt_index = np.ma.MaskedArray(data=data, mask=mask) + g_values = np.array([2, 3]) # One value for each group + t_values = np.array([1, 2, 3, 4]) # One value for each time period + d_values = np.array([2, 2, 2, 2, 3, 3, 3, 3] * 4) # Treatment values + time_values = np.array([1, 2, 3, 4] * 8) + selected_gt_mask = ~mask # Select all non-masked elements + + result = _compute_did_eventstudy_aggregation_weights(gt_index, g_values, t_values, d_values, time_values, selected_gt_mask) + + # Check dimensions of output + assert result["weight_masks"].shape == (2, 4, 4, 5) # Last dimension is number of event study periods + + # Check if masks are maintained + for time_idx in range(5): + time_weights = result["weight_masks"][..., time_idx] + assert np.array_equal(time_weights.mask, mask) + + # Check weight normalization + for time_idx in range(4): + weights = result["weight_masks"][..., time_idx].compressed() # Get non-masked weights + if len(weights) > 0: + assert np.isclose(weights.sum(), 1.0) # Weights should sum to 1 for each time period + + # Check agg_names + assert result["agg_names"] == ["-2", "-1", "0", "1", "2"] + + # Check agg_weights sum to 1 + assert np.isclose(sum(result["agg_weights"]), 1.0) diff --git a/doubleml/did/utils/tests/test_did_group_aggregation.py b/doubleml/did/utils/tests/test_did_group_aggregation.py new file mode 100644 index 00000000..7dbed7e7 --- /dev/null +++ b/doubleml/did/utils/tests/test_did_group_aggregation.py @@ -0,0 +1,113 @@ +import numpy as np +import pytest + +from doubleml.did.utils._aggregation import _compute_did_group_aggregation_weights + + +@pytest.mark.ci +def test_basic_functionality(): + # Setup basic test data + gt_index = np.ma.MaskedArray(data=np.ones((3, 2, 1)), mask=np.zeros((3, 2, 1), dtype=bool)) + g_values = np.array([1, 2, 3]) + d_values = np.array([1, 2, 1, 2, 1, 2]) + selected_gt_mask = np.ones((3, 2, 1), dtype=bool) + + result = _compute_did_group_aggregation_weights(gt_index, g_values, d_values, selected_gt_mask) + + assert isinstance(result, dict) + assert set(result.keys()) == {"weight_masks", "agg_names", "agg_weights"} + assert isinstance(result["weight_masks"], np.ma.MaskedArray) + assert result["weight_masks"].shape == (*gt_index.shape, 3) # 3 groups + + +@pytest.mark.ci +def test_weight_computation(): + gt_index = np.ma.MaskedArray(data=np.ones((3, 4, 4)), mask=np.zeros((3, 4, 4), dtype=bool)) + g_values = np.array([1, 2, 3]) + d_values = np.array([1, 2, 1, 2, 1, 1, 1, 1, 3, 3]) + + # select some group-time combinations + selected_gt_mask = gt_index.mask.copy() + selected_gt_mask[:2, :2, 0] = True + + result = _compute_did_group_aggregation_weights(gt_index, g_values, d_values, selected_gt_mask) + + # check if the number of aggregations is 2 (in this case, group 1 and group 2) + assert len(result["agg_names"]) == 2 + + # Check weights sum to 1 for each group + assert np.allclose(np.sum(result["agg_weights"]), 1.0) + + # Check weight distribution within groups + for i in range(result["weight_masks"].shape[-1]): + group_weights = result["weight_masks"][..., i] + if len(group_weights) > 0: + assert np.allclose(np.sum(group_weights.compressed()), 1.0) + + # check if weights in the selected_gt_mask are 0.5 + assert np.allclose(group_weights[i, ...][selected_gt_mask[i, ...]], 0.5) + + # check if the aggregation weights are [0.75, 0.25] + assert np.allclose(result["agg_weights"], np.array([0.75, 0.25])) + + +@pytest.mark.ci +def test_no_valid_groups(): + gt_index = np.ma.MaskedArray(data=np.ones((2, 2, 1)), mask=np.zeros((2, 2, 1), dtype=bool)) + g_values = np.array([1, 2]) + d_values = np.array([1, 2, 1, 2]) + selected_gt_mask = np.zeros((2, 2, 1), dtype=bool) # No groups selected + + with pytest.raises(ValueError, match="No valid groups found for aggregation."): + _compute_did_group_aggregation_weights(gt_index, g_values, d_values, selected_gt_mask) + + +@pytest.mark.ci +def test_single_group(): + gt_index = np.ma.MaskedArray(data=np.ones((1, 2, 1)), mask=np.zeros((1, 2, 1), dtype=bool)) + g_values = np.array([1]) + d_values = np.array([1, 1]) + selected_gt_mask = np.ones((1, 2, 1), dtype=bool) + + result = _compute_did_group_aggregation_weights(gt_index, g_values, d_values, selected_gt_mask) + + assert len(result["agg_names"]) == 1 + assert result["weight_masks"].shape[-1] == 1 + assert np.allclose(result["agg_weights"], [1.0]) + + +@pytest.mark.ci +def test_masked_input(): + # Create data with shape (3,4,4) + data = np.ones((3, 4, 4)) + mask = np.zeros((3, 4, 4), dtype=bool) + + # Mask some elements in different positions + mask[0, 0, 0] = True + mask[1, 2, 3] = True + mask[2, 1, 1] = True + + gt_index = np.ma.MaskedArray(data=data, mask=mask) + g_values = np.array([1, 2, 3]) # One value for each group + d_values = np.array([1, 2, 3] * 16) # Treatment values matching the data size + selected_gt_mask = ~mask # Select all masked elements + + result = _compute_did_group_aggregation_weights(gt_index, g_values, d_values, selected_gt_mask) + + # Check dimensions of output + assert result["weight_masks"].shape == (3, 4, 4, 3) # Last dimension is number of groups + + for group_idx in range(3): + group_weights = result["weight_masks"][..., group_idx] + assert np.array_equal(group_weights.mask, mask) + + # Check weight normalization + for group_idx in range(3): + weights = result["weight_masks"][..., group_idx].compressed() # Get non-masked weights + assert np.isclose(weights.sum(), 1.0) # Weights should sum to 1 for each group + + # Check agg_names + assert result["agg_names"] == ["1", "2", "3"] + + # Check agg_weights sum to 1 + assert np.isclose(sum(result["agg_weights"]), 1.0) diff --git a/doubleml/did/utils/tests/test_did_time_aggregation.py b/doubleml/did/utils/tests/test_did_time_aggregation.py new file mode 100644 index 00000000..8ea9e540 --- /dev/null +++ b/doubleml/did/utils/tests/test_did_time_aggregation.py @@ -0,0 +1,122 @@ +import numpy as np +import pytest + +from doubleml.did.utils._aggregation import _compute_did_time_aggregation_weights + + +@pytest.mark.ci +def test_basic_functionality_time(): + # Setup basic test data + gt_index = np.ma.MaskedArray(data=np.ones((2, 3, 3)), mask=np.zeros((2, 3, 3), dtype=bool)) + g_values = np.array([2, 3]) + t_values = np.array([1, 2, 3]) + d_values = np.array([2, 2, 2, 3, 3, 3]) + selected_gt_mask = np.ones((2, 1, 3), dtype=bool) + + result = _compute_did_time_aggregation_weights(gt_index, g_values, t_values, d_values, selected_gt_mask) + + assert isinstance(result, dict) + assert set(result.keys()) == {"weight_masks", "agg_names", "agg_weights"} + assert isinstance(result["weight_masks"], np.ma.MaskedArray) + assert result["weight_masks"].shape == (*gt_index.shape, 3) # 3 time periods + assert result["agg_names"] == ["1", "2", "3"] + + +@pytest.mark.ci +def test_weight_computation_time(): + gt_index = np.ma.MaskedArray(data=np.ones((2, 3, 3)), mask=np.zeros((2, 3, 3), dtype=bool)) + g_values = np.array([2, 3]) + t_values = np.array([1, 2, 3]) + d_values = np.array([2, 2, 2, 3, 3, 3]) + + # Select specific group-time combinations + selected_gt_mask = np.zeros((2, 3, 3), dtype=bool) + selected_gt_mask[:, :2, :2] = True # Select first two time periods for all groups + + result = _compute_did_time_aggregation_weights(gt_index, g_values, t_values, d_values, selected_gt_mask) + + # Check if number of aggregations is 2 (in this case, time periods 10 and 20) + assert len(result["agg_names"]) == 2 + assert result["agg_names"] == ["1", "2"] + + # Check weights sum to 1 for each time period + assert np.allclose(np.sum(result["agg_weights"]), 1.0) + + # Check weight distribution within time periods + for i in range(result["weight_masks"].shape[-1]): + time_weights = result["weight_masks"][..., i] + non_masked_values = time_weights.compressed() + if len(non_masked_values) > 0: + assert np.allclose(np.sum(non_masked_values), 1.0) + + # Check if weights in the selected_gt_mask are 0.25 + non_zero = time_weights[selected_gt_mask] != 0 + assert np.allclose(time_weights[selected_gt_mask].data[non_zero], 0.25) + + +@pytest.mark.ci +def test_no_valid_time_periods(): + gt_index = np.ma.MaskedArray(data=np.ones((2, 2, 2)), mask=np.zeros((2, 2, 2), dtype=bool)) + g_values = np.array([1, 2]) + t_values = np.array([10, 20]) + d_values = np.array([1, 2, 1, 2]) + selected_gt_mask = np.zeros((2, 2, 2), dtype=bool) # No time periods selected + + with pytest.raises(ValueError, match="No time periods found for aggregation."): + _compute_did_time_aggregation_weights(gt_index, g_values, t_values, d_values, selected_gt_mask) + + +@pytest.mark.ci +def test_single_time_period(): + gt_index = np.ma.MaskedArray(data=np.ones((2, 3, 3)), mask=np.zeros((2, 3, 3), dtype=bool)) + g_values = np.array([2, 3]) + t_values = np.array([1, 2, 3]) + d_values = np.array([2, 2, 2, 3, 3, 3]) + selected_gt_mask = np.ones((2, 1, 1), dtype=bool) + + result = _compute_did_time_aggregation_weights(gt_index, g_values, t_values, d_values, selected_gt_mask) + + assert len(result["agg_names"]) == 1 + assert result["agg_names"] == ["1"] + assert result["weight_masks"].shape[-1] == 1 + assert np.allclose(result["agg_weights"], [1.0]) + + +@pytest.mark.ci +def test_masked_input_time(): + # Create data with shape (3,4,4) + data = np.ones((2, 4, 4)) + mask = np.zeros((2, 4, 4), dtype=bool) + + # Mask some elements in different positions + mask[0, 0, 0] = True + mask[1, 2, 1] = True + mask[1, 1, 2] = True + + gt_index = np.ma.MaskedArray(data=data, mask=mask) + g_values = np.array([2, 3]) # One value for each group + t_values = np.array([1, 2, 3, 4]) # One value for each time period + d_values = np.array([1, 2, 3, 4] * 6) # Treatment values + selected_gt_mask = ~mask # Select all non-masked elements + + result = _compute_did_time_aggregation_weights(gt_index, g_values, t_values, d_values, selected_gt_mask) + + # Check dimensions of output + assert result["weight_masks"].shape == (2, 4, 4, 4) # Last dimension is number of time periods + + # Check if masks are maintained + for time_idx in range(4): + time_weights = result["weight_masks"][..., time_idx] + assert np.array_equal(time_weights.mask, mask) + + # Check weight normalization + for time_idx in range(4): + weights = result["weight_masks"][..., time_idx].compressed() # Get non-masked weights + if len(weights) > 0: + assert np.isclose(weights.sum(), 1.0) # Weights should sum to 1 for each time period + + # Check agg_names + assert result["agg_names"] == ["1", "2", "3", "4"] + + # Check agg_weights sum to 1 + assert np.isclose(sum(result["agg_weights"]), 1.0) diff --git a/doubleml/did/utils/tests/test_did_utils.py b/doubleml/did/utils/tests/test_did_utils.py new file mode 100644 index 00000000..df9da7f2 --- /dev/null +++ b/doubleml/did/utils/tests/test_did_utils.py @@ -0,0 +1,431 @@ +import numpy as np +import pandas as pd +import pytest + +from doubleml.did.utils._did_utils import ( + _check_anticipation_periods, + _check_control_group, + _check_gt_combination, + _check_gt_values, + _construct_gt_combinations, + _construct_gt_index, + _construct_post_treatment_mask, + _get_id_positions, + _get_never_treated_value, + _is_never_treated, + _set_id_positions, +) + + +@pytest.mark.ci +def test_get_never_treated_value(): + assert _get_never_treated_value(np.array([1, 2])) == 0 + assert np.isinf(_get_never_treated_value(np.array([1.0, 2.0]))) + assert np.isinf(_get_never_treated_value(np.array([1.0, 2]))) + assert _get_never_treated_value(np.array(["2024-01-01", "2024-01-02"], dtype="datetime64")) is pd.NaT + assert _get_never_treated_value(np.array(["2024-01-01", "2024-01-02"])) == 0 + + +@pytest.mark.ci +def test_is_never_treated(): + # check single values + arguments = ( + (0, 0, True), + (1, 0, False), + (np.inf, np.inf, True), + (0, np.inf, False), + (np.nan, np.inf, False), + (pd.NaT, pd.NaT, True), + (0, pd.NaT, False), + ) + for x, never_treated_value, expected in arguments: + assert _is_never_treated(x, never_treated_value) == expected + + # check arrays + arguments = ( + (np.array([0, 1]), 0, np.array([True, False])), + (np.array([0, 1]), np.inf, np.array([False, False])), + (np.array([0, 1]), pd.NaT, np.array([False, False])), + (np.array([0, np.inf]), 0, np.array([True, False])), + (np.array([0, np.inf]), np.inf, np.array([False, True])), + (np.array([0, pd.NaT]), 0, np.array([True, False])), + (np.array([0, pd.NaT]), pd.NaT, np.array([False, True])), + ) + for x, never_treated_value, expected in arguments: + assert np.all(_is_never_treated(x, never_treated_value) == expected) + + +@pytest.mark.ci +def test_check_control_group(): + with pytest.raises(ValueError, match="The control group has to be one of"): + _check_control_group("invalid_control_group") + + +@pytest.mark.ci +def test_check_anticipation_periods(): + with pytest.raises(TypeError, match="The anticipation periods must be an integer."): + _check_anticipation_periods("invalid_type") + with pytest.raises(ValueError, match="The anticipation periods must be non-negative."): + _check_anticipation_periods(-1) + + assert _check_anticipation_periods(0) == 0 + assert _check_anticipation_periods(1) == 1 + + +@pytest.mark.ci +def test_check_gt_combination(): + valid_args = { + "gt_combination": (1, 0, 1), + "g_values": np.array([-1, 1, 2, np.inf]), + "t_values": np.array([0, 1, 2]), + "never_treated_value": np.inf, + "anticipation_periods": 0, + } + invalid_args = [ + ( + {"gt_combination": (3.0, 0, 1)}, + ValueError, + r"The value 3.0 is not in the set of treatment group values \[-1. 1. 2. inf\].", + ), + ({"gt_combination": (1, 0, 3)}, ValueError, r"The value 3 is not in the set of evaluation period values \[0 1 2\]."), + ({"gt_combination": (1, 3, 1)}, ValueError, r"The value 3 is not in the set of evaluation period values \[0 1 2\]."), + ( + {"gt_combination": (0, 0, 1), "g_values": np.array([1, 2, 0]), "never_treated_value": 0}, + ValueError, + r"The never treated group is not allowed as treatment group \(g_value=0\).", + ), + ( + {"gt_combination": (1, 1, 1)}, + ValueError, + "The pre-treatment and evaluation period must be different. Got 1 for both.", + ), + ( + {"gt_combination": (1, 1, 0)}, + ValueError, + "The pre-treatment period must be before the evaluation period. Got t_value_pre 1 and t_value_eval 0.", + ), + ( + {"gt_combination": (-1, 0, 1)}, + ValueError, + r"The value -1 \(group value\) is not in the set of evaluation period values \[0 1 2\].", + ), + ] + for arg, error, msg in invalid_args: + with pytest.raises(error, match=msg): + _check_gt_combination(**(valid_args | arg)) + + msg = r"The treatment was assigned before the first pre-treatment period \(including anticipation\)." + with pytest.warns(UserWarning, match=msg): + _check_gt_combination(**(valid_args | {"gt_combination": (1, 1, 2)})) + with pytest.warns(UserWarning, match=msg): + _check_gt_combination(**(valid_args | {"gt_combination": (1, 0, 1), "anticipation_periods": 1})) + + +@pytest.mark.ci +def test_input_check_gt_values(): + valid_args = { + "g_values": np.array([1.0, 2.0]), + "t_values": np.array([0.0, 1.0, 2.0]), + } + invalid_args = [ + ({"g_values": ["test"]}, TypeError, r"Invalid type for g_values: expected one of \(, \)."), + ({"t_values": ["test"]}, TypeError, r"Invalid type for t_values: expected one of \(, \)."), + ({"g_values": np.array([[1.0, 2.0]])}, ValueError, "g_values must be a vector. Number of dimensions is 2."), + ({"t_values": np.array([[0.0, 1.0, 2.0]])}, ValueError, "t_values must be a vector. Number of dimensions is 2."), + ({"g_values": None}, TypeError, "Invalid type for g_values."), + ({"t_values": None}, TypeError, "Invalid type for t_values."), + ({"t_values": np.array([0.0, 1.0, np.nan])}, ValueError, "t_values contains missing or infinite values."), + ({"t_values": np.array([0.0, 1.0, np.inf])}, ValueError, "t_values contains missing or infinite values."), + ( + {"t_values": np.array(["2024-01-01", "2024-01-02", "NaT"], dtype="datetime64")}, + ValueError, + "t_values contains missing values.", + ), + ( + {"g_values": np.array(["test", "test"])}, + ValueError, + ( + "Invalid data type for g_values: expected one of " + r"\(, , \)." + ), + ), + ( + {"t_values": np.array(["test", "test"])}, + ValueError, + ( + "Invalid data type for t_values: expected one of " + r"\(, , \)." + ), + ), + ( + {"g_values": np.array(["2024-01-01", "2024-01-02"], dtype="datetime64")}, + ValueError, + r"g_values and t_values must have the same data type. Got datetime64\[D\] for g_values and float64 for t_values.", + ), + ] + + for arg, error, msg in invalid_args: + with pytest.raises(error, match=msg): + _check_gt_values(**(valid_args | arg)) + + +@pytest.mark.ci +def test_construct_gt_combinations(): + msg = r"gt_combinations must be one of \['standard', 'all'\]. test was passed." + with pytest.raises(ValueError, match=msg): + _construct_gt_combinations( + setting="test", + g_values=np.array([2, 3]), + t_values=np.array([1, 2, 3, 4]), + never_treated_value=np.inf, + anticipation_periods=0, + ) + + msg = "g_values must be sorted in ascending order." + with pytest.raises(ValueError, match=msg): + _construct_gt_combinations( + setting="standard", + g_values=np.array([3, 2]), + t_values=np.array([1, 2, 3, 4]), + never_treated_value=np.inf, + anticipation_periods=0, + ) + + msg = "t_values must be sorted in ascending order." + with pytest.raises(ValueError, match=msg): + _construct_gt_combinations( + setting="standard", + g_values=np.array([1, 2]), + t_values=np.array([3, 2, 1]), + never_treated_value=np.inf, + anticipation_periods=0, + ) + + # too large anticipation periods (no valid combinations) + msg = ( + "No valid group-time combinations found. " + r"Please check the treatment group values and time period values \(and anticipation\)." + ) + with pytest.raises(ValueError, match=msg): + _construct_gt_combinations( + setting="standard", + g_values=np.array([2, 3]), + t_values=np.array([0, 1, 2, 3]), + never_treated_value=np.inf, + anticipation_periods=3, + ) + + # Test standard setting + standard_combinations = _construct_gt_combinations( + setting="standard", + g_values=np.array([2, 3]), + t_values=np.array([0, 1, 2, 3]), + never_treated_value=np.inf, + anticipation_periods=0, + ) + expected_standard = [ + (2, 0, 1), # g=2, pre=0 (min of t_previous=0 and t_before_g=0), eval=1 + (2, 1, 2), # g=2, pre=1 (min of t_previous=1 and t_before_g=1), eval=2 + (2, 1, 3), # g=2, pre=1 (min of t_previous=2 and t_before_g=1), eval=3 + (3, 0, 1), # g=3, pre=0 (min of t_previous=0 and t_before_g=0), eval=1 + (3, 1, 2), # g=3, pre=1 (min of t_previous=1 and t_before_g=1), eval=2 + (3, 2, 3), # g=3, pre=2 (min of t_previous=2 and t_before_g=2), eval=3 + ] + assert standard_combinations == expected_standard + + # Test all setting + all_combinations = _construct_gt_combinations( + setting="all", + g_values=np.array([2, 3]), + t_values=np.array([0, 1, 2, 3]), + never_treated_value=np.inf, + anticipation_periods=0, + ) + expected_all = [ + (2, 0, 1), # g=2, all pre periods before t_eval=1 + (2, 0, 2), # g=2, all pre periods before t_eval=2 + (2, 1, 2), + (2, 0, 3), # g=2, all pre periods before t_eval=3 + (2, 1, 3), + (3, 0, 1), # g=3, all pre periods before t_eval=1 + (3, 0, 2), # g=3, all pre periods before t_eval=2 + (3, 1, 2), + (3, 0, 3), # g=3, all pre periods before t_eval=3 + (3, 1, 3), + (3, 2, 3), + ] + assert all_combinations == expected_all + + # Test standard setting with anticipation periods + standard_combinations_anticipation = _construct_gt_combinations( + setting="standard", + g_values=np.array([2, 3]), + t_values=np.array([0, 1, 2, 3]), + never_treated_value=np.inf, + anticipation_periods=2, + ) + expected_standard_anticipation = [ + (3, 0, 3), # g=3, pre=0 (min of t_previous=0 and t_before_g=0), eval=3 with anticipation 2 + ] + assert standard_combinations_anticipation == expected_standard_anticipation + + # Test all setting with anticipation periods + all_combinations_anticipation = _construct_gt_combinations( + setting="all", + g_values=np.array([2, 3]), + t_values=np.array([0, 1, 2, 3]), + never_treated_value=np.inf, + anticipation_periods=2, + ) + expected_all_anticipation = [ + (3, 0, 3), # g=3, all pre periods before t_eval=3 with anticipation 2 + ] + assert all_combinations_anticipation == expected_all_anticipation + + +@pytest.mark.ci +def test_construct_gt_index(): + g_values = np.array([0, 2, 3]) + t_values = np.array([1, 2, 3]) + gt_combinations = [(2, 1, 2), (2, 1, 3), (3, 1, 2)] # g_val, t_pre, t_eval + result = _construct_gt_index(gt_combinations, g_values, t_values) + # Check dimensions + assert result.shape == (3, 3, 3) + + # Check valid entries + assert result[1, 0, 1] == 0 # First combination (2, 1, 2) + assert result[1, 0, 2] == 1 # Second combination (2, 1, 3) + assert result[2, 0, 1] == 2 # Third combination (3, 1, 2) + assert result.mask[1, 0, 1] == np.False_ + assert result.mask[1, 0, 2] == np.False_ + assert result.mask[2, 0, 1] == np.False_ + + # Check that other entries are masked and contain -1 + assert result.mask[0, 0, 0] == np.True_ + assert result.data[0, 0, 0] == -1 + + # Test case 2: Empty combinations + empty_result = _construct_gt_index([], g_values, t_values) + assert empty_result.shape == (3, 3, 3) + assert np.all(empty_result.mask) + assert np.all(empty_result.data == -1) + + # Test case 3: Single combination + single_combination = [(2, 1, 2)] + single_result = _construct_gt_index(single_combination, g_values, t_values) + assert single_result[1, 0, 1] == 0 + assert np.sum(~single_result.mask) == 1 # Only one unmasked entry + + # Test case 4: Different dimensions + g_values_large = np.array([0, 1, 2, 3, 4]) + t_values_large = np.array([1, 2, 3, 4]) + large_result = _construct_gt_index(gt_combinations, g_values_large, t_values_large) + assert large_result.shape == (5, 4, 4) + + +@pytest.mark.ci +def test_construct_post_treatment_mask(): + # Test case 1: Basic case with integer values + g_values = np.array([2, 3]) + t_values = np.array([1, 2, 3]) + result = _construct_post_treatment_mask(g_values, t_values) + + # Expected mask pattern for g=2: + # t_eval=1: False (1 not >= 2) + # t_eval=2: True (2 not >= 2) + # t_eval=3: True (3 >= 2) + expected_g2 = np.array([[False, True, True]] * len(t_values)) + np.testing.assert_array_equal(result[0], expected_g2) + + # Expected mask pattern for g=3: + # t_eval=1: False (1 not > 3) + # t_eval=2: False (2 not > 3) + # t_eval=3: True (3 >= 3) + expected_g3 = np.array([[False, False, True]] * len(t_values)) + np.testing.assert_array_equal(result[1], expected_g3) + + # Test case 2: Float values with non-integer treatment times + g_values = np.array([1.5, 2.5]) + t_values = np.array([1.0, 2.0, 3.0]) + result = _construct_post_treatment_mask(g_values, t_values) + + expected_g1_5 = np.array([[False, True, True]] * len(t_values)) + expected_g2_5 = np.array([[False, False, True]] * len(t_values)) + np.testing.assert_array_equal(result[0], expected_g1_5) + np.testing.assert_array_equal(result[1], expected_g2_5) + + # Test case 3: Single group + g_values = np.array([2]) + t_values = np.array([1, 2, 3]) + result = _construct_post_treatment_mask(g_values, t_values) + assert result.shape == (1, 3, 3) + np.testing.assert_array_equal(result[0], expected_g2) + + # Test case 4: Single time period + g_values = np.array([1, 2]) + t_values = np.array([3]) + result = _construct_post_treatment_mask(g_values, t_values) + assert result.shape == (2, 1, 1) + np.testing.assert_array_equal(result, np.array([[[True]], [[True]]])) + + # Test case 5: Datetime values + g_values = np.array(["2020-01-01", "2020-06-01"], dtype="datetime64[D]") + t_values = np.array(["2020-01-01", "2020-03-01", "2020-12-01"], dtype="datetime64[D]") + result = _construct_post_treatment_mask(g_values, t_values) + + expected_g1 = np.array([[True, True, True]] * len(t_values)) + expected_g2 = np.array([[False, False, True]] * len(t_values)) + np.testing.assert_array_equal(result[0], expected_g1) + np.testing.assert_array_equal(result[1], expected_g2) + + +@pytest.mark.ci +def test_get_id_positions(): + # Test case 1: Normal array with valid positions + a = np.array([1, 2, 3, 4, 5]) + id_positions = np.array([0, 2, 4]) + expected = np.array([1, 3, 5]) + result = _get_id_positions(a, id_positions) + np.testing.assert_array_equal(result, expected) + + # Test case 2: 2D array with valid positions + a_2d = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]) + id_positions = np.array([1, 3]) + expected_2d = np.array([[3, 4], [7, 8]]) + result_2d = _get_id_positions(a_2d, id_positions) + np.testing.assert_array_equal(result_2d, expected_2d) + + # Test case 3: None input + a_none = None + id_positions = np.array([0, 1, 2]) + result_none = _get_id_positions(a_none, id_positions) + assert result_none is None + + +@pytest.mark.ci +def test_set_id_positions(): + # Test case 1: Basic 1D array + a = np.array([1, 2, 3]) + n_obs = 5 + id_positions = np.array([1, 3, 4]) + fill_value = 0 + expected = np.array([0, 1, 0, 2, 3]) + result = _set_id_positions(a, n_obs, id_positions, fill_value) + np.testing.assert_array_equal(result, expected) + + # Test case 2: 2D array + a_2d = np.array([[1, 2], [3, 4], [5, 6]]) + n_obs = 5 + id_positions = np.array([0, 2, 4]) + fill_value = -1 + expected_2d = np.array([[1, 2], [-1, -1], [3, 4], [-1, -1], [5, 6]]) + result_2d = _set_id_positions(a_2d, n_obs, id_positions, fill_value) + np.testing.assert_array_equal(result_2d, expected_2d) + + # Test case 3: None input + a_none = None + n_obs = 3 + id_positions = np.array([0, 1]) + fill_value = 0 + result_none = _set_id_positions(a_none, n_obs, id_positions, fill_value) + assert result_none is None diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 1b88c8ee..1b6d3d09 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -7,13 +7,14 @@ from scipy.stats import norm from sklearn.base import is_classifier, is_regressor -from .double_ml_data import DoubleMLBaseData, DoubleMLClusterData -from .double_ml_framework import DoubleMLFramework -from .utils._checks import _check_external_predictions, _check_sample_splitting -from .utils._estimation import _aggregate_coefs_and_ses, _rmse, _set_external_predictions, _var_est -from .utils._sensitivity import _compute_sensitivity_bias -from .utils.gain_statistics import gain_statistics -from .utils.resampling import DoubleMLClusterResampling, DoubleMLResampling +from doubleml.data import DoubleMLClusterData, DoubleMLPanelData +from doubleml.data.base_data import DoubleMLBaseData +from doubleml.double_ml_framework import DoubleMLFramework +from doubleml.utils._checks import _check_external_predictions, _check_sample_splitting +from doubleml.utils._estimation import _aggregate_coefs_and_ses, _rmse, _set_external_predictions, _var_est +from doubleml.utils._sensitivity import _compute_sensitivity_bias +from doubleml.utils.gain_statistics import gain_statistics +from doubleml.utils.resampling import DoubleMLClusterResampling, DoubleMLResampling _implemented_data_backends = ["DoubleMLData", "DoubleMLClusterData"] @@ -33,7 +34,12 @@ def __init__(self, obj_dml_data, n_folds, n_rep, score, draw_sample_splitting): if obj_dml_data.n_cluster_vars > 2: raise NotImplementedError("Multi-way (n_ways > 2) clustering not yet implemented.") self._is_cluster_data = True + self._is_panel_data = False + if isinstance(obj_dml_data, DoubleMLPanelData): + self._is_panel_data = True + self._dml_data = obj_dml_data + self._n_obs = self._dml_data.n_obs # initialize framework which is constructed after the fit method is called self._framework = None @@ -170,6 +176,13 @@ def n_rep(self): """ return self._n_rep + @property + def n_obs(self): + """ + The number of observations used for estimation. + """ + return self._n_obs + @property def n_rep_boot(self): """ @@ -1210,6 +1223,12 @@ def draw_sample_splitting(self): The samples are drawn according to the attributes ``n_folds`` and ``n_rep``. + Parameters + ---------- + n_obs : int or None + The number of observations. If ``None``, the number of observations is set to the number of observations in + the data set. + Returns ------- self : object @@ -1218,14 +1237,14 @@ def draw_sample_splitting(self): obj_dml_resampling = DoubleMLClusterResampling( n_folds=self._n_folds_per_cluster, n_rep=self.n_rep, - n_obs=self._dml_data.n_obs, + n_obs=self.n_obs, n_cluster_vars=self._dml_data.n_cluster_vars, cluster_vars=self._dml_data.cluster_vars, ) self._smpls, self._smpls_cluster = obj_dml_resampling.split_samples() else: obj_dml_resampling = DoubleMLResampling( - n_folds=self.n_folds, n_rep=self.n_rep, n_obs=self._dml_data.n_obs, stratify=self._strata + n_folds=self.n_folds, n_rep=self.n_rep, n_obs=self.n_obs, stratify=self._strata ) self._smpls = obj_dml_resampling.split_samples() @@ -1292,7 +1311,7 @@ def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): >>> dml_plr_obj.set_sample_splitting(smpls) """ self._smpls, self._smpls_cluster, self._n_rep, self._n_folds = _check_sample_splitting( - all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data + all_smpls, all_smpls_cluster, self._dml_data, self._is_cluster_data, n_obs=self.n_obs ) ( @@ -1623,6 +1642,15 @@ def sensitivity_benchmark(self, benchmarking_set, fit_args=None): Computes a benchmark for a given set of features. Returns a DataFrame containing the corresponding values for cf_y, cf_d, rho and the change in estimates. + Parameters + ---------- + benchmarking_set : list + List of features to be used for benchmarking. + + fit_args : dict, optional + Additional arguments for the fit method. + Default is None. + Returns ------- benchmark_results : pandas.DataFrame diff --git a/doubleml/double_ml_framework.py b/doubleml/double_ml_framework.py index 60786028..ea1ae9fa 100644 --- a/doubleml/double_ml_framework.py +++ b/doubleml/double_ml_framework.py @@ -307,7 +307,7 @@ def __add__(self, other): assert np.allclose(self._var_scaling_factors, other._var_scaling_factors) var_scaling_factors = self._var_scaling_factors - # compute standard errors + # compute standard errors (Uses factor 1/n for scaling!) sigma2_hat = np.divide(np.mean(np.square(scaled_psi), axis=0), var_scaling_factors.reshape(-1, 1)) all_ses = np.sqrt(sigma2_hat) thetas, ses = _aggregate_coefs_and_ses(all_thetas, all_ses, var_scaling_factors) diff --git a/doubleml/irm/__init__.py b/doubleml/irm/__init__.py index a48cfe35..7579d6f8 100644 --- a/doubleml/irm/__init__.py +++ b/doubleml/irm/__init__.py @@ -2,4 +2,24 @@ The :mod:`doubleml.irm` module implements double machine learning estimates based on interactive regression models. """ -__all__ = [] +from .apo import DoubleMLAPO +from .apos import DoubleMLAPOS +from .cvar import DoubleMLCVAR +from .iivm import DoubleMLIIVM +from .irm import DoubleMLIRM +from .lpq import DoubleMLLPQ +from .pq import DoubleMLPQ +from .qte import DoubleMLQTE +from .ssm import DoubleMLSSM + +__all__ = [ + "DoubleMLIRM", + "DoubleMLAPO", + "DoubleMLAPOS", + "DoubleMLCVAR", + "DoubleMLIIVM", + "DoubleMLLPQ", + "DoubleMLPQ", + "DoubleMLQTE", + "DoubleMLSSM", +] diff --git a/doubleml/irm/apos.py b/doubleml/irm/apos.py index e9b160cb..8099342a 100644 --- a/doubleml/irm/apos.py +++ b/doubleml/irm/apos.py @@ -6,8 +6,8 @@ from joblib import Parallel, delayed from sklearn.base import clone +from doubleml.data import DoubleMLClusterData, DoubleMLData from doubleml.double_ml import DoubleML -from doubleml.double_ml_data import DoubleMLClusterData, DoubleMLData from doubleml.double_ml_framework import concat from doubleml.irm.apo import DoubleMLAPO from doubleml.utils._checks import _check_sample_splitting, _check_score, _check_trimming, _check_weights @@ -261,10 +261,10 @@ def smpls(self): """ if self._smpls is None: err_msg = ( - "Sample splitting not specified. Draw samples via .draw_sample splitting(). " + "Sample splitting not specified. Draw samples via .draw_sample_splitting(). " + "External samples not implemented yet." ) - raise ValueError(err_msg) + raise NotImplementedError(err_msg) return self._smpls @property diff --git a/doubleml/irm/cvar.py b/doubleml/irm/cvar.py index e77031e6..d2aeaced 100644 --- a/doubleml/irm/cvar.py +++ b/doubleml/irm/cvar.py @@ -3,8 +3,8 @@ from sklearn.model_selection import StratifiedKFold, train_test_split from sklearn.utils import check_X_y +from doubleml.data.base_data import DoubleMLData from doubleml.double_ml import DoubleML -from doubleml.double_ml_data import DoubleMLData from doubleml.double_ml_score_mixins import LinearScoreMixin from doubleml.utils._checks import ( _check_contains_iv, diff --git a/doubleml/irm/iivm.py b/doubleml/irm/iivm.py index d78d6f3e..3f252f2a 100644 --- a/doubleml/irm/iivm.py +++ b/doubleml/irm/iivm.py @@ -2,8 +2,8 @@ from sklearn.utils import check_X_y from sklearn.utils.multiclass import type_of_target +from doubleml.data.base_data import DoubleMLData from doubleml.double_ml import DoubleML -from doubleml.double_ml_data import DoubleMLData from doubleml.double_ml_score_mixins import LinearScoreMixin from doubleml.utils._checks import ( _check_binary_predictions, diff --git a/doubleml/irm/irm.py b/doubleml/irm/irm.py index 72db088e..9bf5ed35 100644 --- a/doubleml/irm/irm.py +++ b/doubleml/irm/irm.py @@ -5,8 +5,8 @@ from sklearn.utils import check_X_y from sklearn.utils.multiclass import type_of_target +from doubleml.data.base_data import DoubleMLData from doubleml.double_ml import DoubleML -from doubleml.double_ml_data import DoubleMLData from doubleml.double_ml_score_mixins import LinearScoreMixin from doubleml.utils._checks import ( _check_binary_predictions, diff --git a/doubleml/irm/lpq.py b/doubleml/irm/lpq.py index 56a97969..c98e8fa2 100644 --- a/doubleml/irm/lpq.py +++ b/doubleml/irm/lpq.py @@ -4,8 +4,8 @@ from sklearn.utils import check_X_y from sklearn.utils.multiclass import type_of_target +from doubleml.data.base_data import DoubleMLData from doubleml.double_ml import DoubleML -from doubleml.double_ml_data import DoubleMLData from doubleml.double_ml_score_mixins import NonLinearScoreMixin from doubleml.utils._checks import _check_quantile, _check_score, _check_treatment, _check_trimming, _check_zero_one_treatment from doubleml.utils._estimation import ( diff --git a/doubleml/irm/pq.py b/doubleml/irm/pq.py index 4cdcd74c..f64dc471 100644 --- a/doubleml/irm/pq.py +++ b/doubleml/irm/pq.py @@ -3,8 +3,8 @@ from sklearn.model_selection import StratifiedKFold, train_test_split from sklearn.utils import check_X_y +from doubleml.data.base_data import DoubleMLData from doubleml.double_ml import DoubleML -from doubleml.double_ml_data import DoubleMLData from doubleml.double_ml_score_mixins import NonLinearScoreMixin from doubleml.utils._checks import ( _check_contains_iv, diff --git a/doubleml/irm/qte.py b/doubleml/irm/qte.py index f05269ad..68b91a9a 100644 --- a/doubleml/irm/qte.py +++ b/doubleml/irm/qte.py @@ -3,7 +3,7 @@ from joblib import Parallel, delayed from sklearn.base import clone -from doubleml.double_ml_data import DoubleMLClusterData, DoubleMLData +from doubleml.data import DoubleMLClusterData, DoubleMLData from doubleml.double_ml_framework import concat from doubleml.irm.cvar import DoubleMLCVAR from doubleml.irm.lpq import DoubleMLLPQ diff --git a/doubleml/irm/ssm.py b/doubleml/irm/ssm.py index 5a6458ca..c84b326d 100644 --- a/doubleml/irm/ssm.py +++ b/doubleml/irm/ssm.py @@ -6,8 +6,8 @@ from sklearn.model_selection import train_test_split from sklearn.utils import check_X_y +from doubleml.data.base_data import DoubleMLData from doubleml.double_ml import DoubleML -from doubleml.double_ml_data import DoubleMLData from doubleml.double_ml_score_mixins import LinearScoreMixin from doubleml.utils._checks import _check_finite_predictions, _check_score, _check_trimming from doubleml.utils._estimation import _dml_cv_predict, _dml_tune, _get_cond_smpls_2d, _predict_zero_one_propensity diff --git a/doubleml/irm/tests/_utils_apos_manual.py b/doubleml/irm/tests/_utils_apos_manual.py index efc5eea1..88fc59c2 100644 --- a/doubleml/irm/tests/_utils_apos_manual.py +++ b/doubleml/irm/tests/_utils_apos_manual.py @@ -1,7 +1,7 @@ import numpy as np from sklearn.base import clone -from ...double_ml_data import DoubleMLData +from ...data.base_data import DoubleMLData from ...tests._utils_boot import draw_weights from ..apo import DoubleMLAPO diff --git a/doubleml/irm/tests/_utils_qte_manual.py b/doubleml/irm/tests/_utils_qte_manual.py index 25de79cd..0e19e03e 100644 --- a/doubleml/irm/tests/_utils_qte_manual.py +++ b/doubleml/irm/tests/_utils_qte_manual.py @@ -1,7 +1,7 @@ import numpy as np from sklearn.base import clone -from ...double_ml_data import DoubleMLData +from ...data.base_data import DoubleMLData from ...tests._utils_boot import draw_weights from ...utils._estimation import _default_kde from ..pq import DoubleMLPQ diff --git a/doubleml/irm/tests/test_apos_exceptions.py b/doubleml/irm/tests/test_apos_exceptions.py index 8e9a0b8a..c309b7e2 100644 --- a/doubleml/irm/tests/test_apos_exceptions.py +++ b/doubleml/irm/tests/test_apos_exceptions.py @@ -86,8 +86,8 @@ def test_apos_exception_ipw_normalization(): def test_apos_exception_properties_and_methods(): # properties dml_obj = DoubleMLAPOS(dml_data, ml_g, ml_m, treatment_levels=0, draw_sample_splitting=False) - msg = r"Sample splitting not specified. Draw samples via .draw_sample splitting\(\). External samples not implemented yet." - with pytest.raises(ValueError, match=msg): + msg = r"Sample splitting not specified. Draw samples via .draw_sample_splitting\(\). External samples not implemented yet." + with pytest.raises(NotImplementedError, match=msg): _ = dml_obj.smpls # methods diff --git a/doubleml/irm/tests/test_qte_exceptions.py b/doubleml/irm/tests/test_qte_exceptions.py index 32193c30..9f94f5d4 100644 --- a/doubleml/irm/tests/test_qte_exceptions.py +++ b/doubleml/irm/tests/test_qte_exceptions.py @@ -5,8 +5,8 @@ from sklearn.linear_model import Lasso, LogisticRegression from doubleml import DoubleMLData, DoubleMLQTE +from doubleml.data.base_data import DoubleMLBaseData from doubleml.datasets import make_irm_data -from doubleml.double_ml_data import DoubleMLBaseData np.random.seed(42) n = 100 diff --git a/doubleml/irm/tests/test_ssm_exceptions.py b/doubleml/irm/tests/test_ssm_exceptions.py index 1f5c6d46..6ff276e3 100644 --- a/doubleml/irm/tests/test_ssm_exceptions.py +++ b/doubleml/irm/tests/test_ssm_exceptions.py @@ -5,8 +5,8 @@ from sklearn.linear_model import Lasso, LogisticRegression from doubleml import DoubleMLSSM +from doubleml.data.base_data import DoubleMLBaseData from doubleml.datasets import make_ssm_data -from doubleml.double_ml_data import DoubleMLBaseData np.random.seed(3141) n = 100 diff --git a/doubleml/plm/pliv.py b/doubleml/plm/pliv.py index dc0fbd29..ba022688 100644 --- a/doubleml/plm/pliv.py +++ b/doubleml/plm/pliv.py @@ -6,8 +6,8 @@ from sklearn.model_selection import GridSearchCV, KFold, RandomizedSearchCV from sklearn.utils import check_X_y +from ..data.base_data import DoubleMLData from ..double_ml import DoubleML -from ..double_ml_data import DoubleMLData from ..double_ml_score_mixins import LinearScoreMixin from ..utils._checks import _check_finite_predictions from ..utils._estimation import _dml_cv_predict, _dml_tune diff --git a/doubleml/plm/plr.py b/doubleml/plm/plr.py index 1b45d865..a81bac48 100644 --- a/doubleml/plm/plr.py +++ b/doubleml/plm/plr.py @@ -5,8 +5,8 @@ from sklearn.base import clone from sklearn.utils import check_X_y +from ..data.base_data import DoubleMLData from ..double_ml import DoubleML -from ..double_ml_data import DoubleMLData from ..double_ml_score_mixins import LinearScoreMixin from ..utils._checks import _check_binary_predictions, _check_finite_predictions, _check_is_propensity, _check_score from ..utils._estimation import _dml_cv_predict, _dml_tune diff --git a/doubleml/tests/_utils.py b/doubleml/tests/_utils.py index eeeaab3d..c9d042d1 100644 --- a/doubleml/tests/_utils.py +++ b/doubleml/tests/_utils.py @@ -4,7 +4,7 @@ from sklearn.base import clone from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold -from ..double_ml_data import DoubleMLBaseData +from ..data.base_data import DoubleMLBaseData from ..utils._estimation import _aggregate_coefs_and_ses, _var_est diff --git a/doubleml/tests/conftest.py b/doubleml/tests/conftest.py index 248697b8..bf53d788 100644 --- a/doubleml/tests/conftest.py +++ b/doubleml/tests/conftest.py @@ -4,7 +4,7 @@ from sklearn.datasets import make_classification, make_regression, make_spd_matrix from doubleml import DoubleMLData -from doubleml.datasets import make_irm_data, make_pliv_CHS2015, make_plr_turrell2018 +from doubleml.datasets import make_pliv_CHS2015, make_plr_turrell2018 def _g(x): @@ -55,26 +55,6 @@ def generate_data1(request): return data -@pytest.fixture(scope="session", params=[(500, 10), (1000, 20)]) -def generate_data_irm_w_missings(request): - n_p = request.param - np.random.seed(1111) - # setting parameters - n = n_p[0] - p = n_p[1] - theta = 0.5 - - # generating data - (x, y, d) = make_irm_data(n, p, theta, return_type="array") - - # randomly set some entries to np.nan - ind = np.random.choice(np.arange(x.size), replace=False, size=int(x.size * 0.05)) - x[np.unravel_index(ind, x.shape)] = np.nan - data = (x, y, d) - - return data - - @pytest.fixture(scope="session", params=[(1000, 20)]) def generate_data_iv(request): n_p = request.param diff --git a/doubleml/tests/test_datasets.py b/doubleml/tests/test_datasets.py index 2f3ff80a..67f612e8 100644 --- a/doubleml/tests/test_datasets.py +++ b/doubleml/tests/test_datasets.py @@ -9,7 +9,6 @@ fetch_bonus, make_confounded_irm_data, make_confounded_plr_data, - make_did_SZ2020, make_heterogeneous_data, make_iivm_data, make_irm_data, @@ -165,42 +164,6 @@ def test_make_pliv_multiway_cluster_CKMS2021_return_types(): _ = make_pliv_multiway_cluster_CKMS2021(N=10, M=10, return_type="matrix") -@pytest.fixture(scope="function", params=[False, True]) -def cross_sectional(request): - return request.param - - -@pytest.fixture(scope="function", params=[1, 2, 3, 4, 5, 6]) -def dgp_type(request): - return request.param - - -@pytest.mark.ci -def test_make_did_SZ2020_return_types(cross_sectional, dgp_type): - np.random.seed(3141) - res = make_did_SZ2020(n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type=DoubleMLData) - assert isinstance(res, DoubleMLData) - res = make_did_SZ2020(n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type=pd.DataFrame) - assert isinstance(res, pd.DataFrame) - if cross_sectional: - x, y, d, t = make_did_SZ2020( - n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type=np.ndarray - ) - assert isinstance(t, np.ndarray) - else: - x, y, d, _ = make_did_SZ2020( - n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type=np.ndarray - ) - assert isinstance(x, np.ndarray) - assert isinstance(y, np.ndarray) - assert isinstance(d, np.ndarray) - with pytest.raises(ValueError, match=msg_inv_return_type): - _ = make_did_SZ2020(n_obs=100, dgp_type=dgp_type, cross_sectional_data=cross_sectional, return_type="matrix") - msg = "The dgp_type is not valid." - with pytest.raises(ValueError, match=msg): - _ = make_did_SZ2020(n_obs=100, dgp_type="5", cross_sectional_data=cross_sectional, return_type="matrix") - - @pytest.fixture(scope="function", params=[True, False]) def linear(request): return request.param diff --git a/doubleml/tests/test_exceptions.py b/doubleml/tests/test_exceptions.py index e5fa1924..a4655bb9 100644 --- a/doubleml/tests/test_exceptions.py +++ b/doubleml/tests/test_exceptions.py @@ -22,13 +22,13 @@ DoubleMLQTE, ) from doubleml.datasets import ( - make_did_SZ2020, make_iivm_data, make_irm_data, make_pliv_CHS2015, make_pliv_multiway_cluster_CKMS2021, make_plr_CCDDHNR2018, ) +from doubleml.did.datasets import make_did_SZ2020 from ._utils import DummyDataClass diff --git a/doubleml/tests/test_model_defaults.py b/doubleml/tests/test_model_defaults.py index 401827b1..f55a555c 100644 --- a/doubleml/tests/test_model_defaults.py +++ b/doubleml/tests/test_model_defaults.py @@ -5,13 +5,13 @@ import doubleml as dml from doubleml.datasets import ( - make_did_SZ2020, make_iivm_data, make_irm_data, make_pliv_CHS2015, make_plr_CCDDHNR2018, make_ssm_data, ) +from doubleml.did.datasets import make_did_SZ2020 np.random.seed(3141) dml_data_plr = make_plr_CCDDHNR2018(n_obs=100) diff --git a/doubleml/tests/test_return_types.py b/doubleml/tests/test_return_types.py index 51c39c24..11ebd624 100644 --- a/doubleml/tests/test_return_types.py +++ b/doubleml/tests/test_return_types.py @@ -24,7 +24,6 @@ DoubleMLSSM, ) from doubleml.datasets import ( - make_did_SZ2020, make_iivm_data, make_irm_data, make_pliv_CHS2015, @@ -32,6 +31,7 @@ make_plr_CCDDHNR2018, make_ssm_data, ) +from doubleml.did.datasets import make_did_SZ2020 np.random.seed(3141) n_obs = 200 diff --git a/doubleml/utils/_aliases.py b/doubleml/utils/_aliases.py new file mode 100644 index 00000000..e52a5818 --- /dev/null +++ b/doubleml/utils/_aliases.py @@ -0,0 +1,29 @@ +import numpy as np +import pandas as pd + +from doubleml.data import DoubleMLClusterData, DoubleMLData + +_array_alias = ["array", "np.ndarray", "np.array", np.ndarray] +_data_frame_alias = ["DataFrame", "pd.DataFrame", pd.DataFrame] +_dml_data_alias = ["DoubleMLData", DoubleMLData] +_dml_cluster_data_alias = ["DoubleMLClusterData", DoubleMLClusterData] + + +def _get_array_alias(): + """Returns the list of array aliases.""" + return _array_alias + + +def _get_data_frame_alias(): + """Returns the list of data frame aliases.""" + return _data_frame_alias + + +def _get_dml_data_alias(): + """Returns the list of DoubleMLData aliases.""" + return _dml_data_alias + + +def _get_dml_cluster_data_alias(): + """Returns the list of DoubleMLClusterData aliases.""" + return _dml_cluster_data_alias diff --git a/doubleml/utils/_check_defaults.py b/doubleml/utils/_check_defaults.py new file mode 100644 index 00000000..5f376000 --- /dev/null +++ b/doubleml/utils/_check_defaults.py @@ -0,0 +1,61 @@ +import numpy as np +import pandas as pd + +from doubleml.double_ml import DoubleML + + +def _check_basic_defaults_before_fit(dml_obj): + # general parameters + assert dml_obj.n_folds == 5 + assert dml_obj.n_rep == 1 + assert dml_obj.framework is None + pd.testing.assert_frame_equal(dml_obj.summary, pd.DataFrame(columns=["coef", "std err", "t", "P>|t|"])) + + # bootstrap + assert dml_obj.boot_method is None + assert dml_obj.n_rep_boot is None + assert dml_obj.boot_t_stat is None + + # sensitivity + assert dml_obj.sensitivity_params is None + assert dml_obj.sensitivity_elements is None + + +def _fit_bootstrap(dml_obj): + dml_obj.fit() + dml_obj.bootstrap() + + +def _check_basic_defaults_after_fit(dml_obj): + # general parameters + assert dml_obj.n_folds == 5 + assert dml_obj.n_rep == 1 + assert dml_obj.framework is not None + + # coefs and se + assert isinstance(dml_obj.coef, np.ndarray) + assert isinstance(dml_obj.se, np.ndarray) + assert isinstance(dml_obj.all_coef, np.ndarray) + assert isinstance(dml_obj.all_se, np.ndarray) + assert isinstance(dml_obj.t_stat, np.ndarray) + assert isinstance(dml_obj.pval, np.ndarray) + + # bootstrap + assert dml_obj.boot_method == "normal" + assert dml_obj.n_rep_boot == 500 + assert isinstance(dml_obj.boot_t_stat, np.ndarray) + + # sensitivity + assert dml_obj.sensitivity_params is None + assert isinstance(dml_obj.sensitivity_elements, dict) + + # fit method + if isinstance(dml_obj, DoubleML): + assert dml_obj.predictions is not None + assert dml_obj.models is None + + # confint method + assert dml_obj.confint().equals(dml_obj.confint(joint=False, level=0.95)) + + # p_adjust method + assert dml_obj.p_adjust().equals(dml_obj.p_adjust(method="romano-wolf")) diff --git a/doubleml/utils/_check_return_types.py b/doubleml/utils/_check_return_types.py new file mode 100644 index 00000000..54462059 --- /dev/null +++ b/doubleml/utils/_check_return_types.py @@ -0,0 +1,153 @@ +import numpy as np +import pandas as pd +import plotly + +from doubleml import DoubleMLFramework +from doubleml.data import DoubleMLClusterData +from doubleml.double_ml_score_mixins import NonLinearScoreMixin + + +def check_basic_return_types(dml_obj, cls): + # ToDo: A second test case with multiple treatment variables would be helpful + assert isinstance(dml_obj.__str__(), str) + assert isinstance(dml_obj.summary, pd.DataFrame) + assert isinstance(dml_obj.draw_sample_splitting(), cls) + if not dml_obj._is_cluster_data: + assert isinstance(dml_obj.set_sample_splitting(dml_obj.smpls), cls) + else: + assert isinstance(dml_obj._dml_data, DoubleMLClusterData) + assert isinstance(dml_obj.fit(), cls) + assert isinstance(dml_obj.__str__(), str) # called again after fit, now with numbers + assert isinstance(dml_obj.summary, pd.DataFrame) # called again after fit, now with numbers + if not dml_obj._is_cluster_data: + assert isinstance(dml_obj.bootstrap(), cls) + else: + assert isinstance(dml_obj._dml_data, DoubleMLClusterData) + assert isinstance(dml_obj.confint(), pd.DataFrame) + if not dml_obj._is_cluster_data: + assert isinstance(dml_obj.p_adjust(), pd.DataFrame) + else: + isinstance(dml_obj.p_adjust("bonferroni"), pd.DataFrame) + assert isinstance(dml_obj._dml_data.__str__(), str) + + +def check_basic_property_types_and_shapes(dml_obj, n_obs, n_treat, n_rep, n_folds, n_rep_boot): + # not checked: learner, learner_names, params, params_names, score + # already checked: summary + + # check that the setting is still in line with the hard-coded values + assert dml_obj._dml_data.n_treat == n_treat + assert dml_obj.n_rep == n_rep + assert dml_obj.n_folds == n_folds + assert dml_obj._dml_data.n_obs == n_obs + assert dml_obj.n_rep_boot == n_rep_boot + + assert isinstance(dml_obj.all_coef, np.ndarray) + assert dml_obj.all_coef.shape == (n_treat, n_rep) + + assert isinstance(dml_obj.all_se, np.ndarray) + assert dml_obj.all_se.shape == (n_treat, n_rep) + + assert isinstance(dml_obj.boot_t_stat, np.ndarray) + assert dml_obj.boot_t_stat.shape == (n_rep_boot, n_treat, n_rep) + + assert isinstance(dml_obj.coef, np.ndarray) + assert dml_obj.coef.shape == (n_treat,) + + assert isinstance(dml_obj.psi, np.ndarray) + assert dml_obj.psi.shape == ( + n_obs, + n_rep, + n_treat, + ) + + is_nonlinear = isinstance(dml_obj, NonLinearScoreMixin) + if is_nonlinear: + for score_element in dml_obj._score_element_names: + assert isinstance(dml_obj.psi_elements[score_element], np.ndarray) + assert dml_obj.psi_elements[score_element].shape == ( + n_obs, + n_rep, + n_treat, + ) + else: + assert isinstance(dml_obj.psi_elements["psi_a"], np.ndarray) + assert dml_obj.psi_elements["psi_a"].shape == ( + n_obs, + n_rep, + n_treat, + ) + + assert isinstance(dml_obj.psi_elements["psi_b"], np.ndarray) + assert dml_obj.psi_elements["psi_b"].shape == ( + n_obs, + n_rep, + n_treat, + ) + + assert isinstance(dml_obj.framework, DoubleMLFramework) + assert isinstance(dml_obj.pval, np.ndarray) + assert dml_obj.pval.shape == (n_treat,) + + assert isinstance(dml_obj.se, np.ndarray) + assert dml_obj.se.shape == (n_treat,) + + assert isinstance(dml_obj.t_stat, np.ndarray) + assert dml_obj.t_stat.shape == (n_treat,) + + assert isinstance(dml_obj._dml_data.binary_treats, pd.Series) + assert len(dml_obj._dml_data.binary_treats) == n_treat + + assert isinstance(dml_obj.smpls, list) + assert len(dml_obj.smpls) == n_rep + all_tuple = all([all([isinstance(tpl, tuple) for tpl in smpl]) for smpl in dml_obj.smpls]) + assert all_tuple + all_pairs = all([all([len(tpl) == 2 for tpl in smpl]) for smpl in dml_obj.smpls]) + assert all_pairs + n_folds_each_smpl = np.array([len(smpl) for smpl in dml_obj.smpls]) + assert np.all(n_folds_each_smpl == n_folds_each_smpl[0]) + assert n_folds_each_smpl[0] == n_folds + + return + + +def check_basic_predictions_and_targets(dml_obj, n_obs, n_treat, n_rep): + + expected_keys = dml_obj.params_names + for key in expected_keys: + assert isinstance(dml_obj.predictions[key], np.ndarray) + assert dml_obj.predictions[key].shape == (n_obs, n_rep, n_treat) + + assert isinstance(dml_obj.nuisance_targets[key], np.ndarray) + assert dml_obj.nuisance_targets[key].shape == (n_obs, n_rep, n_treat) + + assert isinstance(dml_obj.nuisance_loss[key], np.ndarray) + assert dml_obj.nuisance_loss[key].shape == (n_rep, n_treat) + + return + + +def check_sensitivity_return_types(dml_obj, n_obs, n_rep, n_treat, benchmarking_set): + assert isinstance(dml_obj.sensitivity_elements, dict) + for key in ["sigma2", "nu2"]: + assert isinstance(dml_obj.sensitivity_elements[key], np.ndarray) + assert dml_obj.sensitivity_elements[key].shape == (1, n_rep, n_treat) + for key in ["psi_sigma2", "psi_nu2", "riesz_rep"]: + assert isinstance(dml_obj.sensitivity_elements[key], np.ndarray) + assert dml_obj.sensitivity_elements[key].shape == (n_obs, n_rep, n_treat) + + assert isinstance(dml_obj.sensitivity_summary, str) + dml_obj.sensitivity_analysis() + assert isinstance(dml_obj.sensitivity_summary, str) + assert isinstance(dml_obj.sensitivity_plot(), plotly.graph_objs._figure.Figure) + benchmarks = {"cf_y": [0.1, 0.2], "cf_d": [0.15, 0.2], "name": ["test1", "test2"]} + assert isinstance(dml_obj.sensitivity_plot(value="ci", benchmarks=benchmarks), plotly.graph_objs._figure.Figure) + + assert isinstance(dml_obj.framework._calc_sensitivity_analysis(cf_y=0.03, cf_d=0.03, rho=1.0, level=0.95), dict) + assert isinstance( + dml_obj.framework._calc_robustness_value(null_hypothesis=0.0, level=0.95, rho=1.0, idx_treatment=0), tuple + ) + benchmark = dml_obj.sensitivity_benchmark(benchmarking_set=benchmarking_set) + assert isinstance(benchmark, pd.DataFrame) + + return diff --git a/doubleml/utils/_checks.py b/doubleml/utils/_checks.py index 90833ded..db1fbf94 100644 --- a/doubleml/utils/_checks.py +++ b/doubleml/utils/_checks.py @@ -438,16 +438,17 @@ def _check_cluster_sample_splitting(all_smpls_cluster, dml_data, n_rep, n_folds) return smpls_cluster -def _check_sample_splitting(all_smpls, all_smpls_cluster, dml_data, is_cluster_data): +def _check_sample_splitting(all_smpls, all_smpls_cluster, dml_data, is_cluster_data, n_obs=None): + # default value for n_obs is None (different for e.g. DoubleMLPanelData) + if n_obs is None: + n_obs = dml_data.n_obs if isinstance(all_smpls, tuple): if not len(all_smpls) == 2: raise ValueError( "Invalid partition provided. Tuple for train_ind and test_ind must consist of exactly two elements." ) - all_smpls = _check_smpl_split_tpl(all_smpls, dml_data.n_obs) - if _check_is_partition([all_smpls], dml_data.n_obs) & _check_is_partition( - [(all_smpls[1], all_smpls[0])], dml_data.n_obs - ): + all_smpls = _check_smpl_split_tpl(all_smpls, n_obs) + if _check_is_partition([all_smpls], n_obs) & _check_is_partition([(all_smpls[1], all_smpls[0])], n_obs): n_rep = 1 n_folds = 1 smpls = [[all_smpls]] @@ -465,14 +466,14 @@ def _check_sample_splitting(all_smpls, all_smpls_cluster, dml_data, is_cluster_d "Invalid partition provided. All tuples for train_ind and test_ind must consist of exactly two elements." ) n_rep = 1 - all_smpls = _check_smpl_split(all_smpls, dml_data.n_obs) - if _check_is_partition(all_smpls, dml_data.n_obs): - if (len(all_smpls) == 1) & _check_is_partition([(all_smpls[0][1], all_smpls[0][0])], dml_data.n_obs): + all_smpls = _check_smpl_split(all_smpls, n_obs) + if _check_is_partition(all_smpls, n_obs): + if (len(all_smpls) == 1) & _check_is_partition([(all_smpls[0][1], all_smpls[0][0])], n_obs): n_folds = 1 smpls = [all_smpls] else: n_folds = len(all_smpls) - smpls = _check_all_smpls([all_smpls], dml_data.n_obs, check_intersect=True) + smpls = _check_all_smpls([all_smpls], n_obs, check_intersect=True) else: raise ValueError("Invalid partition provided. Tuples provided that don't form a partition.") else: @@ -494,13 +495,13 @@ def _check_sample_splitting(all_smpls, all_smpls_cluster, dml_data, is_cluster_d n_folds_each_smpl = np.array([len(smpl) for smpl in all_smpls]) if not np.all(n_folds_each_smpl == n_folds_each_smpl[0]): raise ValueError("Invalid partition provided. Different number of folds for repeated sample splitting.") - all_smpls = _check_all_smpls(all_smpls, dml_data.n_obs) - smpls_are_partitions = [_check_is_partition(smpl, dml_data.n_obs) for smpl in all_smpls] + all_smpls = _check_all_smpls(all_smpls, n_obs) + smpls_are_partitions = [_check_is_partition(smpl, n_obs) for smpl in all_smpls] if all(smpls_are_partitions): n_rep = len(all_smpls) n_folds = int(n_folds_each_smpl[0]) - smpls = _check_all_smpls(all_smpls, dml_data.n_obs, check_intersect=True) + smpls = _check_all_smpls(all_smpls, n_obs, check_intersect=True) else: raise ValueError("Invalid partition provided. At least one inner list does not form a partition.") diff --git a/pyproject.toml b/pyproject.toml index 339bd0a3..41f52706 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ dependencies = [ "scikit-learn>=1.4.0", "statsmodels", "matplotlib", + "seaborn>=0.13", "plotly" ] classifiers = [