From df03887c56b81d34ad7c499976ceca24e806e71f Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 13 Jan 2025 10:32:40 +0100 Subject: [PATCH 01/23] Logistic regression implementation WIP --- doubleml/__init__.py | 2 + doubleml/double_ml.py | 6 + doubleml/double_ml_data.py | 1049 +++++++++++++++++++++++++++++++++ doubleml/logistic/logistic.py | 463 +++++++++++++++ doubleml/utils/resampling.py | 45 ++ 5 files changed, 1565 insertions(+) create mode 100644 doubleml/double_ml_data.py create mode 100644 doubleml/logistic/logistic.py diff --git a/doubleml/__init__.py b/doubleml/__init__.py index 6cf7de96..93549116 100644 --- a/doubleml/__init__.py +++ b/doubleml/__init__.py @@ -15,6 +15,7 @@ from .irm.ssm import DoubleMLSSM from .plm.pliv import DoubleMLPLIV from .plm.plr import DoubleMLPLR +from .logistic.logistic import DoubleMLLogit from .utils.blp import DoubleMLBLP from .utils.policytree import DoubleMLPolicyTree @@ -42,6 +43,7 @@ "DoubleMLBLP", "DoubleMLPolicyTree", "DoubleMLSSM", + "DoubleMLLogit", ] __version__ = importlib.metadata.version("doubleml") diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 05481bf1..1cc6bcf9 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -585,6 +585,12 @@ def fit(self, n_jobs_cv=None, store_predictions=True, external_predictions=None, # construct framework for inference self._framework = self.construct_framework() + + + + + + return self def construct_framework(self): diff --git a/doubleml/double_ml_data.py b/doubleml/double_ml_data.py new file mode 100644 index 00000000..4f8d7cbc --- /dev/null +++ b/doubleml/double_ml_data.py @@ -0,0 +1,1049 @@ +import numpy as np +import pandas as pd +import io + +from abc import ABC, abstractmethod + +from sklearn.utils.validation import check_array, column_or_1d, check_consistent_length +from sklearn.utils import assert_all_finite +from sklearn.utils.multiclass import type_of_target +from .utils._estimation import _assure_2d_array +from .utils._checks import _check_set + + +class DoubleMLBaseData(ABC): + """Base Class Double machine learning data-backends + """ + def __init__(self, + data): + if not isinstance(data, pd.DataFrame): + raise TypeError('data must be of pd.DataFrame type. ' + f'{str(data)} of type {str(type(data))} was passed.') + if not data.columns.is_unique: + raise ValueError('Invalid pd.DataFrame: ' + 'Contains duplicate column names.') + self._data = data + + def __str__(self): + data_summary = self._data_summary_str() + buf = io.StringIO() + self.data.info(verbose=False, buf=buf) + df_info = buf.getvalue() + res = '================== DoubleMLBaseData Object ==================\n' + \ + '\n------------------ Data summary ------------------\n' + data_summary + \ + '\n------------------ DataFrame info ------------------\n' + df_info + return res + + def _data_summary_str(self): + data_summary = f'No. Observations: {self.n_obs}\n' + return data_summary + + @property + def data(self): + """ + The data. + """ + return self._data + + @property + def all_variables(self): + """ + All variables available in the dataset. + """ + return self.data.columns + + @property + def n_obs(self): + """ + The number of observations. + """ + return self.data.shape[0] + + # TODO: This and the following property does not make sense but the base class DoubleML needs it (especially for the + # multiple treatment variables case) and other things are also build around it, see for example DoubleML._params + @property + def d_cols(self): + return ['theta'] + + @property + def n_treat(self): + """ + The number of treatment variables. + """ + return 1 + + @property + @abstractmethod + def n_coefs(self): + pass + + +class DoubleMLData(DoubleMLBaseData): + """Double machine learning data-backend. + + :class:`DoubleMLData` objects can be initialized from + :class:`pandas.DataFrame`'s as well as :class:`numpy.ndarray`'s. + + Parameters + ---------- + data : :class:`pandas.DataFrame` + The data. + + y_col : str + The outcome variable. + + d_cols : str or list + The treatment variable(s). + + x_cols : None, str or list + The covariates. + If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor + treatment variables ``d_cols``, nor instrumental variables ``z_cols`` are used as covariates. + Default is ``None``. + + z_cols : None, str or list + The instrumental variable(s). + Default is ``None``. + + t_col : None or str + The time variable (only relevant/used for DiD Estimators). + Default is ``None``. + + s_col : None or str + The score or selection variable (only relevant/used for RDD or SSM Estimatiors). + Default is ``None``. + + use_other_treat_as_covariate : bool + Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. + Default is ``True``. + + force_all_x_finite : bool or str + Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. + Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are + allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). + Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used + for the nuisance functions are capable to provide valid predictions with missings and / or infinite values + in the covariates ``x``. + Default is ``True``. + + Examples + -------- + >>> from doubleml import DoubleMLData + >>> from doubleml.datasets import make_plr_CCDDHNR2018 + >>> # initialization from pandas.DataFrame + >>> df = make_plr_CCDDHNR2018(return_type='DataFrame') + >>> obj_dml_data_from_df = DoubleMLData(df, 'y', 'd') + >>> # initialization from np.ndarray + >>> (x, y, d) = make_plr_CCDDHNR2018(return_type='array') + >>> obj_dml_data_from_array = DoubleMLData.from_arrays(x, y, d) + """ + def __init__(self, + data, + y_col, + d_cols, + x_cols=None, + z_cols=None, + t_col=None, + s_col=None, + use_other_treat_as_covariate=True, + force_all_x_finite=True): + DoubleMLBaseData.__init__(self, data) + + self.y_col = y_col + self.d_cols = d_cols + self.z_cols = z_cols + self.t_col = t_col + self.s_col = s_col + self.x_cols = x_cols + self._check_disjoint_sets_y_d_x_z_t_s() + self.use_other_treat_as_covariate = use_other_treat_as_covariate + self.force_all_x_finite = force_all_x_finite + self._binary_treats = self._check_binary_treats() + self._binary_outcome = self._check_binary_outcome() + self._set_y_z_t_s() + # by default, we initialize to the first treatment variable + self.set_x_d(self.d_cols[0]) + + def __str__(self): + data_summary = self._data_summary_str() + buf = io.StringIO() + self.data.info(verbose=False, buf=buf) + df_info = buf.getvalue() + res = '================== DoubleMLData Object ==================\n' + \ + '\n------------------ Data summary ------------------\n' + data_summary + \ + '\n------------------ DataFrame info ------------------\n' + df_info + return res + + def _data_summary_str(self): + data_summary = f'Outcome variable: {self.y_col}\n' \ + f'Treatment variable(s): {self.d_cols}\n' \ + f'Covariates: {self.x_cols}\n' \ + f'Instrument variable(s): {self.z_cols}\n' + if self.t_col is not None: + data_summary += f'Time variable: {self.t_col}\n' + if self.s_col is not None: + data_summary += f'Score/Selection variable: {self.s_col}\n' + data_summary += f'No. Observations: {self.n_obs}\n' + return data_summary + + @classmethod + def from_arrays(cls, x, y, d, z=None, t=None, s=None, use_other_treat_as_covariate=True, + force_all_x_finite=True): + """ + Initialize :class:`DoubleMLData` from :class:`numpy.ndarray`'s. + + Parameters + ---------- + x : :class:`numpy.ndarray` + Array of covariates. + + y : :class:`numpy.ndarray` + Array of the outcome variable. + + d : :class:`numpy.ndarray` + Array of treatment variables. + + z : None or :class:`numpy.ndarray` + Array of instrumental variables. + Default is ``None``. + + t : :class:`numpy.ndarray` + Array of the time variable (only relevant/used for DiD models). + Default is ``None``. + + s : :class:`numpy.ndarray` + Array of the score or selection variable (only relevant/used for RDD and SSM models). + Default is ``None``. + + use_other_treat_as_covariate : bool + Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. + Default is ``True``. + + force_all_x_finite : bool or str + Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. + Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are + allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). + Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used + for the nuisance functions are capable to provide valid predictions with missings and / or infinite values + in the covariates ``x``. + Default is ``True``. + + Examples + -------- + >>> from doubleml import DoubleMLData + >>> from doubleml.datasets import make_plr_CCDDHNR2018 + >>> (x, y, d) = make_plr_CCDDHNR2018(return_type='array') + >>> obj_dml_data_from_array = DoubleMLData.from_arrays(x, y, d) + """ + if isinstance(force_all_x_finite, str): + if force_all_x_finite != 'allow-nan': + raise ValueError("Invalid force_all_x_finite " + force_all_x_finite + ". " + + "force_all_x_finite must be True, False or 'allow-nan'.") + elif not isinstance(force_all_x_finite, bool): + raise TypeError("Invalid force_all_x_finite. " + + "force_all_x_finite must be True, False or 'allow-nan'.") + + x = check_array(x, ensure_2d=False, allow_nd=False, + force_all_finite=force_all_x_finite) + d = check_array(d, ensure_2d=False, allow_nd=False) + y = column_or_1d(y, warn=True) + + x = _assure_2d_array(x) + d = _assure_2d_array(d) + + y_col = 'y' + if z is None: + check_consistent_length(x, y, d) + z_cols = None + else: + z = check_array(z, ensure_2d=False, allow_nd=False) + z = _assure_2d_array(z) + check_consistent_length(x, y, d, z) + if z.shape[1] == 1: + z_cols = ['z'] + else: + z_cols = [f'z{i + 1}' for i in np.arange(z.shape[1])] + + if t is None: + t_col = None + else: + t = column_or_1d(t, warn=True) + check_consistent_length(x, y, d, t) + t_col = 't' + + if s is None: + s_col = None + else: + s = column_or_1d(s, warn=True) + check_consistent_length(x, y, d, s) + s_col = 's' + + if d.shape[1] == 1: + d_cols = ['d'] + else: + d_cols = [f'd{i+1}' for i in np.arange(d.shape[1])] + + x_cols = [f'X{i+1}' for i in np.arange(x.shape[1])] + + # basline version with features, outcome and treatments + data = pd.DataFrame(np.column_stack((x, y, d)), + columns=x_cols + [y_col] + d_cols) + + if z is not None: + df_z = pd.DataFrame(z, columns=z_cols) + data = pd.concat([data, df_z], axis=1) + + if t is not None: + data[t_col] = t + + if s is not None: + data[s_col] = s + + return cls(data, y_col, d_cols, x_cols, z_cols, t_col, s_col, use_other_treat_as_covariate, force_all_x_finite) + + @property + def x(self): + """ + Array of covariates; + Dynamic! May depend on the currently set treatment variable; + To get an array of all covariates (independent of the currently set treatment variable) + call ``obj.data[obj.x_cols].values``. + """ + return self._X.values + + @property + def y(self): + """ + Array of outcome variable. + """ + return self._y.values + + @property + def d(self): + """ + Array of treatment variable; + Dynamic! Depends on the currently set treatment variable; + To get an array of all treatment variables (independent of the currently set treatment variable) + call ``obj.data[obj.d_cols].values``. + """ + return self._d.values + + @property + def z(self): + """ + Array of instrumental variables. + """ + if self.z_cols is not None: + return self._z.values + else: + return None + + @property + def t(self): + """ + Array of time variable. + """ + if self.t_col is not None: + return self._t.values + else: + return None + + @property + def s(self): + """ + Array of score or selection variable. + """ + if self.s_col is not None: + return self._s.values + else: + return None + + @property + def n_treat(self): + """ + The number of treatment variables. + """ + return len(self.d_cols) + + @property + def n_coefs(self): + """ + The number of coefficients to be estimated. + """ + return self.n_treat + + @property + def n_instr(self): + """ + The number of instruments. + """ + if self.z_cols is not None: + n_instr = len(self.z_cols) + else: + n_instr = 0 + return n_instr + + @property + def binary_treats(self): + """ + Series with logical(s) indicating whether the treatment variable(s) are binary with values 0 and 1. + """ + return self._binary_treats + + @property + def binary_outcome(self): + """ + Logical indicating whether the outcome variable is binary with values 0 and 1. + """ + return self._binary_outcome + + @property + def x_cols(self): + """ + The covariates. + """ + return self._x_cols + + @x_cols.setter + def x_cols(self, value): + reset_value = hasattr(self, '_x_cols') + if value is not None: + if isinstance(value, str): + value = [value] + if not isinstance(value, list): + raise TypeError('The covariates x_cols must be of str or list type (or None). ' + f'{str(value)} of type {str(type(value))} was passed.') + if not len(se + + t(value)) == len(value): + raise ValueError('Invalid covariates x_cols: ' + 'Contains duplicate values.') + if not set(value).issubset(set(self.all_variables)): + raise ValueError('Invalid covariates x_cols. ' + 'At least one covariate is no data column.') + assert set(value).issubset(set(self.all_variables)) + self._x_cols = value + else: + excluded_cols = set.union({self.y_col}, set(self.d_cols)) + if (self.z_cols is not None): + excluded_cols = set.union(excluded_cols, set(self.z_cols)) + for col in [self.t_col, self.s_col]: + col = _check_set(col) + excluded_cols = set.union(excluded_cols, col) + self._x_cols = [col for col in self.data.columns if col not in excluded_cols] + if reset_value: + self._check_disjoint_sets() + # by default, we initialize to the first treatment variable + self.set_x_d(self.d_cols[0]) + + @property + def d_cols(self): + """ + The treatment variable(s). + """ + return self._d_cols + + @d_cols.setter + def d_cols(self, value): + reset_value = hasattr(self, '_d_cols') + if isinstance(value, str): + value = [value] + if not isinstance(value, list): + raise TypeError('The treatment variable(s) d_cols must be of str or list type. ' + f'{str(value)} of type {str(type(value))} was passed.') + if not len(set(value)) == len(value): + raise ValueError('Invalid treatment variable(s) d_cols: ' + 'Contains duplicate values.') + if not set(value).issubset(set(self.all_variables)): + raise ValueError('Invalid treatment variable(s) d_cols. ' + 'At least one treatment variable is no data column.') + self._d_cols = value + if reset_value: + self._check_disjoint_sets() + # by default, we initialize to the first treatment variable + self.set_x_d(self.d_cols[0]) + + @property + def y_col(self): + """ + The outcome variable. + """ + return self._y_col + + @y_col.setter + def y_col(self, value): + reset_value = hasattr(self, '_y_col') + if not isinstance(value, str): + raise TypeError('The outcome variable y_col must be of str type. ' + f'{str(value)} of type {str(type(value))} was passed.') + if value not in self.all_variables: + raise ValueError('Invalid outcome variable y_col. ' + f'{value} is no data column.') + self._y_col = value + if reset_value: + self._check_disjoint_sets() + self._set_y_z_t_s() + + @property + def z_cols(self): + """ + The instrumental variable(s). + """ + return self._z_cols + + @z_cols.setter + def z_cols(self, value): + reset_value = hasattr(self, '_z_cols') + if value is not None: + if isinstance(value, str): + value = [value] + if not isinstance(value, list): + raise TypeError('The instrumental variable(s) z_cols must be of str or list type (or None). ' + f'{str(value)} of type {str(type(value))} was passed.') + if not len(set(value)) == len(value): + raise ValueError('Invalid instrumental variable(s) z_cols: ' + 'Contains duplicate values.') + if not set(value).issubset(set(self.all_variables)): + raise ValueError('Invalid instrumental variable(s) z_cols. ' + 'At least one instrumental variable is no data column.') + self._z_cols = value + else: + self._z_cols = None + if reset_value: + self._check_disjoint_sets() + self._set_y_z_t_s() + + @property + def t_col(self): + """ + The time variable. + """ + return self._t_col + + @t_col.setter + def t_col(self, value): + reset_value = hasattr(self, '_t_col') + if value is not None: + if not isinstance(value, str): + raise TypeError('The time variable t_col must be of str type (or None). ' + f'{str(value)} of type {str(type(value))} was passed.') + if value not in self.all_variables: + raise ValueError('Invalid time variable t_col. ' + f'{value} is no data column.') + self._t_col = value + else: + self._t_col = None + if reset_value: + self._check_disjoint_sets() + self._set_y_z_t_s() + + @property + def s_col(self): + """ + The score or selection variable. + """ + return self._s_col + + @s_col.setter + def s_col(self, value): + reset_value = hasattr(self, '_s_col') + if value is not None: + if not isinstance(value, str): + raise TypeError('The score or selection variable s_col must be of str type (or None). ' + f'{str(value)} of type {str(type(value))} was passed.') + if value not in self.all_variables: + raise ValueError('Invalid score or selection variable s_col. ' + f'{value} is no data column.') + self._s_col = value + else: + self._s_col = None + if reset_value: + self._check_disjoint_sets() + self._set_y_z_t_s() + + @property + def use_other_treat_as_covariate(self): + """ + Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. + """ + return self._use_other_treat_as_covariate + + @use_other_treat_as_covariate.setter + def use_other_treat_as_covariate(self, value): + reset_value = hasattr(self, '_use_other_treat_as_covariate') + if not isinstance(value, bool): + raise TypeError('use_other_treat_as_covariate must be True or False. ' + f'Got {str(value)}.') + self._use_other_treat_as_covariate = value + if reset_value: + # by default, we initialize to the first treatment variable + self.set_x_d(self.d_cols[0]) + + @property + def force_all_x_finite(self): + """ + Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. + """ + return self._force_all_x_finite + + @force_all_x_finite.setter + def force_all_x_finite(self, value): + reset_value = hasattr(self, '_force_all_x_finite') + if isinstance(value, str): + if value != 'allow-nan': + raise ValueError("Invalid force_all_x_finite " + value + ". " + + "force_all_x_finite must be True, False or 'allow-nan'.") + elif not isinstance(value, bool): + raise TypeError("Invalid force_all_x_finite. " + + "force_all_x_finite must be True, False or 'allow-nan'.") + self._force_all_x_finite = value + if reset_value: + # by default, we initialize to the first treatment variable + self.set_x_d(self.d_cols[0]) + + def _set_y_z_t_s(self): + assert_all_finite(self.data.loc[:, self.y_col]) + self._y = self.data.loc[:, self.y_col] + if self.z_cols is None: + self._z = None + else: + assert_all_finite(self.data.loc[:, self.z_cols]) + self._z = self.data.loc[:, self.z_cols] + + if self.t_col is None: + self._t = None + else: + assert_all_finite(self.data.loc[:, self.t_col]) + self._t = self.data.loc[:, self.t_col] + + if self.s_col is None: + self._s = None + else: + assert_all_finite(self.data.loc[:, self.s_col]) + self._s = self.data.loc[:, self.s_col] + + def set_x_d(self, treatment_var): + """ + Function that assigns the role for the treatment variables in the multiple-treatment case. + + Parameters + ---------- + treatment_var : str + Active treatment variable that will be set to d. + """ + if not isinstance(treatment_var, str): + raise TypeError('treatment_var must be of str type. ' + f'{str(treatment_var)} of type {str(type(treatment_var))} was passed.') + if treatment_var not in self.d_cols: + raise ValueError('Invalid treatment_var. ' + f'{treatment_var} is not in d_cols.') + if self.use_other_treat_as_covariate: + # note that the following line needs to be adapted in case an intersection of x_cols and d_cols as allowed + # (see https://github.com/DoubleML/doubleml-for-py/issues/83) + xd_list = self.x_cols + self.d_cols + xd_list.remove(treatment_var) + else: + xd_list = self.x_cols + assert_all_finite(self.data.loc[:, treatment_var]) + if self.force_all_x_finite: + assert_all_finite(self.data.loc[:, xd_list], + allow_nan=self.force_all_x_finite == 'allow-nan') + self._d = self.data.loc[:, treatment_var] + self._X = self.data.loc[:, xd_list] + + def _check_binary_treats(self): + is_binary = pd.Series(dtype=bool, index=self.d_cols) + for treatment_var in self.d_cols: + this_d = self.data.loc[:, treatment_var] + binary_treat = (type_of_target(this_d) == 'binary') + zero_one_treat = np.all((np.power(this_d, 2) - this_d) == 0) + is_binary[treatment_var] = (binary_treat & zero_one_treat) + return is_binary + + def _check_binary_outcome(self): + y = self.data.loc[:, self.y_col] + binary_outcome = (type_of_target(y) == 'binary') + zero_one_outcome = np.all((np.power(y, 2) - y) == 0) + is_binary = (binary_outcome & zero_one_outcome) + return is_binary + + def _check_disjoint_sets(self): + # this function can be extended in inherited subclasses + self._check_disjoint_sets_y_d_x_z_t_s() + + def _check_disjoint_sets_y_d_x_z_t_s(self): + y_col_set = {self.y_col} + x_cols_set = set(self.x_cols) + d_cols_set = set(self.d_cols) + + if not y_col_set.isdisjoint(x_cols_set): + raise ValueError(f'{str(self.y_col)} cannot be set as outcome variable ``y_col`` and covariate in ' + '``x_cols``.') + if not y_col_set.isdisjoint(d_cols_set): + raise ValueError(f'{str(self.y_col)} cannot be set as outcome variable ``y_col`` and treatment variable in ' + '``d_cols``.') + # note that the line xd_list = self.x_cols + self.d_cols in method set_x_d needs adaption if an intersection of + # x_cols and d_cols as allowed (see https://github.com/DoubleML/doubleml-for-py/issues/83) + if not d_cols_set.isdisjoint(x_cols_set): + raise ValueError('At least one variable/column is set as treatment variable (``d_cols``) and as covariate' + '(``x_cols``). Consider using parameter ``use_other_treat_as_covariate``.') + + if self.z_cols is not None: + z_cols_set = set(self.z_cols) + if not y_col_set.isdisjoint(z_cols_set): + raise ValueError(f'{str(self.y_col)} cannot be set as outcome variable ``y_col`` and instrumental ' + 'variable in ``z_cols``.') + if not d_cols_set.isdisjoint(z_cols_set): + raise ValueError('At least one variable/column is set as treatment variable (``d_cols``) and ' + 'instrumental variable in ``z_cols``.') + if not x_cols_set.isdisjoint(z_cols_set): + raise ValueError('At least one variable/column is set as covariate (``x_cols``) and instrumental ' + 'variable in ``z_cols``.') + + self._check_disjoint_sets_t_s() + + def _check_disjoint_sets_t_s(self): + y_col_set = {self.y_col} + x_cols_set = set(self.x_cols) + d_cols_set = set(self.d_cols) + + if self.t_col is not None: + t_col_set = {self.t_col} + if not t_col_set.isdisjoint(x_cols_set): + raise ValueError(f'{str(self.t_col)} cannot be set as time variable ``t_col`` and covariate in ' + '``x_cols``.') + if not t_col_set.isdisjoint(d_cols_set): + raise ValueError(f'{str(self.t_col)} cannot be set as time variable ``t_col`` and treatment variable in ' + '``d_cols``.') + if not t_col_set.isdisjoint(y_col_set): + raise ValueError(f'{str(self.t_col)} cannot be set as time variable ``t_col`` and outcome variable ' + '``y_col``.') + if self.z_cols is not None: + z_cols_set = set(self.z_cols) + if not t_col_set.isdisjoint(z_cols_set): + raise ValueError(f'{str(self.t_col)} cannot be set as time variable ``t_col`` and instrumental ' + 'variable in ``z_cols``.') + + if self.s_col is not None: + s_col_set = {self.s_col} + if not s_col_set.isdisjoint(x_cols_set): + raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and covariate in ' + '``x_cols``.') + if not s_col_set.isdisjoint(d_cols_set): + raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and treatment ' + 'variable in ``d_cols``.') + if not s_col_set.isdisjoint(y_col_set): + raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and outcome ' + 'variable ``y_col``.') + if self.z_cols is not None: + z_cols_set = set(self.z_cols) + if not s_col_set.isdisjoint(z_cols_set): + raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and ' + 'instrumental variable in ``z_cols``.') + if self.t_col is not None: + t_col_set = {self.t_col} + if not s_col_set.isdisjoint(t_col_set): + raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and time ' + 'variable ``t_col``.') + + +class DoubleMLClusterData(DoubleMLData): + """Double machine learning data-backend for data with cluster variables. + + :class:`DoubleMLClusterData` objects can be initialized from + :class:`pandas.DataFrame`'s as well as :class:`numpy.ndarray`'s. + + Parameters + ---------- + data : :class:`pandas.DataFrame` + The data. + + y_col : str + The outcome variable. + + d_cols : str or list + The treatment variable(s). + + cluster_cols : str or list + The cluster variable(s). + + x_cols : None, str or list + The covariates. + If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor + treatment variables ``d_cols``, nor instrumental variables ``z_cols`` are used as covariates. + Default is ``None``. + + z_cols : None, str or list + The instrumental variable(s). + Default is ``None``. + + t_col : None or str + The time variable (only relevant/used for DiD Estimators). + Default is ``None``. + + s_col : None or str + The score or selection variable (only relevant/used for RDD and SSM Estimatiors). + Default is ``None``. + + use_other_treat_as_covariate : bool + Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. + Default is ``True``. + + force_all_x_finite : bool or str + Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. + Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are + allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). + Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used + for the nuisance functions are capable to provide valid predictions with missings and / or infinite values + in the covariates ``x``. + Default is ``True``. + + Examples + -------- + >>> from doubleml import DoubleMLClusterData + >>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021 + >>> # initialization from pandas.DataFrame + >>> df = make_pliv_multiway_cluster_CKMS2021(return_type='DataFrame') + >>> obj_dml_data_from_df = DoubleMLClusterData(df, 'Y', 'D', ['cluster_var_i', 'cluster_var_j'], z_cols='Z') + >>> # initialization from np.ndarray + >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array') + >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z) + """ + def __init__(self, + data, + y_col, + d_cols, + cluster_cols, + x_cols=None, + z_cols=None, + t_col=None, + s_col=None, + use_other_treat_as_covariate=True, + force_all_x_finite=True): + DoubleMLBaseData.__init__(self, data) + + # we need to set cluster_cols (needs _data) before call to the super __init__ because of the x_cols setter + self.cluster_cols = cluster_cols + self._set_cluster_vars() + DoubleMLData.__init__(self, + data, + y_col, + d_cols, + x_cols, + z_cols, + t_col, + s_col, + use_other_treat_as_covariate, + force_all_x_finite) + self._check_disjoint_sets_cluster_cols() + + def __str__(self): + data_summary = self._data_summary_str() + buf = io.StringIO() + self.data.info(verbose=False, buf=buf) + df_info = buf.getvalue() + res = '================== DoubleMLClusterData Object ==================\n' + \ + '\n------------------ Data summary ------------------\n' + data_summary + \ + '\n------------------ DataFrame info ------------------\n' + df_info + return res + + def _data_summary_str(self): + data_summary = f'Outcome variable: {self.y_col}\n' \ + f'Treatment variable(s): {self.d_cols}\n' \ + f'Cluster variable(s): {self.cluster_cols}\n' \ + f'Covariates: {self.x_cols}\n' \ + f'Instrument variable(s): {self.z_cols}\n' + if self.t_col is not None: + data_summary += f'Time variable: {self.t_col}\n' + if self.s_col is not None: + data_summary += f'Score/Selection variable: {self.s_col}\n' + + data_summary += f'No. Observations: {self.n_obs}\n' + return data_summary + + @classmethod + def from_arrays(cls, x, y, d, cluster_vars, z=None, t=None, s=None, use_other_treat_as_covariate=True, + force_all_x_finite=True): + """ + Initialize :class:`DoubleMLClusterData` from :class:`numpy.ndarray`'s. + + Parameters + ---------- + x : :class:`numpy.ndarray` + Array of covariates. + + y : :class:`numpy.ndarray` + Array of the outcome variable. + + d : :class:`numpy.ndarray` + Array of treatment variables. + + cluster_vars : :class:`numpy.ndarray` + Array of cluster variables. + + z : None or :class:`numpy.ndarray` + Array of instrumental variables. + Default is ``None``. + + t : :class:`numpy.ndarray` + Array of the time variable (only relevant/used for DiD models). + Default is ``None``. + + s : :class:`numpy.ndarray` + Array of the score or selection variable (only relevant/used for RDD or SSM models). + Default is ``None``. + + use_other_treat_as_covariate : bool + Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. + Default is ``True``. + + force_all_x_finite : bool or str + Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. + Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are + allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). + Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used + for the nuisance functions are capable to provide valid predictions with missings and / or infinite values + in the covariates ``x``. + Default is ``True``. + + Examples + -------- + >>> from doubleml import DoubleMLClusterData + >>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021 + >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array') + >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z) + """ + dml_data = DoubleMLData.from_arrays(x, y, d, z, t, s, use_other_treat_as_covariate, force_all_x_finite) + cluster_vars = check_array(cluster_vars, ensure_2d=False, allow_nd=False) + cluster_vars = _assure_2d_array(cluster_vars) + if cluster_vars.shape[1] == 1: + cluster_cols = ['cluster_var'] + else: + cluster_cols = [f'cluster_var{i + 1}' for i in np.arange(cluster_vars.shape[1])] + + data = pd.concat((pd.DataFrame(cluster_vars, columns=cluster_cols), dml_data.data), axis=1) + + return (cls(data, dml_data.y_col, dml_data.d_cols, cluster_cols, + dml_data.x_cols, dml_data.z_cols, dml_data.t_col, dml_data.s_col, + dml_data.use_other_treat_as_covariate, dml_data.force_all_x_finite)) + + @property + def cluster_cols(self): + """ + The cluster variable(s). + """ + return self._cluster_cols + + @cluster_cols.setter + def cluster_cols(self, value): + reset_value = hasattr(self, '_cluster_cols') + if isinstance(value, str): + value = [value] + if not isinstance(value, list): + raise TypeError('The cluster variable(s) cluster_cols must be of str or list type. ' + f'{str(value)} of type {str(type(value))} was passed.') + if not len(set(value)) == len(value): + raise ValueError('Invalid cluster variable(s) cluster_cols: ' + 'Contains duplicate values.') + if not set(value).issubset(set(self.all_variables)): + raise ValueError('Invalid cluster variable(s) cluster_cols. ' + 'At least one cluster variable is no data column.') + self._cluster_cols = value + if reset_value: + self._check_disjoint_sets() + self._set_cluster_vars() + + @property + def n_cluster_vars(self): + """ + The number of cluster variables. + """ + return len(self.cluster_cols) + + @property + def cluster_vars(self): + """ + Array of cluster variable(s). + """ + return self._cluster_vars.values + + @DoubleMLData.x_cols.setter + def x_cols(self, value): + if value is not None: + # this call might become much easier with https://github.com/python/cpython/pull/26194 + super(self.__class__, self.__class__).x_cols.__set__(self, value) + else: + if self.s_col is None: + if (self.z_cols is not None) & (self.t_col is not None): + y_d_z_t = set.union({self.y_col}, set(self.d_cols), set(self.z_cols), {self.t_col}, set(self.cluster_cols)) + x_cols = [col for col in self.data.columns if col not in y_d_z_t] + elif self.z_cols is not None: + y_d_z = set.union({self.y_col}, set(self.d_cols), set(self.z_cols), set(self.cluster_cols)) + x_cols = [col for col in self.data.columns if col not in y_d_z] + elif self.t_col is not None: + y_d_t = set.union({self.y_col}, set(self.d_cols), {self.t_col}, set(self.cluster_cols)) + x_cols = [col for col in self.data.columns if col not in y_d_t] + else: + y_d = set.union({self.y_col}, set(self.d_cols), set(self.cluster_cols)) + x_cols = [col for col in self.data.columns if col not in y_d] + else: + if (self.z_cols is not None) & (self.t_col is not None): + y_d_z_t_s = set.union({self.y_col}, set(self.d_cols), set(self.z_cols), {self.t_col}, {self.s_col}, + set(self.cluster_cols)) + x_cols = [col for col in self.data.columns if col not in y_d_z_t_s] + elif self.z_cols is not None: + y_d_z_s = set.union({self.y_col}, set(self.d_cols), set(self.z_cols), {self.s_col}, set(self.cluster_cols)) + x_cols = [col for col in self.data.columns if col not in y_d_z_s] + elif self.t_col is not None: + y_d_t_s = set.union({self.y_col}, set(self.d_cols), {self.t_col}, {self.s_col}, set(self.cluster_cols)) + x_cols = [col for col in self.data.columns if col not in y_d_t_s] + else: + y_d_s = set.union({self.y_col}, set(self.d_cols), {self.s_col}, set(self.cluster_cols)) + x_cols = [col for col in self.data.columns if col not in y_d_s] + # this call might become much easier with https://github.com/python/cpython/pull/26194 + super(self.__class__, self.__class__).x_cols.__set__(self, x_cols) + + def _check_disjoint_sets(self): + # apply the standard checks from the DoubleMLData class + super(DoubleMLClusterData, self)._check_disjoint_sets() + self._check_disjoint_sets_cluster_cols() + + def _check_disjoint_sets_cluster_cols(self): + # apply the standard checks from the DoubleMLData class + super(DoubleMLClusterData, self)._check_disjoint_sets() + + # special checks for the additional cluster variables + cluster_cols_set = set(self.cluster_cols) + y_col_set = {self.y_col} + x_cols_set = set(self.x_cols) + d_cols_set = set(self.d_cols) + t_col_set = {self.t_col} + s_col_set = {self.s_col} + + if not y_col_set.isdisjoint(cluster_cols_set): + raise ValueError(f'{str(self.y_col)} cannot be set as outcome variable ``y_col`` and cluster ' + 'variable in ``cluster_cols``.') + if not d_cols_set.isdisjoint(cluster_cols_set): + raise ValueError('At least one variable/column is set as treatment variable (``d_cols``) and ' + 'cluster variable in ``cluster_cols``.') + # TODO: Is the following combination allowed, or not? + if not x_cols_set.isdisjoint(cluster_cols_set): + raise ValueError('At least one variable/column is set as covariate (``x_cols``) and cluster ' + 'variable in ``cluster_cols``.') + if self.z_cols is not None: + z_cols_set = set(self.z_cols) + if not z_cols_set.isdisjoint(cluster_cols_set): + raise ValueError('At least one variable/column is set as instrumental variable (``z_cols``) and ' + 'cluster variable in ``cluster_cols``.') + if self.t_col is not None: + if not t_col_set.isdisjoint(cluster_cols_set): + raise ValueError(f'{str(self.t_col)} cannot be set as time variable ``t_col`` and ' + 'cluster variable in ``cluster_cols``.') + if self.s_col is not None: + if not s_col_set.isdisjoint(cluster_cols_set): + raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and ' + 'cluster variable in ``cluster_cols``.') + + def _set_cluster_vars(self): + assert_all_finite(self.data.loc[:, self.cluster_cols]) + self._cluster_vars = self.data.loc[:, self.cluster_cols] diff --git a/doubleml/logistic/logistic.py b/doubleml/logistic/logistic.py new file mode 100644 index 00000000..8915215b --- /dev/null +++ b/doubleml/logistic/logistic.py @@ -0,0 +1,463 @@ +import numpy as np +from doubleml.utils._estimation import ( + _dml_cv_predict, + _trimm, + _predict_zero_one_propensity, + _cond_targets, + _get_bracket_guess, + _default_kde, + _normalize_ipw, + _dml_tune, + _solve_ipw_score, +) +from sklearn.base import clone +from sklearn.utils import check_X_y +import scipy +from sklearn.utils.multiclass import type_of_target + +from doubleml import DoubleMLData, DoubleMLBLP +from doubleml.double_ml import DoubleML +from doubleml.double_ml_score_mixins import NonLinearScoreMixin +from doubleml.utils import DoubleMLClusterResampling +from doubleml.utils._checks import _check_score, _check_finite_predictions, _check_is_propensity +from doubleml.utils.resampling import DoubleMLDoubleResampling + + +class DoubleMLLogit(NonLinearScoreMixin, DoubleML): + """Double machine learning for partially linear regression models + + Parameters + ---------- + obj_dml_data : :class:`DoubleMLData` object + The :class:`DoubleMLData` object providing the data and specifying the variables for the causal model. + + ml_r : estimator implementing ``fit()`` and ``predict()`` + A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. + :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function :math:`\\ell_0(X) = E[Y|X]`. + + ml_m : estimator implementing ``fit()`` and ``predict()`` + A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. + :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function :math:`m_0(X) = E[D|X]`. + For binary treatment variables :math:`D` (with values 0 and 1), a classifier implementing ``fit()`` and + ``predict_proba()`` can also be specified. If :py:func:`sklearn.base.is_classifier` returns ``True``, + ``predict_proba()`` is used otherwise ``predict()``. + + ml_g : estimator implementing ``fit()`` and ``predict()`` + A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. + :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function + :math:`g_0(X) = E[Y - D \\theta_0|X]`. + Note: The learner `ml_g` is only required for the score ``'IV-type'``. Optionally, it can be specified and + estimated for callable scores. + + n_folds : int + Number of folds. + Default is ``5``. + + n_rep : int + Number of repetitons for the sample splitting. + Default is ``1``. + + score : str or callable + A str (``'partialling out'`` or ``'IV-type'``) specifying the score function + or a callable object / function with signature ``psi_a, psi_b = score(y, d, l_hat, m_hat, g_hat, smpls)``. + Default is ``'partialling out'``. + + draw_sample_splitting : bool + Indicates whether the sample splitting should be drawn during initialization of the object. + Default is ``True``. + + Examples + -------- + >>> import numpy as np + >>> import doubleml as dml + >>> from doubleml.datasets import make_plr_CCDDHNR2018 + >>> from sklearn.ensemble import RandomForestRegressor + >>> from sklearn.base import clone + >>> np.random.seed(3141) + >>> learner = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) + >>> ml_g = learner + >>> ml_m = learner + >>> obj_dml_data = make_plr_CCDDHNR2018(alpha=0.5, n_obs=500, dim_x=20) + >>> dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m) + >>> dml_plr_obj.fit().summary + coef std err t P>|t| 2.5 % 97.5 % + d 0.462321 0.04107 11.256983 2.139582e-29 0.381826 0.542816 + + Notes + ----- + **Partially linear regression (PLR)** models take the form + + .. math:: + + Y = D \\theta_0 + g_0(X) + \\zeta, & &\\mathbb{E}(\\zeta | D,X) = 0, + + D = m_0(X) + V, & &\\mathbb{E}(V | X) = 0, + + where :math:`Y` is the outcome variable and :math:`D` is the policy variable of interest. + The high-dimensional vector :math:`X = (X_1, \\ldots, X_p)` consists of other confounding covariates, + and :math:`\\zeta` and :math:`V` are stochastic errors. + """ + + def __init__(self, + obj_dml_data, + ml_r, + ml_m, + ml_M, + ml_t, + ml_a=None, + n_folds=5, + n_folds_inner=5, + n_rep=1, + score='logistic', + draw_sample_splitting=True): + super().__init__(obj_dml_data, + n_folds, + n_rep, + score, + draw_sample_splitting) + + self._check_data(self._dml_data) + valid_scores = ['logistic'] + _check_score(self.score, valid_scores, allow_callable=True) + + _ = self._check_learner(ml_r, 'ml_r', regressor=True, classifier=False) + _ = self._check_learner(ml_t, 'ml_t', regressor=True, classifier=False) + _ = self._check_learner(ml_M, 'ml_M', regressor=False, classifier=True) + ml_m_is_classifier = self._check_learner(ml_m, 'ml_m', regressor=True, classifier=True) + self._learner = {'ml_l': ml_r, 'ml_m': ml_m, 'ml_t': ml_t, 'ml_M': ml_M} + + if ml_a is not None: + ml_a_is_classifier = self._check_learner(ml_a, 'ml_a', regressor=True, classifier=True) + self._learner['ml_a'] = ml_a + else: + self._learner['ml_a'] = clone(ml_m) + ml_a_is_classifier = ml_m_is_classifier + + self._predict_method = {'ml_r': 'predict', 'ml_t': 'predict', 'ml_M': 'predict_proba'} + + if ml_m_is_classifier: + if self._dml_data.binary_treats.all(): + self._predict_method['ml_m'] = 'predict_proba' + else: + raise ValueError(f'The ml_m learner {str(ml_m)} was identified as classifier ' + 'but at least one treatment variable is not binary with values 0 and 1.') + else: + self._predict_method['ml_m'] = 'predict' + + if ml_a_is_classifier: + if self._dml_data.binary_treats.all(): + self._predict_method['ml_a'] = 'predict_proba' + else: + raise ValueError(f'The ml_a learner {str(ml_a)} was identified as classifier ' + 'but at least one treatment variable is not binary with values 0 and 1.') + else: + self._predict_method['ml_a'] = 'predict' + + self._initialize_ml_nuisance_params() + self._sensitivity_implemented = True + self._external_predictions_implemented = True + + def _initialize_ml_nuisance_params(self): + self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} + for learner in self._learner} + + def _check_data(self, obj_dml_data): + if not isinstance(obj_dml_data, DoubleMLData): + raise TypeError('The data must be of DoubleMLData type. ' + f'{str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed.') + return + + def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, smpls_inner=None, + n_jobs=None, est_params=None, method='predict'): + res = {} + res['preds'] = np.zeros_like(y) + res['preds_inner'] = np.zeros_like(y) + for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): + res_inner = _dml_cv_predict(estimator, x, y, smpls=smpls_double_split, n_jobs=n_jobs, + est_params=est_params, method=method, + return_models=True) + _check_finite_predictions(res_inner['preds'], estimator, estimator_name, smpls_double_split) + + res['preds_inner'] += res_inner['preds'] + for model in res_inner['models']: + res['models'].append(model) + res['preds'][smpls_single_split[1]] += model.predict(x[smpls_single_split[1]]) + + res["preds"] /= len(smpls) + res['targets'] = np.copy(y) + + + + def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False): + x, y = check_X_y(self._dml_data.x, self._dml_data.y, + force_all_finite=False) + x, d = check_X_y(x, self._dml_data.d, + force_all_finite=False) + x_d_concat = np.hstack([[d, np.newaxis], x]) + r_external = external_predictions['ml_r'] is not None + m_external = external_predictions['ml_m'] is not None + M_external = external_predictions['ml_M'] is not None + t_external = external_predictions['ml_t'] is not None + if 'ml_a' in self._learner: + a_external = external_predictions['ml_a'] is not None + else: + a_external = False + + # nuisance m + if m_external: + m_hat = {'preds': external_predictions['ml_m'], + 'targets': None, + 'models': None} + else: + m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=smpls, n_jobs=n_jobs_cv, + est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'], + return_models=return_models) + _check_finite_predictions(m_hat['preds'], self._learner['ml_m'], 'ml_m', smpls) + + if self._check_learner(self._learner['ml_m'], 'ml_m', regressor=True, classifier=True): + _check_is_propensity(m_hat['preds'], self._learner['ml_m'], 'ml_m', smpls, eps=1e-12) + + if self._dml_data.binary_treats[self._dml_data.d_cols[self._i_treat]]: + binary_preds = (type_of_target(m_hat['preds']) == 'binary') + zero_one_preds = np.all((np.power(m_hat['preds'], 2) - m_hat['preds']) == 0) + if binary_preds & zero_one_preds: + raise ValueError(f'For the binary treatment variable {self._dml_data.d_cols[self._i_treat]}, ' + f'predictions obtained with the ml_m learner {str(self._learner["ml_m"])} are also ' + 'observed to be binary with values 0 and 1. Make sure that for classifiers ' + 'probabilities and not labels are predicted.') + + + if M_external: + M_hat = {'preds': external_predictions['ml_M'], + 'targets': None, + 'models': None} + else: + M_hat = (self.double_dml_cv_predict(self._learner['ml_M'], 'ml_M', x_d_concat, y, smpls=smpls, smpls_inner=smpls_inner, + n_jobs=n_jobs_cv, + est_params=self._get_params('ml_M'), method=self._predict_method['ml_M'])) + + if a_external: + a_hat = {'preds': external_predictions['ml_a'], + 'targets': None, + 'models': None} + else: + a_hat = (self.double_dml_cv_predict(self._learner['ml_a'], 'ml_a', x_d_concat, y, smpls=smpls, smpls_inner=smpls_inner, + n_jobs=n_jobs_cv, + est_params=self._get_params('ml_a'), method=self._predict_method['ml_a'])) + + + W = scipy.special.logit(M_hat['preds']) + d_tilde_full = d - a_hat['preds'] + + beta_notFold = np.zeros_like(d) + + for _, test in smpls: + beta_notFold[test] = np.sum(d_tilde_full[test] * W[test]) / np.sum(d_tilde_full[test] ** 2) + + # nuisance t + if t_external: + t_hat = {'preds': external_predictions['ml_t'], + 'targets': None, + 'models': None} + else: + t_hat = _dml_cv_predict(self._learner['ml_t'], x, W, smpls=smpls, n_jobs=n_jobs_cv, + est_params=self._get_params('ml_t'), method=self._predict_method['ml_t'], + return_models=return_models) + _check_finite_predictions(t_hat['preds'], self._learner['ml_l'], 'ml_l', smpls) + + W = scipy.special.expit(M_hat['preds']) + + # nuisance W + if t_external: + t_hat = {'preds': external_predictions['ml_t'], + 'targets': None, + 'models': None} + else: + t_hat = _dml_cv_predict(self._learner['ml_t'], x, W, smpls=smpls, n_jobs=n_jobs_cv, + est_params=self._get_params('ml_t'), method=self._predict_method['ml_t'], + return_models=return_models) + _check_finite_predictions(t_hat['preds'], self._learner['ml_t'], 'ml_t', smpls) + + r_hat = {} + r_hat['preds'] = t_hat['preds'] - beta_notFold * a_hat['preds'] + + + psi_elements = self._score_elements(y, d, r_hat['preds'], m_hat['preds']) + + preds = {'predictions': {'ml_r': r_hat['preds'], + 'ml_m': m_hat['preds'], + 'ml_a': a_hat['preds'], + 'ml_t': t_hat['preds'], + 'ml_M': M_hat['preds']}, + 'targets': {'ml_r': r_hat['targets'], + 'ml_m': m_hat['targets'], + 'ml_a': a_hat['targets'], + 'ml_t': t_hat['targets'], + 'ml_M': M_hat['targets']}, + 'models': {'ml_r': None, + 'ml_m': m_hat['models'], + 'ml_a': a_hat['models'], + 'ml_t': t_hat['models'], + 'ml_M': M_hat['models']}} + + return psi_elements, preds + + def _score_elements(self, y, d, r_hat, m_hat): + # compute residual + d_tilde = d - m_hat + psi_hat = scipy.special.expit(-r) + score_const = d_tilde * (1 - y) * np.exp(r) + psi_elements = {"y": y, "d": d, "r_hat": r_hat, "m_hat": m_hat, "psi_hat": psi_hat, "score_const": score_const} + + return psi_elements + + def _sensitivity_element_est(self, preds): + pass + + def _nuisance_tuning(self): + pass + + @property + def __smpls__inner(self): + return self._smpls[self._i_rep] + + def draw_sample_splitting(self): + """ + Draw sample splitting for DoubleML models. + + The samples are drawn according to the attributes + ``n_folds`` and ``n_rep``. + + Returns + ------- + self : object + """ + + obj_dml_resampling = DoubleMLDoubleResampling(n_folds=self.n_folds, + n_folds_inner=self.n_folds_inner, + n_rep=self.n_rep, + n_obs=self._dml_data.n_obs, + stratify=self._strata) + self._smpls, self._smpls_inner = obj_dml_resampling.split_samples() + + return self + + def set_sample_splitting(self): + raise NotImplementedError('set_sample_splitting is not implemented for DoubleMLLogit.') + + def _compute_score(self, psi_elements, coef): + + score_1 = psi_elements["y"] * np.exp(-coef * psi_elements["r_hat"]) * psi_elements["d_tilde"] + + + return psi_elements["psi_hat"] * (score_1 - psi_elements["score_const"]) + + def _compute_score_deriv(self, psi_elements, coef, inds=None): + deriv_1 = - psi_elements["y"] * np.exp(-coef * psi_elements["r_hat"]) * psi_elements["d"] + + return psi_elements["psi_hat"] * psi_elements["d_tilde"] * deriv_1 + + + def cate(self, basis, is_gate=False): + """ + Calculate conditional average treatment effects (CATE) for a given basis. + + Parameters + ---------- + basis : :class:`pandas.DataFrame` + The basis for estimating the best linear predictor. Has to have the shape ``(n_obs, d)``, + where ``n_obs`` is the number of observations and ``d`` is the number of predictors. + is_gate : bool + Indicates whether the basis is constructed for GATEs (dummy-basis). + Default is ``False``. + + Returns + ------- + model : :class:`doubleML.DoubleMLBLP` + Best linear Predictor model. + """ + if self._dml_data.n_treat > 1: + raise NotImplementedError('Only implemented for single treatment. ' + + f'Number of treatments is {str(self._dml_data.n_treat)}.') + if self.n_rep != 1: + raise NotImplementedError('Only implemented for one repetition. ' + + f'Number of repetitions is {str(self.n_rep)}.') + + Y_tilde, D_tilde = self._partial_out() + + D_basis = basis * D_tilde + model = DoublelMLBLP( + orth_signal=Y_tilde.reshape(-1), + basis=D_basis, + is_gate=is_gate, + ) + model.fit() + + ## TODO: Solve score + + + return model + + def gate(self, groups): + """ + Calculate group average treatment effects (GATE) for groups. + + Parameters + ---------- + groups : :class:`pandas.DataFrame` + The group indicator for estimating the best linear predictor. Groups should be mutually exclusive. + Has to be dummy coded with shape ``(n_obs, d)``, where ``n_obs`` is the number of observations + and ``d`` is the number of groups or ``(n_obs, 1)`` and contain the corresponding groups (as str). + + Returns + ------- + model : :class:`doubleML.DoubleMLBLP` + Best linear Predictor model for Group Effects. + """ + + if not isinstance(groups, pd.DataFrame): + raise TypeError('Groups must be of DataFrame type. ' + f'Groups of type {str(type(groups))} was passed.') + if not all(groups.dtypes == bool) or all(groups.dtypes == int): + if groups.shape[1] == 1: + groups = pd.get_dummies(groups, prefix='Group', prefix_sep='_') + else: + raise TypeError('Columns of groups must be of bool type or int type (dummy coded). ' + 'Alternatively, groups should only contain one column.') + + if any(groups.sum(0) <= 5): + warnings.warn('At least one group effect is estimated with less than 6 observations.') + + model = self.cate(groups, is_gate=True) + return model + + def _partial_out(self): + """ + Helper function. Returns the partialled out quantities of Y and D. + Works with multiple repetitions. + + Returns + ------- + Y_tilde : :class:`numpy.ndarray` + The residual of the regression of Y on X. + D_tilde : :class:`numpy.ndarray` + The residual of the regression of D on X. + """ + if self.predictions is None: + raise ValueError('predictions are None. Call .fit(store_predictions=True) to store the predictions.') + + y = self._dml_data.y.reshape(-1, 1) + d = self._dml_data.d.reshape(-1, 1) + ml_m = self.predictions["ml_m"].squeeze(axis=2) + + if self.score == "partialling out": + ml_l = self.predictions["ml_l"].squeeze(axis=2) + Y_tilde = y - ml_l + D_tilde = d - ml_m + else: + assert self.score == "IV-type" + ml_g = self.predictions["ml_g"].squeeze(axis=2) + Y_tilde = y - (self.coef * ml_m) - ml_g + D_tilde = d - ml_m + + return Y_tilde, D_tilde \ No newline at end of file diff --git a/doubleml/utils/resampling.py b/doubleml/utils/resampling.py index 188d2f24..18153944 100644 --- a/doubleml/utils/resampling.py +++ b/doubleml/utils/resampling.py @@ -25,6 +25,51 @@ def split_samples(self): return smpls +class DoubleMLDoubleResampling: + def __init__(self, + n_folds, + n_folds_inner, + n_rep, + n_obs, + stratify=None): + self.n_folds = n_folds + self.n_rep = n_rep + self.n_obs = n_obs + self.stratify = stratify + + if n_folds < 2: + raise ValueError('n_folds must be greater than 1. ' + 'You can use set_sample_splitting with a tuple to only use one fold.') + if n_folds_inner < 2: + raise ValueError('n_folds_inner must be greater than 1. ' + 'You can use set_sample_splitting with a tuple to only use one fold.') + + + if self.stratify is None: + self.resampling = RepeatedKFold(n_splits=n_folds, n_repeats=n_rep) + self.resampling_inner = RepeatedKFold(n_splits=n_folds_inner) + else: + self.resampling = RepeatedStratifiedKFold(n_splits=n_folds, n_repeats=n_rep) + self.resampling_inner = RepeatedStratifiedKFold(n_splits=n_folds_inner) + + def split_samples(self): + all_smpls = [(train, test) for train, test in self.resampling.split(X=np.zeros(self.n_obs), y=self.stratify)] + smpls = [all_smpls[(i_repeat * self.n_folds):((i_repeat + 1) * self.n_folds)] + for i_repeat in range(self.n_rep)] + smpls_inner = [] + for _ in range(self.n_rep): + smpls_inner_rep = [] + for _, test in all_smpls: + if self.stratify is None: + smpls_inner_rep.append([(train_inner, test_inner) for train_inner, test_inner in self.resampling_inner.split(X=test)]) + else: + smpls_inner_rep.append([(train_inner, test_inner) for train_inner, test_inner in + self.resampling_inner.split(X=np.zeros(len(test)), y=self.stratify[test])]) + smpls_inner.append(smpls_inner_rep) + + return smpls, smpls_inner + + class DoubleMLClusterResampling: def __init__(self, n_folds, n_rep, n_obs, n_cluster_vars, cluster_vars): self.n_folds = n_folds From f5521f142de7ad22e754c7d1d8d7c5f4c18ffa3b Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 27 Jan 2025 10:11:36 +0100 Subject: [PATCH 02/23] First WIP of implementation --- doubleml/double_ml_data.py | 4 +--- doubleml/logistic/logistic.py | 24 +++++++++++++++++------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/doubleml/double_ml_data.py b/doubleml/double_ml_data.py index 4f8d7cbc..fdee739d 100644 --- a/doubleml/double_ml_data.py +++ b/doubleml/double_ml_data.py @@ -413,9 +413,7 @@ def x_cols(self, value): if not isinstance(value, list): raise TypeError('The covariates x_cols must be of str or list type (or None). ' f'{str(value)} of type {str(type(value))} was passed.') - if not len(se - - t(value)) == len(value): + if not len(set(value)) == len(value): raise ValueError('Invalid covariates x_cols: ' 'Contains duplicate values.') if not set(value).issubset(set(self.all_variables)): diff --git a/doubleml/logistic/logistic.py b/doubleml/logistic/logistic.py index 8915215b..26c14a80 100644 --- a/doubleml/logistic/logistic.py +++ b/doubleml/logistic/logistic.py @@ -1,5 +1,5 @@ import numpy as np -from doubleml.utils._estimation import ( +from ..utils._estimation import ( _dml_cv_predict, _trimm, _predict_zero_one_propensity, @@ -15,12 +15,15 @@ import scipy from sklearn.utils.multiclass import type_of_target -from doubleml import DoubleMLData, DoubleMLBLP -from doubleml.double_ml import DoubleML -from doubleml.double_ml_score_mixins import NonLinearScoreMixin -from doubleml.utils import DoubleMLClusterResampling -from doubleml.utils._checks import _check_score, _check_finite_predictions, _check_is_propensity -from doubleml.utils.resampling import DoubleMLDoubleResampling +from .. import DoubleMLData +from ..double_ml import DoubleML +from ..double_ml_score_mixins import NonLinearScoreMixin +from ..utils import DoubleMLClusterResampling +from ..utils._checks import _check_score, _check_finite_predictions, _check_is_propensity +from ..utils.resampling import DoubleMLDoubleResampling + + + class DoubleMLLogit(NonLinearScoreMixin, DoubleML): @@ -110,6 +113,7 @@ def __init__(self, n_rep=1, score='logistic', draw_sample_splitting=True): + self.n_folds_inner = n_folds_inner super().__init__(obj_dml_data, n_folds, n_rep, @@ -165,6 +169,8 @@ def _check_data(self, obj_dml_data): if not isinstance(obj_dml_data, DoubleMLData): raise TypeError('The data must be of DoubleMLData type. ' f'{str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed.') + if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]): + raise TypeError('The outcome variable y must be binary with values 0 and 1.') return def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, smpls_inner=None, @@ -311,6 +317,10 @@ def _score_elements(self, y, d, r_hat, m_hat): return psi_elements + @property + def _score_element_names(self): + return ['y', 'd', 'r_hat', 'm_hat', 'psi_hat', 'score_const'] + def _sensitivity_element_est(self, preds): pass From bfa756c58797a36943741c2a5d03a9ae57e4e82a Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Fri, 21 Feb 2025 15:07:11 +0100 Subject: [PATCH 03/23] Working implementation. Started on test set-up. --- doubleml/logistic/logistic.py | 189 +++++++--- .../logistic/tests/_utils_logistic_manual.py | 346 +++++++++++++++++ doubleml/logistic/tests/tests_logistic.py | 352 ++++++++++++++++++ doubleml/utils/_estimation.py | 6 +- doubleml/utils/resampling.py | 13 +- 5 files changed, 853 insertions(+), 53 deletions(-) create mode 100644 doubleml/logistic/tests/_utils_logistic_manual.py create mode 100644 doubleml/logistic/tests/tests_logistic.py diff --git a/doubleml/logistic/logistic.py b/doubleml/logistic/logistic.py index 26c14a80..25ba3763 100644 --- a/doubleml/logistic/logistic.py +++ b/doubleml/logistic/logistic.py @@ -103,7 +103,6 @@ class DoubleMLLogit(NonLinearScoreMixin, DoubleML): def __init__(self, obj_dml_data, - ml_r, ml_m, ml_M, ml_t, @@ -119,16 +118,17 @@ def __init__(self, n_rep, score, draw_sample_splitting) + self._coef_bounds = (-1e-2, 1e2) + self._coef_start_val = 1.0 self._check_data(self._dml_data) valid_scores = ['logistic'] _check_score(self.score, valid_scores, allow_callable=True) - _ = self._check_learner(ml_r, 'ml_r', regressor=True, classifier=False) _ = self._check_learner(ml_t, 'ml_t', regressor=True, classifier=False) _ = self._check_learner(ml_M, 'ml_M', regressor=False, classifier=True) ml_m_is_classifier = self._check_learner(ml_m, 'ml_m', regressor=True, classifier=True) - self._learner = {'ml_l': ml_r, 'ml_m': ml_m, 'ml_t': ml_t, 'ml_M': ml_M} + self._learner = {'ml_m': ml_m, 'ml_t': ml_t, 'ml_M': ml_M} if ml_a is not None: ml_a_is_classifier = self._check_learner(ml_a, 'ml_a', regressor=True, classifier=True) @@ -137,7 +137,7 @@ def __init__(self, self._learner['ml_a'] = clone(ml_m) ml_a_is_classifier = ml_m_is_classifier - self._predict_method = {'ml_r': 'predict', 'ml_t': 'predict', 'ml_M': 'predict_proba'} + self._predict_method = {'ml_t': 'predict', 'ml_M': 'predict_proba'} if ml_m_is_classifier: if self._dml_data.binary_treats.all(): @@ -158,7 +158,6 @@ def __init__(self, self._predict_method['ml_a'] = 'predict' self._initialize_ml_nuisance_params() - self._sensitivity_implemented = True self._external_predictions_implemented = True def _initialize_ml_nuisance_params(self): @@ -173,34 +172,40 @@ def _check_data(self, obj_dml_data): raise TypeError('The outcome variable y must be binary with values 0 and 1.') return + def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, smpls_inner=None, n_jobs=None, est_params=None, method='predict'): res = {} res['preds'] = np.zeros_like(y) - res['preds_inner'] = np.zeros_like(y) + res['preds_inner'] = [] + res['models'] = [] for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): res_inner = _dml_cv_predict(estimator, x, y, smpls=smpls_double_split, n_jobs=n_jobs, est_params=est_params, method=method, - return_models=True) + return_models=True, smpls_is_partition=True) _check_finite_predictions(res_inner['preds'], estimator, estimator_name, smpls_double_split) - res['preds_inner'] += res_inner['preds'] + res['preds_inner'].append(res_inner['preds']) for model in res_inner['models']: res['models'].append(model) - res['preds'][smpls_single_split[1]] += model.predict(x[smpls_single_split[1]]) - + if method == 'predict_proba': + res['preds'][smpls_single_split[1]] += model.predict_proba(x[smpls_single_split[1]])[:, 1] + else: + res['preds'][smpls_single_split[1]] += model.predict(x[smpls_single_split[1]]) + res["preds_inner"] res["preds"] /= len(smpls) res['targets'] = np.copy(y) + return res def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False): + # TODO: How to deal with smpls_inner? x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False) x, d = check_X_y(x, self._dml_data.d, force_all_finite=False) - x_d_concat = np.hstack([[d, np.newaxis], x]) - r_external = external_predictions['ml_r'] is not None + x_d_concat = np.hstack((d.reshape(-1,1), x)) m_external = external_predictions['ml_m'] is not None M_external = external_predictions['ml_M'] is not None t_external = external_predictions['ml_t'] is not None @@ -215,7 +220,11 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa 'targets': None, 'models': None} else: - m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=smpls, n_jobs=n_jobs_cv, + filtered_smpls = [] + for train, test in smpls: + train_filtered = train[y[train] == 0] + filtered_smpls.append((train_filtered, test)) + m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=filtered_smpls, n_jobs=n_jobs_cv, est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'], return_models=return_models) _check_finite_predictions(m_hat['preds'], self._learner['ml_m'], 'ml_m', smpls) @@ -238,7 +247,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa 'targets': None, 'models': None} else: - M_hat = (self.double_dml_cv_predict(self._learner['ml_M'], 'ml_M', x_d_concat, y, smpls=smpls, smpls_inner=smpls_inner, + M_hat = (self._double_dml_cv_predict(self._learner['ml_M'], 'ml_M', x_d_concat, y, smpls=smpls, smpls_inner=self.__smpls__inner, n_jobs=n_jobs_cv, est_params=self._get_params('ml_M'), method=self._predict_method['ml_M'])) @@ -247,18 +256,49 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa 'targets': None, 'models': None} else: - a_hat = (self.double_dml_cv_predict(self._learner['ml_a'], 'ml_a', x_d_concat, y, smpls=smpls, smpls_inner=smpls_inner, + a_hat = (self._double_dml_cv_predict(self._learner['ml_a'], 'ml_a', x, d, smpls=smpls, smpls_inner=self.__smpls__inner, n_jobs=n_jobs_cv, est_params=self._get_params('ml_a'), method=self._predict_method['ml_a'])) + # r_legacy = np.zeros_like(y) + # smpls_inner = self.__smpls__inner + # M_hat = {} + # a_hat = {} + # M_hat['preds_inner'] = [] + # M_hat['preds'] = np.full_like(y, np.nan) + # a_hat['preds_inner'] = [] + # a_hat['preds'] = np.full_like(y, np.nan) + # for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): + # test = smpls_single_split[1] + # train = smpls_single_split[0] + # # r_legacy[test] = + # Mleg, aleg, a_nf_leg = self.legacy_implementation(y[train], x[train], d[train], x[test], d[test], + # self._learner['ml_m'], self._learner['ml_M'], + # smpls_single_split, smpls_double_split, y, x, d, + # x_d_concat, n_jobs_cv) + # Mtemp = np.full_like(y, np.nan) + # Mtemp[train] = Mleg + # Atemp = np.full_like(y, np.nan) + # Atemp[train] = aleg + # M_hat['preds_inner'].append(Mtemp) + # a_hat['preds_inner'].append(Atemp) + # a_hat['preds'][test] = a_nf_leg + # + # #r_hat['preds'] = r_legacy + + + + W_inner = [] + beta = np.zeros_like(d) + + for i, (train, test) in enumerate(smpls): + M_iteration = M_hat['preds_inner'][i][train] + M_iteration = np.clip(M_iteration, 1e-8, 1 - 1e-8) + w = scipy.special.logit(M_iteration) + W_inner.append(w) + d_tilde = (d - a_hat['preds_inner'][i])[train] + beta[test] = np.sum(d_tilde * w) / np.sum(d_tilde ** 2) - W = scipy.special.logit(M_hat['preds']) - d_tilde_full = d - a_hat['preds'] - - beta_notFold = np.zeros_like(d) - - for _, test in smpls: - beta_notFold[test] = np.sum(d_tilde_full[test] * W[test]) / np.sum(d_tilde_full[test] ** 2) # nuisance t if t_external: @@ -266,26 +306,17 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa 'targets': None, 'models': None} else: - t_hat = _dml_cv_predict(self._learner['ml_t'], x, W, smpls=smpls, n_jobs=n_jobs_cv, - est_params=self._get_params('ml_t'), method=self._predict_method['ml_t'], - return_models=return_models) - _check_finite_predictions(t_hat['preds'], self._learner['ml_l'], 'ml_l', smpls) - - W = scipy.special.expit(M_hat['preds']) - - # nuisance W - if t_external: - t_hat = {'preds': external_predictions['ml_t'], - 'targets': None, - 'models': None} - else: - t_hat = _dml_cv_predict(self._learner['ml_t'], x, W, smpls=smpls, n_jobs=n_jobs_cv, + t_hat = _dml_cv_predict(self._learner['ml_t'], x, W_inner, smpls=smpls, n_jobs=n_jobs_cv, est_params=self._get_params('ml_t'), method=self._predict_method['ml_t'], return_models=return_models) _check_finite_predictions(t_hat['preds'], self._learner['ml_t'], 'ml_t', smpls) + r_hat = {} - r_hat['preds'] = t_hat['preds'] - beta_notFold * a_hat['preds'] + r_hat['preds'] = t_hat['preds'] - beta * a_hat['preds'] + + + psi_elements = self._score_elements(y, d, r_hat['preds'], m_hat['preds']) @@ -295,7 +326,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa 'ml_a': a_hat['preds'], 'ml_t': t_hat['preds'], 'ml_M': M_hat['preds']}, - 'targets': {'ml_r': r_hat['targets'], + 'targets': {'ml_r': None, 'ml_m': m_hat['targets'], 'ml_a': a_hat['targets'], 'ml_t': t_hat['targets'], @@ -308,18 +339,86 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa return psi_elements, preds + + def legacy_implementation(self, Yfold: np.ndarray, Xfold: np.ndarray, Afold: np.ndarray, XnotFold: np.ndarray, AnotFold: np.ndarray, + learner, learnerClassifier, smpls_single_split, smpls_double_split, yfull, xfull, afull, x_d_concat, n_jobs_cv, noFolds: int = 5, seed=None, )-> (np.ndarray, np.ndarray, np.ndarray): + + def learn_predict(X, Y, Xpredict, learner, learnerClassifier, fit_args={}): + results = [] + if len(np.unique(Y)) == 2: + learnerClassifier.fit(X, Y, **fit_args) + for x in Xpredict: + results.append(learnerClassifier.predict_proba(x)[:, 1]) + else: + learner.fit(X, Y, **fit_args) + for x in Xpredict: + results.append(learner.predict(x)) + return (*results,) + + nFold = len(Yfold) + i = np.remainder(np.arange(nFold), noFolds) + np.random.default_rng(seed).shuffle(i) + + M = np.zeros((nFold)) + a_hat = np.zeros((nFold)) + a_hat_notFold = np.zeros((len(XnotFold))) + M_notFold = np.zeros((len(XnotFold))) + loss = {} + + a_hat_inner = _dml_cv_predict(self._learner['ml_a'], xfull, afull, smpls=smpls_double_split, n_jobs=n_jobs_cv, + est_params=self._get_params('ml_a'), method=self._predict_method['ml_a'], + return_models=True, smpls_is_partition=True) + _check_finite_predictions(a_hat_inner['preds'], self._learner['ml_a'], 'ml_a', smpls_double_split) + a_hat_notFold = np.full_like(yfull, 0.) + for model in a_hat_inner['models']: + if self._predict_method['ml_a'] == 'predict_proba': + a_hat_notFold[smpls_single_split[1]] += model.predict_proba(xfull[smpls_single_split[1]])[:, 1] + else: + a_hat_notFold[smpls_single_split[1]] += model.predict(xfull[smpls_single_split[1]]) + + M_hat = _dml_cv_predict(self._learner['ml_M'], x_d_concat, yfull, smpls=smpls_double_split, n_jobs=n_jobs_cv, + est_params=self._get_params('ml_M'), method=self._predict_method['ml_M'], + return_models=True, smpls_is_partition=True) + _check_finite_predictions(M_hat['preds'], self._learner['ml_M'], 'ml_M', smpls_double_split) + + M = M_hat['preds'][~np.isnan(M_hat['preds'])] + a_hat = a_hat_inner['preds'][~np.isnan(a_hat_inner['preds'])] + a_hat_notFold = a_hat_notFold[smpls_single_split[1]] + + np.clip(M, 1e-8, 1 - 1e-8, out=M) +# loss["M"] = compute_loss(Yfold, M) +# loss["a_hat"] = compute_loss(Afold, a_hat) + a_hat_notFold /= noFolds + # M_notFold /= noFolds + np.clip(M_notFold, 1e-8, 1 - 1e-8, out=M_notFold) + + # Obtain preliminary estimate of beta based on M and residual of a + W = scipy.special.logit(M) + A_resid = Afold - a_hat + beta_notFold = sum(A_resid * W) / sum(A_resid ** 2) + # print(beta_notFold) + t_notFold, = learn_predict(Xfold, W, [XnotFold], learner, learnerClassifier) + W_notFold = scipy.special.expit(M_notFold) +# loss["t"] = compute_loss(W_notFold, t_notFold) + + + # Compute r based on estimates for W=logit(M), beta and residual of A + r_notFold = t_notFold - beta_notFold * a_hat_notFold + + return M, a_hat, a_hat_notFold #r_notFold #, a_hat_notFold, M_notFold, t_notFold + def _score_elements(self, y, d, r_hat, m_hat): # compute residual d_tilde = d - m_hat - psi_hat = scipy.special.expit(-r) - score_const = d_tilde * (1 - y) * np.exp(r) - psi_elements = {"y": y, "d": d, "r_hat": r_hat, "m_hat": m_hat, "psi_hat": psi_hat, "score_const": score_const} + psi_hat = scipy.special.expit(-r_hat) + score_const = d_tilde * (1 - y) * np.exp(r_hat) + psi_elements = {"y": y, "d": d, "d_tilde": d_tilde, "r_hat": r_hat, "m_hat": m_hat, "psi_hat": psi_hat, "score_const": score_const} return psi_elements @property def _score_element_names(self): - return ['y', 'd', 'r_hat', 'm_hat', 'psi_hat', 'score_const'] + return ['y', 'd', 'd_tilde', 'r_hat', 'm_hat', 'psi_hat', 'score_const'] def _sensitivity_element_est(self, preds): pass @@ -329,7 +428,7 @@ def _nuisance_tuning(self): @property def __smpls__inner(self): - return self._smpls[self._i_rep] + return self._smpls_inner[self._i_rep] def draw_sample_splitting(self): """ @@ -357,13 +456,13 @@ def set_sample_splitting(self): def _compute_score(self, psi_elements, coef): - score_1 = psi_elements["y"] * np.exp(-coef * psi_elements["r_hat"]) * psi_elements["d_tilde"] + score_1 = psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d_tilde"] return psi_elements["psi_hat"] * (score_1 - psi_elements["score_const"]) def _compute_score_deriv(self, psi_elements, coef, inds=None): - deriv_1 = - psi_elements["y"] * np.exp(-coef * psi_elements["r_hat"]) * psi_elements["d"] + deriv_1 = - psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d"] return psi_elements["psi_hat"] * psi_elements["d_tilde"] * deriv_1 diff --git a/doubleml/logistic/tests/_utils_logistic_manual.py b/doubleml/logistic/tests/_utils_logistic_manual.py new file mode 100644 index 00000000..ae53992a --- /dev/null +++ b/doubleml/logistic/tests/_utils_logistic_manual.py @@ -0,0 +1,346 @@ +import numpy as np +import scipy +from sklearn.base import clone, is_classifier + +from ...tests._utils_boot import boot_manual, draw_weights +from ...tests._utils import fit_predict, fit_predict_proba, tune_grid_search + + +def fit_logistic_multitreat(y, x, d, learner_l, learner_m, learner_g, all_smpls, score, + n_rep=1, l_params=None, m_params=None, g_params=None, + use_other_treat_as_covariate=True): + n_obs = len(y) + n_d = d.shape[1] + + thetas = list() + ses = list() + all_l_hat = list() + all_m_hat = list() + all_g_hat = list() + for i_rep in range(n_rep): + smpls = all_smpls[i_rep] + thetas_this_rep = np.full(n_d, np.nan) + ses_this_rep = np.full(n_d, np.nan) + all_l_hat_this_rep = list() + all_m_hat_this_rep = list() + all_g_hat_this_rep = list() + + for i_d in range(n_d): + if use_other_treat_as_covariate: + xd = np.hstack((x, np.delete(d, i_d, axis=1))) + else: + xd = x + + l_hat, m_hat, g_hat, thetas_this_rep[i_d], ses_this_rep[i_d] = fit_plr_single_split( + y, xd, d[:, i_d], + learner_l, learner_m, learner_g, + smpls, score, + l_params, m_params, g_params) + all_l_hat_this_rep.append(l_hat) + all_m_hat_this_rep.append(m_hat) + all_g_hat_this_rep.append(g_hat) + + thetas.append(thetas_this_rep) + ses.append(ses_this_rep) + all_l_hat.append(all_l_hat_this_rep) + all_m_hat.append(all_m_hat_this_rep) + all_g_hat.append(all_g_hat_this_rep) + + theta = np.full(n_d, np.nan) + se = np.full(n_d, np.nan) + for i_d in range(n_d): + theta_vec = np.array([xx[i_d] for xx in thetas]) + se_vec = np.array([xx[i_d] for xx in ses]) + theta[i_d] = np.median(theta_vec) + se[i_d] = np.sqrt(np.median(np.power(se_vec, 2) * n_obs + np.power(theta_vec - theta[i_d], 2)) / n_obs) + + res = {'theta': theta, 'se': se, + 'thetas': thetas, 'ses': ses, + 'all_l_hat': all_l_hat, 'all_m_hat': all_m_hat, 'all_g_hat': all_g_hat} + + return res + + +def fit_logistic(y, x, d, learner_l, learner_m, learner_g, all_smpls, score, + n_rep=1, l_params=None, m_params=None, g_params=None): + n_obs = len(y) + + thetas = np.zeros(n_rep) + ses = np.zeros(n_rep) + all_l_hat = list() + all_m_hat = list() + all_g_hat = list() + for i_rep in range(n_rep): + smpls = all_smpls[i_rep] + l_hat, m_hat, g_hat, thetas[i_rep], ses[i_rep] = fit_plr_single_split( + y, x, d, + learner_l, learner_m, learner_g, + smpls, score, + l_params, m_params, g_params) + all_l_hat.append(l_hat) + all_m_hat.append(m_hat) + all_g_hat.append(g_hat) + + theta = np.median(thetas) + se = np.sqrt(np.median(np.power(ses, 2) * n_obs + np.power(thetas - theta, 2)) / n_obs) + + res = {'theta': theta, 'se': se, + 'thetas': thetas, 'ses': ses, + 'all_l_hat': all_l_hat, 'all_m_hat': all_m_hat, 'all_g_hat': all_g_hat} + + return res + + +def fit_plr_logistic_split(y, x, d, learner_l, learner_m, learner_g, smpls, score, + l_params=None, m_params=None, g_params=None): + fit_g = (score == 'IV-type') | callable(score) + if is_classifier(learner_m): + l_hat, m_hat, g_hat = fit_nuisance_plr_classifier(y, x, d, + learner_l, learner_m, learner_g, + smpls, fit_g, + l_params, m_params, g_params) + else: + l_hat, m_hat, g_hat = fit_nuisance_plr(y, x, d, + learner_l, learner_m, learner_g, + smpls, fit_g, + l_params, m_params, g_params) + + theta, se = plr_dml2(y, x, d, l_hat, m_hat, g_hat, + smpls, score) + + return l_hat, m_hat, g_hat, theta, se + + +def fit_nuisance_logistic(y, x, d, learner_l, learner_m, learner_g, smpls, fit_g=True, + l_params=None, m_params=None, g_params=None): + ml_l = clone(learner_l) + l_hat = fit_predict(y, x, ml_l, l_params, smpls) + + ml_m = clone(learner_m) + m_hat = fit_predict(d, x, ml_m, m_params, smpls) + + if fit_g: + y_minus_l_hat, d_minus_m_hat, _ = compute_plr_residuals(y, d, l_hat, m_hat, [], smpls) + psi_a = -np.multiply(d_minus_m_hat, d_minus_m_hat) + psi_b = np.multiply(d_minus_m_hat, y_minus_l_hat) + theta_initial = -np.nanmean(psi_b) / np.nanmean(psi_a) + + ml_g = clone(learner_g) + g_hat = fit_predict(y - theta_initial*d, x, ml_g, g_params, smpls) + else: + g_hat = [] + + return l_hat, m_hat, g_hat + + +def fit_nuisance_logistic_classifier(y, x, d, learner_l, learner_m, learner_g, smpls, fit_g=True, + l_params=None, m_params=None, g_params=None): + ml_l = clone(learner_l) + l_hat = fit_predict(y, x, ml_l, l_params, smpls) + + ml_m = clone(learner_m) + m_hat = fit_predict_proba(d, x, ml_m, m_params, smpls) + + if fit_g: + y_minus_l_hat, d_minus_m_hat, _ = compute_plr_residuals(y, d, l_hat, m_hat, [], smpls) + psi_a = -np.multiply(d_minus_m_hat, d_minus_m_hat) + psi_b = np.multiply(d_minus_m_hat, y_minus_l_hat) + theta_initial = -np.mean(psi_b) / np.mean(psi_a) + + ml_g = clone(learner_g) + g_hat = fit_predict(y - theta_initial*d, x, ml_g, g_params, smpls) + else: + g_hat = [] + + return l_hat, m_hat, g_hat + + +def tune_nuisance_plr(y, x, d, ml_l, ml_m, ml_g, smpls, n_folds_tune, param_grid_l, param_grid_m, param_grid_g, tune_g=True): + l_tune_res = tune_grid_search(y, x, ml_l, smpls, param_grid_l, n_folds_tune) + + m_tune_res = tune_grid_search(d, x, ml_m, smpls, param_grid_m, n_folds_tune) + + if tune_g: + l_hat = np.full_like(y, np.nan) + m_hat = np.full_like(d, np.nan) + for idx, (train_index, _) in enumerate(smpls): + l_hat[train_index] = l_tune_res[idx].predict(x[train_index, :]) + m_hat[train_index] = m_tune_res[idx].predict(x[train_index, :]) + psi_a = -np.multiply(d - m_hat, d - m_hat) + psi_b = np.multiply(d - m_hat, y - l_hat) + theta_initial = -np.nanmean(psi_b) / np.nanmean(psi_a) + + g_tune_res = tune_grid_search(y - theta_initial*d, x, ml_g, smpls, param_grid_g, n_folds_tune) + g_best_params = [xx.best_params_ for xx in g_tune_res] + else: + g_best_params = [] + + l_best_params = [xx.best_params_ for xx in l_tune_res] + m_best_params = [xx.best_params_ for xx in m_tune_res] + + return l_best_params, m_best_params, g_best_params + + +def compute_plr_residuals(y, d, l_hat, m_hat, g_hat, smpls): + y_minus_l_hat = np.full_like(y, np.nan, dtype='float64') + d_minus_m_hat = np.full_like(d, np.nan, dtype='float64') + y_minus_g_hat = np.full_like(y, np.nan, dtype='float64') + for idx, (_, test_index) in enumerate(smpls): + y_minus_l_hat[test_index] = y[test_index] - l_hat[idx] + if len(g_hat) > 0: + y_minus_g_hat[test_index] = y[test_index] - g_hat[idx] + d_minus_m_hat[test_index] = d[test_index] - m_hat[idx] + return y_minus_l_hat, d_minus_m_hat, y_minus_g_hat + + +def plr_dml2(y, x, d, l_hat, m_hat, g_hat, smpls, score): + n_obs = len(y) + y_minus_l_hat, d_minus_m_hat, y_minus_g_hat = compute_plr_residuals(y, d, l_hat, m_hat, g_hat, smpls) + theta_hat = plr_orth(y_minus_l_hat, d_minus_m_hat, y_minus_g_hat, d, score) + se = np.sqrt(var_plr(theta_hat, d, y_minus_l_hat, d_minus_m_hat, y_minus_g_hat, score, n_obs)) + + return theta_hat, se + + +def var_plr(theta, d, y_minus_l_hat, d_minus_m_hat, y_minus_g_hat, score, n_obs): + if score == 'partialling out': + var = 1/n_obs * 1/np.power(np.mean(np.multiply(d_minus_m_hat, d_minus_m_hat)), 2) * \ + np.mean(np.power(np.multiply(y_minus_l_hat - d_minus_m_hat*theta, d_minus_m_hat), 2)) + else: + assert score == 'IV-type' + var = 1/n_obs * 1/np.power(np.mean(np.multiply(d_minus_m_hat, d)), 2) * \ + np.mean(np.power(np.multiply(y_minus_g_hat - d*theta, d_minus_m_hat), 2)) + + return var + + +def plr_orth(y_minus_l_hat, d_minus_m_hat, y_minus_g_hat, d, score): + if score == 'IV-type': + res = np.mean(np.multiply(d_minus_m_hat, y_minus_g_hat))/np.mean(np.multiply(d_minus_m_hat, d)) + else: + assert score == 'partialling out' + res = scipy.linalg.lstsq(d_minus_m_hat.reshape(-1, 1), y_minus_l_hat)[0] + + return res + + +def boot_plr(y, d, thetas, ses, all_l_hat, all_m_hat, all_g_hat, + all_smpls, score, bootstrap, n_rep_boot, + n_rep=1, apply_cross_fitting=True): + all_boot_t_stat = list() + for i_rep in range(n_rep): + smpls = all_smpls[i_rep] + if apply_cross_fitting: + n_obs = len(y) + else: + test_index = smpls[0][1] + n_obs = len(test_index) + weights = draw_weights(bootstrap, n_rep_boot, n_obs) + + boot_t_stat = boot_plr_single_split( + thetas[i_rep], y, d, all_l_hat[i_rep], all_m_hat[i_rep], all_g_hat[i_rep], smpls, + score, ses[i_rep], + weights, n_rep_boot, apply_cross_fitting) + all_boot_t_stat.append(boot_t_stat) + + # differently for plr because of n_rep_boot and multiple treatmentsa + boot_t_stat = np.transpose(np.vstack(all_boot_t_stat)) + + return boot_t_stat + + +def boot_plr_multitreat(y, d, thetas, ses, all_l_hat, all_m_hat, all_g_hat, + all_smpls, score, bootstrap, n_rep_boot, + n_rep=1, apply_cross_fitting=True): + n_d = d.shape[1] + all_boot_t_stat = list() + for i_rep in range(n_rep): + smpls = all_smpls[i_rep] + if apply_cross_fitting: + n_obs = len(y) + else: + test_index = smpls[0][1] + n_obs = len(test_index) + weights = draw_weights(bootstrap, n_rep_boot, n_obs) + + boot_t_stat = np.full((n_d, n_rep_boot), np.nan) + for i_d in range(n_d): + boot_t_stat[i_d, :] = boot_plr_single_split( + thetas[i_rep][i_d], y, d[:, i_d], + all_l_hat[i_rep][i_d], all_m_hat[i_rep][i_d], all_g_hat[i_rep][i_d], + smpls, score, ses[i_rep][i_d], + weights, n_rep_boot, apply_cross_fitting) + + # transpose for shape (n_rep_boot, n_d) + boot_t_stat = np.transpose(boot_t_stat) + all_boot_t_stat.append(boot_t_stat) + + # stack repetitions along the last axis + boot_t_stat = np.stack(all_boot_t_stat, axis=2) + + return boot_t_stat + + +def boot_plr_single_split(theta, y, d, l_hat, m_hat, g_hat, + smpls, score, se, weights, n_rep, apply_cross_fitting): + y_minus_l_hat, d_minus_m_hat, y_minus_g_hat = compute_plr_residuals(y, d, l_hat, m_hat, g_hat, smpls) + + if apply_cross_fitting: + if score == 'partialling out': + J = np.mean(-np.multiply(d_minus_m_hat, d_minus_m_hat)) + else: + assert score == 'IV-type' + J = np.mean(-np.multiply(d_minus_m_hat, d)) + else: + test_index = smpls[0][1] + if score == 'partialling out': + J = np.mean(-np.multiply(d_minus_m_hat[test_index], d_minus_m_hat[test_index])) + else: + assert score == 'IV-type' + J = np.mean(-np.multiply(d_minus_m_hat[test_index], d[test_index])) + + if score == 'partialling out': + psi = np.multiply(y_minus_l_hat - d_minus_m_hat * theta, d_minus_m_hat) + else: + assert score == 'IV-type' + psi = np.multiply(y_minus_g_hat - d * theta, d_minus_m_hat) + + boot_t_stat = boot_manual(psi, J, smpls, se, weights, n_rep, apply_cross_fitting) + + return boot_t_stat + + +def fit_sensitivity_elements_plr(y, d, all_coef, predictions, score, n_rep): + n_treat = d.shape[1] + n_obs = len(y) + + sigma2 = np.full(shape=(1, n_rep, n_treat), fill_value=np.nan) + nu2 = np.full(shape=(1, n_rep, n_treat), fill_value=np.nan) + psi_sigma2 = np.full(shape=(n_obs, n_rep, n_treat), fill_value=np.nan) + psi_nu2 = np.full(shape=(n_obs, n_rep, n_treat), fill_value=np.nan) + + for i_rep in range(n_rep): + for i_treat in range(n_treat): + d_tilde = d[:, i_treat] + m_hat = predictions['ml_m'][:, i_rep, i_treat] + theta = all_coef[i_treat, i_rep] + if score == 'partialling out': + l_hat = predictions['ml_l'][:, i_rep, i_treat] + sigma2_score_element = np.square(y - l_hat - np.multiply(theta, d_tilde-m_hat)) + else: + assert score == 'IV-type' + g_hat = predictions['ml_g'][:, i_rep, i_treat] + sigma2_score_element = np.square(y - g_hat - np.multiply(theta, d_tilde)) + + sigma2[0, i_rep, i_treat] = np.mean(sigma2_score_element) + psi_sigma2[:, i_rep, i_treat] = sigma2_score_element - sigma2[0, i_rep, i_treat] + + nu2[0, i_rep, i_treat] = np.divide(1.0, np.mean(np.square(d_tilde-m_hat))) + psi_nu2[:, i_rep, i_treat] = nu2[0, i_rep, i_treat] - \ + np.multiply(np.square(d_tilde-m_hat), np.square(nu2[0, i_rep, i_treat])) + + element_dict = {'sigma2': sigma2, + 'nu2': nu2, + 'psi_sigma2': psi_sigma2, + 'psi_nu2': psi_nu2} + return element_dict diff --git a/doubleml/logistic/tests/tests_logistic.py b/doubleml/logistic/tests/tests_logistic.py new file mode 100644 index 00000000..2b97bf76 --- /dev/null +++ b/doubleml/logistic/tests/tests_logistic.py @@ -0,0 +1,352 @@ +import pytest +import math +import scipy +import numpy as np +import pandas as pd + +from sklearn.base import clone + +from sklearn.linear_model import LinearRegression, Lasso +from sklearn.ensemble import RandomForestRegressor + +import doubleml as dml + +from ...tests._utils import draw_smpls +from ._utils_logistic_manual import fit_logistic, , boot_plr + + +@pytest.fixture(scope='module', + params=[RandomForestRegressor(max_depth=2, n_estimators=10), + LinearRegression(), + Lasso(alpha=0.1)]) +def learner(request): + return request.param + + +@pytest.fixture(scope='module', + params=['IV-type', 'partialling out']) +def score(request): + return request.param + + +@pytest.fixture(scope="module") +def dml_plr_fixture(generate_data1, learner, score): + boot_methods = ['normal'] + n_folds = 2 + n_rep_boot = 502 + + # collect data + data = generate_data1 + x_cols = data.columns[data.columns.str.startswith('X')].tolist() + + # Set machine learning methods for m & g + ml_l = clone(learner) + ml_m = clone(learner) + ml_g = clone(learner) + + np.random.seed(3141) + obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols) + if score == 'partialling out': + dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, + ml_l, ml_m, + n_folds=n_folds, + score=score) + else: + assert score == 'IV-type' + dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, + ml_l, ml_m, ml_g, + n_folds, + score=score) + + dml_plr_obj.fit() + + np.random.seed(3141) + y = data['y'].values + x = data.loc[:, x_cols].values + d = data['d'].values + n_obs = len(y) + all_smpls = draw_smpls(n_obs, n_folds) + + res_manual = fit_plr(y, x, d, clone(learner), clone(learner), clone(learner), + all_smpls, score) + + np.random.seed(3141) + # test with external nuisance predictions + if score == 'partialling out': + dml_plr_obj_ext = dml.DoubleMLPLR(obj_dml_data, + ml_l, ml_m, + n_folds, + score=score) + else: + assert score == 'IV-type' + dml_plr_obj_ext = dml.DoubleMLPLR(obj_dml_data, + ml_l, ml_m, ml_g, + n_folds, + score=score) + + # synchronize the sample splitting + dml_plr_obj_ext.set_sample_splitting(all_smpls=all_smpls) + + if score == 'partialling out': + prediction_dict = {'d': {'ml_l': dml_plr_obj.predictions['ml_l'].reshape(-1, 1), + 'ml_m': dml_plr_obj.predictions['ml_m'].reshape(-1, 1)}} + else: + assert score == 'IV-type' + prediction_dict = {'d': {'ml_l': dml_plr_obj.predictions['ml_l'].reshape(-1, 1), + 'ml_m': dml_plr_obj.predictions['ml_m'].reshape(-1, 1), + 'ml_g': dml_plr_obj.predictions['ml_g'].reshape(-1, 1)}} + + dml_plr_obj_ext.fit(external_predictions=prediction_dict) + + res_dict = {'coef': dml_plr_obj.coef, + 'coef_manual': res_manual['theta'], + 'coef_ext': dml_plr_obj_ext.coef, + 'se': dml_plr_obj.se, + 'se_manual': res_manual['se'], + 'se_ext': dml_plr_obj_ext.se, + 'boot_methods': boot_methods} + + for bootstrap in boot_methods: + np.random.seed(3141) + boot_t_stat = boot_plr(y, d, res_manual['thetas'], res_manual['ses'], + res_manual['all_l_hat'], res_manual['all_m_hat'], res_manual['all_g_hat'], + all_smpls, score, bootstrap, n_rep_boot) + + np.random.seed(3141) + dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) + np.random.seed(3141) + dml_plr_obj_ext.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) + res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat + res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat.reshape(-1, 1, 1) + res_dict['boot_t_stat' + bootstrap + '_ext'] = dml_plr_obj_ext.boot_t_stat + + # sensitivity tests + res_dict['sensitivity_elements'] = dml_plr_obj.sensitivity_elements + res_dict['sensitivity_elements_manual'] = fit_sensitivity_elements_plr(y, d.reshape(-1, 1), + all_coef=dml_plr_obj.all_coef, + predictions=dml_plr_obj.predictions, + score=score, + n_rep=1) + # check if sensitivity score with rho=0 gives equal asymptotic standard deviation + dml_plr_obj.sensitivity_analysis(rho=0.0) + res_dict['sensitivity_ses'] = dml_plr_obj.sensitivity_params['se'] + return res_dict + + +@pytest.mark.ci +def test_dml_plr_coef(dml_plr_fixture): + assert math.isclose(dml_plr_fixture['coef'], + dml_plr_fixture['coef_manual'], + rel_tol=1e-9, abs_tol=1e-4) + assert math.isclose(dml_plr_fixture['coef'], + dml_plr_fixture['coef_ext'], + rel_tol=1e-9, abs_tol=1e-4) + + +@pytest.mark.ci +def test_dml_plr_se(dml_plr_fixture): + assert math.isclose(dml_plr_fixture['se'], + dml_plr_fixture['se_manual'], + rel_tol=1e-9, abs_tol=1e-4) + assert math.isclose(dml_plr_fixture['se'], + dml_plr_fixture['se_ext'], + rel_tol=1e-9, abs_tol=1e-4) + + +@pytest.mark.ci +def test_dml_plr_boot(dml_plr_fixture): + for bootstrap in dml_plr_fixture['boot_methods']: + assert np.allclose(dml_plr_fixture['boot_t_stat' + bootstrap], + dml_plr_fixture['boot_t_stat' + bootstrap + '_manual'], + rtol=1e-9, atol=1e-4) + assert np.allclose(dml_plr_fixture['boot_t_stat' + bootstrap], + dml_plr_fixture['boot_t_stat' + bootstrap + '_ext'], + rtol=1e-9, atol=1e-4) + + +@pytest.mark.ci +def test_dml_plr_sensitivity(dml_plr_fixture): + sensitivity_element_names = ['sigma2', 'nu2', 'psi_sigma2', 'psi_nu2'] + for sensitivity_element in sensitivity_element_names: + assert np.allclose(dml_plr_fixture['sensitivity_elements'][sensitivity_element], + dml_plr_fixture['sensitivity_elements_manual'][sensitivity_element]) + + +@pytest.mark.ci +def test_dml_plr_sensitivity_rho0(dml_plr_fixture): + assert np.allclose(dml_plr_fixture['se'], + dml_plr_fixture['sensitivity_ses']['lower'], + rtol=1e-9, atol=1e-4) + assert np.allclose(dml_plr_fixture['se'], + dml_plr_fixture['sensitivity_ses']['upper'], + rtol=1e-9, atol=1e-4) + + +@pytest.fixture(scope="module") +def dml_plr_ols_manual_fixture(generate_data1, score): + learner = LinearRegression() + boot_methods = ['Bayes', 'normal', 'wild'] + n_folds = 2 + n_rep_boot = 501 + + # collect data + data = generate_data1 + x_cols = data.columns[data.columns.str.startswith('X')].tolist() + + # Set machine learning methods for m & g + ml_l = clone(learner) + ml_g = clone(learner) + ml_m = clone(learner) + + obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols) + if score == 'partialling out': + dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, + ml_l, ml_m, + n_folds=n_folds, + score=score) + else: + assert score == 'IV-type' + dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, + ml_l, ml_m, ml_g, + n_folds, + score=score) + + n = data.shape[0] + this_smpl = list() + xx = int(n/2) + this_smpl.append((np.arange(xx, n), np.arange(0, xx))) + this_smpl.append((np.arange(0, xx), np.arange(xx, n))) + smpls = [this_smpl] + dml_plr_obj.set_sample_splitting(smpls) + + dml_plr_obj.fit() + + y = data['y'].values + x = data.loc[:, x_cols].values + d = data['d'].values + + # add column of ones for intercept + o = np.ones((n, 1)) + x = np.append(x, o, axis=1) + + smpls = dml_plr_obj.smpls[0] + + l_hat = [] + l_hat_vec = np.full_like(y, np.nan) + for (train_index, test_index) in smpls: + ols_est = scipy.linalg.lstsq(x[train_index], y[train_index])[0] + preds = np.dot(x[test_index], ols_est) + l_hat.append(preds) + l_hat_vec[test_index] = preds + + m_hat = [] + m_hat_vec = np.full_like(d, np.nan) + for (train_index, test_index) in smpls: + ols_est = scipy.linalg.lstsq(x[train_index], d[train_index])[0] + preds = np.dot(x[test_index], ols_est) + m_hat.append(preds) + m_hat_vec[test_index] = preds + + g_hat = [] + if score == 'IV-type': + theta_initial = scipy.linalg.lstsq((d - m_hat_vec).reshape(-1, 1), y - l_hat_vec)[0] + for (train_index, test_index) in smpls: + ols_est = scipy.linalg.lstsq(x[train_index], + y[train_index] - d[train_index] * theta_initial)[0] + g_hat.append(np.dot(x[test_index], ols_est)) + + res_manual, se_manual = plr_dml2(y, x, d, + l_hat, m_hat, g_hat, + smpls, score) + + res_dict = {'coef': dml_plr_obj.coef, + 'coef_manual': res_manual, + 'se': dml_plr_obj.se, + 'se_manual': se_manual, + 'boot_methods': boot_methods} + + for bootstrap in boot_methods: + np.random.seed(3141) + boot_t_stat = boot_plr(y, d, [res_manual], [se_manual], + [l_hat], [m_hat], [g_hat], + [smpls], score, bootstrap, n_rep_boot) + + np.random.seed(3141) + dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) + res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat + res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat.reshape(-1, 1, 1) + + return res_dict + + +@pytest.mark.ci +def test_dml_plr_ols_manual_coef(dml_plr_ols_manual_fixture): + assert math.isclose(dml_plr_ols_manual_fixture['coef'], + dml_plr_ols_manual_fixture['coef_manual'], + rel_tol=1e-9, abs_tol=1e-4) + + +@pytest.mark.ci +def test_dml_plr_ols_manual_se(dml_plr_ols_manual_fixture): + assert math.isclose(dml_plr_ols_manual_fixture['se'], + dml_plr_ols_manual_fixture['se_manual'], + rel_tol=1e-9, abs_tol=1e-4) + + +@pytest.mark.ci +def test_dml_plr_ols_manual_boot(dml_plr_ols_manual_fixture): + for bootstrap in dml_plr_ols_manual_fixture['boot_methods']: + assert np.allclose(dml_plr_ols_manual_fixture['boot_t_stat' + bootstrap], + dml_plr_ols_manual_fixture['boot_t_stat' + bootstrap + '_manual'], + rtol=1e-9, atol=1e-4) + + +@pytest.fixture(scope='module', + params=["nonrobust", "HC0", "HC1", "HC2", "HC3"]) +def cov_type(request): + return request.param + + +@pytest.mark.ci +def test_dml_plr_cate_gate(score, cov_type): + n = 9 + + # collect data + np.random.seed(42) + obj_dml_data = dml.datasets.make_plr_CCDDHNR2018(n_obs=n) + ml_l = LinearRegression() + ml_g = LinearRegression() + ml_m = LinearRegression() + + dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, + ml_g, ml_m, ml_l, + n_folds=2, + score=score) + dml_plr_obj.fit() + random_basis = pd.DataFrame(np.random.normal(0, 1, size=(n, 5))) + cate = dml_plr_obj.cate(random_basis, cov_type=cov_type) + assert isinstance(cate, dml.DoubleMLBLP) + assert isinstance(cate.confint(), pd.DataFrame) + assert cate.blp_model.cov_type == cov_type + + groups_1 = pd.DataFrame( + np.column_stack([obj_dml_data.data['X1'] <= 0, + obj_dml_data.data['X1'] > 0.2]), + columns=['Group 1', 'Group 2']) + msg = ('At least one group effect is estimated with less than 6 observations.') + with pytest.warns(UserWarning, match=msg): + gate_1 = dml_plr_obj.gate(groups_1, cov_type=cov_type) + assert isinstance(gate_1, dml.utils.blp.DoubleMLBLP) + assert isinstance(gate_1.confint(), pd.DataFrame) + assert all(gate_1.confint().index == groups_1.columns.tolist()) + assert gate_1.blp_model.cov_type == cov_type + + np.random.seed(42) + groups_2 = pd.DataFrame(np.random.choice(["1", "2"], n)) + msg = ('At least one group effect is estimated with less than 6 observations.') + with pytest.warns(UserWarning, match=msg): + gate_2 = dml_plr_obj.gate(groups_2, cov_type=cov_type) + assert isinstance(gate_2, dml.utils.blp.DoubleMLBLP) + assert isinstance(gate_2.confint(), pd.DataFrame) + assert all(gate_2.confint().index == ["Group_1", "Group_2"]) + assert gate_2.blp_model.cov_type == cov_type diff --git a/doubleml/utils/_estimation.py b/doubleml/utils/_estimation.py index 3d99d93a..3ed110f3 100644 --- a/doubleml/utils/_estimation.py +++ b/doubleml/utils/_estimation.py @@ -45,10 +45,12 @@ def _fit(estimator, x, y, train_index, idx=None): def _dml_cv_predict( estimator, x, y, smpls=None, n_jobs=None, est_params=None, method="predict", return_train_preds=False, return_models=False -): +, smpls_is_partition=None): n_obs = x.shape[0] - smpls_is_partition = _check_is_partition(smpls, n_obs) + # TODO: Better name for smples_is_partition + if smpls_is_partition is None: + smpls_is_partition = _check_is_partition(smpls, n_obs) fold_specific_params = (est_params is not None) & (not isinstance(est_params, dict)) fold_specific_target = isinstance(y, list) manual_cv_predict = ( diff --git a/doubleml/utils/resampling.py b/doubleml/utils/resampling.py index 18153944..d1014517 100644 --- a/doubleml/utils/resampling.py +++ b/doubleml/utils/resampling.py @@ -33,6 +33,7 @@ def __init__(self, n_obs, stratify=None): self.n_folds = n_folds + self.n_folds_inner = n_folds_inner self.n_rep = n_rep self.n_obs = n_obs self.stratify = stratify @@ -47,10 +48,10 @@ def __init__(self, if self.stratify is None: self.resampling = RepeatedKFold(n_splits=n_folds, n_repeats=n_rep) - self.resampling_inner = RepeatedKFold(n_splits=n_folds_inner) + self.resampling_inner = RepeatedKFold(n_splits=n_folds_inner, n_repeats=1) else: self.resampling = RepeatedStratifiedKFold(n_splits=n_folds, n_repeats=n_rep) - self.resampling_inner = RepeatedStratifiedKFold(n_splits=n_folds_inner) + self.resampling_inner = RepeatedStratifiedKFold(n_splits=n_folds_inner, n_repeats=1) def split_samples(self): all_smpls = [(train, test) for train, test in self.resampling.split(X=np.zeros(self.n_obs), y=self.stratify)] @@ -59,12 +60,12 @@ def split_samples(self): smpls_inner = [] for _ in range(self.n_rep): smpls_inner_rep = [] - for _, test in all_smpls: + for train, test in all_smpls: if self.stratify is None: - smpls_inner_rep.append([(train_inner, test_inner) for train_inner, test_inner in self.resampling_inner.split(X=test)]) + smpls_inner_rep.append([(train[train_inner], train[test_inner]) for train_inner, test_inner in self.resampling_inner.split(X=train)]) else: - smpls_inner_rep.append([(train_inner, test_inner) for train_inner, test_inner in - self.resampling_inner.split(X=np.zeros(len(test)), y=self.stratify[test])]) + smpls_inner_rep.append([(train[train_inner], train[test_inner]) for train_inner, test_inner in + self.resampling_inner.split(X=np.zeros(len(train)), y=self.stratify[train])]) smpls_inner.append(smpls_inner_rep) return smpls, smpls_inner From d729d0a2f1ea0752ddeb9c762452e46d3ad43f14 Mon Sep 17 00:00:00 2001 From: bbd5721 Date: Thu, 27 Feb 2025 15:24:40 +0100 Subject: [PATCH 04/23] Changed data type of arrays --- doubleml/logistic/logistic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doubleml/logistic/logistic.py b/doubleml/logistic/logistic.py index 25ba3763..cfb9926e 100644 --- a/doubleml/logistic/logistic.py +++ b/doubleml/logistic/logistic.py @@ -176,7 +176,7 @@ def _check_data(self, obj_dml_data): def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, smpls_inner=None, n_jobs=None, est_params=None, method='predict'): res = {} - res['preds'] = np.zeros_like(y) + res['preds'] = np.zeros(d.shape, dtype=float) res['preds_inner'] = [] res['models'] = [] for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): @@ -289,7 +289,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa W_inner = [] - beta = np.zeros_like(d) + beta = np.zeros(d.shape, dtype=float) for i, (train, test) in enumerate(smpls): M_iteration = M_hat['preds_inner'][i][train] From 8fe7ca667519d79507fb0a8621bb51e54e2983a5 Mon Sep 17 00:00:00 2001 From: bbd5721 Date: Thu, 27 Feb 2025 15:30:40 +0100 Subject: [PATCH 05/23] Fix variable name --- doubleml/logistic/logistic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doubleml/logistic/logistic.py b/doubleml/logistic/logistic.py index cfb9926e..ab10ceb8 100644 --- a/doubleml/logistic/logistic.py +++ b/doubleml/logistic/logistic.py @@ -176,7 +176,7 @@ def _check_data(self, obj_dml_data): def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, smpls_inner=None, n_jobs=None, est_params=None, method='predict'): res = {} - res['preds'] = np.zeros(d.shape, dtype=float) + res['preds'] = np.zeros(y.shape, dtype=float) res['preds_inner'] = [] res['models'] = [] for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): From 18bac23cbc95b0e6d25af918f1924202cea231b5 Mon Sep 17 00:00:00 2001 From: bbd5721 Date: Wed, 27 Aug 2025 10:22:19 +0200 Subject: [PATCH 06/23] Moved into plm folder, started testing setup --- doubleml/__init__.py | 2 + doubleml/datasets.py | 1753 +++++++++++++++++ doubleml/double_ml_data.py | 55 +- doubleml/plm/__init__.py | 1 + doubleml/{logistic => plm}/logistic.py | 103 +- .../tests/_utils_logistic_manual.py | 37 +- .../{logistic => plm}/tests/tests_logistic.py | 51 +- doubleml/utils/_estimation.py | 30 +- 8 files changed, 1906 insertions(+), 126 deletions(-) create mode 100644 doubleml/datasets.py rename doubleml/{logistic => plm}/logistic.py (87%) rename doubleml/{logistic => plm}/tests/_utils_logistic_manual.py (87%) rename doubleml/{logistic => plm}/tests/tests_logistic.py (85%) diff --git a/doubleml/__init__.py b/doubleml/__init__.py index 93549116..ba59a07e 100644 --- a/doubleml/__init__.py +++ b/doubleml/__init__.py @@ -13,6 +13,8 @@ from .irm.pq import DoubleMLPQ from .irm.qte import DoubleMLQTE from .irm.ssm import DoubleMLSSM +from doubleml.plm.logistic import DoubleMLLogit + from .plm.pliv import DoubleMLPLIV from .plm.plr import DoubleMLPLR from .logistic.logistic import DoubleMLLogit diff --git a/doubleml/datasets.py b/doubleml/datasets.py new file mode 100644 index 00000000..629a033a --- /dev/null +++ b/doubleml/datasets.py @@ -0,0 +1,1753 @@ +import pandas as pd +import numpy as np +import warnings + +from scipy.linalg import toeplitz +from scipy.optimize import minimize_scalar +from scipy.special import expit + +from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder +from sklearn.datasets import make_spd_matrix + +from .double_ml_data import DoubleMLData, DoubleMLClusterData + +_array_alias = ['array', 'np.ndarray', 'np.array', np.ndarray] +_data_frame_alias = ['DataFrame', 'pd.DataFrame', pd.DataFrame] +_dml_data_alias = ['DoubleMLData', DoubleMLData] +_dml_cluster_data_alias = ['DoubleMLClusterData', DoubleMLClusterData] + + +def fetch_401K(return_type='DoubleMLData', polynomial_features=False): + """ + Data set on financial wealth and 401(k) plan participation. + + Parameters + ---------- + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + polynomial_features : + If ``True`` polynomial features are added (see replication files of Chernozhukov et al. (2018)). + + References + ---------- + Abadie, A. (2003), Semiparametric instrumental variable estimation of treatment response models. Journal of + Econometrics, 113(2): 231-263. + + Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), + Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. + doi:`10.1111/ectj.12097 `_. + """ + url = 'https://github.com/VC2015/DMLonGitHub/raw/master/sipp1991.dta' + raw_data = pd.read_stata(url) + + y_col = 'net_tfa' + d_cols = ['e401'] + x_cols = ['age', 'inc', 'educ', 'fsize', 'marr', 'twoearn', 'db', 'pira', 'hown'] + + data = raw_data.copy() + + if polynomial_features: + raise NotImplementedError('polynomial_features os not implemented yet for fetch_401K.') + + if return_type in _data_frame_alias + _dml_data_alias: + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, y_col, d_cols, x_cols) + else: + raise ValueError('Invalid return_type.') + + +def fetch_bonus(return_type='DoubleMLData', polynomial_features=False): + """ + Data set on the Pennsylvania Reemployment Bonus experiment. + + Parameters + ---------- + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + polynomial_features : + If ``True`` polynomial features are added (see replication files of Chernozhukov et al. (2018)). + + References + ---------- + Bilias Y. (2000), Sequential Testing of Duration Data: The Case of Pennsylvania 'Reemployment Bonus' Experiment. + Journal of Applied Econometrics, 15(6): 575-594. + + Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), + Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. + doi:`10.1111/ectj.12097 `_. + """ + url = 'https://raw.githubusercontent.com/VC2015/DMLonGitHub/master/penn_jae.dat' + raw_data = pd.read_csv(url, sep='\s+') + + ind = (raw_data['tg'] == 0) | (raw_data['tg'] == 4) + data = raw_data.copy()[ind] + data.reset_index(inplace=True) + data['tg'] = data['tg'].replace(4, 1) + data['inuidur1'] = np.log(data['inuidur1']) + + # variable dep as factor (dummy encoding) + dummy_enc = OneHotEncoder(drop='first', categories='auto').fit(data.loc[:, ['dep']]) + xx = dummy_enc.transform(data.loc[:, ['dep']]).toarray() + data['dep1'] = xx[:, 0] + data['dep2'] = xx[:, 1] + + y_col = 'inuidur1' + d_cols = ['tg'] + x_cols = ['female', 'black', 'othrace', + 'dep1', 'dep2', + 'q2', 'q3', 'q4', 'q5', 'q6', + 'agelt35', 'agegt54', 'durable', 'lusd', 'husd'] + + if polynomial_features: + poly = PolynomialFeatures(2, include_bias=False) + data_transf = poly.fit_transform(data[x_cols]) + x_cols = list(poly.get_feature_names_out(x_cols)) + + data_transf = pd.DataFrame(data_transf, columns=x_cols) + data = pd.concat((data[[y_col] + d_cols], data_transf), + axis=1, sort=False) + + if return_type in _data_frame_alias + _dml_data_alias: + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, y_col, d_cols, x_cols) + else: + raise ValueError('Invalid return_type.') + + +def _g(x): + return np.power(np.sin(x), 2) + + +def _m(x, nu=0., gamma=1.): + return 0.5 / np.pi * (np.sinh(gamma)) / (np.cosh(gamma) - np.cos(x - nu)) + + +def make_plr_CCDDHNR2018(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLData', **kwargs): + """ + Generates data from a partially linear regression model used in Chernozhukov et al. (2018) for Figure 1. + The data generating process is defined as + + .. math:: + + d_i &= m_0(x_i) + s_1 v_i, & &v_i \\sim \\mathcal{N}(0,1), + + y_i &= \\alpha d_i + g_0(x_i) + s_2 \\zeta_i, & &\\zeta_i \\sim \\mathcal{N}(0,1), + + + with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries + :math:`\\Sigma_{kj} = 0.7^{|j-k|}`. + The nuisance functions are given by + + .. math:: + + m_0(x_i) &= a_0 x_{i,1} + a_1 \\frac{\\exp(x_{i,3})}{1+\\exp(x_{i,3})}, + + g_0(x_i) &= b_0 \\frac{\\exp(x_{i,1})}{1+\\exp(x_{i,1})} + b_1 x_{i,3}. + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dim_x : + The number of covariates. + alpha : + The value of the causal parameter. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. + **kwargs + Additional keyword arguments to set non-default values for the parameters + :math:`a_0=1`, :math:`a_1=0.25`, :math:`s_1=1`, :math:`b_0=1`, :math:`b_1=0.25` or :math:`s_2=1`. + + References + ---------- + Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), + Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. + doi:`10.1111/ectj.12097 `_. + """ + a_0 = kwargs.get('a_0', 1.) + a_1 = kwargs.get('a_1', 0.25) + s_1 = kwargs.get('s_1', 1.) + + b_0 = kwargs.get('b_0', 1.) + b_1 = kwargs.get('b_1', 0.25) + s_2 = kwargs.get('s_2', 1.) + + cov_mat = toeplitz([np.power(0.7, k) for k in range(dim_x)]) + x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) + + d = a_0 * x[:, 0] + a_1 * np.divide(np.exp(x[:, 2]), 1 + np.exp(x[:, 2])) \ + + s_1 * np.random.standard_normal(size=[n_obs, ]) + y = alpha * d + b_0 * np.divide(np.exp(x[:, 0]), 1 + np.exp(x[:, 0])) \ + + b_1 * x[:, 2] + s_2 * np.random.standard_normal(size=[n_obs, ]) + + if return_type in _array_alias: + return x, y, d + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d)), + columns=x_cols + ['y', 'd']) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, 'y', 'd', x_cols) + else: + raise ValueError('Invalid return_type.') + + +def make_plr_turrell2018(n_obs=100, dim_x=20, theta=0.5, return_type='DoubleMLData', **kwargs): + """ + Generates data from a partially linear regression model used in a blog article by Turrell (2018). + The data generating process is defined as + + .. math:: + + d_i &= m_0(x_i' b) + v_i, & &v_i \\sim \\mathcal{N}(0,1), + + y_i &= \\theta d_i + g_0(x_i' b) + u_i, & &u_i \\sim \\mathcal{N}(0,1), + + + with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a random symmetric, + positive-definite matrix generated with :py:meth:`sklearn.datasets.make_spd_matrix`. + :math:`b` is a vector with entries :math:`b_j=\\frac{1}{j}` and the nuisance functions are given by + + .. math:: + + m_0(x_i) &= \\frac{1}{2 \\pi} \\frac{\\sinh(\\gamma)}{\\cosh(\\gamma) - \\cos(x_i-\\nu)}, + + g_0(x_i) &= \\sin(x_i)^2. + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dim_x : + The number of covariates. + theta : + The value of the causal parameter. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. + **kwargs + Additional keyword arguments to set non-default values for the parameters + :math:`\\nu=0`, or :math:`\\gamma=1`. + + References + ---------- + Turrell, A. (2018), Econometrics in Python part I - Double machine learning, Markov Wanderer: A blog on economics, + science, coding and data. `https://aeturrell.com/blog/posts/econometrics-in-python-parti-ml/ + `_. + """ + nu = kwargs.get('nu', 0.) + gamma = kwargs.get('gamma', 1.) + + b = [1 / k for k in range(1, dim_x + 1)] + sigma = make_spd_matrix(dim_x) + + x = np.random.multivariate_normal(np.zeros(dim_x), sigma, size=[n_obs, ]) + G = _g(np.dot(x, b)) + M = _m(np.dot(x, b), nu=nu, gamma=gamma) + d = M + np.random.standard_normal(size=[n_obs, ]) + y = np.dot(theta, d) + G + np.random.standard_normal(size=[n_obs, ]) + + if return_type in _array_alias: + return x, y, d + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d)), + columns=x_cols + ['y', 'd']) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, 'y', 'd', x_cols) + else: + raise ValueError('Invalid return_type.') + + +def make_irm_data(n_obs=500, dim_x=20, theta=0, R2_d=0.5, R2_y=0.5, return_type='DoubleMLData'): + """ + Generates data from a interactive regression (IRM) model. + The data generating process is defined as + + .. math:: + + d_i &= 1\\left\\lbrace \\frac{\\exp(c_d x_i' \\beta)}{1+\\exp(c_d x_i' \\beta)} > v_i \\right\\rbrace, & &v_i + \\sim \\mathcal{U}(0,1), + + y_i &= \\theta d_i + c_y x_i' \\beta d_i + \\zeta_i, & &\\zeta_i \\sim \\mathcal{N}(0,1), + + with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries + :math:`\\Sigma_{kj} = 0.5^{|j-k|}`. + :math:`\\beta` is a `dim_x`-vector with entries :math:`\\beta_j=\\frac{1}{j^2}` and the constants :math:`c_y` and + :math:`c_d` are given by + + .. math:: + + c_y = \\sqrt{\\frac{R_y^2}{(1-R_y^2) \\beta' \\Sigma \\beta}}, \\qquad c_d = + \\sqrt{\\frac{(\\pi^2 /3) R_d^2}{(1-R_d^2) \\beta' \\Sigma \\beta}}. + + The data generating process is inspired by a process used in the simulation experiment (see Appendix P) of Belloni + et al. (2017). + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dim_x : + The number of covariates. + theta : + The value of the causal parameter. + R2_d : + The value of the parameter :math:`R_d^2`. + R2_y : + The value of the parameter :math:`R_y^2`. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. + + References + ---------- + Belloni, A., Chernozhukov, V., Fernández‐Val, I. and Hansen, C. (2017). Program Evaluation and Causal Inference With + High‐Dimensional Data. Econometrica, 85: 233-298. + """ + # inspired by https://onlinelibrary.wiley.com/doi/abs/10.3982/ECTA12723, see suplement + v = np.random.uniform(size=[n_obs, ]) + zeta = np.random.standard_normal(size=[n_obs, ]) + + cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) + x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) + + beta = [1 / (k ** 2) for k in range(1, dim_x + 1)] + b_sigma_b = np.dot(np.dot(cov_mat, beta), beta) + c_y = np.sqrt(R2_y / ((1 - R2_y) * b_sigma_b)) + c_d = np.sqrt(np.pi ** 2 / 3. * R2_d / ((1 - R2_d) * b_sigma_b)) + + xx = np.exp(np.dot(x, np.multiply(beta, c_d))) + d = 1. * ((xx / (1 + xx)) > v) + + y = d * theta + d * np.dot(x, np.multiply(beta, c_y)) + zeta + + if return_type in _array_alias: + return x, y, d + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d)), + columns=x_cols + ['y', 'd']) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, 'y', 'd', x_cols) + else: + raise ValueError('Invalid return_type.') + + +def make_iivm_data(n_obs=500, dim_x=20, theta=1., alpha_x=0.2, return_type='DoubleMLData'): + """ + Generates data from a interactive IV regression (IIVM) model. + The data generating process is defined as + + .. math:: + + d_i &= 1\\left\\lbrace \\alpha_x Z + v_i > 0 \\right\\rbrace, + + y_i &= \\theta d_i + x_i' \\beta + u_i, + + with :math:`Z \\sim \\text{Bernoulli}(0.5)` and + + .. math:: + + \\left(\\begin{matrix} u_i \\\\ v_i \\end{matrix} \\right) \\sim + \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & 0.3 \\\\ 0.3 & 1 \\end{matrix} \\right) \\right). + + The covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries + :math:`\\Sigma_{kj} = 0.5^{|j-k|}` and :math:`\\beta` is a `dim_x`-vector with entries + :math:`\\beta_j=\\frac{1}{j^2}`. + + The data generating process is inspired by a process used in the simulation experiment of Farbmacher, Gruber and + Klaassen (2020). + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dim_x : + The number of covariates. + theta : + The value of the causal parameter. + alpha_x : + The value of the parameter :math:`\\alpha_x`. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z)``. + + References + ---------- + Farbmacher, H., Guber, R. and Klaaßen, S. (2020). Instrument Validity Tests with Causal Forests. MEA Discussion + Paper No. 13-2020. Available at SSRN: http://dx.doi.org/10.2139/ssrn.3619201. + """ + # inspired by https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3619201 + xx = np.random.multivariate_normal(np.zeros(2), + np.array([[1., 0.3], [0.3, 1.]]), + size=[n_obs, ]) + u = xx[:, 0] + v = xx[:, 1] + + cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) + x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) + + beta = [1 / (k ** 2) for k in range(1, dim_x + 1)] + + z = np.random.binomial(p=0.5, n=1, size=[n_obs, ]) + d = 1. * (alpha_x * z + v > 0) + + y = d * theta + np.dot(x, beta) + u + + if return_type in _array_alias: + return x, y, d, z + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d, z)), + columns=x_cols + ['y', 'd', 'z']) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, 'y', 'd', x_cols, 'z') + else: + raise ValueError('Invalid return_type.') + + +def _make_pliv_data(n_obs=100, dim_x=20, theta=0.5, gamma_z=0.4, return_type='DoubleMLData'): + b = [1 / k for k in range(1, dim_x + 1)] + sigma = make_spd_matrix(dim_x) + + x = np.random.multivariate_normal(np.zeros(dim_x), sigma, size=[n_obs, ]) + G = _g(np.dot(x, b)) + # instrument + z = _m(np.dot(x, b)) + np.random.standard_normal(size=[n_obs, ]) + # treatment + M = _m(gamma_z * z + np.dot(x, b)) + d = M + np.random.standard_normal(size=[n_obs, ]) + y = np.dot(theta, d) + G + np.random.standard_normal(size=[n_obs, ]) + + if return_type in _array_alias: + return x, y, d, z + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d, z)), + columns=x_cols + ['y', 'd', 'z']) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, 'y', 'd', x_cols, 'z') + else: + raise ValueError('Invalid return_type.') + + +def make_pliv_CHS2015(n_obs, alpha=1., dim_x=200, dim_z=150, return_type='DoubleMLData'): + """ + Generates data from a partially linear IV regression model used in Chernozhukov, Hansen and Spindler (2015). + The data generating process is defined as + + .. math:: + + z_i &= \\Pi x_i + \\zeta_i, + + d_i &= x_i' \\gamma + z_i' \\delta + u_i, + + y_i &= \\alpha d_i + x_i' \\beta + \\varepsilon_i, + + with + + .. math:: + + \\left(\\begin{matrix} \\varepsilon_i \\\\ u_i \\\\ \\zeta_i \\\\ x_i \\end{matrix} \\right) \\sim + \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & 0.6 & 0 & 0 \\\\ 0.6 & 1 & 0 & 0 \\\\ + 0 & 0 & 0.25 I_{p_n^z} & 0 \\\\ 0 & 0 & 0 & \\Sigma \\end{matrix} \\right) \\right) + + where :math:`\\Sigma` is a :math:`p_n^x \\times p_n^x` matrix with entries + :math:`\\Sigma_{kj} = 0.5^{|j-k|}` and :math:`I_{p_n^z}` is the :math:`p_n^z \\times p_n^z` identity matrix. + :math:`\\beta = \\gamma` is a :math:`p_n^x`-vector with entries :math:`\\beta_j=\\frac{1}{j^2}`, + :math:`\\delta` is a :math:`p_n^z`-vector with entries :math:`\\delta_j=\\frac{1}{j^2}` + and :math:`\\Pi = (I_{p_n^z}, 0_{p_n^z \\times (p_n^x - p_n^z)})`. + + Parameters + ---------- + n_obs : + The number of observations to simulate. + alpha : + The value of the causal parameter. + dim_x : + The number of covariates. + dim_z : + The number of instruments. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z)``. + + References + ---------- + Chernozhukov, V., Hansen, C. and Spindler, M. (2015), Post-Selection and Post-Regularization Inference in Linear + Models with Many Controls and Instruments. American Economic Review: Papers and Proceedings, 105 (5): 486-90. + """ + assert dim_x >= dim_z + # see https://assets.aeaweb.org/asset-server/articles-attachments/aer/app/10505/P2015_1022_app.pdf + xx = np.random.multivariate_normal(np.zeros(2), + np.array([[1., 0.6], [0.6, 1.]]), + size=[n_obs, ]) + epsilon = xx[:, 0] + u = xx[:, 1] + + sigma = toeplitz([np.power(0.5, k) for k in range(0, dim_x)]) + x = np.random.multivariate_normal(np.zeros(dim_x), + sigma, + size=[n_obs, ]) + + I_z = np.eye(dim_z) + xi = np.random.multivariate_normal(np.zeros(dim_z), + 0.25 * I_z, + size=[n_obs, ]) + + beta = [1 / (k ** 2) for k in range(1, dim_x + 1)] + gamma = beta + delta = [1 / (k ** 2) for k in range(1, dim_z + 1)] + Pi = np.hstack((I_z, np.zeros((dim_z, dim_x - dim_z)))) + + z = np.dot(x, np.transpose(Pi)) + xi + d = np.dot(x, gamma) + np.dot(z, delta) + u + y = alpha * d + np.dot(x, beta) + epsilon + + if return_type in _array_alias: + return x, y, d, z + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] + z_cols = [f'Z{i + 1}' for i in np.arange(dim_z)] + data = pd.DataFrame(np.column_stack((x, y, d, z)), + columns=x_cols + ['y', 'd'] + z_cols) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, 'y', 'd', x_cols, z_cols) + else: + raise ValueError('Invalid return_type.') + + +def make_pliv_multiway_cluster_CKMS2021(N=25, M=25, dim_X=100, theta=1., return_type='DoubleMLClusterData', **kwargs): + """ + Generates data from a partially linear IV regression model with multiway cluster sample used in Chiang et al. + (2021). The data generating process is defined as + + .. math:: + + Z_{ij} &= X_{ij}' \\xi_0 + V_{ij}, + + D_{ij} &= Z_{ij}' \\pi_{10} + X_{ij}' \\pi_{20} + v_{ij}, + + Y_{ij} &= D_{ij} \\theta + X_{ij}' \\zeta_0 + \\varepsilon_{ij}, + + with + + .. math:: + + X_{ij} &= (1 - \\omega_1^X - \\omega_2^X) \\alpha_{ij}^X + + \\omega_1^X \\alpha_{i}^X + \\omega_2^X \\alpha_{j}^X, + + \\varepsilon_{ij} &= (1 - \\omega_1^\\varepsilon - \\omega_2^\\varepsilon) \\alpha_{ij}^\\varepsilon + + \\omega_1^\\varepsilon \\alpha_{i}^\\varepsilon + \\omega_2^\\varepsilon \\alpha_{j}^\\varepsilon, + + v_{ij} &= (1 - \\omega_1^v - \\omega_2^v) \\alpha_{ij}^v + + \\omega_1^v \\alpha_{i}^v + \\omega_2^v \\alpha_{j}^v, + + V_{ij} &= (1 - \\omega_1^V - \\omega_2^V) \\alpha_{ij}^V + + \\omega_1^V \\alpha_{i}^V + \\omega_2^V \\alpha_{j}^V, + + and :math:`\\alpha_{ij}^X, \\alpha_{i}^X, \\alpha_{j}^X \\sim \\mathcal{N}(0, \\Sigma)` + where :math:`\\Sigma` is a :math:`p_x \\times p_x` matrix with entries + :math:`\\Sigma_{kj} = s_X^{|j-k|}`. + Further + + .. math:: + + \\left(\\begin{matrix} \\alpha_{ij}^\\varepsilon \\\\ \\alpha_{ij}^v \\end{matrix}\\right), + \\left(\\begin{matrix} \\alpha_{i}^\\varepsilon \\\\ \\alpha_{i}^v \\end{matrix}\\right), + \\left(\\begin{matrix} \\alpha_{j}^\\varepsilon \\\\ \\alpha_{j}^v \\end{matrix}\\right) + \\sim \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & s_{\\varepsilon v} \\\\ + s_{\\varepsilon v} & 1 \\end{matrix} \\right) \\right) + + + and :math:`\\alpha_{ij}^V, \\alpha_{i}^V, \\alpha_{j}^V \\sim \\mathcal{N}(0, 1)`. + + Parameters + ---------- + N : + The number of observations (first dimension). + M : + The number of observations (second dimension). + dim_X : + The number of covariates. + theta : + The value of the causal parameter. + return_type : + If ``'DoubleMLClusterData'`` or ``DoubleMLClusterData``, returns a ``DoubleMLClusterData`` object where + ``DoubleMLClusterData.data`` is a ``pd.DataFrame``. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s + ``(x, y, d, cluster_vars, z)``. + **kwargs + Additional keyword arguments to set non-default values for the parameters + :math:`\\pi_{10}=1.0`, :math:`\\omega_X = \\omega_{\\varepsilon} = \\omega_V = \\omega_v = (0.25, 0.25)`, + :math:`s_X = s_{\\varepsilon v} = 0.25`, + or the :math:`p_x`-vectors :math:`\\zeta_0 = \\pi_{20} = \\xi_0` with default entries + :math:`(\\zeta_{0})_j = 0.5^j`. + + References + ---------- + Chiang, H. D., Kato K., Ma, Y. and Sasaki, Y. (2021), Multiway Cluster Robust Double/Debiased Machine Learning, + Journal of Business & Economic Statistics, + doi: `10.1080/07350015.2021.1895815 `_, + arXiv:`1909.03489 `_. + """ + # additional parameters specifiable via kwargs + pi_10 = kwargs.get('pi_10', 1.0) + + xx = np.arange(1, dim_X + 1) + zeta_0 = kwargs.get('zeta_0', np.power(0.5, xx)) + pi_20 = kwargs.get('pi_20', np.power(0.5, xx)) + xi_0 = kwargs.get('xi_0', np.power(0.5, xx)) + + omega_X = kwargs.get('omega_X', np.array([0.25, 0.25])) + omega_epsilon = kwargs.get('omega_epsilon', np.array([0.25, 0.25])) + omega_v = kwargs.get('omega_v', np.array([0.25, 0.25])) + omega_V = kwargs.get('omega_V', np.array([0.25, 0.25])) + + s_X = kwargs.get('s_X', 0.25) + s_epsilon_v = kwargs.get('s_epsilon_v', 0.25) + + # use np.tile() and np.repeat() for repeating vectors in different styles, i.e., + # np.tile([v1, v2, v3], 2) [v1, v2, v3, v1, v2, v3] + # np.repeat([v1, v2, v3], 2) [v1, v1, v2, v2, v3, v3] + + alpha_V = np.random.normal(size=(N * M)) + alpha_V_i = np.repeat(np.random.normal(size=N), M) + alpha_V_j = np.tile(np.random.normal(size=M), N) + + cov_mat = np.array([[1, s_epsilon_v], [s_epsilon_v, 1]]) + alpha_eps_v = np.random.multivariate_normal(np.zeros(2), cov_mat, size=[N * M, ]) + alpha_eps = alpha_eps_v[:, 0] + alpha_v = alpha_eps_v[:, 1] + + alpha_eps_v_i = np.random.multivariate_normal(np.zeros(2), cov_mat, size=[N, ]) + alpha_eps_i = np.repeat(alpha_eps_v_i[:, 0], M) + alpha_v_i = np.repeat(alpha_eps_v_i[:, 1], M) + + alpha_eps_v_j = np.random.multivariate_normal(np.zeros(2), cov_mat, size=[M, ]) + alpha_eps_j = np.tile(alpha_eps_v_j[:, 0], N) + alpha_v_j = np.tile(alpha_eps_v_j[:, 1], N) + + cov_mat = toeplitz([np.power(s_X, k) for k in range(dim_X)]) + alpha_X = np.random.multivariate_normal(np.zeros(dim_X), cov_mat, size=[N * M, ]) + alpha_X_i = np.repeat(np.random.multivariate_normal(np.zeros(dim_X), cov_mat, size=[N, ]), + M, axis=0) + alpha_X_j = np.tile(np.random.multivariate_normal(np.zeros(dim_X), cov_mat, size=[M, ]), + (N, 1)) + + # generate variables + x = (1 - omega_X[0] - omega_X[1]) * alpha_X \ + + omega_X[0] * alpha_X_i + omega_X[1] * alpha_X_j + + eps = (1 - omega_epsilon[0] - omega_epsilon[1]) * alpha_eps \ + + omega_epsilon[0] * alpha_eps_i + omega_epsilon[1] * alpha_eps_j + + v = (1 - omega_v[0] - omega_v[1]) * alpha_v \ + + omega_v[0] * alpha_v_i + omega_v[1] * alpha_v_j + + V = (1 - omega_V[0] - omega_V[1]) * alpha_V \ + + omega_V[0] * alpha_V_i + omega_V[1] * alpha_V_j + + z = np.matmul(x, xi_0) + V + d = z * pi_10 + np.matmul(x, pi_20) + v + y = d * theta + np.matmul(x, zeta_0) + eps + + cluster_cols = ['cluster_var_i', 'cluster_var_j'] + cluster_vars = pd.MultiIndex.from_product([range(N), range(M)]).to_frame(name=cluster_cols).reset_index(drop=True) + + if return_type in _array_alias: + return x, y, d, cluster_vars.values, z + elif return_type in _data_frame_alias + _dml_cluster_data_alias: + x_cols = [f'X{i + 1}' for i in np.arange(dim_X)] + data = pd.concat((cluster_vars, + pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + ['Y', 'D', 'Z'])), + axis=1) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLClusterData(data, 'Y', 'D', cluster_cols, x_cols, 'Z') + else: + raise ValueError('Invalid return_type.') + + +def make_did_SZ2020(n_obs=500, dgp_type=1, cross_sectional_data=False, return_type='DoubleMLData', **kwargs): + """ + Generates data from a difference-in-differences model used in Sant'Anna and Zhao (2020). + The data generating process is defined as follows. For a generic :math:`W=(W_1, W_2, W_3, W_4)^T`, let + + .. math:: + + f_{reg}(W) &= 210 + 27.4 \\cdot W_1 +13.7 \\cdot (W_2 + W_3 + W_4), + + f_{ps}(W) &= 0.75 \\cdot (-W_1 + 0.5 \\cdot W_2 -0.25 \\cdot W_3 - 0.1 \\cdot W_4). + + + Let :math:`X= (X_1, X_2, X_3, X_4)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries + :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix. + Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, + where :math:`\\tilde{Z}_1 = \\exp(0.5 \\cdot X_1)`, :math:`\\tilde{Z}_2 = 10 + X_2/(1 + \\exp(X_1))`, + :math:`\\tilde{Z}_3 = (0.6 + X_1 \\cdot X_3 / 25)^3` and :math:`\\tilde{Z}_4 = (20 + X_2 + X_4)^2`. + At first define + + .. math:: + + Y_0(0) &= f_{reg}(W_{reg}) + \\nu(W_{reg}, D) + \\varepsilon_0, + + Y_1(d) &= 2 \\cdot f_{reg}(W_{reg}) + \\nu(W_{reg}, D) + \\varepsilon_1(d), + + p(W_{ps}) &= \\frac{\\exp(f_{ps}(W_{ps}))}{1 + \\exp(f_{ps}(W_{ps}))}, + + D &= 1\\{p(W_{ps}) \\ge U\\}, + + where :math:`\\varepsilon_0, \\varepsilon_1(d), d=0, 1` are independent standard normal random variables, + :math:`U \\sim \\mathcal{U}[0, 1]` is a independent standard uniform + and :math:`\\nu(W_{reg}, D)\\sim \\mathcal{N}(D \\cdot f_{reg}(W_{reg}),1)`. + The different data generating processes are defined via + + .. math:: + + DGP1:\\quad W_{reg} &= Z \\quad W_{ps} = Z + + DGP2:\\quad W_{reg} &= Z \\quad W_{ps} = X + + DGP3:\\quad W_{reg} &= X \\quad W_{ps} = Z + + DGP4:\\quad W_{reg} &= X \\quad W_{ps} = X + + DGP5:\\quad W_{reg} &= Z \\quad W_{ps} = 0 + + DGP6:\\quad W_{reg} &= X \\quad W_{ps} = 0, + + such that the last two settings correspond to an experimental setting with treatment probability + of :math:`P(D=1) = \\frac{1}{2}.` + For the panel data the outcome is already defined as the difference :math:`Y = Y_1(D) - Y_0(0)`. + For cross-sectional data the flag ``cross_sectional_data`` has to be set to ``True``. + Then the outcome will be defined to be + + .. math:: + + Y = T \\cdot Y_1(D) + (1-T) \\cdot Y_0(0), + + where :math:`T = 1\\{U_T\\le \\lambda_T \\}` with :math:`U_T\\sim \\mathcal{U}[0, 1]` and :math:`\\lambda_T=0.5`. + The true average treatment effect on the treated is zero for all data generating processes. + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dgp_type : + The DGP to be used. Default value is ``1`` (integer). + cross_sectional_data : + Indicates whether the setting is uses cross-sectional or panel data. Default value is ``False``. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)`` + or ``(x, y, d, t)``. + **kwargs + Additional keyword arguments to set non-default values for the parameter + :math:`xi=0.75`, :math:`c=0.0` and :math:`\\lambda_T=0.5`. + + References + ---------- + Sant’Anna, P. H. and Zhao, J. (2020), + Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122. + doi:`10.1016/j.jeconom.2020.06.003 `_. + """ + xi = kwargs.get('xi', 0.75) + c = kwargs.get('c', 0.0) + lambda_t = kwargs.get('lambda_t', 0.5) + + def f_reg(w): + res = 210 + 27.4 * w[:, 0] + 13.7 * (w[:, 1] + w[:, 2] + w[:, 3]) + return res + + def f_ps(w, xi): + res = xi * (-w[:, 0] + 0.5 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3]) + return res + + dim_x = 4 + cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) + x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) + + z_tilde_1 = np.exp(0.5 * x[:, 0]) + z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) + z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 + z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 + + z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4)) + z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) + + # error terms + epsilon_0 = np.random.normal(loc=0, scale=1, size=n_obs) + epsilon_1 = np.random.normal(loc=0, scale=1, size=[n_obs, 2]) + + if dgp_type == 1: + features_ps = z + features_reg = z + elif dgp_type == 2: + features_ps = x + features_reg = z + elif dgp_type == 3: + features_ps = z + features_reg = x + elif dgp_type == 4: + features_ps = x + features_reg = x + elif dgp_type == 5: + features_ps = None + features_reg = z + elif dgp_type == 6: + features_ps = None + features_reg = x + else: + raise ValueError('The dgp_type is not valid.') + + # treatment and propensities + is_experimental = (dgp_type == 5) or (dgp_type == 6) + if is_experimental: + # Set D to be experimental + p = 0.5 * np.ones(n_obs) + else: + p = np.exp(f_ps(features_ps, xi)) / (1 + np.exp(f_ps(features_ps, xi))) + u = np.random.uniform(low=0, high=1, size=n_obs) + d = 1.0 * (p >= u) + + # potential outcomes + nu = np.random.normal(loc=d * f_reg(features_reg), scale=1, size=n_obs) + y0 = f_reg(features_reg) + nu + epsilon_0 + y1_d0 = 2 * f_reg(features_reg) + nu + epsilon_1[:, 0] + y1_d1 = 2 * f_reg(features_reg) + nu + epsilon_1[:, 1] + y1 = d * y1_d1 + (1 - d) * y1_d0 + + if not cross_sectional_data: + y = y1 - y0 + + if return_type in _array_alias: + return z, y, d + elif return_type in _data_frame_alias + _dml_data_alias: + z_cols = [f'Z{i + 1}' for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((z, y, d)), + columns=z_cols + ['y', 'd']) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, 'y', 'd', z_cols) + else: + raise ValueError('Invalid return_type.') + + else: + u_t = np.random.uniform(low=0, high=1, size=n_obs) + t = 1.0 * (u_t <= lambda_t) + y = t * y1 + (1 - t) * y0 + + if return_type in _array_alias: + return z, y, d, t + elif return_type in _data_frame_alias + _dml_data_alias: + z_cols = [f'Z{i + 1}' for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((z, y, d, t)), + columns=z_cols + ['y', 'd', 't']) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, 'y', 'd', z_cols, t_col='t') + else: + raise ValueError('Invalid return_type.') + + +def make_confounded_irm_data(n_obs=500, theta=0.0, gamma_a=0.127, beta_a=0.58, linear=False, **kwargs): + """ + Generates counfounded data from an interactive regression model. + + The data generating process is defined as follows (inspired by the Monte Carlo simulation used + in Sant'Anna and Zhao (2020)). + + Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` corresponds + to the identity matrix. + Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, + where + + .. math:: + + \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) + + \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) + + \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 + + \\tilde{Z}_4 &= (20 + X_2 + X_4)^2 + + \\tilde{Z}_5 &= X_5. + + Additionally, generate a confounder :math:`A \\sim \\mathcal{U}[-1, 1]`. + At first, define the propensity score as + + .. math:: + + m(X, A) = P(D=1|X,A) = p(Z) + \\gamma_A \\cdot A + + where + + .. math:: + + p(Z) &= \\frac{\\exp(f_{ps}(Z))}{1 + \\exp(f_{ps}(Z))}, + + f_{ps}(Z) &= 0.75 \\cdot (-Z_1 + 0.1 \\cdot Z_2 -0.25 \\cdot Z_3 - 0.1 \\cdot Z_4). + + and generate the treatment :math:`D = 1\\{m(X, A) \\ge U\\}` with :math:`U \\sim \\mathcal{U}[0, 1]`. + Since :math:`A` is independent of :math:`X`, the short form of the propensity score is given as + + .. math:: + + P(D=1|X) = p(Z). + + Further, generate the outcome of interest :math:`Y` as + + .. math:: + + Y &= \\theta \\cdot D (Z_5 + 1) + g(Z) + \\beta_A \\cdot A + \\varepsilon + + g(Z) &= 2.5 + 0.74 \\cdot Z_1 + 0.25 \\cdot Z_2 + 0.137 \\cdot (Z_3 + Z_4) + + where :math:`\\varepsilon \\sim \\mathcal{N}(0,5)`. + This implies an average treatment effect of :math:`\\theta`. Additionally, the long and short forms of + the conditional expectation take the following forms + + .. math:: + + \\mathbb{E}[Y|D, X, A] &= \\theta \\cdot D (Z_5 + 1) + g(Z) + \\beta_A \\cdot A + + \\mathbb{E}[Y|D, X] &= (\\theta + \\beta_A \\frac{\\mathrm{Cov}(A, D(Z_5 + 1))}{\\mathrm{Var}(D(Z_5 + 1))}) + \\cdot D (Z_5 + 1) + g(Z). + + Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`, which can be + set via the parameters ``gamma_a`` and ``beta_a``. + + The observed data is given as :math:`W = (Y, D, Z)`. + Further, orcale values of the confounder :math:`A`, the transformed covariated :math:`Z`, + the potential outcomes of :math:`Y`, the long and short forms of the main regression and the propensity score and + in sample versions of the confounding parameters :math:`cf_d` and :math:`cf_y` (for ATE and ATTE) + are returned in a dictionary. + + Parameters + ---------- + n_obs : int + The number of observations to simulate. + Default is ``500``. + theta : float or int + Average treatment effect. + Default is ``0.0``. + gamma_a : float + Coefficient of the unobserved confounder in the propensity score. + Default is ``0.127``. + beta_a : float + Coefficient of the unobserved confounder in the outcome regression. + Default is ``0.58``. + linear : bool + If ``True``, the Z will be set to X, such that the underlying (short) models are linear/logistic. + Default is ``False``. + + Returns + ------- + res_dict : dictionary + Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. + + References + ---------- + Sant’Anna, P. H. and Zhao, J. (2020), + Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122. + doi:`10.1016/j.jeconom.2020.06.003 `_. + """ + c = 0.0 # the confounding strength is only valid for c=0 + xi = 0.75 + dim_x = kwargs.get('dim_x', 5) + trimming_threshold = kwargs.get('trimming_threshold', 0.01) + var_eps_y = kwargs.get('var_eps_y', 1.0) + + # Specification of main regression function + def f_reg(w): + res = 2.5 + 0.74 * w[:, 0] + 0.25 * w[:, 1] + 0.137 * (w[:, 2] + w[:, 3]) + return res + + # Specification of prop score function + def f_ps(w, xi): + res = xi * (-w[:, 0] + 0.1 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3]) + return res + + # observed covariates + cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) + x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) + z_tilde_1 = np.exp(0.5 * x[:, 0]) + z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) + z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 + z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 + z_tilde_5 = x[:, 4] + z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, z_tilde_5)) + z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) + # error terms and unobserved confounder + eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) + # unobserved confounder + a_bounds = (-1, 1) + a = np.random.uniform(low=a_bounds[0], high=a_bounds[1], size=n_obs) + var_a = np.square(a_bounds[1] - a_bounds[0]) / 12 + + # Choose the features used in the models + if linear: + features_ps = x + features_reg = x + else: + features_ps = z + features_reg = z + + p = np.exp(f_ps(features_ps, xi)) / (1 + np.exp(f_ps(features_ps, xi))) + # compute short and long form of propensity score + m_long = p + gamma_a * a + m_short = p + # check propensity score bounds + if np.any(m_long < trimming_threshold) or np.any(m_long > 1.0 - trimming_threshold): + m_long = np.clip(m_long, trimming_threshold, 1.0 - trimming_threshold) + m_short = np.clip(m_short, trimming_threshold, 1.0 - trimming_threshold) + warnings.warn(f'Propensity score is close to 0 or 1. ' + f'Trimming is at {trimming_threshold} and {1.0 - trimming_threshold} is applied') + # generate treatment based on long form + u = np.random.uniform(low=0, high=1, size=n_obs) + d = 1.0 * (m_long >= u) + # add treatment heterogeneity + d1x = z[:, 4] + 1 + var_dx = np.var(d * (d1x)) + cov_adx = gamma_a * var_a + # Outcome regression + g_partial_reg = f_reg(features_reg) + # short model + g_short_d0 = g_partial_reg + g_short_d1 = (theta + beta_a * cov_adx / var_dx) * d1x + g_partial_reg + g_short = d * g_short_d1 + (1.0 - d) * g_short_d0 + # long model + g_long_d0 = g_partial_reg + beta_a * a + g_long_d1 = theta * d1x + g_partial_reg + beta_a * a + g_long = d * g_long_d1 + (1.0 - d) * g_long_d0 + # Potential outcomes + y_0 = g_long_d0 + eps_y + y_1 = g_long_d1 + eps_y + # Realized outcome + y = d * y_1 + (1.0 - d) * y_0 + # In-sample values for confounding strength + explained_residual_variance = np.square(g_long - g_short) + residual_variance = np.square(y - g_short) + cf_y = np.mean(explained_residual_variance) / np.mean(residual_variance) + # compute the Riesz representation + treated_weight = d / np.mean(d) + untreated_weight = (1.0 - d) / np.mean(d) + # Odds ratios + propensity_ratio_long = m_long / (1.0 - m_long) + rr_long_ate = d / m_long - (1.0 - d) / (1.0 - m_long) + rr_long_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_long) + propensity_ratio_short = m_short / (1.0 - m_short) + rr_short_ate = d / m_short - (1.0 - d) / (1.0 - m_short) + rr_short_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_short) + cf_d_ate = (np.mean(1 / (m_long * (1 - m_long))) - np.mean(1 / (m_short * (1 - m_short)))) / np.mean( + 1 / (m_long * (1 - m_long))) + cf_d_atte = (np.mean(propensity_ratio_long) - np.mean(propensity_ratio_short)) / np.mean(propensity_ratio_long) + if (beta_a == 0) | (gamma_a == 0): + rho_ate = 0.0 + rho_atte = 0.0 + else: + rho_ate = np.corrcoef((g_long - g_short), (rr_long_ate - rr_short_ate))[0, 1] + rho_atte = np.corrcoef((g_long - g_short), (rr_long_atte - rr_short_atte))[0, 1] + oracle_values = { + 'g_long': g_long, + 'g_short': g_short, + 'm_long': m_long, + 'm_short': m_short, + 'gamma_a': gamma_a, + 'beta_a': beta_a, + 'a': a, + 'y_0': y_0, + 'y_1': y_1, + 'z': z, + 'cf_y': cf_y, + 'cf_d_ate': cf_d_ate, + 'cf_d_atte': cf_d_atte, + 'rho_ate': rho_ate, + 'rho_atte': rho_atte, + } + res_dict = { + 'x': x, + 'y': y, + 'd': d, + 'oracle_values': oracle_values + } + return res_dict + + +def make_confounded_plr_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04, **kwargs): + """ + Generates counfounded data from an partially linear regression model. + + The data generating process is defined as follows (similar to the Monte Carlo simulation used + in Sant'Anna and Zhao (2020)). Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, + where :math:`\\Sigma` is a matrix with entries + :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix. + Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, + where + + .. math:: + + \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) + + \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) + + \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 + + \\tilde{Z}_4 &= (20 + X_2 + X_4)^2. + + Additionally, generate a confounder :math:`A \\sim \\mathcal{U}[-1, 1]`. + At first, define the treatment as + + .. math:: + + D = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4 + \\gamma_A \\cdot A + \\varepsilon_D + + and with :math:`\\varepsilon \\sim \\mathcal{N}(0,1)`. + Since :math:`A` is independent of :math:`X`, the long and short form of the treatment regression are given as + + .. math:: + + E[D|X,A] = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4 + \\gamma_A \\cdot A + + E[D|X] = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4. + + Further, generate the outcome of interest :math:`Y` as + + .. math:: + + Y &= \\theta \\cdot D + g(Z) + \\beta_A \\cdot A + \\varepsilon + + g(Z) &= 210 + 27.4 \\cdot Z_1 +13.7 \\cdot (Z_2 + Z_3 + Z_4) + + where :math:`\\varepsilon \\sim \\mathcal{N}(0,5)`. + This implies an average treatment effect of :math:`\\theta`. Additionally, the long and short forms of + the conditional expectation take the following forms + + .. math:: + + \\mathbb{E}[Y|D, X, A] &= \\theta \\cdot D + g(Z) + \\beta_A \\cdot A + + \\mathbb{E}[Y|D, X] &= (\\theta + \\gamma_A\\beta_A \\frac{\\mathrm{Var}(A)}{\\mathrm{Var}(D)}) \\cdot D + g(Z). + + Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`. + Both are chosen to obtain the desired confounding of the outcome and Riesz Representer (in sample). + + The observed data is given as :math:`W = (Y, D, X)`. + Further, orcale values of the confounder :math:`A`, the transformed covariated :math:`Z`, the effect :math:`\\theta`, + the coefficients :math:`\\gamma_a`, :math:`\\beta_a`, the long and short forms of the main regression and + the propensity score are returned in a dictionary. + + Parameters + ---------- + n_obs : int + The number of observations to simulate. + Default is ``500``. + theta : float or int + Average treatment effect. + Default is ``5.0``. + cf_y : float + Percentage of the residual variation of the outcome explained by latent/confounding variable. + Default is ``0.04``. + cf_d : float + Percentage gains in the variation of the Riesz Representer generated by latent/confounding variable. + Default is ``0.04``. + + Returns + ------- + res_dict : dictionary + Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. + + References + ---------- + Sant’Anna, P. H. and Zhao, J. (2020), + Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122. + doi:`10.1016/j.jeconom.2020.06.003 `_. + """ + c = kwargs.get('c', 0.0) + dim_x = kwargs.get('dim_x', 4) + + # observed covariates + cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) + x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) + + z_tilde_1 = np.exp(0.5 * x[:, 0]) + z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) + z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 + z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 + + z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, x[:, 4:])) + z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) + + # error terms + var_eps_y = 5 + eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) + var_eps_d = 1 + eps_d = np.random.normal(loc=0, scale=np.sqrt(var_eps_d), size=n_obs) + + # unobserved confounder + a_bounds = (-1, 1) + a = np.random.uniform(low=a_bounds[0], high=a_bounds[1], size=n_obs) + var_a = np.square(a_bounds[1] - a_bounds[0]) / 12 + + # get the required impact of the confounder on the propensity score + m_short = -z[:, 0] + 0.5 * z[:, 1] - 0.25 * z[:, 2] - 0.1 * z[:, 3] + + def f_m(gamma_a): + rr_long = eps_d / var_eps_d + rr_short = (gamma_a * a + eps_d) / (gamma_a ** 2 * var_a + var_eps_d) + C2_D = (np.mean(np.square(rr_long)) - np.mean(np.square(rr_short))) / np.mean(np.square(rr_short)) + return np.square(C2_D / (1 + C2_D) - cf_d) + + gamma_a = minimize_scalar(f_m).x + m_long = m_short + gamma_a * a + d = m_long + eps_d + + # short and long version of g + g_partial_reg = 210 + 27.4 * z[:, 0] + 13.7 * (z[:, 1] + z[:, 2] + z[:, 3]) + + var_d = np.var(d) + + def f_g(beta_a): + g_diff = beta_a * (a - gamma_a * (var_a / var_d) * d) + y_diff = eps_y + g_diff + return np.square(np.mean(np.square(g_diff)) / np.mean(np.square(y_diff)) - cf_y) + + beta_a = minimize_scalar(f_g).x + + g_long = theta * d + g_partial_reg + beta_a * a + g_short = (theta + gamma_a * beta_a * var_a / var_d) * d + g_partial_reg + + y = g_long + eps_y + + oracle_values = {'g_long': g_long, + 'g_short': g_short, + 'm_long': m_long, + 'm_short': m_short, + 'theta': theta, + 'gamma_a': gamma_a, + 'beta_a': beta_a, + 'a': a, + 'z': z} + + res_dict = {'x': x, + 'y': y, + 'd': d, + 'oracle_values': oracle_values} + + return res_dict + + +def make_heterogeneous_data(n_obs=200, p=30, support_size=5, n_x=1, binary_treatment=False): + """ + Creates a simple synthetic example for heterogeneous treatment effects. + The data generating process is based on the Monte Carlo simulation from Oprescu et al. (2019). + + The data is generated as + + .. math:: + + Y_i & = \\theta_0(X_i)D_i + \\langle X_i,\\gamma_0\\rangle + \\epsilon_i + + D_i & = \\langle X_i,\\beta_0\\rangle + \\eta_i, + + where :math:`X_i\\sim\\mathcal{U}[0,1]^{p}` and :math:`\\epsilon_i,\\eta_i + \\sim\\mathcal{U}[-1,1]`. + If the treatment is set to be binary, the treatment is generated as + + .. math:: + D_i = 1\\{\\langle X_i,\\beta_0\\rangle \\ge \\eta_i\\}. + + The coefficient vectors :math:`\\gamma_0` and :math:`\\beta_0` both have small random (identical) support + which values are drawn independently from :math:`\\mathcal{U}[0,1]` and :math:`\\mathcal{U}[0,0.3]`. + Further, :math:`\\theta_0(x)` defines the conditional treatment effect, which is defined differently depending + on the dimension of :math:`x`. + + If the heterogeneity is univariate the conditional treatment effect takes the following form + + .. math:: + \\theta_0(x) = \\exp(2x_0) + 3\\sin(4x_0), + + whereas for the two-dimensional case the conditional treatment effect is defined as + + .. math:: + \\theta_0(x) = \\exp(2x_0) + 3\\sin(4x_1). + + Parameters + ---------- + n_obs : int + Number of observations to simulate. + Default is ``200``. + + p : int + Dimension of covariates. + Default is ``30``. + + support_size : int + Number of relevant (confounding) covariates. + Default is ``5``. + + n_x : int + Dimension of the heterogeneity. Can be either ``1`` or ``2``. + Default is ``1``. + + binary_treatment : bool + Indicates whether the treatment is binary. + Default is ``False``. + + Returns + ------- + res_dict : dictionary + Dictionary with entries ``data``, ``effects``, ``treatment_effect``. + + """ + # simple input checks + assert n_x in [1, 2], 'n_x must be either 1 or 2.' + assert support_size <= p, 'support_size must be smaller than p.' + assert isinstance(binary_treatment, bool), 'binary_treatment must be a boolean.' + + # define treatment effects + if n_x == 1: + def treatment_effect(x): + return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 0]) + else: + assert n_x == 2 + + # redefine treatment effect + def treatment_effect(x): + return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 1]) + + # Outcome support and coefficients + support_y = np.random.choice(np.arange(p), size=support_size, replace=False) + coefs_y = np.random.uniform(0, 1, size=support_size) + # treatment support and coefficients + support_d = support_y + coefs_d = np.random.uniform(0, 0.3, size=support_size) + + # noise + epsilon = np.random.uniform(-1, 1, size=n_obs) + eta = np.random.uniform(-1, 1, size=n_obs) + + # Generate controls, covariates, treatments and outcomes + x = np.random.uniform(0, 1, size=(n_obs, p)) + # Heterogeneous treatment effects + te = treatment_effect(x) + if binary_treatment: + d = 1.0 * (np.dot(x[:, support_d], coefs_d) >= eta) + else: + d = np.dot(x[:, support_d], coefs_d) + eta + y = te * d + np.dot(x[:, support_y], coefs_y) + epsilon + + # Now we build the dataset + y_df = pd.DataFrame({'y': y}) + d_df = pd.DataFrame({'d': d}) + x_df = pd.DataFrame( + data=x, + index=np.arange(x.shape[0]), + columns=[f'X_{i}' for i in range(x.shape[1])] + ) + + data = pd.concat([y_df, d_df, x_df], axis=1) + res_dict = { + 'data': data, + 'effects': te, + 'treatment_effect': treatment_effect} + return res_dict + + +def make_ssm_data(n_obs=8000, dim_x=100, theta=1, mar=True, return_type='DoubleMLData'): + """ + Generates data from a sample selection model (SSM). + The data generating process is defined as + + .. math:: + + y_i &= \\theta d_i + x_i' \\beta d_i + u_i, + + s_i &= 1\\left\\lbrace d_i + \\gamma z_i + x_i' \\beta + v_i > 0 \\right\\rbrace, + + d_i &= 1\\left\\lbrace x_i' \\beta + w_i > 0 \\right\\rbrace, + + with Y being observed if :math:`s_i = 1` and covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma^2_x)`, where + :math:`\\Sigma^2_x` is a matrix with entries + :math:`\\Sigma_{kj} = 0.5^{|j-k|}`. + :math:`\\beta` is a `dim_x`-vector with entries :math:`\\beta_j=\\frac{0.4}{j^2}` + :math:`z_i \\sim \\mathcal{N}(0, 1)`, + :math:`(u_i,v_i) \\sim \\mathcal{N}(0, \\Sigma^2_{u,v})`, + :math:`w_i \\sim \\mathcal{N}(0, 1)`. + + + The data generating process is inspired by a process used in the simulation study (see Appendix E) of Bia, + Huber and Lafférs (2023). + + Parameters + ---------- + n_obs : + The number of observations to simulate. + dim_x : + The number of covariates. + theta : + The value of the causal parameter. + mar: + Boolean. Indicates whether missingness at random holds. + return_type : + If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. + + If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. + + If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z, s)``. + + References + ---------- + Michela Bia, Martin Huber & Lukáš Lafférs (2023) Double Machine Learning for Sample Selection Models, + Journal of Business & Economic Statistics, DOI: 10.1080/07350015.2023.2271071 + """ + if mar: + sigma = np.array([[1, 0], [0, 1]]) + gamma = 0 + else: + sigma = np.array([[1, 0.8], [0.8, 1]]) + gamma = 1 + + e = np.random.multivariate_normal(mean=[0, 0], cov=sigma, size=n_obs).T + + cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) + x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) + + beta = [0.4 / (k ** 2) for k in range(1, dim_x + 1)] + + d = np.where(np.dot(x, beta) + np.random.randn(n_obs) > 0, 1, 0) + z = np.random.randn(n_obs) + s = np.where(np.dot(x, beta) + d + gamma * z + e[0] > 0, 1, 0) + + y = np.dot(x, beta) + theta * d + e[1] + y[s == 0] = 0 + + if return_type in _array_alias: + return x, y, d, z, s + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] + if mar: + data = pd.DataFrame(np.column_stack((x, y, d, s)), + columns=x_cols + ['y', 'd', 's']) + else: + data = pd.DataFrame(np.column_stack((x, y, d, z, s)), + columns=x_cols + ['y', 'd', 'z', 's']) + if return_type in _data_frame_alias: + return data + else: + if mar: + return DoubleMLData(data, 'y', 'd', x_cols, None, None, 's') + return DoubleMLData(data, 'y', 'd', x_cols, 'z', None, 's') + else: + raise ValueError('Invalid return_type.') + + +def make_irm_data_discrete_treatments(n_obs=200, n_levels=3, linear=False, random_state=None, **kwargs): + """ + Generates data from a interactive regression (IRM) model with multiple treatment levels (based on an + underlying continous treatment). + + The data generating process is defined as follows (similar to the Monte Carlo simulation used + in Sant'Anna and Zhao (2020)). + + Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` corresponds + to the identity matrix. + Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, + where + + .. math:: + + \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) + + \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) + + \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 + + \\tilde{Z}_4 &= (20 + X_2 + X_4)^2 + + \\tilde{Z}_5 &= X_5. + + A continuous treatment :math:`D_{\\text{cont}}` is generated as + + .. math:: + + D_{\\text{cont}} = \\xi (-Z_1 + 0.5 Z_2 - 0.25 Z_3 - 0.1 Z_4) + \\varepsilon_D, + + where :math:`\\varepsilon_D \\sim \\mathcal{N}(0,1)` and :math:`\\xi=0.3`. The corresponding treatment + effect is defined as + + .. math:: + + \\theta (d) = 0.1 \\exp(d) + 10 \\sin(0.7 d) + 2 d - 0.2 d^2. + + Based on the continous treatment, a discrete treatment :math:`D` is generated as with a baseline level of + :math:`D=0` and additional levels based on the quantiles of :math:`D_{\\text{cont}}`. The number of levels + is defined by :math:`n_{\\text{levels}}`. Each level is chosen to have the same probability of being selected. + + The potential outcomes are defined as + + .. math:: + + Y(0) &= 210 + 27.4 Z_1 + 13.7 (Z_2 + Z_3 + Z_4) + \\varepsilon_Y + + Y(1) &= \\theta (D_{\\text{cont}}) 1\\{D_{\\text{cont}} > 0\\} + Y(0), + + where :math:`\\varepsilon_Y \\sim \\mathcal{N}(0,5)`. Further, the observed outcome is defined as + + .. math:: + + Y = Y(1) 1\\{D > 0\\} + Y(0) 1\\{D = 0\\}. + + The data is returned as a dictionary with the entries ``x``, ``y``, ``d`` and ``oracle_values``. + + Parameters + ---------- + n_obs : int + The number of observations to simulate. + Default is ``200``. + + n_levels : int + The number of treatment levels. + Default is ``3``. + + linear : bool + Indicates whether the true underlying regression is linear. + Default is ``False``. + + random_state : int + Random seed for reproducibility. + Default is ``42``. + + Returns + ------- + res_dict : dictionary + Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. + The oracle values contain the continuous treatment, the level bounds, the potential level, ITE + and the potential outcome without treatment. + + """ + if random_state is not None: + np.random.seed(random_state) + xi = kwargs.get('xi', 0.3) + c = kwargs.get('c', 0.0) + dim_x = kwargs.get('dim_x', 5) + + if not isinstance(n_levels, int): + raise ValueError('n_levels must be an integer.') + if n_levels < 2: + raise ValueError('n_levels must be at least 2.') + + # observed covariates + cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) + x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) + + def f_reg(w): + res = 210 + 27.4 * w[:, 0] + 13.7 * (w[:, 1] + w[:, 2] + w[:, 3]) + return res + + def f_treatment(w, xi): + res = xi * (-w[:, 0] + 0.5 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3]) + return res + + def treatment_effect(d, scale=15): + return scale * (1 / (1 + np.exp(-d - 1.2 * np.cos(d)))) - 2 + + z_tilde_1 = np.exp(0.5 * x[:, 0]) + z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) + z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 + z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 + + z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, x[:, 4:])) + z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) + + # error terms + var_eps_y = 5 + eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) + var_eps_d = 1 + eps_d = np.random.normal(loc=0, scale=np.sqrt(var_eps_d), size=n_obs) + + if linear: + g = f_reg(x) + m = f_treatment(x, xi) + else: + assert not linear + g = f_reg(z) + m = f_treatment(z, xi) + + cont_d = m + eps_d + level_bounds = np.quantile(cont_d, q=np.linspace(0, 1, n_levels + 1)) + potential_level = sum([1.0 * (cont_d >= bound) for bound in level_bounds[1:-1]]) + 1 + eta = np.random.uniform(0, 1, size=n_obs) + d = 1.0 * (eta >= 1 / n_levels) * potential_level + + ite = treatment_effect(cont_d) + y0 = g + eps_y + # only treated for d > 0 compared to the baseline + y = ite * (d > 0) + y0 + + oracle_values = { + 'cont_d': cont_d, + 'level_bounds': level_bounds, + 'potential_level': potential_level, + 'ite': ite, + 'y0': y0, + } + + resul_dict = { + 'x': x, + 'y': y, + 'd': d, + 'oracle_values': oracle_values + } + + return resul_dict + + +def make_logistic_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLData', **kwargs): + """ + Generates synthetic data for a logistic partially linear regression model, as in Liu et al. (2021), + designed for use in double/debiased machine learning applications. + + The data generating process is defined as follows: + + - Covariates \( x_i \sim \mathcal{N}(0, \Sigma) \), where \( \Sigma_{kj} = 0.7^{|j-k|} \). + - Treatment \( d_i = a_0(x_i) \). + - Propensity score \( p_i = \sigma(\alpha d_i + r_0(x_i)) \), where \( \sigma(\cdot) \) is the logistic function. + - Outcome \( y_i \sim \text{Bernoulli}(p_i) \). + + The nuisance functions are defined as: + + .. math:: + + a_0(x_i) &= \frac{2}{1 + \exp(x_{i,1})} - \frac{2}{1 + \exp(x_{i,2})} + \sin(x_{i,3}) + \cos(x_{i,4}) \\ + &+ 0.5 \cdot \mathbb{1}(x_{i,5} > 0) - 0.5 \cdot \mathbb{1}(x_{i,6} > 0) + 0.2 x_{i,7} x_{i,8} - 0.2 x_{i,9} x_{i,10} \\ + + r_0(x_i) &= 0.1 x_{i,1} x_{i,2} x_{i,3} + 0.1 x_{i,4} x_{i,5} + 0.1 x_{i,6}^3 - 0.5 \sin^2(x_{i,7}) \\ + &+ 0.5 \cos(x_{i,8}) + \frac{1}{1 + x_{i,9}^2} - \frac{1}{1 + \exp(x_{i,10})} \\ + &+ 0.25 \cdot \mathbb{1}(x_{i,11} > 0) - 0.25 \cdot \mathbb{1}(x_{i,13} > 0) + + Parameters + ---------- + n_obs : int + Number of observations to simulate. + dim_x : int + Number of covariates. + alpha : float + Value of the causal parameter. + return_type : str + Determines the return format. One of: + + - 'DoubleMLData' or DoubleMLData: returns a ``DoubleMLData`` object. + - 'DataFrame', 'pd.DataFrame' or pd.DataFrame: returns a ``pandas.DataFrame``. + - 'array', 'np.ndarray', 'np.array' or np.ndarray: returns tuple of numpy arrays (x, y, d, p). + + **kwargs + Optional keyword arguments (currently unused in this implementation). + + Returns + ------- + Union[DoubleMLData, pd.DataFrame, Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]] + The generated data in the specified format. + + References + ---------- + Liu, Molei, Yi Zhang, and Doudou Zhou. 2021. + "Double/Debiased Machine Learning for Logistic Partially Linear Model." + The Econometrics Journal 24 (3): 559–88. https://doi.org/10.1093/ectj/utab019. + + """ + + def r_0(X): + return 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + \ + 0.1 * X[:, 3] * X[:, 4] + \ + 0.1 * X[:, 5] ** 3 + \ + -0.5 * np.sin(X[:, 6]) ** 2 + \ + 0.5 * np.cos(X[:, 7]) + \ + 1 / (1 + X[:, 8] ** 2) + \ + -1 / (1 + np.exp(X[:, 9])) + \ + 0.25 * np.where(X[:, 10] > 0, 1, 0) + \ + -0.25 * np.where(X[:, 12] > 0, 1, 0) + + def a_0(X): + return 2 / (1 + np.exp(X[:, 0])) + \ + -2 / (1 + np.exp(X[:, 1])) + \ + 1 * np.sin(X[:, 2]) + \ + 1 * np.cos(X[:, 3]) + \ + 0.5 * np.where(X[:, 4] > 0, 1, 0) + \ + -0.5 * np.where(X[:, 5] > 0, 1, 0) + \ + 0.2 * X[:, 6] * X[:, 7] + \ + -0.2 * X[:, 8] * X[:, 9] + + + sigma = np.full((dim_x, dim_x), 0.2) + np.fill_diagonal(sigma, 1) + + x = np.random.multivariate_normal(np.zeros(dim_x), sigma, size=n_obs) + np.clip(x, -2, 2, out=x) + + d = a_0(x) + + p = expit(alpha * d[:] + r_0(x)) + + y = np.random.binomial(1, p) + + if return_type in _array_alias: + return x, y, d, p + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d, p)), + columns=x_cols + ['y', 'd', 'p']) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, 'y', 'd', x_cols, p_cols='p') + else: + raise ValueError('Invalid return_type.') diff --git a/doubleml/double_ml_data.py b/doubleml/double_ml_data.py index fdee739d..35c9af65 100644 --- a/doubleml/double_ml_data.py +++ b/doubleml/double_ml_data.py @@ -113,6 +113,10 @@ class DoubleMLData(DoubleMLBaseData): The score or selection variable (only relevant/used for RDD or SSM Estimatiors). Default is ``None``. + p_cols : None, str or list, optional + The column(s) containing the probabilities of the outcome (only for simulated, binary data). + Default is ``None``. + use_other_treat_as_covariate : bool Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. Default is ``True``. @@ -145,6 +149,7 @@ def __init__(self, z_cols=None, t_col=None, s_col=None, + p_cols=None, use_other_treat_as_covariate=True, force_all_x_finite=True): DoubleMLBaseData.__init__(self, data) @@ -155,6 +160,7 @@ def __init__(self, self.t_col = t_col self.s_col = s_col self.x_cols = x_cols + self.p_cols = p_cols self._check_disjoint_sets_y_d_x_z_t_s() self.use_other_treat_as_covariate = use_other_treat_as_covariate self.force_all_x_finite = force_all_x_finite @@ -187,7 +193,7 @@ def _data_summary_str(self): return data_summary @classmethod - def from_arrays(cls, x, y, d, z=None, t=None, s=None, use_other_treat_as_covariate=True, + def from_arrays(cls, x, y, d, z=None, t=None, s=None, p=None, use_other_treat_as_covariate=True, force_all_x_finite=True): """ Initialize :class:`DoubleMLData` from :class:`numpy.ndarray`'s. @@ -215,6 +221,10 @@ def from_arrays(cls, x, y, d, z=None, t=None, s=None, use_other_treat_as_covaria Array of the score or selection variable (only relevant/used for RDD and SSM models). Default is ``None``. + p : None or :class:`numpy.ndarray` + Array of the probabilities of the outcome (only for simulated, binary data). + Default is ``None``. + use_other_treat_as_covariate : bool Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. Default is ``True``. @@ -299,7 +309,13 @@ def from_arrays(cls, x, y, d, z=None, t=None, s=None, use_other_treat_as_covaria if s is not None: data[s_col] = s - return cls(data, y_col, d_cols, x_cols, z_cols, t_col, s_col, use_other_treat_as_covariate, force_all_x_finite) + if p is not None: + if p.shape[1] == 1: + d_cols = ['p'] + else: + d_cols = [f'p{i + 1}' for i in np.arange(p.shape[1])] + + return cls(data, y_col, d_cols, x_cols, z_cols, t_col, s_col, p_cols, use_other_treat_as_covariate, force_all_x_finite) @property def x(self): @@ -358,6 +374,41 @@ def s(self): else: return None + @property + def p_cols(self): + """ + The column(s) containing the probabilities of the outcome (only for simulated data). + """ + return self._p_cols + + @p_cols.setter + def p_cols(self, value): + if value is not None: + if isinstance(value, str): + value = [value] + if not isinstance(value, list): + raise TypeError('The probability column(s) p_cols must be of str or list type (or None). ' + f'{str(value)} of type {str(type(value))} was passed.') + if not len(set(value)) == len(value): + raise ValueError('Invalid probability column(s) p_cols: ' + 'Contains duplicate values.') + if not set(value).issubset(set(self.all_variables)): + raise ValueError('Invalid probability column(s) p_cols. ' + 'At least one probability column is not a data column.') + self._p_cols = value + else: + self._p_cols = None + + @property + def p(self): + """ + Array of probabilities of the outcome (only for simulated data). + """ + if self.p_cols is not None: + return self._p.values + else: + return None + @property def n_treat(self): """ diff --git a/doubleml/plm/__init__.py b/doubleml/plm/__init__.py index e81f00c5..88ff26a8 100644 --- a/doubleml/plm/__init__.py +++ b/doubleml/plm/__init__.py @@ -8,4 +8,5 @@ __all__ = [ "DoubleMLPLR", "DoubleMLPLIV", + "DoubleMLLogit" ] diff --git a/doubleml/logistic/logistic.py b/doubleml/plm/logistic.py similarity index 87% rename from doubleml/logistic/logistic.py rename to doubleml/plm/logistic.py index ab10ceb8..d48fb29d 100644 --- a/doubleml/logistic/logistic.py +++ b/doubleml/plm/logistic.py @@ -1,5 +1,9 @@ +import inspect + import numpy as np -from ..utils._estimation import ( +from torch.sparse import sampled_addmm + +from doubleml.utils._estimation import ( _dml_cv_predict, _trimm, _predict_zero_one_propensity, @@ -15,12 +19,12 @@ import scipy from sklearn.utils.multiclass import type_of_target -from .. import DoubleMLData -from ..double_ml import DoubleML -from ..double_ml_score_mixins import NonLinearScoreMixin -from ..utils import DoubleMLClusterResampling -from ..utils._checks import _check_score, _check_finite_predictions, _check_is_propensity -from ..utils.resampling import DoubleMLDoubleResampling +from doubleml import DoubleMLData +from doubleml.double_ml import DoubleML +from doubleml.double_ml_score_mixins import NonLinearScoreMixin +from doubleml.utils import DoubleMLClusterResampling +from doubleml.utils._checks import _check_score, _check_finite_predictions, _check_is_propensity +from doubleml.utils.resampling import DoubleMLDoubleResampling @@ -61,7 +65,7 @@ class DoubleMLLogit(NonLinearScoreMixin, DoubleML): Default is ``1``. score : str or callable - A str (``'partialling out'`` or ``'IV-type'``) specifying the score function + A str (``'nuisance_space'`` or ``'instrument'``) specifying the score function or a callable object / function with signature ``psi_a, psi_b = score(y, d, l_hat, m_hat, g_hat, smpls)``. Default is ``'partialling out'``. @@ -103,14 +107,14 @@ class DoubleMLLogit(NonLinearScoreMixin, DoubleML): def __init__(self, obj_dml_data, - ml_m, ml_M, ml_t, + ml_m, ml_a=None, n_folds=5, n_folds_inner=5, n_rep=1, - score='logistic', + score='nuisance_space', draw_sample_splitting=True): self.n_folds_inner = n_folds_inner super().__init__(obj_dml_data, @@ -122,12 +126,16 @@ def __init__(self, self._coef_start_val = 1.0 self._check_data(self._dml_data) - valid_scores = ['logistic'] + valid_scores = ['nuisance_space', 'instrument'] _check_score(self.score, valid_scores, allow_callable=True) _ = self._check_learner(ml_t, 'ml_t', regressor=True, classifier=False) _ = self._check_learner(ml_M, 'ml_M', regressor=False, classifier=True) - ml_m_is_classifier = self._check_learner(ml_m, 'ml_m', regressor=True, classifier=True) + + if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]): + ml_m_is_classifier = self._check_learner(ml_m, 'ml_m', regressor=False, classifier=True) + else: + ml_m_is_classifier = self._check_learner(ml_m, 'ml_m', regressor=True, classifier=False) self._learner = {'ml_m': ml_m, 'ml_t': ml_t, 'ml_M': ml_M} if ml_a is not None: @@ -157,6 +165,11 @@ def __init__(self, else: self._predict_method['ml_a'] = 'predict' + if score == 'instrument': + sig = inspect.signature(self.learner['ml_a'].fit) + if not 'sample_weight' in sig.parameters: + raise ValueError('Learner \"ml_a\" who supports sample_weight is required for score type \"instrument\"') + self._initialize_ml_nuisance_params() self._external_predictions_implemented = True @@ -174,7 +187,7 @@ def _check_data(self, obj_dml_data): def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, smpls_inner=None, - n_jobs=None, est_params=None, method='predict'): + n_jobs=None, est_params=None, method='predict', sample_weights=None): res = {} res['preds'] = np.zeros(y.shape, dtype=float) res['preds_inner'] = [] @@ -182,7 +195,7 @@ def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, s for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): res_inner = _dml_cv_predict(estimator, x, y, smpls=smpls_double_split, n_jobs=n_jobs, est_params=est_params, method=method, - return_models=True, smpls_is_partition=True) + return_models=True, smpls_is_partition=True, sample_weights=sample_weights) _check_finite_predictions(res_inner['preds'], estimator, estimator_name, smpls_double_split) res['preds_inner'].append(res_inner['preds']) @@ -214,19 +227,41 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa else: a_external = False + if M_external: + M_hat = {'preds': external_predictions['ml_M'], + 'targets': None, + 'models': None} + else: + M_hat = (self._double_dml_cv_predict(self._learner['ml_M'], 'ml_M', x_d_concat, y, smpls=smpls, smpls_inner=self.__smpls__inner, + n_jobs=n_jobs_cv, + est_params=self._get_params('ml_M'), method=self._predict_method['ml_M'])) + + # TODO + #if self._score_type == "instrument": + + # nuisance m if m_external: m_hat = {'preds': external_predictions['ml_m'], 'targets': None, 'models': None} else: - filtered_smpls = [] - for train, test in smpls: - train_filtered = train[y[train] == 0] - filtered_smpls.append((train_filtered, test)) - m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=filtered_smpls, n_jobs=n_jobs_cv, - est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'], - return_models=return_models) + if self.score == 'instrument': + weights = [] + for i, (train, test) in enumerate(smpls): + weights.append( M_hat['preds_inner'][i][train] * (1-M_hat['preds_inner'][i][train])) + m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=smpls, n_jobs=n_jobs_cv, + est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'], + return_models=return_models, weights=weights) + + else: + filtered_smpls = [] + for train, test in smpls: + train_filtered = train[y[train] == 0] + filtered_smpls.append((train_filtered, test)) + m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=filtered_smpls, n_jobs=n_jobs_cv, + est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'], + return_models=return_models) _check_finite_predictions(m_hat['preds'], self._learner['ml_m'], 'ml_m', smpls) if self._check_learner(self._learner['ml_m'], 'ml_m', regressor=True, classifier=True): @@ -242,14 +277,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa 'probabilities and not labels are predicted.') - if M_external: - M_hat = {'preds': external_predictions['ml_M'], - 'targets': None, - 'models': None} - else: - M_hat = (self._double_dml_cv_predict(self._learner['ml_M'], 'ml_M', x_d_concat, y, smpls=smpls, smpls_inner=self.__smpls__inner, - n_jobs=n_jobs_cv, - est_params=self._get_params('ml_M'), method=self._predict_method['ml_M'])) + if a_external: a_hat = {'preds': external_predictions['ml_a'], @@ -456,15 +484,22 @@ def set_sample_splitting(self): def _compute_score(self, psi_elements, coef): - score_1 = psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d_tilde"] - + if self._score_type == 'nuisance_space': + score_1 = psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d_tilde"] + score = psi_elements["psi_hat"] * (score_1 - psi_elements["score_const"]) + else: + score = (psi_elements["y"] - np.exp(coef * psi_elements["d"]+ psi_elements["r_hat"])) * psi_elements["d_tilde"] - return psi_elements["psi_hat"] * (score_1 - psi_elements["score_const"]) + return score def _compute_score_deriv(self, psi_elements, coef, inds=None): - deriv_1 = - psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d"] + if self._score_type == 'nuisance_space': + deriv_1 = - psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d"] + deriv = psi_elements["psi_hat"] * psi_elements["d_tilde"] * deriv_1 + else: + deriv = - psi_elements["d"] * np.exp(coef * psi_elements["d"]+ psi_elements["r_hat"]) * psi_elements["d_tilde"] - return psi_elements["psi_hat"] * psi_elements["d_tilde"] * deriv_1 + return deriv def cate(self, basis, is_gate=False): diff --git a/doubleml/logistic/tests/_utils_logistic_manual.py b/doubleml/plm/tests/_utils_logistic_manual.py similarity index 87% rename from doubleml/logistic/tests/_utils_logistic_manual.py rename to doubleml/plm/tests/_utils_logistic_manual.py index ae53992a..af4d034e 100644 --- a/doubleml/logistic/tests/_utils_logistic_manual.py +++ b/doubleml/plm/tests/_utils_logistic_manual.py @@ -2,8 +2,8 @@ import scipy from sklearn.base import clone, is_classifier -from ...tests._utils_boot import boot_manual, draw_weights -from ...tests._utils import fit_predict, fit_predict_proba, tune_grid_search +from doubleml.tests._utils_boot import boot_manual, draw_weights +from doubleml.tests._utils import fit_predict, fit_predict_proba, tune_grid_search def fit_logistic_multitreat(y, x, d, learner_l, learner_m, learner_g, all_smpls, score, @@ -155,32 +155,6 @@ def fit_nuisance_logistic_classifier(y, x, d, learner_l, learner_m, learner_g, s return l_hat, m_hat, g_hat -def tune_nuisance_plr(y, x, d, ml_l, ml_m, ml_g, smpls, n_folds_tune, param_grid_l, param_grid_m, param_grid_g, tune_g=True): - l_tune_res = tune_grid_search(y, x, ml_l, smpls, param_grid_l, n_folds_tune) - - m_tune_res = tune_grid_search(d, x, ml_m, smpls, param_grid_m, n_folds_tune) - - if tune_g: - l_hat = np.full_like(y, np.nan) - m_hat = np.full_like(d, np.nan) - for idx, (train_index, _) in enumerate(smpls): - l_hat[train_index] = l_tune_res[idx].predict(x[train_index, :]) - m_hat[train_index] = m_tune_res[idx].predict(x[train_index, :]) - psi_a = -np.multiply(d - m_hat, d - m_hat) - psi_b = np.multiply(d - m_hat, y - l_hat) - theta_initial = -np.nanmean(psi_b) / np.nanmean(psi_a) - - g_tune_res = tune_grid_search(y - theta_initial*d, x, ml_g, smpls, param_grid_g, n_folds_tune) - g_best_params = [xx.best_params_ for xx in g_tune_res] - else: - g_best_params = [] - - l_best_params = [xx.best_params_ for xx in l_tune_res] - m_best_params = [xx.best_params_ for xx in m_tune_res] - - return l_best_params, m_best_params, g_best_params - - def compute_plr_residuals(y, d, l_hat, m_hat, g_hat, smpls): y_minus_l_hat = np.full_like(y, np.nan, dtype='float64') d_minus_m_hat = np.full_like(d, np.nan, dtype='float64') @@ -193,13 +167,6 @@ def compute_plr_residuals(y, d, l_hat, m_hat, g_hat, smpls): return y_minus_l_hat, d_minus_m_hat, y_minus_g_hat -def plr_dml2(y, x, d, l_hat, m_hat, g_hat, smpls, score): - n_obs = len(y) - y_minus_l_hat, d_minus_m_hat, y_minus_g_hat = compute_plr_residuals(y, d, l_hat, m_hat, g_hat, smpls) - theta_hat = plr_orth(y_minus_l_hat, d_minus_m_hat, y_minus_g_hat, d, score) - se = np.sqrt(var_plr(theta_hat, d, y_minus_l_hat, d_minus_m_hat, y_minus_g_hat, score, n_obs)) - - return theta_hat, se def var_plr(theta, d, y_minus_l_hat, d_minus_m_hat, y_minus_g_hat, score, n_obs): diff --git a/doubleml/logistic/tests/tests_logistic.py b/doubleml/plm/tests/tests_logistic.py similarity index 85% rename from doubleml/logistic/tests/tests_logistic.py rename to doubleml/plm/tests/tests_logistic.py index 2b97bf76..a77db7a6 100644 --- a/doubleml/logistic/tests/tests_logistic.py +++ b/doubleml/plm/tests/tests_logistic.py @@ -11,8 +11,8 @@ import doubleml as dml -from ...tests._utils import draw_smpls -from ._utils_logistic_manual import fit_logistic, , boot_plr +from doubleml.tests._utils import draw_smpls +from ._utils_logistic_manual import fit_logistic, boot_plr @pytest.fixture(scope='module', @@ -304,49 +304,4 @@ def test_dml_plr_ols_manual_boot(dml_plr_ols_manual_fixture): @pytest.fixture(scope='module', params=["nonrobust", "HC0", "HC1", "HC2", "HC3"]) def cov_type(request): - return request.param - - -@pytest.mark.ci -def test_dml_plr_cate_gate(score, cov_type): - n = 9 - - # collect data - np.random.seed(42) - obj_dml_data = dml.datasets.make_plr_CCDDHNR2018(n_obs=n) - ml_l = LinearRegression() - ml_g = LinearRegression() - ml_m = LinearRegression() - - dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, - ml_g, ml_m, ml_l, - n_folds=2, - score=score) - dml_plr_obj.fit() - random_basis = pd.DataFrame(np.random.normal(0, 1, size=(n, 5))) - cate = dml_plr_obj.cate(random_basis, cov_type=cov_type) - assert isinstance(cate, dml.DoubleMLBLP) - assert isinstance(cate.confint(), pd.DataFrame) - assert cate.blp_model.cov_type == cov_type - - groups_1 = pd.DataFrame( - np.column_stack([obj_dml_data.data['X1'] <= 0, - obj_dml_data.data['X1'] > 0.2]), - columns=['Group 1', 'Group 2']) - msg = ('At least one group effect is estimated with less than 6 observations.') - with pytest.warns(UserWarning, match=msg): - gate_1 = dml_plr_obj.gate(groups_1, cov_type=cov_type) - assert isinstance(gate_1, dml.utils.blp.DoubleMLBLP) - assert isinstance(gate_1.confint(), pd.DataFrame) - assert all(gate_1.confint().index == groups_1.columns.tolist()) - assert gate_1.blp_model.cov_type == cov_type - - np.random.seed(42) - groups_2 = pd.DataFrame(np.random.choice(["1", "2"], n)) - msg = ('At least one group effect is estimated with less than 6 observations.') - with pytest.warns(UserWarning, match=msg): - gate_2 = dml_plr_obj.gate(groups_2, cov_type=cov_type) - assert isinstance(gate_2, dml.utils.blp.DoubleMLBLP) - assert isinstance(gate_2.confint(), pd.DataFrame) - assert all(gate_2.confint().index == ["Group_1", "Group_2"]) - assert gate_2.blp_model.cov_type == cov_type + return request.param \ No newline at end of file diff --git a/doubleml/utils/_estimation.py b/doubleml/utils/_estimation.py index 3ed110f3..6029dfd9 100644 --- a/doubleml/utils/_estimation.py +++ b/doubleml/utils/_estimation.py @@ -43,9 +43,9 @@ def _fit(estimator, x, y, train_index, idx=None): return estimator, idx -def _dml_cv_predict( - estimator, x, y, smpls=None, n_jobs=None, est_params=None, method="predict", return_train_preds=False, return_models=False -, smpls_is_partition=None): +def _dml_cv_predict(estimator, x, y, smpls=None, + n_jobs=None, est_params=None, method='predict', return_train_preds=False, return_models=False, + smpls_is_partition=None, sample_weights=None): n_obs = x.shape[0] # TODO: Better name for smples_is_partition @@ -53,9 +53,9 @@ def _dml_cv_predict( smpls_is_partition = _check_is_partition(smpls, n_obs) fold_specific_params = (est_params is not None) & (not isinstance(est_params, dict)) fold_specific_target = isinstance(y, list) - manual_cv_predict = ( - (not smpls_is_partition) | return_train_preds | fold_specific_params | fold_specific_target | return_models - ) + manual_cv_predict = (not smpls_is_partition) | return_train_preds | fold_specific_params | fold_specific_target \ + | return_models | bool(sample_weights) + #TODO: Check if cross_val_predict supports weights res = {"models": None} if not manual_cv_predict: @@ -187,6 +187,22 @@ def _draw_weights(method, n_rep_boot, n_obs): return weights +def _trimm(preds, trimming_rule, trimming_threshold): + if trimming_rule == 'truncate': + preds[preds < trimming_threshold] = trimming_threshold + preds[preds > 1 - trimming_threshold] = 1 - trimming_threshold + return preds + + +def _normalize_ipw(propensity, treatment): + mean_treat1 = np.mean(np.divide(treatment, propensity)) + mean_treat0 = np.mean(np.divide(1.0 - treatment, 1.0 - propensity)) + normalized_weights = np.multiply(treatment, np.multiply(propensity, mean_treat1)) \ + + np.multiply(1.0 - treatment, 1.0 - np.multiply(1.0 - propensity, mean_treat0)) + + return normalized_weights + + def _rmse(y_true, y_pred): subset = np.logical_not(np.isnan(y_true)) rmse = root_mean_squared_error(y_true[subset], y_pred[subset]) @@ -302,7 +318,7 @@ def _var_est(psi, psi_deriv, smpls, is_cluster_data, cluster_vars=None, smpls_cl J_l = test_cluster_inds[1] const = np.divide(min(len(I_k), len(J_l)), (np.square(len(I_k) * len(J_l)))) for cluster_value in I_k: - ind_cluster = (first_cluster_var == cluster_value) & np.isin(second_cluster_var, J_l) + ind_cluster = (first_cluster_var == cluster_value) & np.in1d(second_cluster_var, J_l) gamma_hat += const * np.sum(np.outer(psi[ind_cluster], psi[ind_cluster])) for cluster_value in J_l: ind_cluster = (second_cluster_var == cluster_value) & np.isin(first_cluster_var, I_k) From c6e600d2f67abf33aa59d8f074453c49ebd60c77 Mon Sep 17 00:00:00 2001 From: bbd5721 Date: Wed, 27 Aug 2025 19:18:16 +0200 Subject: [PATCH 07/23] Fixed bug in score computation --- doubleml/double_ml_data.py | 14 ++- doubleml/plm/logistic.py | 183 ++++++++----------------------------- 2 files changed, 49 insertions(+), 148 deletions(-) diff --git a/doubleml/double_ml_data.py b/doubleml/double_ml_data.py index 35c9af65..612e6b7f 100644 --- a/doubleml/double_ml_data.py +++ b/doubleml/double_ml_data.py @@ -288,6 +288,15 @@ def from_arrays(cls, x, y, d, z=None, t=None, s=None, p=None, use_other_treat_as check_consistent_length(x, y, d, s) s_col = 's' + + if p is None: + p_cols = None + else: + if p.shape[1] == 1: + p_cols = ['p'] + else: + p_cols = [f'p{i + 1}' for i in np.arange(p.shape[1])] + if d.shape[1] == 1: d_cols = ['d'] else: @@ -310,10 +319,7 @@ def from_arrays(cls, x, y, d, z=None, t=None, s=None, p=None, use_other_treat_as data[s_col] = s if p is not None: - if p.shape[1] == 1: - d_cols = ['p'] - else: - d_cols = [f'p{i + 1}' for i in np.arange(p.shape[1])] + data[p_cols] = p return cls(data, y_col, d_cols, x_cols, z_cols, t_col, s_col, p_cols, use_other_treat_as_covariate, force_all_x_finite) diff --git a/doubleml/plm/logistic.py b/doubleml/plm/logistic.py index d48fb29d..3e04d15d 100644 --- a/doubleml/plm/logistic.py +++ b/doubleml/plm/logistic.py @@ -215,9 +215,9 @@ def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, s def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False): # TODO: How to deal with smpls_inner? x, y = check_X_y(self._dml_data.x, self._dml_data.y, - force_all_finite=False) + ensure_all_finite=False) x, d = check_X_y(x, self._dml_data.d, - force_all_finite=False) + ensure_all_finite=False) x_d_concat = np.hstack((d.reshape(-1,1), x)) m_external = external_predictions['ml_m'] is not None M_external = external_predictions['ml_M'] is not None @@ -236,9 +236,6 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa n_jobs=n_jobs_cv, est_params=self._get_params('ml_M'), method=self._predict_method['ml_M'])) - # TODO - #if self._score_type == "instrument": - # nuisance m if m_external: @@ -254,7 +251,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'], return_models=return_models, weights=weights) - else: + elif self.score == 'nuisance_space': filtered_smpls = [] for train, test in smpls: train_filtered = train[y[train] == 0] @@ -262,6 +259,8 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=filtered_smpls, n_jobs=n_jobs_cv, est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'], return_models=return_models) + else: + raise NotImplementedError _check_finite_predictions(m_hat['preds'], self._learner['ml_m'], 'ml_m', smpls) if self._check_learner(self._learner['ml_m'], 'ml_m', regressor=True, classifier=True): @@ -288,31 +287,32 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa n_jobs=n_jobs_cv, est_params=self._get_params('ml_a'), method=self._predict_method['ml_a'])) - # r_legacy = np.zeros_like(y) - # smpls_inner = self.__smpls__inner - # M_hat = {} - # a_hat = {} - # M_hat['preds_inner'] = [] - # M_hat['preds'] = np.full_like(y, np.nan) - # a_hat['preds_inner'] = [] - # a_hat['preds'] = np.full_like(y, np.nan) - # for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): - # test = smpls_single_split[1] - # train = smpls_single_split[0] - # # r_legacy[test] = - # Mleg, aleg, a_nf_leg = self.legacy_implementation(y[train], x[train], d[train], x[test], d[test], - # self._learner['ml_m'], self._learner['ml_M'], - # smpls_single_split, smpls_double_split, y, x, d, - # x_d_concat, n_jobs_cv) - # Mtemp = np.full_like(y, np.nan) - # Mtemp[train] = Mleg - # Atemp = np.full_like(y, np.nan) - # Atemp[train] = aleg - # M_hat['preds_inner'].append(Mtemp) - # a_hat['preds_inner'].append(Atemp) - # a_hat['preds'][test] = a_nf_leg - # - # #r_hat['preds'] = r_legacy + + r_legacy = np.zeros_like(y) + smpls_inner = self.__smpls__inner + M_hat_l = {} + a_hat_l = {} + M_hat_l['preds_inner'] = [] + M_hat_l['preds'] = np.full_like(y, np.nan) + a_hat_l['preds_inner'] = [] + a_hat_l['preds'] = np.full_like(y, np.nan) + for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): + test = smpls_single_split[1] + train = smpls_single_split[0] + # r_legacy[test] = + Mleg, aleg, a_nf_leg = self.legacy_implementation(y[train], x[train], d[train], x[test], d[test], + self._learner['ml_m'], self._learner['ml_M'], + smpls_single_split, smpls_double_split, y, x, d, + x_d_concat, n_jobs_cv) + Mtemp = np.full_like(y, np.nan) + Mtemp[train] = Mleg + Atemp = np.full_like(y, np.nan) + Atemp[train] = aleg + M_hat_l['preds_inner'].append(Mtemp) + a_hat_l['preds_inner'].append(Atemp) + a_hat_l['preds'][test] = a_nf_leg + + #r_hat['preds'] = r_legacy @@ -343,10 +343,6 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa r_hat = {} r_hat['preds'] = t_hat['preds'] - beta * a_hat['preds'] - - - - psi_elements = self._score_elements(y, d, r_hat['preds'], m_hat['preds']) preds = {'predictions': {'ml_r': r_hat['preds'], @@ -484,124 +480,23 @@ def set_sample_splitting(self): def _compute_score(self, psi_elements, coef): - if self._score_type == 'nuisance_space': + if self.score == 'nuisance_space': score_1 = psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d_tilde"] score = psi_elements["psi_hat"] * (score_1 - psi_elements["score_const"]) - else: + elif self.score == 'instrument': score = (psi_elements["y"] - np.exp(coef * psi_elements["d"]+ psi_elements["r_hat"])) * psi_elements["d_tilde"] + else: + raise NotImplementedError return score def _compute_score_deriv(self, psi_elements, coef, inds=None): - if self._score_type == 'nuisance_space': + if self.score == 'nuisance_space': deriv_1 = - psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d"] deriv = psi_elements["psi_hat"] * psi_elements["d_tilde"] * deriv_1 - else: + elif self.score == 'instrument': deriv = - psi_elements["d"] * np.exp(coef * psi_elements["d"]+ psi_elements["r_hat"]) * psi_elements["d_tilde"] - - return deriv - - - def cate(self, basis, is_gate=False): - """ - Calculate conditional average treatment effects (CATE) for a given basis. - - Parameters - ---------- - basis : :class:`pandas.DataFrame` - The basis for estimating the best linear predictor. Has to have the shape ``(n_obs, d)``, - where ``n_obs`` is the number of observations and ``d`` is the number of predictors. - is_gate : bool - Indicates whether the basis is constructed for GATEs (dummy-basis). - Default is ``False``. - - Returns - ------- - model : :class:`doubleML.DoubleMLBLP` - Best linear Predictor model. - """ - if self._dml_data.n_treat > 1: - raise NotImplementedError('Only implemented for single treatment. ' + - f'Number of treatments is {str(self._dml_data.n_treat)}.') - if self.n_rep != 1: - raise NotImplementedError('Only implemented for one repetition. ' + - f'Number of repetitions is {str(self.n_rep)}.') - - Y_tilde, D_tilde = self._partial_out() - - D_basis = basis * D_tilde - model = DoublelMLBLP( - orth_signal=Y_tilde.reshape(-1), - basis=D_basis, - is_gate=is_gate, - ) - model.fit() - - ## TODO: Solve score - - - return model - - def gate(self, groups): - """ - Calculate group average treatment effects (GATE) for groups. - - Parameters - ---------- - groups : :class:`pandas.DataFrame` - The group indicator for estimating the best linear predictor. Groups should be mutually exclusive. - Has to be dummy coded with shape ``(n_obs, d)``, where ``n_obs`` is the number of observations - and ``d`` is the number of groups or ``(n_obs, 1)`` and contain the corresponding groups (as str). - - Returns - ------- - model : :class:`doubleML.DoubleMLBLP` - Best linear Predictor model for Group Effects. - """ - - if not isinstance(groups, pd.DataFrame): - raise TypeError('Groups must be of DataFrame type. ' - f'Groups of type {str(type(groups))} was passed.') - if not all(groups.dtypes == bool) or all(groups.dtypes == int): - if groups.shape[1] == 1: - groups = pd.get_dummies(groups, prefix='Group', prefix_sep='_') - else: - raise TypeError('Columns of groups must be of bool type or int type (dummy coded). ' - 'Alternatively, groups should only contain one column.') - - if any(groups.sum(0) <= 5): - warnings.warn('At least one group effect is estimated with less than 6 observations.') - - model = self.cate(groups, is_gate=True) - return model - - def _partial_out(self): - """ - Helper function. Returns the partialled out quantities of Y and D. - Works with multiple repetitions. - - Returns - ------- - Y_tilde : :class:`numpy.ndarray` - The residual of the regression of Y on X. - D_tilde : :class:`numpy.ndarray` - The residual of the regression of D on X. - """ - if self.predictions is None: - raise ValueError('predictions are None. Call .fit(store_predictions=True) to store the predictions.') - - y = self._dml_data.y.reshape(-1, 1) - d = self._dml_data.d.reshape(-1, 1) - ml_m = self.predictions["ml_m"].squeeze(axis=2) - - if self.score == "partialling out": - ml_l = self.predictions["ml_l"].squeeze(axis=2) - Y_tilde = y - ml_l - D_tilde = d - ml_m else: - assert self.score == "IV-type" - ml_g = self.predictions["ml_g"].squeeze(axis=2) - Y_tilde = y - (self.coef * ml_m) - ml_g - D_tilde = d - ml_m + raise NotImplementedError - return Y_tilde, D_tilde \ No newline at end of file + return deriv \ No newline at end of file From 6f556e02caaf3e39e8b11e2655361178305ca183 Mon Sep 17 00:00:00 2001 From: bbd5721 Date: Wed, 27 Aug 2025 22:02:40 +0200 Subject: [PATCH 08/23] Reverted from ensure_all_finite to force_all_finite --- doubleml/plm/logistic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doubleml/plm/logistic.py b/doubleml/plm/logistic.py index 3e04d15d..a716497d 100644 --- a/doubleml/plm/logistic.py +++ b/doubleml/plm/logistic.py @@ -215,9 +215,9 @@ def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, s def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False): # TODO: How to deal with smpls_inner? x, y = check_X_y(self._dml_data.x, self._dml_data.y, - ensure_all_finite=False) + force_all_finite=False) x, d = check_X_y(x, self._dml_data.d, - ensure_all_finite=False) + force_all_finite=False) x_d_concat = np.hstack((d.reshape(-1,1), x)) m_external = external_predictions['ml_m'] is not None M_external = external_predictions['ml_M'] is not None From 3a332bf91e97af94780805130f21b7688238d29d Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Thu, 28 Aug 2025 15:59:29 +0200 Subject: [PATCH 09/23] Fixes to instrument score --- doubleml/plm/logistic.py | 53 ++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/doubleml/plm/logistic.py b/doubleml/plm/logistic.py index a716497d..e19fc1e4 100644 --- a/doubleml/plm/logistic.py +++ b/doubleml/plm/logistic.py @@ -249,7 +249,7 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa weights.append( M_hat['preds_inner'][i][train] * (1-M_hat['preds_inner'][i][train])) m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=smpls, n_jobs=n_jobs_cv, est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'], - return_models=return_models, weights=weights) + return_models=return_models, sample_weights=weights) elif self.score == 'nuisance_space': filtered_smpls = [] @@ -288,29 +288,29 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa est_params=self._get_params('ml_a'), method=self._predict_method['ml_a'])) - r_legacy = np.zeros_like(y) - smpls_inner = self.__smpls__inner - M_hat_l = {} - a_hat_l = {} - M_hat_l['preds_inner'] = [] - M_hat_l['preds'] = np.full_like(y, np.nan) - a_hat_l['preds_inner'] = [] - a_hat_l['preds'] = np.full_like(y, np.nan) - for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): - test = smpls_single_split[1] - train = smpls_single_split[0] - # r_legacy[test] = - Mleg, aleg, a_nf_leg = self.legacy_implementation(y[train], x[train], d[train], x[test], d[test], - self._learner['ml_m'], self._learner['ml_M'], - smpls_single_split, smpls_double_split, y, x, d, - x_d_concat, n_jobs_cv) - Mtemp = np.full_like(y, np.nan) - Mtemp[train] = Mleg - Atemp = np.full_like(y, np.nan) - Atemp[train] = aleg - M_hat_l['preds_inner'].append(Mtemp) - a_hat_l['preds_inner'].append(Atemp) - a_hat_l['preds'][test] = a_nf_leg + # r_legacy = np.zeros_like(y) + # smpls_inner = self.__smpls__inner + # M_hat_l = {} + # a_hat_l = {} + # M_hat_l['preds_inner'] = [] + # M_hat_l['preds'] = np.full_like(y, np.nan) + # a_hat_l['preds_inner'] = [] + # a_hat_l['preds'] = np.full_like(y, np.nan) + # for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): + # test = smpls_single_split[1] + # train = smpls_single_split[0] + # # r_legacy[test] = + # Mleg, aleg, a_nf_leg = self.legacy_implementation(y[train], x[train], d[train], x[test], d[test], + # self._learner['ml_m'], self._learner['ml_M'], + # smpls_single_split, smpls_double_split, y, x, d, + # x_d_concat, n_jobs_cv) + # Mtemp = np.full_like(y, np.nan) + # Mtemp[train] = Mleg + # Atemp = np.full_like(y, np.nan) + # Atemp[train] = aleg + # M_hat_l['preds_inner'].append(Mtemp) + # a_hat_l['preds_inner'].append(Atemp) + # a_hat_l['preds'][test] = a_nf_leg #r_hat['preds'] = r_legacy @@ -484,7 +484,7 @@ def _compute_score(self, psi_elements, coef): score_1 = psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d_tilde"] score = psi_elements["psi_hat"] * (score_1 - psi_elements["score_const"]) elif self.score == 'instrument': - score = (psi_elements["y"] - np.exp(coef * psi_elements["d"]+ psi_elements["r_hat"])) * psi_elements["d_tilde"] + score = (psi_elements["y"] - scipy.special.expit(coef * psi_elements["d"]+ psi_elements["r_hat"])) * psi_elements["d_tilde"] else: raise NotImplementedError @@ -495,7 +495,8 @@ def _compute_score_deriv(self, psi_elements, coef, inds=None): deriv_1 = - psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d"] deriv = psi_elements["psi_hat"] * psi_elements["d_tilde"] * deriv_1 elif self.score == 'instrument': - deriv = - psi_elements["d"] * np.exp(coef * psi_elements["d"]+ psi_elements["r_hat"]) * psi_elements["d_tilde"] + expit = scipy.special.expit(coef * psi_elements["d"]+ psi_elements["r_hat"]) + deriv = - psi_elements["d"] * expit * (1-expit) * psi_elements["d_tilde"] else: raise NotImplementedError From b41a773c92a3d0aab04e76bfdb7d1343ff129122 Mon Sep 17 00:00:00 2001 From: bbd5721 Date: Wed, 3 Sep 2025 14:52:48 +0200 Subject: [PATCH 10/23] Added option for exception on convergence failure --- doubleml/double_ml_score_mixins.py | 44 ++++++++++++++++++------------ doubleml/plm/logistic.py | 4 ++- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/doubleml/double_ml_score_mixins.py b/doubleml/double_ml_score_mixins.py index 57dd6e62..b0c69c25 100644 --- a/doubleml/double_ml_score_mixins.py +++ b/doubleml/double_ml_score_mixins.py @@ -86,6 +86,7 @@ class NonLinearScoreMixin: _score_type = "nonlinear" _coef_start_val = np.nan _coef_bounds = None + _error_on_convergence_failure = False @property @abstractmethod @@ -149,12 +150,14 @@ def score_deriv(theta): theta_hat = root_res.root if not root_res.converged: score_val = score(theta_hat) - warnings.warn( - "Could not find a root of the score function.\n " - f"Flag: {root_res.flag}.\n" - f"Score value found is {score_val} " - f"for parameter theta equal to {theta_hat}." - ) + msg = ('Could not find a root of the score function.\n ' + f'Flag: {root_res.flag}.\n' + f'Score value found is {score_val} ' + f'for parameter theta equal to {theta_hat}.') + if self._error_on_convergence_failure: + raise ValueError(msg) + else: + warnings.warn(msg) else: signs_different, bracket_guess = _get_bracket_guess(score, self._coef_start_val, self._coef_bounds) @@ -182,16 +185,19 @@ def score_squared(theta): else: score_val_sign = np.sign(score(alt_coef_start)) if score_val_sign > 0: + theta_hat_array, score_val, _ = fmin_l_bfgs_b( score, self._coef_start_val, approx_grad=True, bounds=[self._coef_bounds] ) theta_hat = theta_hat_array.item() - warnings.warn( - "Could not find a root of the score function.\n " - f"Minimum score value found is {score_val} " - f"for parameter theta equal to {theta_hat}.\n " - "No theta found such that the score function evaluates to a negative value." - ) + msg = ('Could not find a root of the score function.\n ' + f'Minimum score value found is {score_val} ' + f'for parameter theta equal to {theta_hat}.\n ' + 'No theta found such that the score function evaluates to a negative value.') + if self._error_on_convergence_failure: + raise ValueError(msg) + else: + warnings.warn(msg) else: def neg_score(theta): @@ -202,11 +208,13 @@ def neg_score(theta): neg_score, self._coef_start_val, approx_grad=True, bounds=[self._coef_bounds] ) theta_hat = theta_hat_array.item() - warnings.warn( - "Could not find a root of the score function. " - f"Maximum score value found is {-1 * neg_score_val} " - f"for parameter theta equal to {theta_hat}. " - "No theta found such that the score function evaluates to a positive value." - ) + msg = ('Could not find a root of the score function. ' + f'Maximum score value found is {-1*neg_score_val} ' + f'for parameter theta equal to {theta_hat}. ' + 'No theta found such that the score function evaluates to a positive value.') + if self._error_on_convergence_failure: + raise ValueError(msg) + else: + warnings.warn(msg) return theta_hat diff --git a/doubleml/plm/logistic.py b/doubleml/plm/logistic.py index e19fc1e4..9e1bb875 100644 --- a/doubleml/plm/logistic.py +++ b/doubleml/plm/logistic.py @@ -115,13 +115,15 @@ def __init__(self, n_folds_inner=5, n_rep=1, score='nuisance_space', - draw_sample_splitting=True): + draw_sample_splitting=True, + error_on_convergence_failure=False,): self.n_folds_inner = n_folds_inner super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting) + self._error_on_convergence_failure = error_on_convergence_failure self._coef_bounds = (-1e-2, 1e2) self._coef_start_val = 1.0 From c434667ec8a668ca271d6639194807fda1ca26f6 Mon Sep 17 00:00:00 2001 From: bbd5721 Date: Mon, 29 Sep 2025 10:38:13 -0700 Subject: [PATCH 11/23] Added unbalanced dataset option, bug fixes --- doubleml/datasets.py | 34 ++++++++++----- doubleml/plm/logistic.py | 80 ++++++++++++++++++++++++++++++++++- doubleml/utils/_estimation.py | 28 ++++++------ 3 files changed, 115 insertions(+), 27 deletions(-) diff --git a/doubleml/datasets.py b/doubleml/datasets.py index 629a033a..dad8b9f7 100644 --- a/doubleml/datasets.py +++ b/doubleml/datasets.py @@ -1651,7 +1651,7 @@ def treatment_effect(d, scale=15): return resul_dict -def make_logistic_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLData', **kwargs): +def make_logistic_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLData', balanced_r0=True, **kwargs): """ Generates synthetic data for a logistic partially linear regression model, as in Liu et al. (2021), designed for use in double/debiased machine learning applications. @@ -1705,16 +1705,28 @@ def make_logistic_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLD """ - def r_0(X): - return 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + \ - 0.1 * X[:, 3] * X[:, 4] + \ - 0.1 * X[:, 5] ** 3 + \ - -0.5 * np.sin(X[:, 6]) ** 2 + \ - 0.5 * np.cos(X[:, 7]) + \ - 1 / (1 + X[:, 8] ** 2) + \ - -1 / (1 + np.exp(X[:, 9])) + \ - 0.25 * np.where(X[:, 10] > 0, 1, 0) + \ - -0.25 * np.where(X[:, 12] > 0, 1, 0) + if balanced_r0: + def r_0(X): + return 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + \ + 0.1 * X[:, 3] * X[:, 4] + \ + 0.1 * X[:, 5] ** 3 + \ + -0.5 * np.sin(X[:, 6]) ** 2 + \ + 0.5 * np.cos(X[:, 7]) + \ + 1 / (1 + X[:, 8] ** 2) + \ + -1 / (1 + np.exp(X[:, 9])) + \ + 0.25 * np.where(X[:, 10] > 0, 1, 0) + \ + -0.25 * np.where(X[:, 12] > 0, 1, 0) + else: + def r_0(X): + return 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + \ + 0.1 * X[:, 3] * X[:, 4] + \ + 0.1 * X[:, 5] ** 3 + \ + -0.5 * np.sin(X[:, 6]) ** 2 + \ + 0.5 * np.cos(X[:, 7]) + \ + 3 / (1 + X[:, 8] ** 2) + \ + -1 / (1 + np.exp(X[:, 9])) + \ + 0.5 * np.where(X[:, 10] > 0, 1, 0) + \ + -0.25 * np.where(X[:, 12] > 0, 1, 0) def a_0(X): return 2 / (1 + np.exp(X[:, 0])) + \ diff --git a/doubleml/plm/logistic.py b/doubleml/plm/logistic.py index 9e1bb875..7314debd 100644 --- a/doubleml/plm/logistic.py +++ b/doubleml/plm/logistic.py @@ -143,9 +143,11 @@ def __init__(self, if ml_a is not None: ml_a_is_classifier = self._check_learner(ml_a, 'ml_a', regressor=True, classifier=True) self._learner['ml_a'] = ml_a + self._ml_a_provided = True else: self._learner['ml_a'] = clone(ml_m) ml_a_is_classifier = ml_m_is_classifier + self._ml_a_provided = False self._predict_method = {'ml_t': 'predict', 'ml_M': 'predict_proba'} @@ -449,8 +451,82 @@ def _score_element_names(self): def _sensitivity_element_est(self, preds): pass - def _nuisance_tuning(self): - pass + def _nuisance_tuning(self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, + search_mode, n_iter_randomized_search): + # TODO: test + x, y = check_X_y(self._dml_data.x, self._dml_data.y, + force_all_finite=False) + x, d = check_X_y(x, self._dml_data.d, + force_all_finite=False) + x_d_concat = np.hstack((d.reshape(-1, 1), x)) + + if scoring_methods is None: + scoring_methods = {'ml_m': None, + 'ml_M': None, + 'ml_a': None, + 'ml_t': None} + + train_inds = [train_index for (train_index, _) in smpls] + M_tune_res = _dml_tune(y, x_d_concat, train_inds, + self._learner['ml_M'], param_grids['ml_M'], scoring_methods['ml_M'], + n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search) + + if self.score == 'nuisance_space': + filtered_smpls = [] + for train, test in smpls: + train_filtered = train[y[train] == 0] + filtered_smpls.append(train_filtered) + filtered_train_inds = [train_index for (train_index, _) in smpls] + elif self.score == 'instrument': + filtered_train_inds = train_inds + else: + raise NotImplementedError + m_tune_res = _dml_tune(d, x, filtered_train_inds, + self._learner['ml_m'], param_grids['ml_m'], scoring_methods['ml_m'], + n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search) + + a_tune_res = _dml_tune(d, x, train_inds, + self._learner['ml_a'], param_grids['ml_a'], scoring_methods['ml_a'], + n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search) + + M_best_params = [xx.best_params_ for xx in M_tune_res] + m_best_params = [xx.best_params_ for xx in m_tune_res] + a_best_params = [xx.best_params_ for xx in a_tune_res] + + # Create targets for tuning ml_t + M_hat = (self._double_dml_cv_predict(self._learner['ml_M'], 'ml_M', x_d_concat, y, smpls=smpls, + smpls_inner=self.__smpls__inner, + n_jobs=n_jobs_cv, + est_params=M_best_params, method=self._predict_method['ml_M'])) + + W_inner = [] + for i, (train, test) in enumerate(smpls): + M_iteration = M_hat['preds_inner'][i][train] + M_iteration = np.clip(M_iteration, 1e-8, 1 - 1e-8) + w = scipy.special.logit(M_iteration) + W_inner.append(w) + + t_tune_res = _dml_tune(W_inner, x, train_inds, + self._learner['ml_t'], param_grids['ml_t'], scoring_methods['ml_t'], + n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search) + t_best_params = [xx.best_params_ for xx in t_tune_res] + + + + # Update params and tune_res to include ml_a and ml_t + params = {'ml_M': M_best_params, + 'ml_m': m_best_params, + 'ml_a': a_best_params, + 'ml_t': t_best_params} + tune_res = {'M_tune': M_tune_res, + 'm_tune': m_tune_res, + 'a_tune': a_tune_res, + 't_tune': t_tune_res} + + res = {'params': params, + 'tune_res': tune_res} + + return res @property def __smpls__inner(self): diff --git a/doubleml/utils/_estimation.py b/doubleml/utils/_estimation.py index 6029dfd9..8086322a 100644 --- a/doubleml/utils/_estimation.py +++ b/doubleml/utils/_estimation.py @@ -149,25 +149,25 @@ def _dml_cv_predict(estimator, x, y, smpls=None, return res -def _dml_tune( - y, x, train_inds, learner, param_grid, scoring_method, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search -): +def _dml_tune(y, x, train_inds, + learner, param_grid, scoring_method, + n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search, fold_specific_target=False): tune_res = list() - for train_index in train_inds: + for i, train_index in enumerate(train_inds): tune_resampling = KFold(n_splits=n_folds_tune, shuffle=True) if search_mode == "grid_search": g_grid_search = GridSearchCV(learner, param_grid, scoring=scoring_method, cv=tune_resampling, n_jobs=n_jobs_cv) else: - assert search_mode == "randomized_search" - g_grid_search = RandomizedSearchCV( - learner, - param_grid, - scoring=scoring_method, - cv=tune_resampling, - n_jobs=n_jobs_cv, - n_iter=n_iter_randomized_search, - ) - tune_res.append(g_grid_search.fit(x[train_index, :], y[train_index])) + assert search_mode == 'randomized_search' + g_grid_search = RandomizedSearchCV(learner, param_grid, + scoring=scoring_method, + cv=tune_resampling, + n_jobs=n_jobs_cv, + n_iter=n_iter_randomized_search) + if fold_specific_target: + tune_res.append(g_grid_search.fit(x[train_index, :], y[i])) + else: + tune_res.append(g_grid_search.fit(x[train_index, :], y[train_index])) return tune_res From 443d82ddcfa530f8151e47ad467bb17cddb2b0ed Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Tue, 7 Oct 2025 15:42:38 -0700 Subject: [PATCH 12/23] Added binary treatment dataset, fixed bug for model check --- doubleml/datasets.py | 11 +++++++++-- doubleml/plm/logistic.py | 3 +-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/doubleml/datasets.py b/doubleml/datasets.py index dad8b9f7..b555b3bc 100644 --- a/doubleml/datasets.py +++ b/doubleml/datasets.py @@ -1651,7 +1651,7 @@ def treatment_effect(d, scale=15): return resul_dict -def make_logistic_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLData', balanced_r0=True, **kwargs): +def make_logistic_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLData', balanced_r0=True, treatment="continuous", **kwargs): """ Generates synthetic data for a logistic partially linear regression model, as in Liu et al. (2021), designed for use in double/debiased machine learning applications. @@ -1745,7 +1745,14 @@ def a_0(X): x = np.random.multivariate_normal(np.zeros(dim_x), sigma, size=n_obs) np.clip(x, -2, 2, out=x) - d = a_0(x) + if treatment == "continuous": + d = a_0(x) + elif treatment == "binary": + d_cont = a_0(x) + d = np.random.binomial(1, expit(d_cont - d_cont.mean())) + elif treatment == "binary_unbalanced": + d_cont = a_0(x) + d = np.random.binomial(1, expit(d_cont)) p = expit(alpha * d[:] + r_0(x)) diff --git a/doubleml/plm/logistic.py b/doubleml/plm/logistic.py index 7314debd..3e21cbf0 100644 --- a/doubleml/plm/logistic.py +++ b/doubleml/plm/logistic.py @@ -1,7 +1,6 @@ import inspect import numpy as np -from torch.sparse import sampled_addmm from doubleml.utils._estimation import ( _dml_cv_predict, @@ -134,7 +133,7 @@ def __init__(self, _ = self._check_learner(ml_t, 'ml_t', regressor=True, classifier=False) _ = self._check_learner(ml_M, 'ml_M', regressor=False, classifier=True) - if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]): + if np.array_equal(np.unique(obj_dml_data.d), [0, 1]): ml_m_is_classifier = self._check_learner(ml_m, 'ml_m', regressor=False, classifier=True) else: ml_m_is_classifier = self._check_learner(ml_m, 'ml_m', regressor=True, classifier=False) From 774c74dfb98d7cb3b461bd962a0f37b74fce3257 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Tue, 7 Oct 2025 15:45:10 -0700 Subject: [PATCH 13/23] Adjusted dataset balancing --- doubleml/datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doubleml/datasets.py b/doubleml/datasets.py index b555b3bc..6d9acfc8 100644 --- a/doubleml/datasets.py +++ b/doubleml/datasets.py @@ -1723,9 +1723,9 @@ def r_0(X): 0.1 * X[:, 5] ** 3 + \ -0.5 * np.sin(X[:, 6]) ** 2 + \ 0.5 * np.cos(X[:, 7]) + \ - 3 / (1 + X[:, 8] ** 2) + \ + 4 / (1 + X[:, 8] ** 2) + \ -1 / (1 + np.exp(X[:, 9])) + \ - 0.5 * np.where(X[:, 10] > 0, 1, 0) + \ + 1.5 * np.where(X[:, 10] > 0, 1, 0) + \ -0.25 * np.where(X[:, 12] > 0, 1, 0) def a_0(X): From 9695820f2cefa6bd1b63659fcca96e9f6f6a805a Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 27 Oct 2025 13:54:16 -0700 Subject: [PATCH 14/23] Renamed Logistic to LPLR Added test set-up --- doubleml/__init__.py | 4 +- doubleml/plm/__init__.py | 3 +- doubleml/plm/datasets/__init__.py | 2 + doubleml/plm/datasets/dgp_lplr_LZZ2020.py | 139 +++++++++ doubleml/plm/{logistic.py => lplr.py} | 213 +++---------- doubleml/plm/tests/_utils_lplr_manual.py | 335 +++++++++++++++++++++ doubleml/plm/tests/test_lplr.py | 105 +++++++ doubleml/plm/tests/test_lplr_exceptions.py | 293 ++++++++++++++++++ doubleml/plm/tests/test_lplr_tune.py | 227 ++++++++++++++ 9 files changed, 1155 insertions(+), 166 deletions(-) create mode 100644 doubleml/plm/datasets/dgp_lplr_LZZ2020.py rename doubleml/plm/{logistic.py => lplr.py} (69%) create mode 100644 doubleml/plm/tests/_utils_lplr_manual.py create mode 100644 doubleml/plm/tests/test_lplr.py create mode 100644 doubleml/plm/tests/test_lplr_exceptions.py create mode 100644 doubleml/plm/tests/test_lplr_tune.py diff --git a/doubleml/__init__.py b/doubleml/__init__.py index ba59a07e..7c8ead97 100644 --- a/doubleml/__init__.py +++ b/doubleml/__init__.py @@ -13,7 +13,7 @@ from .irm.pq import DoubleMLPQ from .irm.qte import DoubleMLQTE from .irm.ssm import DoubleMLSSM -from doubleml.plm.logistic import DoubleMLLogit +from doubleml.plm.lplr import DoubleMLLPLR from .plm.pliv import DoubleMLPLIV from .plm.plr import DoubleMLPLR @@ -45,7 +45,7 @@ "DoubleMLBLP", "DoubleMLPolicyTree", "DoubleMLSSM", - "DoubleMLLogit", + "DoubleMLLPLR", ] __version__ = importlib.metadata.version("doubleml") diff --git a/doubleml/plm/__init__.py b/doubleml/plm/__init__.py index 88ff26a8..37262ed9 100644 --- a/doubleml/plm/__init__.py +++ b/doubleml/plm/__init__.py @@ -4,9 +4,10 @@ from .pliv import DoubleMLPLIV from .plr import DoubleMLPLR +from .lplr import DoubleMLLPLR __all__ = [ "DoubleMLPLR", "DoubleMLPLIV", - "DoubleMLLogit" + "DoubleMLLPLR" ] diff --git a/doubleml/plm/datasets/__init__.py b/doubleml/plm/datasets/__init__.py index b2bb7df0..5f433ae7 100644 --- a/doubleml/plm/datasets/__init__.py +++ b/doubleml/plm/datasets/__init__.py @@ -8,6 +8,7 @@ from .dgp_pliv_multiway_cluster_CKMS2021 import make_pliv_multiway_cluster_CKMS2021 from .dgp_plr_CCDDHNR2018 import make_plr_CCDDHNR2018 from .dgp_plr_turrell2018 import make_plr_turrell2018 +from .dgp_lplr_LZZ2020 import make_lplr_LZZ2020 __all__ = [ "make_plr_CCDDHNR2018", @@ -15,5 +16,6 @@ "make_confounded_plr_data", "make_pliv_CHS2015", "make_pliv_multiway_cluster_CKMS2021", + "make_lplr_LZZ2020", "_make_pliv_data", ] diff --git a/doubleml/plm/datasets/dgp_lplr_LZZ2020.py b/doubleml/plm/datasets/dgp_lplr_LZZ2020.py new file mode 100644 index 00000000..007e2b91 --- /dev/null +++ b/doubleml/plm/datasets/dgp_lplr_LZZ2020.py @@ -0,0 +1,139 @@ +import numpy as np +import pandas as pd +from scipy.special import expit + +from doubleml.data import DoubleMLData +from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias + +_array_alias = _get_array_alias() +_data_frame_alias = _get_data_frame_alias() +_dml_data_alias = _get_dml_data_alias() + +def make_lplr_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLData', balanced_r0=True, treatment="continuous", **kwargs): + """ + Generates synthetic data for a logistic partially linear regression model, as in Liu et al. (2021), + designed for use in double/debiased machine learning applications. + + The data generating process is defined as follows: + + - Covariates \( x_i \sim \mathcal{N}(0, \Sigma) \), where \( \Sigma_{kj} = 0.7^{|j-k|} \). + - Treatment \( d_i = a_0(x_i) \). + - Propensity score \( p_i = \sigma(\alpha d_i + r_0(x_i)) \), where \( \sigma(\cdot) \) is the logistic function. + - Outcome \( y_i \sim \text{Bernoulli}(p_i) \). + + The nuisance functions are defined as: + + .. math:: + + a_0(x_i) &= \frac{2}{1 + \exp(x_{i,1})} - \frac{2}{1 + \exp(x_{i,2})} + \sin(x_{i,3}) + \cos(x_{i,4}) \\ + &+ 0.5 \cdot \mathbb{1}(x_{i,5} > 0) - 0.5 \cdot \mathbb{1}(x_{i,6} > 0) + 0.2 x_{i,7} x_{i,8} - 0.2 x_{i,9} x_{i,10} \\ + + r_0(x_i) &= 0.1 x_{i,1} x_{i,2} x_{i,3} + 0.1 x_{i,4} x_{i,5} + 0.1 x_{i,6}^3 - 0.5 \sin^2(x_{i,7}) \\ + &+ 0.5 \cos(x_{i,8}) + \frac{1}{1 + x_{i,9}^2} - \frac{1}{1 + \exp(x_{i,10})} \\ + &+ 0.25 \cdot \mathbb{1}(x_{i,11} > 0) - 0.25 \cdot \mathbb{1}(x_{i,13} > 0) + + Parameters + ---------- + n_obs : int + Number of observations to simulate. + dim_x : int + Number of covariates. + alpha : float + Value of the causal parameter. + return_type : str + Determines the return format. One of: + + - 'DoubleMLData' or DoubleMLData: returns a ``DoubleMLData`` object. + - 'DataFrame', 'pd.DataFrame' or pd.DataFrame: returns a ``pandas.DataFrame``. + - 'array', 'np.ndarray', 'np.array' or np.ndarray: returns tuple of numpy arrays (x, y, d, p). + balanced_r0 : bool, default True + If True, uses the "balanced" r_0 specification (smaller magnitude / more balanced + heterogeneity). If False, uses an "unbalanced" r_0 specification with larger + share of Y=0. + treatment : {'continuous', 'binary', 'binary_unbalanced'}, default 'continuous' + Determines how the treatment d is generated from a_0(x): + - 'continuous': d = a_0(x) (continuous treatment). + - 'binary': d ~ Bernoulli( sigmoid(a_0(x) - mean(a_0(x))) ) . + - 'binary_unbalanced': d ~ Bernoulli( sigmoid(a_0(x)) ). + + **kwargs + Optional keyword arguments (currently unused in this implementation). + + Returns + ------- + Union[DoubleMLData, pd.DataFrame, Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]] + The generated data in the specified format. + + References + ---------- + Liu, Molei, Yi Zhang, and Doudou Zhou. 2021. + "Double/Debiased Machine Learning for Logistic Partially Linear Model." + The Econometrics Journal 24 (3): 559–88. https://doi.org/10.1093/ectj/utab019. + + """ + + if balanced_r0: + def r_0(X): + return 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + \ + 0.1 * X[:, 3] * X[:, 4] + \ + 0.1 * X[:, 5] ** 3 + \ + -0.5 * np.sin(X[:, 6]) ** 2 + \ + 0.5 * np.cos(X[:, 7]) + \ + 1 / (1 + X[:, 8] ** 2) + \ + -1 / (1 + np.exp(X[:, 9])) + \ + 0.25 * np.where(X[:, 10] > 0, 1, 0) + \ + -0.25 * np.where(X[:, 12] > 0, 1, 0) + else: + def r_0(X): + return 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + \ + 0.1 * X[:, 3] * X[:, 4] + \ + 0.1 * X[:, 5] ** 3 + \ + -0.5 * np.sin(X[:, 6]) ** 2 + \ + 0.5 * np.cos(X[:, 7]) + \ + 4 / (1 + X[:, 8] ** 2) + \ + -1 / (1 + np.exp(X[:, 9])) + \ + 1.5 * np.where(X[:, 10] > 0, 1, 0) + \ + -0.25 * np.where(X[:, 12] > 0, 1, 0) + + def a_0(X): + return 2 / (1 + np.exp(X[:, 0])) + \ + -2 / (1 + np.exp(X[:, 1])) + \ + 1 * np.sin(X[:, 2]) + \ + 1 * np.cos(X[:, 3]) + \ + 0.5 * np.where(X[:, 4] > 0, 1, 0) + \ + -0.5 * np.where(X[:, 5] > 0, 1, 0) + \ + 0.2 * X[:, 6] * X[:, 7] + \ + -0.2 * X[:, 8] * X[:, 9] + + + sigma = np.full((dim_x, dim_x), 0.2) + np.fill_diagonal(sigma, 1) + + x = np.random.multivariate_normal(np.zeros(dim_x), sigma, size=n_obs) + np.clip(x, -2, 2, out=x) + + if treatment == "continuous": + d = a_0(x) + elif treatment == "binary": + d_cont = a_0(x) + d = np.random.binomial(1, expit(d_cont - d_cont.mean())) + elif treatment == "binary_unbalanced": + d_cont = a_0(x) + d = np.random.binomial(1, expit(d_cont)) + + p = expit(alpha * d[:] + r_0(x)) + + y = np.random.binomial(1, p) + + if return_type in _array_alias: + return x, y, d, p + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d, p)), + columns=x_cols + ['y', 'd', 'p']) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, 'y', 'd', x_cols) + else: + raise ValueError('Invalid return_type.') \ No newline at end of file diff --git a/doubleml/plm/logistic.py b/doubleml/plm/lplr.py similarity index 69% rename from doubleml/plm/logistic.py rename to doubleml/plm/lplr.py index 3e21cbf0..1ed00810 100644 --- a/doubleml/plm/logistic.py +++ b/doubleml/plm/lplr.py @@ -29,79 +29,64 @@ -class DoubleMLLogit(NonLinearScoreMixin, DoubleML): - """Double machine learning for partially linear regression models +class DoubleMLLPLR(NonLinearScoreMixin, DoubleML): + """Double machine learning for partially logistic models (binary outcomes) Parameters ---------- - obj_dml_data : :class:`DoubleMLData` object - The :class:`DoubleMLData` object providing the data and specifying the variables for the causal model. - - ml_r : estimator implementing ``fit()`` and ``predict()`` - A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. - :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function :math:`\\ell_0(X) = E[Y|X]`. - - ml_m : estimator implementing ``fit()`` and ``predict()`` - A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. - :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function :math:`m_0(X) = E[D|X]`. - For binary treatment variables :math:`D` (with values 0 and 1), a classifier implementing ``fit()`` and - ``predict_proba()`` can also be specified. If :py:func:`sklearn.base.is_classifier` returns ``True``, - ``predict_proba()`` is used otherwise ``predict()``. - - ml_g : estimator implementing ``fit()`` and ``predict()`` - A machine learner implementing ``fit()`` and ``predict()`` methods (e.g. - :py:class:`sklearn.ensemble.RandomForestRegressor`) for the nuisance function - :math:`g_0(X) = E[Y - D \\theta_0|X]`. - Note: The learner `ml_g` is only required for the score ``'IV-type'``. Optionally, it can be specified and - estimated for callable scores. - - n_folds : int - Number of folds. - Default is ``5``. - - n_rep : int - Number of repetitons for the sample splitting. - Default is ``1``. - - score : str or callable - A str (``'nuisance_space'`` or ``'instrument'``) specifying the score function - or a callable object / function with signature ``psi_a, psi_b = score(y, d, l_hat, m_hat, g_hat, smpls)``. - Default is ``'partialling out'``. - - draw_sample_splitting : bool - Indicates whether the sample splitting should be drawn during initialization of the object. - Default is ``True``. + obj_dml_data : DoubleMLData + The DoubleMLData object providing the data and variable specification. + The outcome variable y must be binary with values {0, 1}. + ml_M : estimator + Classifier for M_0(D, X) = P[Y = 1 | D, X]. Must implement fit() and predict_proba(). + ml_t : estimator + Regressor for the auxiliary regression used to predict log-odds. Must implement fit() and predict(). + ml_m : estimator + Learner for m_0(X) = E[D | X]. For binary treatments a classifier with predict_proba() is expected; + for continuous treatments a regressor with predict() is expected. + ml_a : estimator, optional + Optional alternative learner for E[D | X]. If not provided, a clone of ml_m is used. + Must support the same prediction interface as ml_m. + n_folds : int, default=5 + Number of outer cross-fitting folds. + n_folds_inner : int, default=5 + Number of inner folds for nested resampling used internally. + n_rep : int, default=1 + Number of repetitions for sample splitting. + score : {'nuisance_space', 'instrument'} or callable, default='nuisance_space' + Score to use. 'nuisance_space' estimates m on subsamples with y=0; 'instrument' uses an instrument-type score. + draw_sample_splitting : bool, default=True + Whether to draw sample splitting during initialization. + error_on_convergence_failure : bool, default=False + If True, raise an error on convergence failure of score. Examples -------- >>> import numpy as np >>> import doubleml as dml - >>> from doubleml.datasets import make_plr_CCDDHNR2018 - >>> from sklearn.ensemble import RandomForestRegressor + >>> from doubleml.plm.datasets import make_lplr_LZZ2020 + >>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier >>> from sklearn.base import clone >>> np.random.seed(3141) - >>> learner = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) - >>> ml_g = learner - >>> ml_m = learner - >>> obj_dml_data = make_plr_CCDDHNR2018(alpha=0.5, n_obs=500, dim_x=20) - >>> dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, ml_g, ml_m) - >>> dml_plr_obj.fit().summary - coef std err t P>|t| 2.5 % 97.5 % - d 0.462321 0.04107 11.256983 2.139582e-29 0.381826 0.542816 + >>> ml_t = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) + >>> ml_m = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) + >>> ml_M = RandomForestClassifier(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) + >>> obj_dml_data = make_lplr_LZZ2020(alpha=0.5, n_obs=500, dim_x=20) + >>> dml_lplr_obj = dml.DoubleMLPLR(obj_dml_data, ml_M, ml_t, ml_m) + >>> dml_lplr_obj.fit().summary + coef std err t P>|t| 2.5 % 97.5 % + d 0.480691 0.040533 11.859129 1.929729e-32 0.401247 0.560135 Notes ----- - **Partially linear regression (PLR)** models take the form + **Partially logistic regression (PLR)** models take the form .. math:: - Y = D \\theta_0 + g_0(X) + \\zeta, & &\\mathbb{E}(\\zeta | D,X) = 0, - - D = m_0(X) + V, & &\\mathbb{E}(V | X) = 0, + Y = \\text{expit} ( D \\theta_0 + r_0(X)) where :math:`Y` is the outcome variable and :math:`D` is the policy variable of interest. - The high-dimensional vector :math:`X = (X_1, \\ldots, X_p)` consists of other confounding covariates, - and :math:`\\zeta` and :math:`V` are stochastic errors. + The high-dimensional vector :math:`X = (X_1, \\ldots, X_p)` consists of other confounding covariates. """ def __init__(self, @@ -122,13 +107,18 @@ def __init__(self, n_rep, score, draw_sample_splitting) + + # Ensure outcome only contains 0 and 1 (validate early in constructor) + if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]): + raise TypeError("The outcome variable y must be binary with values 0 and 1.") + self._error_on_convergence_failure = error_on_convergence_failure self._coef_bounds = (-1e-2, 1e2) self._coef_start_val = 1.0 self._check_data(self._dml_data) valid_scores = ['nuisance_space', 'instrument'] - _check_score(self.score, valid_scores, allow_callable=True) + _check_score(self.score, valid_scores, allow_callable=False) _ = self._check_learner(ml_t, 'ml_t', regressor=True, classifier=False) _ = self._check_learner(ml_M, 'ml_M', regressor=False, classifier=True) @@ -208,7 +198,6 @@ def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, s res['preds'][smpls_single_split[1]] += model.predict_proba(x[smpls_single_split[1]])[:, 1] else: res['preds'][smpls_single_split[1]] += model.predict(x[smpls_single_split[1]]) - res["preds_inner"] res["preds"] /= len(smpls) res['targets'] = np.copy(y) return res @@ -216,7 +205,6 @@ def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, s def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False): - # TODO: How to deal with smpls_inner? x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False) x, d = check_X_y(x, self._dml_data.d, @@ -278,9 +266,6 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa 'observed to be binary with values 0 and 1. Make sure that for classifiers ' 'probabilities and not labels are predicted.') - - - if a_external: a_hat = {'preds': external_predictions['ml_a'], 'targets': None, @@ -290,35 +275,6 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa n_jobs=n_jobs_cv, est_params=self._get_params('ml_a'), method=self._predict_method['ml_a'])) - - # r_legacy = np.zeros_like(y) - # smpls_inner = self.__smpls__inner - # M_hat_l = {} - # a_hat_l = {} - # M_hat_l['preds_inner'] = [] - # M_hat_l['preds'] = np.full_like(y, np.nan) - # a_hat_l['preds_inner'] = [] - # a_hat_l['preds'] = np.full_like(y, np.nan) - # for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): - # test = smpls_single_split[1] - # train = smpls_single_split[0] - # # r_legacy[test] = - # Mleg, aleg, a_nf_leg = self.legacy_implementation(y[train], x[train], d[train], x[test], d[test], - # self._learner['ml_m'], self._learner['ml_M'], - # smpls_single_split, smpls_double_split, y, x, d, - # x_d_concat, n_jobs_cv) - # Mtemp = np.full_like(y, np.nan) - # Mtemp[train] = Mleg - # Atemp = np.full_like(y, np.nan) - # Atemp[train] = aleg - # M_hat_l['preds_inner'].append(Mtemp) - # a_hat_l['preds_inner'].append(Atemp) - # a_hat_l['preds'][test] = a_nf_leg - - #r_hat['preds'] = r_legacy - - - W_inner = [] beta = np.zeros(d.shape, dtype=float) @@ -366,74 +322,6 @@ def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=Fa return psi_elements, preds - - def legacy_implementation(self, Yfold: np.ndarray, Xfold: np.ndarray, Afold: np.ndarray, XnotFold: np.ndarray, AnotFold: np.ndarray, - learner, learnerClassifier, smpls_single_split, smpls_double_split, yfull, xfull, afull, x_d_concat, n_jobs_cv, noFolds: int = 5, seed=None, )-> (np.ndarray, np.ndarray, np.ndarray): - - def learn_predict(X, Y, Xpredict, learner, learnerClassifier, fit_args={}): - results = [] - if len(np.unique(Y)) == 2: - learnerClassifier.fit(X, Y, **fit_args) - for x in Xpredict: - results.append(learnerClassifier.predict_proba(x)[:, 1]) - else: - learner.fit(X, Y, **fit_args) - for x in Xpredict: - results.append(learner.predict(x)) - return (*results,) - - nFold = len(Yfold) - i = np.remainder(np.arange(nFold), noFolds) - np.random.default_rng(seed).shuffle(i) - - M = np.zeros((nFold)) - a_hat = np.zeros((nFold)) - a_hat_notFold = np.zeros((len(XnotFold))) - M_notFold = np.zeros((len(XnotFold))) - loss = {} - - a_hat_inner = _dml_cv_predict(self._learner['ml_a'], xfull, afull, smpls=smpls_double_split, n_jobs=n_jobs_cv, - est_params=self._get_params('ml_a'), method=self._predict_method['ml_a'], - return_models=True, smpls_is_partition=True) - _check_finite_predictions(a_hat_inner['preds'], self._learner['ml_a'], 'ml_a', smpls_double_split) - a_hat_notFold = np.full_like(yfull, 0.) - for model in a_hat_inner['models']: - if self._predict_method['ml_a'] == 'predict_proba': - a_hat_notFold[smpls_single_split[1]] += model.predict_proba(xfull[smpls_single_split[1]])[:, 1] - else: - a_hat_notFold[smpls_single_split[1]] += model.predict(xfull[smpls_single_split[1]]) - - M_hat = _dml_cv_predict(self._learner['ml_M'], x_d_concat, yfull, smpls=smpls_double_split, n_jobs=n_jobs_cv, - est_params=self._get_params('ml_M'), method=self._predict_method['ml_M'], - return_models=True, smpls_is_partition=True) - _check_finite_predictions(M_hat['preds'], self._learner['ml_M'], 'ml_M', smpls_double_split) - - M = M_hat['preds'][~np.isnan(M_hat['preds'])] - a_hat = a_hat_inner['preds'][~np.isnan(a_hat_inner['preds'])] - a_hat_notFold = a_hat_notFold[smpls_single_split[1]] - - np.clip(M, 1e-8, 1 - 1e-8, out=M) -# loss["M"] = compute_loss(Yfold, M) -# loss["a_hat"] = compute_loss(Afold, a_hat) - a_hat_notFold /= noFolds - # M_notFold /= noFolds - np.clip(M_notFold, 1e-8, 1 - 1e-8, out=M_notFold) - - # Obtain preliminary estimate of beta based on M and residual of a - W = scipy.special.logit(M) - A_resid = Afold - a_hat - beta_notFold = sum(A_resid * W) / sum(A_resid ** 2) - # print(beta_notFold) - t_notFold, = learn_predict(Xfold, W, [XnotFold], learner, learnerClassifier) - W_notFold = scipy.special.expit(M_notFold) -# loss["t"] = compute_loss(W_notFold, t_notFold) - - - # Compute r based on estimates for W=logit(M), beta and residual of A - r_notFold = t_notFold - beta_notFold * a_hat_notFold - - return M, a_hat, a_hat_notFold #r_notFold #, a_hat_notFold, M_notFold, t_notFold - def _score_elements(self, y, d, r_hat, m_hat): # compute residual d_tilde = d - m_hat @@ -470,12 +358,11 @@ def _nuisance_tuning(self, smpls, param_grids, scoring_methods, n_folds_tune, n_ self._learner['ml_M'], param_grids['ml_M'], scoring_methods['ml_M'], n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search) + filtered_train_inds = [] if self.score == 'nuisance_space': - filtered_smpls = [] for train, test in smpls: train_filtered = train[y[train] == 0] - filtered_smpls.append(train_filtered) - filtered_train_inds = [train_index for (train_index, _) in smpls] + filtered_train_inds.append(train_filtered) elif self.score == 'instrument': filtered_train_inds = train_inds else: @@ -553,7 +440,7 @@ def draw_sample_splitting(self): return self def set_sample_splitting(self): - raise NotImplementedError('set_sample_splitting is not implemented for DoubleMLLogit.') + raise NotImplementedError('set_sample_splitting is not implemented for DoubleMLLPLR.') def _compute_score(self, psi_elements, coef): @@ -577,4 +464,4 @@ def _compute_score_deriv(self, psi_elements, coef, inds=None): else: raise NotImplementedError - return deriv \ No newline at end of file + return deriv diff --git a/doubleml/plm/tests/_utils_lplr_manual.py b/doubleml/plm/tests/_utils_lplr_manual.py new file mode 100644 index 00000000..f14a1f66 --- /dev/null +++ b/doubleml/plm/tests/_utils_lplr_manual.py @@ -0,0 +1,335 @@ +import numpy as np +from sklearn.base import clone +from sklearn.model_selection import train_test_split + +from ...tests._utils import fit_predict, fit_predict_proba, tune_grid_search +from ...utils._estimation import _predict_zero_one_propensity +from ...utils._propensity_score import _trimm + + +def fit_selection( + y, + x, + d, + z, + s, + learner_g, + learner_pi, + learner_m, + all_smpls, + score, + trimming_rule="truncate", + trimming_threshold=1e-2, + normalize_ipw=True, + n_rep=1, + g_d0_params=None, + g_d1_params=None, + pi_params=None, + m_params=None, +): + n_obs = len(y) + + thetas = np.zeros(n_rep) + ses = np.zeros(n_rep) + + all_g_d1_hat = list() + all_g_d0_hat = list() + all_pi_hat = list() + all_m_hat = list() + + all_psi_a = list() + all_psi_b = list() + + for i_rep in range(n_rep): + smpls = all_smpls[i_rep] + + g_hat_d1_list, g_hat_d0_list, pi_hat_list, m_hat_list = fit_nuisance_selection( + y, + x, + d, + z, + s, + learner_g, + learner_pi, + learner_m, + smpls, + score, + trimming_rule=trimming_rule, + trimming_threshold=trimming_threshold, + g_d0_params=g_d0_params, + g_d1_params=g_d1_params, + pi_params=pi_params, + m_params=m_params, + ) + all_g_d1_hat.append(g_hat_d1_list) + all_g_d0_hat.append(g_hat_d0_list) + all_pi_hat.append(pi_hat_list) + all_m_hat.append(m_hat_list) + + g_hat_d1, g_hat_d0, pi_hat, m_hat = compute_selection(y, g_hat_d1_list, g_hat_d0_list, pi_hat_list, m_hat_list, smpls) + + dtreat = d == 1 + dcontrol = d == 0 + psi_a, psi_b = selection_score_elements(dtreat, dcontrol, g_hat_d1, g_hat_d0, pi_hat, m_hat, s, y, normalize_ipw) + + all_psi_a.append(psi_a) + all_psi_b.append(psi_b) + + thetas[i_rep], ses[i_rep] = selection_dml2(psi_a, psi_b) + + theta = np.median(thetas) + se = np.sqrt(np.median(np.power(ses, 2) * n_obs + np.power(thetas - theta, 2)) / n_obs) + + res = { + "theta": theta, + "se": se, + "thetas": thetas, + "ses": ses, + "all_g_d1_hat": all_g_d1_hat, + "all_g_d0_hat": all_g_d0_hat, + "all_pi_hat": all_pi_hat, + "all_m_hat": all_m_hat, + "all_psi_a": all_psi_a, + "all_psi_b": all_psi_b, + } + + return res + + +def fit_nuisance_selection( + y, + x, + d, + z, + s, + learner_g, + learner_pi, + learner_m, + smpls, + score, + trimming_rule="truncate", + trimming_threshold=1e-2, + g_d0_params=None, + g_d1_params=None, + pi_params=None, + m_params=None, +): + ml_g_d1 = clone(learner_g) + ml_g_d0 = clone(learner_g) + ml_pi = clone(learner_pi) + ml_m = clone(learner_m) + + if z is None: + dx = np.column_stack((d, x)) + else: + dx = np.column_stack((d, x, z)) + + if score == "missing-at-random": + pi_hat_list = fit_predict_proba(s, dx, ml_pi, pi_params, smpls, trimming_threshold=trimming_threshold) + + m_hat_list = fit_predict_proba(d, x, ml_m, m_params, smpls) + + train_cond_d1_s1 = np.intersect1d(np.where(d == 1)[0], np.where(s == 1)[0]) + g_hat_d1_list = fit_predict(y, x, ml_g_d1, g_d1_params, smpls, train_cond=train_cond_d1_s1) + + train_cond_d0_s1 = np.intersect1d(np.where(d == 0)[0], np.where(s == 1)[0]) + g_hat_d0_list = fit_predict(y, x, ml_g_d0, g_d0_params, smpls, train_cond=train_cond_d0_s1) + else: + # initialize empty lists + g_hat_d1_list = [] + g_hat_d0_list = [] + pi_hat_list = [] + m_hat_list = [] + + # create strata for splitting + strata = d.reshape(-1, 1) + 2 * s.reshape(-1, 1) + + # POTENTIAL OUTCOME Y(1) + for i_fold, _ in enumerate(smpls): + ml_g_d1 = clone(learner_g) + ml_pi = clone(learner_pi) + ml_m = clone(learner_m) + + # set the params for the nuisance learners + if g_d1_params is not None: + ml_g_d1.set_params(**g_d1_params[i_fold]) + if g_d0_params is not None: + ml_g_d0.set_params(**g_d0_params[i_fold]) + if pi_params is not None: + ml_pi.set_params(**pi_params[i_fold]) + if m_params is not None: + ml_m.set_params(**m_params[i_fold]) + + train_inds = smpls[i_fold][0] + test_inds = smpls[i_fold][1] + + # start nested crossfitting + train_inds_1, train_inds_2 = train_test_split( + train_inds, test_size=0.5, random_state=42, stratify=strata[train_inds] + ) + + s_train_1 = s[train_inds_1] + dx_train_1 = dx[train_inds_1, :] + + # preliminary propensity score for selection + ml_pi_prelim = clone(ml_pi) + # fit on first part of training set + ml_pi_prelim.fit(dx_train_1, s_train_1) + pi_hat_prelim = _predict_zero_one_propensity(ml_pi_prelim, dx) + + # predictions for small pi in denominator + pi_hat = pi_hat_prelim[test_inds] + + # add selection indicator to covariates + xpi = np.column_stack((x, pi_hat_prelim)) + + # estimate propensity score p using the second training sample + xpi_train_2 = xpi[train_inds_2, :] + d_train_2 = d[train_inds_2] + xpi_test = xpi[test_inds, :] + + ml_m.fit(xpi_train_2, d_train_2) + + m_hat = _predict_zero_one_propensity(ml_m, xpi_test) + + # estimate conditional outcome on second training sample -- treatment + s1_d1_train_2_indices = np.intersect1d(np.where(d == 1)[0], np.intersect1d(np.where(s == 1)[0], train_inds_2)) + xpi_s1_d1_train_2 = xpi[s1_d1_train_2_indices, :] + y_s1_d1_train_2 = y[s1_d1_train_2_indices] + + ml_g_d1.fit(xpi_s1_d1_train_2, y_s1_d1_train_2) + + # predict conditional outcome + g_hat_d1 = ml_g_d1.predict(xpi_test) + + # estimate conditional outcome on second training sample -- control + s1_d0_train_2_indices = np.intersect1d(np.where(d == 0)[0], np.intersect1d(np.where(s == 1)[0], train_inds_2)) + xpi_s1_d0_train_2 = xpi[s1_d0_train_2_indices, :] + y_s1_d0_train_2 = y[s1_d0_train_2_indices] + + ml_g_d0.fit(xpi_s1_d0_train_2, y_s1_d0_train_2) + + # predict conditional outcome + g_hat_d0 = ml_g_d0.predict(xpi_test) + + m_hat = _trimm(m_hat, trimming_rule, trimming_threshold) + + # append predictions on test sample to final list of predictions + g_hat_d1_list.append(g_hat_d1) + g_hat_d0_list.append(g_hat_d0) + pi_hat_list.append(pi_hat) + m_hat_list.append(m_hat) + + return g_hat_d1_list, g_hat_d0_list, pi_hat_list, m_hat_list + + +def compute_selection(y, g_hat_d1_list, g_hat_d0_list, pi_hat_list, m_hat_list, smpls): + g_hat_d1 = np.full_like(y, np.nan, dtype="float64") + g_hat_d0 = np.full_like(y, np.nan, dtype="float64") + pi_hat = np.full_like(y, np.nan, dtype="float64") + m_hat = np.full_like(y, np.nan, dtype="float64") + + for idx, (_, test_index) in enumerate(smpls): + g_hat_d1[test_index] = g_hat_d1_list[idx] + g_hat_d0[test_index] = g_hat_d0_list[idx] + pi_hat[test_index] = pi_hat_list[idx] + m_hat[test_index] = m_hat_list[idx] + + return g_hat_d1, g_hat_d0, pi_hat, m_hat + + +def selection_score_elements(dtreat, dcontrol, g_d1, g_d0, pi, m, s, y, normalize_ipw): + # psi_a + psi_a = -1 * np.ones_like(y) + + # psi_b + if normalize_ipw: + weight_treat = sum(dtreat) / sum((dtreat * s) / (m * pi)) + weight_control = sum(dcontrol) / sum((dcontrol * s) / ((1 - m) * pi)) + + psi_b1 = weight_treat * ((dtreat * s * (y - g_d1)) / (m * pi)) + g_d1 + psi_b0 = weight_control * ((dcontrol * s * (y - g_d0)) / ((1 - m) * pi)) + g_d0 + + else: + psi_b1 = (dtreat * s * (y - g_d1)) / (m * pi) + g_d1 + psi_b0 = (dcontrol * s * (y - g_d0)) / ((1 - m) * pi) + g_d0 + + psi_b = psi_b1 - psi_b0 + + return psi_a, psi_b + + +def selection_dml2(psi_a, psi_b): + n_obs = len(psi_a) + theta_hat = -np.mean(psi_b) / np.mean(psi_a) + se = np.sqrt(var_selection(theta_hat, psi_a, psi_b, n_obs)) + + return theta_hat, se + + +def var_selection(theta, psi_a, psi_b, n_obs): + J = np.mean(psi_a) + var = 1 / n_obs * np.mean(np.power(np.multiply(psi_a, theta) + psi_b, 2)) / np.power(J, 2) + return var + + +def tune_nuisance_ssm_mar(y, x, d, z, s, ml_g, ml_pi, ml_m, smpls, n_folds_tune, param_grid_g, param_grid_pi, param_grid_m): + d0_s1 = np.intersect1d(np.where(d == 0)[0], np.where(s == 1)[0]) + d1_s1 = np.intersect1d(np.where(d == 1)[0], np.where(s == 1)[0]) + + g0_tune_res = tune_grid_search(y, x, ml_g, smpls, param_grid_g, n_folds_tune, train_cond=d0_s1) + g1_tune_res = tune_grid_search(y, x, ml_g, smpls, param_grid_g, n_folds_tune, train_cond=d1_s1) + + dx = np.column_stack((x, d)) + + pi_tune_res = tune_grid_search(s, dx, ml_pi, smpls, param_grid_pi, n_folds_tune) + + m_tune_res = tune_grid_search(d, x, ml_m, smpls, param_grid_m, n_folds_tune) + + g0_best_params = [xx.best_params_ for xx in g0_tune_res] + g1_best_params = [xx.best_params_ for xx in g1_tune_res] + pi_best_params = [xx.best_params_ for xx in pi_tune_res] + m_best_params = [xx.best_params_ for xx in m_tune_res] + + return g0_best_params, g1_best_params, pi_best_params, m_best_params + + +def tune_nuisance_ssm_nonignorable( + y, x, d, z, s, ml_g, ml_pi, ml_m, smpls, n_folds_tune, param_grid_g, param_grid_pi, param_grid_m +): + + train_inds = [tr for (tr, _) in smpls] + + inner0_list, inner1_list = [], [] + for tr in train_inds: + i0, i1 = train_test_split(tr, test_size=0.5, stratify=d[tr] + 2 * s[tr], random_state=42) + inner0_list.append(i0) + inner1_list.append(i1) + + X_dz = np.c_[x, d.reshape(-1, 1), z.reshape(-1, 1)] + pi_tune_res = tune_grid_search(s, X_dz, ml_pi, [(i0, np.array([])) for i0 in inner0_list], param_grid_pi, n_folds_tune) + pi_best_params = [gs.best_params_ for gs in pi_tune_res] + + pi_hat_full = np.full_like(s, np.nan, dtype=float) + for i0, i1, gs in zip(inner0_list, inner1_list, pi_tune_res): + ml_pi_temp = clone(ml_pi) + ml_pi_temp.set_params(**gs.best_params_) + ml_pi_temp.fit(X_dz[i0], s[i0]) + ph = _predict_zero_one_propensity(ml_pi_temp, X_dz) + pi_hat_full[i1] = ph[i1] + + X_pi = np.c_[x, pi_hat_full] + m_tune_res = tune_grid_search(d, X_pi, ml_m, [(i1, np.array([])) for i1 in inner1_list], param_grid_m, n_folds_tune) + m_best_params = [gs.best_params_ for gs in m_tune_res] + + X_pi_d = np.c_[x, d.reshape(-1, 1), pi_hat_full.reshape(-1, 1)] + inner1_d0_s1 = [i1[(d[i1] == 0) & (s[i1] == 1)] for i1 in inner1_list] + inner1_d1_s1 = [i1[(d[i1] == 1) & (s[i1] == 1)] for i1 in inner1_list] + + g0_tune_res = tune_grid_search(y, X_pi_d, ml_g, [(idx, np.array([])) for idx in inner1_d0_s1], param_grid_g, n_folds_tune) + g1_tune_res = tune_grid_search(y, X_pi_d, ml_g, [(idx, np.array([])) for idx in inner1_d1_s1], param_grid_g, n_folds_tune) + + g0_best_params = [gs.best_params_ for gs in g0_tune_res] + g1_best_params = [gs.best_params_ for gs in g1_tune_res] + + return g0_best_params, g1_best_params, pi_best_params, m_best_params diff --git a/doubleml/plm/tests/test_lplr.py b/doubleml/plm/tests/test_lplr.py new file mode 100644 index 00000000..c561d9fe --- /dev/null +++ b/doubleml/plm/tests/test_lplr.py @@ -0,0 +1,105 @@ +import math + +import numpy as np +import pytest +from sklearn.base import clone +from sklearn.linear_model import LassoCV, LogisticRegressionCV + +import doubleml as dml + +from ...tests._utils import draw_smpls +from ._utils_ssm_manual import fit_selection + + +@pytest.fixture(scope="module", params=[[LassoCV(), LogisticRegressionCV(penalty="l1", solver="liblinear")]]) +def learner(request): + return request.param + + +@pytest.fixture(scope="module", params=["missing-at-random", "nonignorable"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def normalize_ipw(request): + return request.param + + +@pytest.fixture(scope="module", params=[0.01]) +def trimming_threshold(request): + return request.param + + +@pytest.fixture(scope="module") +def dml_selection_fixture( + generate_data_selection_mar, generate_data_selection_nonignorable, learner, score, trimming_threshold, normalize_ipw +): + n_folds = 3 + + # collect data + np.random.seed(42) + if score == "missing-at-random": + (x, y, d, z, s) = generate_data_selection_mar + else: + (x, y, d, z, s) = generate_data_selection_nonignorable + + ml_g = clone(learner[0]) + ml_pi = clone(learner[1]) + ml_m = clone(learner[1]) + + np.random.seed(42) + n_obs = len(y) + all_smpls = draw_smpls(n_obs, n_folds) + + np.random.seed(42) + if score == "missing-at-random": + obj_dml_data = dml.DoubleMLSSMData.from_arrays(x, y, d, z=None, s=s) + dml_sel_obj = dml.DoubleMLSSM(obj_dml_data, ml_g, ml_pi, ml_m, n_folds=n_folds, score=score) + else: + assert score == "nonignorable" + obj_dml_data = dml.DoubleMLSSMData.from_arrays(x, y, d, z=z, s=s) + dml_sel_obj = dml.DoubleMLSSM(obj_dml_data, ml_g, ml_pi, ml_m, n_folds=n_folds, score=score) + + np.random.seed(42) + dml_sel_obj.set_sample_splitting(all_smpls=all_smpls) + dml_sel_obj.fit() + + np.random.seed(42) + res_manual = fit_selection( + y, + x, + d, + z, + s, + clone(learner[0]), + clone(learner[1]), + clone(learner[1]), + all_smpls, + score, + trimming_rule="truncate", + trimming_threshold=trimming_threshold, + normalize_ipw=normalize_ipw, + ) + + res_dict = { + "coef": dml_sel_obj.coef[0], + "coef_manual": res_manual["theta"], + "se": dml_sel_obj.se[0], + "se_manual": res_manual["se"], + } + + # sensitivity tests + # TODO + + return res_dict + + +@pytest.mark.ci +def test_dml_selection_coef(dml_selection_fixture): + assert math.isclose(dml_selection_fixture["coef"], dml_selection_fixture["coef_manual"], rel_tol=1e-9, abs_tol=1e-2) + + +@pytest.mark.ci +def test_dml_selection_se(dml_selection_fixture): + assert math.isclose(dml_selection_fixture["se"], dml_selection_fixture["se_manual"], rel_tol=1e-9, abs_tol=5e-2) diff --git a/doubleml/plm/tests/test_lplr_exceptions.py b/doubleml/plm/tests/test_lplr_exceptions.py new file mode 100644 index 00000000..4361e7c7 --- /dev/null +++ b/doubleml/plm/tests/test_lplr_exceptions.py @@ -0,0 +1,293 @@ +import numpy as np +import pandas as pd +import pytest +from sklearn.base import BaseEstimator +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.linear_model import Lasso, LogisticRegression + +from doubleml import DoubleMLLPLR +from doubleml.data.base_data import DoubleMLBaseData, DoubleMLData +from doubleml.plm.datasets import make_lplr_LZZ2020 + +np.random.seed(3141) +n = 100 +# create test data and basic learners +dml_data = make_lplr_LZZ2020(alpha=0.5, n_obs=n, dim_x=10) +ml_M = RandomForestClassifier() +ml_t = RandomForestRegressor() +ml_m = RandomForestRegressor() +dml_lplr = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m) +dml_lplr_instrument = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score="instrument") + +@pytest.mark.ci +def test_lplr_exception_data(): + msg = ( + r"The data must be of DoubleMLData type\. .* of type " + r" was passed\." + ) + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(pd.DataFrame(), ml_M, ml_t, ml_m) + + dml_data_nb = make_lplr_LZZ2020(alpha=0.5, n_obs=50, dim_x=5) + dml_data_nb.data[dml_data_nb.y_col] = dml_data_nb.data[dml_data_nb.y_col] + 1 + dml_data_nb._set_y_z() + with pytest.raises(TypeError, match="The outcome variable y must be binary with values 0 and 1."): + _ = DoubleMLLPLR(dml_data_nb, ml_M, ml_t, ml_m) + + +@pytest.mark.ci +def test_lplr_exception_scores(): + # LPLR valid scores are 'nuisance_space' and 'instrument' + msg = "Invalid score MAR" + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score="MAR") + msg = "score should be string. 0 was passed." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score=0) + +@pytest.mark.ci +def test_ssm_exception_resampling(): + msg = "The number of folds must be of int type. 1.5 of type was passed." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, n_folds=1.5) + + msg = "The number of repetitions for the sample splitting must be of int type. 1.5 of type was passed." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, n_rep=1.5) + + msg = "The number of folds must be positive. 0 was passed." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, n_folds=0) + + msg = "The number of repetitions for the sample splitting must be positive. 0 was passed." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, n_rep=0) + + msg = "draw_sample_splitting must be True or False. Got true." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, draw_sample_splitting="true") + + +@pytest.mark.ci +def test_lplr_exception_get_params(): + msg = "Invalid nuisance learner ml_x. Valid nuisance learner ml_M or ml_g_t or ml_m or ml_a." + with pytest.raises(ValueError, match=msg): + dml_lplr.get_params("ml_x") + +@pytest.mark.ci +def test_lplr_exception_smpls(): + msg = ( + "Sample splitting not specified. " + r"Either draw samples via .draw_sample splitting\(\) or set external samples via .set_sample_splitting\(\)." + ) + dml_plr_no_smpls = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, draw_sample_splitting=False) + with pytest.raises(ValueError, match=msg): + _ = dml_plr_no_smpls.smpls + +@pytest.mark.ci +def test_lplr_exception_fit(): + msg = "The number of CPUs used to fit the learners must be of int type. 5 of type was passed." + with pytest.raises(TypeError, match=msg): + dml_lplr.fit(n_jobs_cv="5") + msg = "store_predictions must be True or False. Got 1." + with pytest.raises(TypeError, match=msg): + dml_lplr.fit(store_predictions=1) + msg = "store_models must be True or False. Got 1." + with pytest.raises(TypeError, match=msg): + dml_lplr.fit(store_models=1) + +@pytest.mark.ci +def test_lplr_exception_bootstrap(): + dml_lplr_boot = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m) + msg = r"Apply fit\(\) before bootstrap\(\)." + with pytest.raises(ValueError, match=msg): + dml_lplr_boot.bootstrap() + + dml_lplr_boot.fit() + msg = 'Method must be "Bayes", "normal" or "wild". Got Gaussian.' + with pytest.raises(ValueError, match=msg): + dml_lplr_boot.bootstrap(method="Gaussian") + msg = "The number of bootstrap replications must be of int type. 500 of type was passed." + with pytest.raises(TypeError, match=msg): + dml_lplr_boot.bootstrap(n_rep_boot="500") + msg = "The number of bootstrap replications must be positive. 0 was passed." + with pytest.raises(ValueError, match=msg): + dml_lplr_boot.bootstrap(n_rep_boot=0) + + +@pytest.mark.ci +def test_lplr_exception_confint(): + dml_lplr_conf = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m) + msg = r"Apply fit\(\) before confint\(\)." + with pytest.raises(ValueError, match=msg): + dml_lplr_conf.confint() + dml_lplr_conf.fit() + + msg = "joint must be True or False. Got 1." + with pytest.raises(TypeError, match=msg): + dml_lplr_conf.confint(joint=1) + msg = "The confidence level must be of float type. 5% of type was passed." + with pytest.raises(TypeError, match=msg): + dml_lplr_conf.confint(level="5%") + msg = r"The confidence level must be in \(0,1\). 0.0 was passed." + with pytest.raises(ValueError, match=msg): + dml_lplr_conf.confint(level=0.0) + + msg = r"Apply bootstrap\(\) before confint\(joint=True\)." + with pytest.raises(ValueError, match=msg): + dml_lplr_conf.confint(joint=True) + dml_lplr_conf.bootstrap() + df_lplr_ci = dml_lplr_conf.confint(joint=True) + assert isinstance(df_lplr_ci, pd.DataFrame) + + +@pytest.mark.ci +def test_lplr_exception_set_ml_nuisance_params(): + # invalid learner name + msg = "Invalid nuisance learner g. Valid nuisance learner ml_M or ml_t or ml_m or ml_a." + with pytest.raises(ValueError, match=msg): + dml_lplr.set_ml_nuisance_params("g", "d", {"alpha": 0.1}) + # invalid treatment variable + msg = "Invalid treatment variable y. Valid treatment variable d." + with pytest.raises(ValueError, match=msg): + dml_lplr.set_ml_nuisance_params("ml_M", "y", {"alpha": 0.1}) + + +class _DummyNoSetParams: + def fit(self): + pass + + +class _DummyNoGetParams(_DummyNoSetParams): + def set_params(self): + pass + + +class _DummyNoClassifier(_DummyNoGetParams): + def get_params(self): + pass + + def predict_proba(self): + pass + + +class LogisticRegressionManipulatedType(LogisticRegression): + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.estimator_type = None + return tags + + +@pytest.mark.ci +@pytest.mark.filterwarnings( + r"ignore:.*is \(probably\) neither a regressor nor a classifier.*:UserWarning", +) +def test_lplr_exception_learner(): + err_msg_prefix = "Invalid learner provided for ml_t: " + + msg = err_msg_prefix + "provide an instance of a learner instead of a class." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, Lasso, ml_m) + msg = err_msg_prefix + r"BaseEstimator\(\) has no method .fit\(\)." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, BaseEstimator(), ml_m) + msg = r"has no method .set_params\(\)." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, _DummyNoSetParams(), ml_m) + msg = r"has no method .get_params\(\)." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, _DummyNoGetParams(), ml_m) + + # ml_m may not be a classifier when treatment is not binary + msg = ( + r"The ml_m learner LogisticRegression\(\) was identified as classifier " + r"but at least one treatment variable is not binary with values 0 and 1\." + ) + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, LogisticRegression()) + + # construct a classifier which is not identifiable as classifier via is_classifier by sklearn + log_reg = LogisticRegressionManipulatedType() + # TODO(0.11) can be removed if the sklearn dependency is bumped to 1.6.0 + log_reg._estimator_type = None + msg = ( + r"Learner provided for ml_m is probably invalid: LogisticRegressionManipulatedType\(\) is \(probably\) " + r"no classifier\." + ) + with pytest.warns(UserWarning, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, log_reg) + + +@pytest.mark.ci +@pytest.mark.filterwarnings( + r"ignore:.*is \(probably\) neither a regressor nor a classifier.*:UserWarning", + r"ignore: Learner provided for ml_m is probably invalid.*is \(probably\) no classifier.*:UserWarning", +) +def test_lplr_exception_and_warning_learner(): + # invalid ml_M (must be a classifier with predict_proba) + with pytest.raises(TypeError): + _ = DoubleMLLPLR(dml_data, _DummyNoClassifier(), ml_t, ml_m) + msg = "Invalid learner provided for ml_M: " + r"Lasso\(\) has no method .predict_proba\(\)." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, Lasso(), ml_t, ml_m) + + +class LassoWithNanPred(Lasso): + def predict(self, X): + preds = super().predict(X) + n_obs = len(preds) + preds[np.random.randint(0, n_obs, 1)] = np.nan + return preds + + +class LassoWithInfPred(Lasso): + def predict(self, X): + preds = super().predict(X) + n_obs = len(preds) + preds[np.random.randint(0, n_obs, 1)] = np.inf + return preds + + +@pytest.mark.ci +def test_lplr_nan_prediction(): + msg = r"Predictions from learner LassoWithNanPred\(\) for ml_t are not finite." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, LassoWithNanPred(), ml_m).fit() + msg = r"Predictions from learner LassoWithInfPred\(\) for ml_t are not finite." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, LassoWithInfPred(), ml_m).fit() + + +@pytest.mark.ci +def test_double_ml_exception_evaluate_learner(): + dml_lplr_obj = DoubleMLLPLR( + dml_data, + ml_M=LogisticRegression(), + ml_t=Lasso(), + ml_m=RandomForestRegressor(), + n_folds=5, + score="nuisance_space", + ) + + msg = r"Apply fit\(\) before evaluate_learners\(\)." + with pytest.raises(ValueError, match=msg): + dml_lplr_obj.evaluate_learners() + + dml_lplr_obj.fit() + + msg = "metric should be a callable. 'mse' was passed." + with pytest.raises(TypeError, match=msg): + dml_lplr_obj.evaluate_learners(metric="mse") + + msg = ( + r"The learners have to be a subset of \['ml_M', 'ml_t', 'ml_m', 'ml_a'\]\. " + r"Learners \['ml_mu', 'ml_p'\] provided." + ) + with pytest.raises(ValueError, match=msg): + dml_lplr_obj.evaluate_learners(learners=["ml_mu", "ml_p"]) + + def eval_fct(y_pred, y_true): + return np.nan + + with pytest.raises(ValueError): + dml_lplr_obj.evaluate_learners(metric=eval_fct) diff --git a/doubleml/plm/tests/test_lplr_tune.py b/doubleml/plm/tests/test_lplr_tune.py new file mode 100644 index 00000000..0e0fa7bf --- /dev/null +++ b/doubleml/plm/tests/test_lplr_tune.py @@ -0,0 +1,227 @@ +import math + +import numpy as np +import pytest +from sklearn.base import clone +from sklearn.ensemble import RandomForestRegressor +from sklearn.linear_model import LogisticRegression + +import doubleml as dml + +from ...tests._utils import draw_smpls +from ._utils_lplr_manual import fit_selection, tune_nuisance_ssm_mar, tune_nuisance_ssm_nonignorable + + +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) +def learner_g(request): + return request.param + + +@pytest.fixture(scope="module", params=[LogisticRegression(random_state=42)]) +def learner_m(request): + return request.param + + +@pytest.fixture(scope="module", params=["missing-at-random", "nonignorable"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def normalize_ipw(request): + return request.param + + +@pytest.fixture(scope="module", params=[True, False]) +def tune_on_folds(request): + return request.param + + +def get_par_grid(learner): + if learner.__class__ in [RandomForestRegressor]: + par_grid = {"n_estimators": [5, 10, 20]} + else: + assert learner.__class__ in [LogisticRegression] + par_grid = {"C": np.logspace(-2, 2, 10)} + return par_grid + + +@pytest.fixture(scope="module") +def dml_ssm_fixture( + generate_data_selection_mar, + generate_data_selection_nonignorable, + learner_g, + learner_m, + score, + normalize_ipw, + tune_on_folds, +): + par_grid = {"ml_g": get_par_grid(learner_g), "ml_pi": get_par_grid(learner_m), "ml_m": get_par_grid(learner_m)} + n_folds_tune = 4 + n_folds = 2 + + # collect data + np.random.seed(42) + if score == "missing-at-random": + (x, y, d, z, s) = generate_data_selection_mar + else: + (x, y, d, z, s) = generate_data_selection_nonignorable + + n_obs = len(y) + all_smpls = draw_smpls(n_obs, n_folds) + + ml_g = clone(learner_g) + ml_pi = clone(learner_m) + ml_m = clone(learner_m) + + np.random.seed(42) + if score == "missing-at-random": + obj_dml_data = dml.DoubleMLSSMData.from_arrays(x, y, d, z=None, s=s) + dml_sel_obj = dml.DoubleMLSSM( + obj_dml_data, + ml_g, + ml_pi, + ml_m, + n_folds=n_folds, + score=score, + normalize_ipw=normalize_ipw, + draw_sample_splitting=False, + ) + else: + assert score == "nonignorable" + obj_dml_data = dml.DoubleMLSSMData.from_arrays(x, y, d, z=z, s=s) + dml_sel_obj = dml.DoubleMLSSM( + obj_dml_data, + ml_g, + ml_pi, + ml_m, + n_folds=n_folds, + score=score, + normalize_ipw=normalize_ipw, + draw_sample_splitting=False, + ) + + # synchronize the sample splitting + np.random.seed(42) + dml_sel_obj.set_sample_splitting(all_smpls=all_smpls) + + np.random.seed(42) + # tune hyperparameters + tune_res = dml_sel_obj.tune(par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune, return_tune_res=False) + assert isinstance(tune_res, dml.DoubleMLSSM) + + dml_sel_obj.fit() + + np.random.seed(42) + smpls = all_smpls[0] + if tune_on_folds: + if score == "missing-at-random": + g0_best_params, g1_best_params, pi_best_params, m_best_params = tune_nuisance_ssm_mar( + y, + x, + d, + z, + s, + clone(learner_g), + clone(learner_m), + clone(learner_m), + smpls, + n_folds_tune, + par_grid["ml_g"], + par_grid["ml_pi"], + par_grid["ml_m"], + ) + elif score == "nonignorable": + g0_best_params, g1_best_params, pi_best_params, m_best_params = tune_nuisance_ssm_nonignorable( + y, + x, + d, + z, + s, + clone(learner_g), + clone(learner_m), + clone(learner_m), + smpls, + n_folds_tune, + par_grid["ml_g"], + par_grid["ml_pi"], + par_grid["ml_m"], + ) + + else: + xx = [(np.arange(len(y)), np.array([]))] + if score == "missing-at-random": + g0_best_params, g1_best_params, pi_best_params, m_best_params = tune_nuisance_ssm_mar( + y, + x, + d, + z, + s, + clone(learner_g), + clone(learner_m), + clone(learner_m), + xx, + n_folds_tune, + par_grid["ml_g"], + par_grid["ml_pi"], + par_grid["ml_m"], + ) + elif score == "nonignorable": + g0_best_params, g1_best_params, pi_best_params, m_best_params = tune_nuisance_ssm_nonignorable( + y, + x, + d, + z, + s, + clone(learner_g), + clone(learner_m), + clone(learner_m), + xx, + n_folds_tune, + par_grid["ml_g"], + par_grid["ml_pi"], + par_grid["ml_m"], + ) + + g0_best_params = g0_best_params * n_folds + g1_best_params = g1_best_params * n_folds + pi_best_params = pi_best_params * n_folds + m_best_params = m_best_params * n_folds + + np.random.seed(42) + res_manual = fit_selection( + y, + x, + d, + z, + s, + clone(learner_g), + clone(learner_m), + clone(learner_m), + all_smpls, + score, + normalize_ipw=normalize_ipw, + g_d0_params=g0_best_params, + g_d1_params=g1_best_params, + pi_params=pi_best_params, + m_params=m_best_params, + ) + + res_dict = { + "coef": dml_sel_obj.coef[0], + "coef_manual": res_manual["theta"], + "se": dml_sel_obj.se[0], + "se_manual": res_manual["se"], + } + + return res_dict + + +@pytest.mark.ci +def test_dml_ssm_coef(dml_ssm_fixture): + assert math.isclose(dml_ssm_fixture["coef"], dml_ssm_fixture["coef_manual"], rel_tol=1e-9, abs_tol=1e-4) + + +@pytest.mark.ci +def test_dml_ssm_se(dml_ssm_fixture): + assert math.isclose(dml_ssm_fixture["se"], dml_ssm_fixture["se_manual"], rel_tol=1e-9, abs_tol=1e-4) From dbfea737dc092c7f3c32531fdaf670b47892a5f6 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 27 Oct 2025 14:19:32 -0700 Subject: [PATCH 15/23] Clean-up of branch --- doubleml/datasets.py | 1772 --------------------------------- doubleml/double_ml.py | 6 - doubleml/double_ml_data.py | 1104 -------------------- doubleml/utils/_estimation.py | 16 - 4 files changed, 2898 deletions(-) delete mode 100644 doubleml/datasets.py delete mode 100644 doubleml/double_ml_data.py diff --git a/doubleml/datasets.py b/doubleml/datasets.py deleted file mode 100644 index 6d9acfc8..00000000 --- a/doubleml/datasets.py +++ /dev/null @@ -1,1772 +0,0 @@ -import pandas as pd -import numpy as np -import warnings - -from scipy.linalg import toeplitz -from scipy.optimize import minimize_scalar -from scipy.special import expit - -from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder -from sklearn.datasets import make_spd_matrix - -from .double_ml_data import DoubleMLData, DoubleMLClusterData - -_array_alias = ['array', 'np.ndarray', 'np.array', np.ndarray] -_data_frame_alias = ['DataFrame', 'pd.DataFrame', pd.DataFrame] -_dml_data_alias = ['DoubleMLData', DoubleMLData] -_dml_cluster_data_alias = ['DoubleMLClusterData', DoubleMLClusterData] - - -def fetch_401K(return_type='DoubleMLData', polynomial_features=False): - """ - Data set on financial wealth and 401(k) plan participation. - - Parameters - ---------- - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - polynomial_features : - If ``True`` polynomial features are added (see replication files of Chernozhukov et al. (2018)). - - References - ---------- - Abadie, A. (2003), Semiparametric instrumental variable estimation of treatment response models. Journal of - Econometrics, 113(2): 231-263. - - Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), - Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. - doi:`10.1111/ectj.12097 `_. - """ - url = 'https://github.com/VC2015/DMLonGitHub/raw/master/sipp1991.dta' - raw_data = pd.read_stata(url) - - y_col = 'net_tfa' - d_cols = ['e401'] - x_cols = ['age', 'inc', 'educ', 'fsize', 'marr', 'twoearn', 'db', 'pira', 'hown'] - - data = raw_data.copy() - - if polynomial_features: - raise NotImplementedError('polynomial_features os not implemented yet for fetch_401K.') - - if return_type in _data_frame_alias + _dml_data_alias: - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, y_col, d_cols, x_cols) - else: - raise ValueError('Invalid return_type.') - - -def fetch_bonus(return_type='DoubleMLData', polynomial_features=False): - """ - Data set on the Pennsylvania Reemployment Bonus experiment. - - Parameters - ---------- - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - polynomial_features : - If ``True`` polynomial features are added (see replication files of Chernozhukov et al. (2018)). - - References - ---------- - Bilias Y. (2000), Sequential Testing of Duration Data: The Case of Pennsylvania 'Reemployment Bonus' Experiment. - Journal of Applied Econometrics, 15(6): 575-594. - - Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), - Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. - doi:`10.1111/ectj.12097 `_. - """ - url = 'https://raw.githubusercontent.com/VC2015/DMLonGitHub/master/penn_jae.dat' - raw_data = pd.read_csv(url, sep='\s+') - - ind = (raw_data['tg'] == 0) | (raw_data['tg'] == 4) - data = raw_data.copy()[ind] - data.reset_index(inplace=True) - data['tg'] = data['tg'].replace(4, 1) - data['inuidur1'] = np.log(data['inuidur1']) - - # variable dep as factor (dummy encoding) - dummy_enc = OneHotEncoder(drop='first', categories='auto').fit(data.loc[:, ['dep']]) - xx = dummy_enc.transform(data.loc[:, ['dep']]).toarray() - data['dep1'] = xx[:, 0] - data['dep2'] = xx[:, 1] - - y_col = 'inuidur1' - d_cols = ['tg'] - x_cols = ['female', 'black', 'othrace', - 'dep1', 'dep2', - 'q2', 'q3', 'q4', 'q5', 'q6', - 'agelt35', 'agegt54', 'durable', 'lusd', 'husd'] - - if polynomial_features: - poly = PolynomialFeatures(2, include_bias=False) - data_transf = poly.fit_transform(data[x_cols]) - x_cols = list(poly.get_feature_names_out(x_cols)) - - data_transf = pd.DataFrame(data_transf, columns=x_cols) - data = pd.concat((data[[y_col] + d_cols], data_transf), - axis=1, sort=False) - - if return_type in _data_frame_alias + _dml_data_alias: - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, y_col, d_cols, x_cols) - else: - raise ValueError('Invalid return_type.') - - -def _g(x): - return np.power(np.sin(x), 2) - - -def _m(x, nu=0., gamma=1.): - return 0.5 / np.pi * (np.sinh(gamma)) / (np.cosh(gamma) - np.cos(x - nu)) - - -def make_plr_CCDDHNR2018(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLData', **kwargs): - """ - Generates data from a partially linear regression model used in Chernozhukov et al. (2018) for Figure 1. - The data generating process is defined as - - .. math:: - - d_i &= m_0(x_i) + s_1 v_i, & &v_i \\sim \\mathcal{N}(0,1), - - y_i &= \\alpha d_i + g_0(x_i) + s_2 \\zeta_i, & &\\zeta_i \\sim \\mathcal{N}(0,1), - - - with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries - :math:`\\Sigma_{kj} = 0.7^{|j-k|}`. - The nuisance functions are given by - - .. math:: - - m_0(x_i) &= a_0 x_{i,1} + a_1 \\frac{\\exp(x_{i,3})}{1+\\exp(x_{i,3})}, - - g_0(x_i) &= b_0 \\frac{\\exp(x_{i,1})}{1+\\exp(x_{i,1})} + b_1 x_{i,3}. - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dim_x : - The number of covariates. - alpha : - The value of the causal parameter. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. - **kwargs - Additional keyword arguments to set non-default values for the parameters - :math:`a_0=1`, :math:`a_1=0.25`, :math:`s_1=1`, :math:`b_0=1`, :math:`b_1=0.25` or :math:`s_2=1`. - - References - ---------- - Chernozhukov, V., Chetverikov, D., Demirer, M., Duflo, E., Hansen, C., Newey, W. and Robins, J. (2018), - Double/debiased machine learning for treatment and structural parameters. The Econometrics Journal, 21: C1-C68. - doi:`10.1111/ectj.12097 `_. - """ - a_0 = kwargs.get('a_0', 1.) - a_1 = kwargs.get('a_1', 0.25) - s_1 = kwargs.get('s_1', 1.) - - b_0 = kwargs.get('b_0', 1.) - b_1 = kwargs.get('b_1', 0.25) - s_2 = kwargs.get('s_2', 1.) - - cov_mat = toeplitz([np.power(0.7, k) for k in range(dim_x)]) - x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) - - d = a_0 * x[:, 0] + a_1 * np.divide(np.exp(x[:, 2]), 1 + np.exp(x[:, 2])) \ - + s_1 * np.random.standard_normal(size=[n_obs, ]) - y = alpha * d + b_0 * np.divide(np.exp(x[:, 0]), 1 + np.exp(x[:, 0])) \ - + b_1 * x[:, 2] + s_2 * np.random.standard_normal(size=[n_obs, ]) - - if return_type in _array_alias: - return x, y, d - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d)), - columns=x_cols + ['y', 'd']) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, 'y', 'd', x_cols) - else: - raise ValueError('Invalid return_type.') - - -def make_plr_turrell2018(n_obs=100, dim_x=20, theta=0.5, return_type='DoubleMLData', **kwargs): - """ - Generates data from a partially linear regression model used in a blog article by Turrell (2018). - The data generating process is defined as - - .. math:: - - d_i &= m_0(x_i' b) + v_i, & &v_i \\sim \\mathcal{N}(0,1), - - y_i &= \\theta d_i + g_0(x_i' b) + u_i, & &u_i \\sim \\mathcal{N}(0,1), - - - with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a random symmetric, - positive-definite matrix generated with :py:meth:`sklearn.datasets.make_spd_matrix`. - :math:`b` is a vector with entries :math:`b_j=\\frac{1}{j}` and the nuisance functions are given by - - .. math:: - - m_0(x_i) &= \\frac{1}{2 \\pi} \\frac{\\sinh(\\gamma)}{\\cosh(\\gamma) - \\cos(x_i-\\nu)}, - - g_0(x_i) &= \\sin(x_i)^2. - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dim_x : - The number of covariates. - theta : - The value of the causal parameter. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. - **kwargs - Additional keyword arguments to set non-default values for the parameters - :math:`\\nu=0`, or :math:`\\gamma=1`. - - References - ---------- - Turrell, A. (2018), Econometrics in Python part I - Double machine learning, Markov Wanderer: A blog on economics, - science, coding and data. `https://aeturrell.com/blog/posts/econometrics-in-python-parti-ml/ - `_. - """ - nu = kwargs.get('nu', 0.) - gamma = kwargs.get('gamma', 1.) - - b = [1 / k for k in range(1, dim_x + 1)] - sigma = make_spd_matrix(dim_x) - - x = np.random.multivariate_normal(np.zeros(dim_x), sigma, size=[n_obs, ]) - G = _g(np.dot(x, b)) - M = _m(np.dot(x, b), nu=nu, gamma=gamma) - d = M + np.random.standard_normal(size=[n_obs, ]) - y = np.dot(theta, d) + G + np.random.standard_normal(size=[n_obs, ]) - - if return_type in _array_alias: - return x, y, d - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d)), - columns=x_cols + ['y', 'd']) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, 'y', 'd', x_cols) - else: - raise ValueError('Invalid return_type.') - - -def make_irm_data(n_obs=500, dim_x=20, theta=0, R2_d=0.5, R2_y=0.5, return_type='DoubleMLData'): - """ - Generates data from a interactive regression (IRM) model. - The data generating process is defined as - - .. math:: - - d_i &= 1\\left\\lbrace \\frac{\\exp(c_d x_i' \\beta)}{1+\\exp(c_d x_i' \\beta)} > v_i \\right\\rbrace, & &v_i - \\sim \\mathcal{U}(0,1), - - y_i &= \\theta d_i + c_y x_i' \\beta d_i + \\zeta_i, & &\\zeta_i \\sim \\mathcal{N}(0,1), - - with covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries - :math:`\\Sigma_{kj} = 0.5^{|j-k|}`. - :math:`\\beta` is a `dim_x`-vector with entries :math:`\\beta_j=\\frac{1}{j^2}` and the constants :math:`c_y` and - :math:`c_d` are given by - - .. math:: - - c_y = \\sqrt{\\frac{R_y^2}{(1-R_y^2) \\beta' \\Sigma \\beta}}, \\qquad c_d = - \\sqrt{\\frac{(\\pi^2 /3) R_d^2}{(1-R_d^2) \\beta' \\Sigma \\beta}}. - - The data generating process is inspired by a process used in the simulation experiment (see Appendix P) of Belloni - et al. (2017). - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dim_x : - The number of covariates. - theta : - The value of the causal parameter. - R2_d : - The value of the parameter :math:`R_d^2`. - R2_y : - The value of the parameter :math:`R_y^2`. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)``. - - References - ---------- - Belloni, A., Chernozhukov, V., Fernández‐Val, I. and Hansen, C. (2017). Program Evaluation and Causal Inference With - High‐Dimensional Data. Econometrica, 85: 233-298. - """ - # inspired by https://onlinelibrary.wiley.com/doi/abs/10.3982/ECTA12723, see suplement - v = np.random.uniform(size=[n_obs, ]) - zeta = np.random.standard_normal(size=[n_obs, ]) - - cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) - x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) - - beta = [1 / (k ** 2) for k in range(1, dim_x + 1)] - b_sigma_b = np.dot(np.dot(cov_mat, beta), beta) - c_y = np.sqrt(R2_y / ((1 - R2_y) * b_sigma_b)) - c_d = np.sqrt(np.pi ** 2 / 3. * R2_d / ((1 - R2_d) * b_sigma_b)) - - xx = np.exp(np.dot(x, np.multiply(beta, c_d))) - d = 1. * ((xx / (1 + xx)) > v) - - y = d * theta + d * np.dot(x, np.multiply(beta, c_y)) + zeta - - if return_type in _array_alias: - return x, y, d - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d)), - columns=x_cols + ['y', 'd']) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, 'y', 'd', x_cols) - else: - raise ValueError('Invalid return_type.') - - -def make_iivm_data(n_obs=500, dim_x=20, theta=1., alpha_x=0.2, return_type='DoubleMLData'): - """ - Generates data from a interactive IV regression (IIVM) model. - The data generating process is defined as - - .. math:: - - d_i &= 1\\left\\lbrace \\alpha_x Z + v_i > 0 \\right\\rbrace, - - y_i &= \\theta d_i + x_i' \\beta + u_i, - - with :math:`Z \\sim \\text{Bernoulli}(0.5)` and - - .. math:: - - \\left(\\begin{matrix} u_i \\\\ v_i \\end{matrix} \\right) \\sim - \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & 0.3 \\\\ 0.3 & 1 \\end{matrix} \\right) \\right). - - The covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries - :math:`\\Sigma_{kj} = 0.5^{|j-k|}` and :math:`\\beta` is a `dim_x`-vector with entries - :math:`\\beta_j=\\frac{1}{j^2}`. - - The data generating process is inspired by a process used in the simulation experiment of Farbmacher, Gruber and - Klaassen (2020). - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dim_x : - The number of covariates. - theta : - The value of the causal parameter. - alpha_x : - The value of the parameter :math:`\\alpha_x`. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z)``. - - References - ---------- - Farbmacher, H., Guber, R. and Klaaßen, S. (2020). Instrument Validity Tests with Causal Forests. MEA Discussion - Paper No. 13-2020. Available at SSRN: http://dx.doi.org/10.2139/ssrn.3619201. - """ - # inspired by https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3619201 - xx = np.random.multivariate_normal(np.zeros(2), - np.array([[1., 0.3], [0.3, 1.]]), - size=[n_obs, ]) - u = xx[:, 0] - v = xx[:, 1] - - cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) - x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) - - beta = [1 / (k ** 2) for k in range(1, dim_x + 1)] - - z = np.random.binomial(p=0.5, n=1, size=[n_obs, ]) - d = 1. * (alpha_x * z + v > 0) - - y = d * theta + np.dot(x, beta) + u - - if return_type in _array_alias: - return x, y, d, z - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d, z)), - columns=x_cols + ['y', 'd', 'z']) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, 'y', 'd', x_cols, 'z') - else: - raise ValueError('Invalid return_type.') - - -def _make_pliv_data(n_obs=100, dim_x=20, theta=0.5, gamma_z=0.4, return_type='DoubleMLData'): - b = [1 / k for k in range(1, dim_x + 1)] - sigma = make_spd_matrix(dim_x) - - x = np.random.multivariate_normal(np.zeros(dim_x), sigma, size=[n_obs, ]) - G = _g(np.dot(x, b)) - # instrument - z = _m(np.dot(x, b)) + np.random.standard_normal(size=[n_obs, ]) - # treatment - M = _m(gamma_z * z + np.dot(x, b)) - d = M + np.random.standard_normal(size=[n_obs, ]) - y = np.dot(theta, d) + G + np.random.standard_normal(size=[n_obs, ]) - - if return_type in _array_alias: - return x, y, d, z - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d, z)), - columns=x_cols + ['y', 'd', 'z']) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, 'y', 'd', x_cols, 'z') - else: - raise ValueError('Invalid return_type.') - - -def make_pliv_CHS2015(n_obs, alpha=1., dim_x=200, dim_z=150, return_type='DoubleMLData'): - """ - Generates data from a partially linear IV regression model used in Chernozhukov, Hansen and Spindler (2015). - The data generating process is defined as - - .. math:: - - z_i &= \\Pi x_i + \\zeta_i, - - d_i &= x_i' \\gamma + z_i' \\delta + u_i, - - y_i &= \\alpha d_i + x_i' \\beta + \\varepsilon_i, - - with - - .. math:: - - \\left(\\begin{matrix} \\varepsilon_i \\\\ u_i \\\\ \\zeta_i \\\\ x_i \\end{matrix} \\right) \\sim - \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & 0.6 & 0 & 0 \\\\ 0.6 & 1 & 0 & 0 \\\\ - 0 & 0 & 0.25 I_{p_n^z} & 0 \\\\ 0 & 0 & 0 & \\Sigma \\end{matrix} \\right) \\right) - - where :math:`\\Sigma` is a :math:`p_n^x \\times p_n^x` matrix with entries - :math:`\\Sigma_{kj} = 0.5^{|j-k|}` and :math:`I_{p_n^z}` is the :math:`p_n^z \\times p_n^z` identity matrix. - :math:`\\beta = \\gamma` is a :math:`p_n^x`-vector with entries :math:`\\beta_j=\\frac{1}{j^2}`, - :math:`\\delta` is a :math:`p_n^z`-vector with entries :math:`\\delta_j=\\frac{1}{j^2}` - and :math:`\\Pi = (I_{p_n^z}, 0_{p_n^z \\times (p_n^x - p_n^z)})`. - - Parameters - ---------- - n_obs : - The number of observations to simulate. - alpha : - The value of the causal parameter. - dim_x : - The number of covariates. - dim_z : - The number of instruments. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z)``. - - References - ---------- - Chernozhukov, V., Hansen, C. and Spindler, M. (2015), Post-Selection and Post-Regularization Inference in Linear - Models with Many Controls and Instruments. American Economic Review: Papers and Proceedings, 105 (5): 486-90. - """ - assert dim_x >= dim_z - # see https://assets.aeaweb.org/asset-server/articles-attachments/aer/app/10505/P2015_1022_app.pdf - xx = np.random.multivariate_normal(np.zeros(2), - np.array([[1., 0.6], [0.6, 1.]]), - size=[n_obs, ]) - epsilon = xx[:, 0] - u = xx[:, 1] - - sigma = toeplitz([np.power(0.5, k) for k in range(0, dim_x)]) - x = np.random.multivariate_normal(np.zeros(dim_x), - sigma, - size=[n_obs, ]) - - I_z = np.eye(dim_z) - xi = np.random.multivariate_normal(np.zeros(dim_z), - 0.25 * I_z, - size=[n_obs, ]) - - beta = [1 / (k ** 2) for k in range(1, dim_x + 1)] - gamma = beta - delta = [1 / (k ** 2) for k in range(1, dim_z + 1)] - Pi = np.hstack((I_z, np.zeros((dim_z, dim_x - dim_z)))) - - z = np.dot(x, np.transpose(Pi)) + xi - d = np.dot(x, gamma) + np.dot(z, delta) + u - y = alpha * d + np.dot(x, beta) + epsilon - - if return_type in _array_alias: - return x, y, d, z - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] - z_cols = [f'Z{i + 1}' for i in np.arange(dim_z)] - data = pd.DataFrame(np.column_stack((x, y, d, z)), - columns=x_cols + ['y', 'd'] + z_cols) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, 'y', 'd', x_cols, z_cols) - else: - raise ValueError('Invalid return_type.') - - -def make_pliv_multiway_cluster_CKMS2021(N=25, M=25, dim_X=100, theta=1., return_type='DoubleMLClusterData', **kwargs): - """ - Generates data from a partially linear IV regression model with multiway cluster sample used in Chiang et al. - (2021). The data generating process is defined as - - .. math:: - - Z_{ij} &= X_{ij}' \\xi_0 + V_{ij}, - - D_{ij} &= Z_{ij}' \\pi_{10} + X_{ij}' \\pi_{20} + v_{ij}, - - Y_{ij} &= D_{ij} \\theta + X_{ij}' \\zeta_0 + \\varepsilon_{ij}, - - with - - .. math:: - - X_{ij} &= (1 - \\omega_1^X - \\omega_2^X) \\alpha_{ij}^X - + \\omega_1^X \\alpha_{i}^X + \\omega_2^X \\alpha_{j}^X, - - \\varepsilon_{ij} &= (1 - \\omega_1^\\varepsilon - \\omega_2^\\varepsilon) \\alpha_{ij}^\\varepsilon - + \\omega_1^\\varepsilon \\alpha_{i}^\\varepsilon + \\omega_2^\\varepsilon \\alpha_{j}^\\varepsilon, - - v_{ij} &= (1 - \\omega_1^v - \\omega_2^v) \\alpha_{ij}^v - + \\omega_1^v \\alpha_{i}^v + \\omega_2^v \\alpha_{j}^v, - - V_{ij} &= (1 - \\omega_1^V - \\omega_2^V) \\alpha_{ij}^V - + \\omega_1^V \\alpha_{i}^V + \\omega_2^V \\alpha_{j}^V, - - and :math:`\\alpha_{ij}^X, \\alpha_{i}^X, \\alpha_{j}^X \\sim \\mathcal{N}(0, \\Sigma)` - where :math:`\\Sigma` is a :math:`p_x \\times p_x` matrix with entries - :math:`\\Sigma_{kj} = s_X^{|j-k|}`. - Further - - .. math:: - - \\left(\\begin{matrix} \\alpha_{ij}^\\varepsilon \\\\ \\alpha_{ij}^v \\end{matrix}\\right), - \\left(\\begin{matrix} \\alpha_{i}^\\varepsilon \\\\ \\alpha_{i}^v \\end{matrix}\\right), - \\left(\\begin{matrix} \\alpha_{j}^\\varepsilon \\\\ \\alpha_{j}^v \\end{matrix}\\right) - \\sim \\mathcal{N}\\left(0, \\left(\\begin{matrix} 1 & s_{\\varepsilon v} \\\\ - s_{\\varepsilon v} & 1 \\end{matrix} \\right) \\right) - - - and :math:`\\alpha_{ij}^V, \\alpha_{i}^V, \\alpha_{j}^V \\sim \\mathcal{N}(0, 1)`. - - Parameters - ---------- - N : - The number of observations (first dimension). - M : - The number of observations (second dimension). - dim_X : - The number of covariates. - theta : - The value of the causal parameter. - return_type : - If ``'DoubleMLClusterData'`` or ``DoubleMLClusterData``, returns a ``DoubleMLClusterData`` object where - ``DoubleMLClusterData.data`` is a ``pd.DataFrame``. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s - ``(x, y, d, cluster_vars, z)``. - **kwargs - Additional keyword arguments to set non-default values for the parameters - :math:`\\pi_{10}=1.0`, :math:`\\omega_X = \\omega_{\\varepsilon} = \\omega_V = \\omega_v = (0.25, 0.25)`, - :math:`s_X = s_{\\varepsilon v} = 0.25`, - or the :math:`p_x`-vectors :math:`\\zeta_0 = \\pi_{20} = \\xi_0` with default entries - :math:`(\\zeta_{0})_j = 0.5^j`. - - References - ---------- - Chiang, H. D., Kato K., Ma, Y. and Sasaki, Y. (2021), Multiway Cluster Robust Double/Debiased Machine Learning, - Journal of Business & Economic Statistics, - doi: `10.1080/07350015.2021.1895815 `_, - arXiv:`1909.03489 `_. - """ - # additional parameters specifiable via kwargs - pi_10 = kwargs.get('pi_10', 1.0) - - xx = np.arange(1, dim_X + 1) - zeta_0 = kwargs.get('zeta_0', np.power(0.5, xx)) - pi_20 = kwargs.get('pi_20', np.power(0.5, xx)) - xi_0 = kwargs.get('xi_0', np.power(0.5, xx)) - - omega_X = kwargs.get('omega_X', np.array([0.25, 0.25])) - omega_epsilon = kwargs.get('omega_epsilon', np.array([0.25, 0.25])) - omega_v = kwargs.get('omega_v', np.array([0.25, 0.25])) - omega_V = kwargs.get('omega_V', np.array([0.25, 0.25])) - - s_X = kwargs.get('s_X', 0.25) - s_epsilon_v = kwargs.get('s_epsilon_v', 0.25) - - # use np.tile() and np.repeat() for repeating vectors in different styles, i.e., - # np.tile([v1, v2, v3], 2) [v1, v2, v3, v1, v2, v3] - # np.repeat([v1, v2, v3], 2) [v1, v1, v2, v2, v3, v3] - - alpha_V = np.random.normal(size=(N * M)) - alpha_V_i = np.repeat(np.random.normal(size=N), M) - alpha_V_j = np.tile(np.random.normal(size=M), N) - - cov_mat = np.array([[1, s_epsilon_v], [s_epsilon_v, 1]]) - alpha_eps_v = np.random.multivariate_normal(np.zeros(2), cov_mat, size=[N * M, ]) - alpha_eps = alpha_eps_v[:, 0] - alpha_v = alpha_eps_v[:, 1] - - alpha_eps_v_i = np.random.multivariate_normal(np.zeros(2), cov_mat, size=[N, ]) - alpha_eps_i = np.repeat(alpha_eps_v_i[:, 0], M) - alpha_v_i = np.repeat(alpha_eps_v_i[:, 1], M) - - alpha_eps_v_j = np.random.multivariate_normal(np.zeros(2), cov_mat, size=[M, ]) - alpha_eps_j = np.tile(alpha_eps_v_j[:, 0], N) - alpha_v_j = np.tile(alpha_eps_v_j[:, 1], N) - - cov_mat = toeplitz([np.power(s_X, k) for k in range(dim_X)]) - alpha_X = np.random.multivariate_normal(np.zeros(dim_X), cov_mat, size=[N * M, ]) - alpha_X_i = np.repeat(np.random.multivariate_normal(np.zeros(dim_X), cov_mat, size=[N, ]), - M, axis=0) - alpha_X_j = np.tile(np.random.multivariate_normal(np.zeros(dim_X), cov_mat, size=[M, ]), - (N, 1)) - - # generate variables - x = (1 - omega_X[0] - omega_X[1]) * alpha_X \ - + omega_X[0] * alpha_X_i + omega_X[1] * alpha_X_j - - eps = (1 - omega_epsilon[0] - omega_epsilon[1]) * alpha_eps \ - + omega_epsilon[0] * alpha_eps_i + omega_epsilon[1] * alpha_eps_j - - v = (1 - omega_v[0] - omega_v[1]) * alpha_v \ - + omega_v[0] * alpha_v_i + omega_v[1] * alpha_v_j - - V = (1 - omega_V[0] - omega_V[1]) * alpha_V \ - + omega_V[0] * alpha_V_i + omega_V[1] * alpha_V_j - - z = np.matmul(x, xi_0) + V - d = z * pi_10 + np.matmul(x, pi_20) + v - y = d * theta + np.matmul(x, zeta_0) + eps - - cluster_cols = ['cluster_var_i', 'cluster_var_j'] - cluster_vars = pd.MultiIndex.from_product([range(N), range(M)]).to_frame(name=cluster_cols).reset_index(drop=True) - - if return_type in _array_alias: - return x, y, d, cluster_vars.values, z - elif return_type in _data_frame_alias + _dml_cluster_data_alias: - x_cols = [f'X{i + 1}' for i in np.arange(dim_X)] - data = pd.concat((cluster_vars, - pd.DataFrame(np.column_stack((x, y, d, z)), columns=x_cols + ['Y', 'D', 'Z'])), - axis=1) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLClusterData(data, 'Y', 'D', cluster_cols, x_cols, 'Z') - else: - raise ValueError('Invalid return_type.') - - -def make_did_SZ2020(n_obs=500, dgp_type=1, cross_sectional_data=False, return_type='DoubleMLData', **kwargs): - """ - Generates data from a difference-in-differences model used in Sant'Anna and Zhao (2020). - The data generating process is defined as follows. For a generic :math:`W=(W_1, W_2, W_3, W_4)^T`, let - - .. math:: - - f_{reg}(W) &= 210 + 27.4 \\cdot W_1 +13.7 \\cdot (W_2 + W_3 + W_4), - - f_{ps}(W) &= 0.75 \\cdot (-W_1 + 0.5 \\cdot W_2 -0.25 \\cdot W_3 - 0.1 \\cdot W_4). - - - Let :math:`X= (X_1, X_2, X_3, X_4)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` is a matrix with entries - :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix. - Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, - where :math:`\\tilde{Z}_1 = \\exp(0.5 \\cdot X_1)`, :math:`\\tilde{Z}_2 = 10 + X_2/(1 + \\exp(X_1))`, - :math:`\\tilde{Z}_3 = (0.6 + X_1 \\cdot X_3 / 25)^3` and :math:`\\tilde{Z}_4 = (20 + X_2 + X_4)^2`. - At first define - - .. math:: - - Y_0(0) &= f_{reg}(W_{reg}) + \\nu(W_{reg}, D) + \\varepsilon_0, - - Y_1(d) &= 2 \\cdot f_{reg}(W_{reg}) + \\nu(W_{reg}, D) + \\varepsilon_1(d), - - p(W_{ps}) &= \\frac{\\exp(f_{ps}(W_{ps}))}{1 + \\exp(f_{ps}(W_{ps}))}, - - D &= 1\\{p(W_{ps}) \\ge U\\}, - - where :math:`\\varepsilon_0, \\varepsilon_1(d), d=0, 1` are independent standard normal random variables, - :math:`U \\sim \\mathcal{U}[0, 1]` is a independent standard uniform - and :math:`\\nu(W_{reg}, D)\\sim \\mathcal{N}(D \\cdot f_{reg}(W_{reg}),1)`. - The different data generating processes are defined via - - .. math:: - - DGP1:\\quad W_{reg} &= Z \\quad W_{ps} = Z - - DGP2:\\quad W_{reg} &= Z \\quad W_{ps} = X - - DGP3:\\quad W_{reg} &= X \\quad W_{ps} = Z - - DGP4:\\quad W_{reg} &= X \\quad W_{ps} = X - - DGP5:\\quad W_{reg} &= Z \\quad W_{ps} = 0 - - DGP6:\\quad W_{reg} &= X \\quad W_{ps} = 0, - - such that the last two settings correspond to an experimental setting with treatment probability - of :math:`P(D=1) = \\frac{1}{2}.` - For the panel data the outcome is already defined as the difference :math:`Y = Y_1(D) - Y_0(0)`. - For cross-sectional data the flag ``cross_sectional_data`` has to be set to ``True``. - Then the outcome will be defined to be - - .. math:: - - Y = T \\cdot Y_1(D) + (1-T) \\cdot Y_0(0), - - where :math:`T = 1\\{U_T\\le \\lambda_T \\}` with :math:`U_T\\sim \\mathcal{U}[0, 1]` and :math:`\\lambda_T=0.5`. - The true average treatment effect on the treated is zero for all data generating processes. - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dgp_type : - The DGP to be used. Default value is ``1`` (integer). - cross_sectional_data : - Indicates whether the setting is uses cross-sectional or panel data. Default value is ``False``. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d)`` - or ``(x, y, d, t)``. - **kwargs - Additional keyword arguments to set non-default values for the parameter - :math:`xi=0.75`, :math:`c=0.0` and :math:`\\lambda_T=0.5`. - - References - ---------- - Sant’Anna, P. H. and Zhao, J. (2020), - Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122. - doi:`10.1016/j.jeconom.2020.06.003 `_. - """ - xi = kwargs.get('xi', 0.75) - c = kwargs.get('c', 0.0) - lambda_t = kwargs.get('lambda_t', 0.5) - - def f_reg(w): - res = 210 + 27.4 * w[:, 0] + 13.7 * (w[:, 1] + w[:, 2] + w[:, 3]) - return res - - def f_ps(w, xi): - res = xi * (-w[:, 0] + 0.5 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3]) - return res - - dim_x = 4 - cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) - x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) - - z_tilde_1 = np.exp(0.5 * x[:, 0]) - z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) - z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 - z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 - - z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4)) - z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) - - # error terms - epsilon_0 = np.random.normal(loc=0, scale=1, size=n_obs) - epsilon_1 = np.random.normal(loc=0, scale=1, size=[n_obs, 2]) - - if dgp_type == 1: - features_ps = z - features_reg = z - elif dgp_type == 2: - features_ps = x - features_reg = z - elif dgp_type == 3: - features_ps = z - features_reg = x - elif dgp_type == 4: - features_ps = x - features_reg = x - elif dgp_type == 5: - features_ps = None - features_reg = z - elif dgp_type == 6: - features_ps = None - features_reg = x - else: - raise ValueError('The dgp_type is not valid.') - - # treatment and propensities - is_experimental = (dgp_type == 5) or (dgp_type == 6) - if is_experimental: - # Set D to be experimental - p = 0.5 * np.ones(n_obs) - else: - p = np.exp(f_ps(features_ps, xi)) / (1 + np.exp(f_ps(features_ps, xi))) - u = np.random.uniform(low=0, high=1, size=n_obs) - d = 1.0 * (p >= u) - - # potential outcomes - nu = np.random.normal(loc=d * f_reg(features_reg), scale=1, size=n_obs) - y0 = f_reg(features_reg) + nu + epsilon_0 - y1_d0 = 2 * f_reg(features_reg) + nu + epsilon_1[:, 0] - y1_d1 = 2 * f_reg(features_reg) + nu + epsilon_1[:, 1] - y1 = d * y1_d1 + (1 - d) * y1_d0 - - if not cross_sectional_data: - y = y1 - y0 - - if return_type in _array_alias: - return z, y, d - elif return_type in _data_frame_alias + _dml_data_alias: - z_cols = [f'Z{i + 1}' for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((z, y, d)), - columns=z_cols + ['y', 'd']) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, 'y', 'd', z_cols) - else: - raise ValueError('Invalid return_type.') - - else: - u_t = np.random.uniform(low=0, high=1, size=n_obs) - t = 1.0 * (u_t <= lambda_t) - y = t * y1 + (1 - t) * y0 - - if return_type in _array_alias: - return z, y, d, t - elif return_type in _data_frame_alias + _dml_data_alias: - z_cols = [f'Z{i + 1}' for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((z, y, d, t)), - columns=z_cols + ['y', 'd', 't']) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, 'y', 'd', z_cols, t_col='t') - else: - raise ValueError('Invalid return_type.') - - -def make_confounded_irm_data(n_obs=500, theta=0.0, gamma_a=0.127, beta_a=0.58, linear=False, **kwargs): - """ - Generates counfounded data from an interactive regression model. - - The data generating process is defined as follows (inspired by the Monte Carlo simulation used - in Sant'Anna and Zhao (2020)). - - Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` corresponds - to the identity matrix. - Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, - where - - .. math:: - - \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) - - \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) - - \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 - - \\tilde{Z}_4 &= (20 + X_2 + X_4)^2 - - \\tilde{Z}_5 &= X_5. - - Additionally, generate a confounder :math:`A \\sim \\mathcal{U}[-1, 1]`. - At first, define the propensity score as - - .. math:: - - m(X, A) = P(D=1|X,A) = p(Z) + \\gamma_A \\cdot A - - where - - .. math:: - - p(Z) &= \\frac{\\exp(f_{ps}(Z))}{1 + \\exp(f_{ps}(Z))}, - - f_{ps}(Z) &= 0.75 \\cdot (-Z_1 + 0.1 \\cdot Z_2 -0.25 \\cdot Z_3 - 0.1 \\cdot Z_4). - - and generate the treatment :math:`D = 1\\{m(X, A) \\ge U\\}` with :math:`U \\sim \\mathcal{U}[0, 1]`. - Since :math:`A` is independent of :math:`X`, the short form of the propensity score is given as - - .. math:: - - P(D=1|X) = p(Z). - - Further, generate the outcome of interest :math:`Y` as - - .. math:: - - Y &= \\theta \\cdot D (Z_5 + 1) + g(Z) + \\beta_A \\cdot A + \\varepsilon - - g(Z) &= 2.5 + 0.74 \\cdot Z_1 + 0.25 \\cdot Z_2 + 0.137 \\cdot (Z_3 + Z_4) - - where :math:`\\varepsilon \\sim \\mathcal{N}(0,5)`. - This implies an average treatment effect of :math:`\\theta`. Additionally, the long and short forms of - the conditional expectation take the following forms - - .. math:: - - \\mathbb{E}[Y|D, X, A] &= \\theta \\cdot D (Z_5 + 1) + g(Z) + \\beta_A \\cdot A - - \\mathbb{E}[Y|D, X] &= (\\theta + \\beta_A \\frac{\\mathrm{Cov}(A, D(Z_5 + 1))}{\\mathrm{Var}(D(Z_5 + 1))}) - \\cdot D (Z_5 + 1) + g(Z). - - Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`, which can be - set via the parameters ``gamma_a`` and ``beta_a``. - - The observed data is given as :math:`W = (Y, D, Z)`. - Further, orcale values of the confounder :math:`A`, the transformed covariated :math:`Z`, - the potential outcomes of :math:`Y`, the long and short forms of the main regression and the propensity score and - in sample versions of the confounding parameters :math:`cf_d` and :math:`cf_y` (for ATE and ATTE) - are returned in a dictionary. - - Parameters - ---------- - n_obs : int - The number of observations to simulate. - Default is ``500``. - theta : float or int - Average treatment effect. - Default is ``0.0``. - gamma_a : float - Coefficient of the unobserved confounder in the propensity score. - Default is ``0.127``. - beta_a : float - Coefficient of the unobserved confounder in the outcome regression. - Default is ``0.58``. - linear : bool - If ``True``, the Z will be set to X, such that the underlying (short) models are linear/logistic. - Default is ``False``. - - Returns - ------- - res_dict : dictionary - Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. - - References - ---------- - Sant’Anna, P. H. and Zhao, J. (2020), - Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122. - doi:`10.1016/j.jeconom.2020.06.003 `_. - """ - c = 0.0 # the confounding strength is only valid for c=0 - xi = 0.75 - dim_x = kwargs.get('dim_x', 5) - trimming_threshold = kwargs.get('trimming_threshold', 0.01) - var_eps_y = kwargs.get('var_eps_y', 1.0) - - # Specification of main regression function - def f_reg(w): - res = 2.5 + 0.74 * w[:, 0] + 0.25 * w[:, 1] + 0.137 * (w[:, 2] + w[:, 3]) - return res - - # Specification of prop score function - def f_ps(w, xi): - res = xi * (-w[:, 0] + 0.1 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3]) - return res - - # observed covariates - cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) - x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) - z_tilde_1 = np.exp(0.5 * x[:, 0]) - z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) - z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 - z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 - z_tilde_5 = x[:, 4] - z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, z_tilde_5)) - z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) - # error terms and unobserved confounder - eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) - # unobserved confounder - a_bounds = (-1, 1) - a = np.random.uniform(low=a_bounds[0], high=a_bounds[1], size=n_obs) - var_a = np.square(a_bounds[1] - a_bounds[0]) / 12 - - # Choose the features used in the models - if linear: - features_ps = x - features_reg = x - else: - features_ps = z - features_reg = z - - p = np.exp(f_ps(features_ps, xi)) / (1 + np.exp(f_ps(features_ps, xi))) - # compute short and long form of propensity score - m_long = p + gamma_a * a - m_short = p - # check propensity score bounds - if np.any(m_long < trimming_threshold) or np.any(m_long > 1.0 - trimming_threshold): - m_long = np.clip(m_long, trimming_threshold, 1.0 - trimming_threshold) - m_short = np.clip(m_short, trimming_threshold, 1.0 - trimming_threshold) - warnings.warn(f'Propensity score is close to 0 or 1. ' - f'Trimming is at {trimming_threshold} and {1.0 - trimming_threshold} is applied') - # generate treatment based on long form - u = np.random.uniform(low=0, high=1, size=n_obs) - d = 1.0 * (m_long >= u) - # add treatment heterogeneity - d1x = z[:, 4] + 1 - var_dx = np.var(d * (d1x)) - cov_adx = gamma_a * var_a - # Outcome regression - g_partial_reg = f_reg(features_reg) - # short model - g_short_d0 = g_partial_reg - g_short_d1 = (theta + beta_a * cov_adx / var_dx) * d1x + g_partial_reg - g_short = d * g_short_d1 + (1.0 - d) * g_short_d0 - # long model - g_long_d0 = g_partial_reg + beta_a * a - g_long_d1 = theta * d1x + g_partial_reg + beta_a * a - g_long = d * g_long_d1 + (1.0 - d) * g_long_d0 - # Potential outcomes - y_0 = g_long_d0 + eps_y - y_1 = g_long_d1 + eps_y - # Realized outcome - y = d * y_1 + (1.0 - d) * y_0 - # In-sample values for confounding strength - explained_residual_variance = np.square(g_long - g_short) - residual_variance = np.square(y - g_short) - cf_y = np.mean(explained_residual_variance) / np.mean(residual_variance) - # compute the Riesz representation - treated_weight = d / np.mean(d) - untreated_weight = (1.0 - d) / np.mean(d) - # Odds ratios - propensity_ratio_long = m_long / (1.0 - m_long) - rr_long_ate = d / m_long - (1.0 - d) / (1.0 - m_long) - rr_long_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_long) - propensity_ratio_short = m_short / (1.0 - m_short) - rr_short_ate = d / m_short - (1.0 - d) / (1.0 - m_short) - rr_short_atte = treated_weight - np.multiply(untreated_weight, propensity_ratio_short) - cf_d_ate = (np.mean(1 / (m_long * (1 - m_long))) - np.mean(1 / (m_short * (1 - m_short)))) / np.mean( - 1 / (m_long * (1 - m_long))) - cf_d_atte = (np.mean(propensity_ratio_long) - np.mean(propensity_ratio_short)) / np.mean(propensity_ratio_long) - if (beta_a == 0) | (gamma_a == 0): - rho_ate = 0.0 - rho_atte = 0.0 - else: - rho_ate = np.corrcoef((g_long - g_short), (rr_long_ate - rr_short_ate))[0, 1] - rho_atte = np.corrcoef((g_long - g_short), (rr_long_atte - rr_short_atte))[0, 1] - oracle_values = { - 'g_long': g_long, - 'g_short': g_short, - 'm_long': m_long, - 'm_short': m_short, - 'gamma_a': gamma_a, - 'beta_a': beta_a, - 'a': a, - 'y_0': y_0, - 'y_1': y_1, - 'z': z, - 'cf_y': cf_y, - 'cf_d_ate': cf_d_ate, - 'cf_d_atte': cf_d_atte, - 'rho_ate': rho_ate, - 'rho_atte': rho_atte, - } - res_dict = { - 'x': x, - 'y': y, - 'd': d, - 'oracle_values': oracle_values - } - return res_dict - - -def make_confounded_plr_data(n_obs=500, theta=5.0, cf_y=0.04, cf_d=0.04, **kwargs): - """ - Generates counfounded data from an partially linear regression model. - - The data generating process is defined as follows (similar to the Monte Carlo simulation used - in Sant'Anna and Zhao (2020)). Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, - where :math:`\\Sigma` is a matrix with entries - :math:`\\Sigma_{kj} = c^{|j-k|}`. The default value is :math:`c = 0`, corresponding to the identity matrix. - Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, - where - - .. math:: - - \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) - - \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) - - \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 - - \\tilde{Z}_4 &= (20 + X_2 + X_4)^2. - - Additionally, generate a confounder :math:`A \\sim \\mathcal{U}[-1, 1]`. - At first, define the treatment as - - .. math:: - - D = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4 + \\gamma_A \\cdot A + \\varepsilon_D - - and with :math:`\\varepsilon \\sim \\mathcal{N}(0,1)`. - Since :math:`A` is independent of :math:`X`, the long and short form of the treatment regression are given as - - .. math:: - - E[D|X,A] = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4 + \\gamma_A \\cdot A - - E[D|X] = -Z_1 + 0.5 \\cdot Z_2 - 0.25 \\cdot Z_3 - 0.1 \\cdot Z_4. - - Further, generate the outcome of interest :math:`Y` as - - .. math:: - - Y &= \\theta \\cdot D + g(Z) + \\beta_A \\cdot A + \\varepsilon - - g(Z) &= 210 + 27.4 \\cdot Z_1 +13.7 \\cdot (Z_2 + Z_3 + Z_4) - - where :math:`\\varepsilon \\sim \\mathcal{N}(0,5)`. - This implies an average treatment effect of :math:`\\theta`. Additionally, the long and short forms of - the conditional expectation take the following forms - - .. math:: - - \\mathbb{E}[Y|D, X, A] &= \\theta \\cdot D + g(Z) + \\beta_A \\cdot A - - \\mathbb{E}[Y|D, X] &= (\\theta + \\gamma_A\\beta_A \\frac{\\mathrm{Var}(A)}{\\mathrm{Var}(D)}) \\cdot D + g(Z). - - Consequently, the strength of confounding is determined via :math:`\\gamma_A` and :math:`\\beta_A`. - Both are chosen to obtain the desired confounding of the outcome and Riesz Representer (in sample). - - The observed data is given as :math:`W = (Y, D, X)`. - Further, orcale values of the confounder :math:`A`, the transformed covariated :math:`Z`, the effect :math:`\\theta`, - the coefficients :math:`\\gamma_a`, :math:`\\beta_a`, the long and short forms of the main regression and - the propensity score are returned in a dictionary. - - Parameters - ---------- - n_obs : int - The number of observations to simulate. - Default is ``500``. - theta : float or int - Average treatment effect. - Default is ``5.0``. - cf_y : float - Percentage of the residual variation of the outcome explained by latent/confounding variable. - Default is ``0.04``. - cf_d : float - Percentage gains in the variation of the Riesz Representer generated by latent/confounding variable. - Default is ``0.04``. - - Returns - ------- - res_dict : dictionary - Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. - - References - ---------- - Sant’Anna, P. H. and Zhao, J. (2020), - Doubly robust difference-in-differences estimators. Journal of Econometrics, 219(1), 101-122. - doi:`10.1016/j.jeconom.2020.06.003 `_. - """ - c = kwargs.get('c', 0.0) - dim_x = kwargs.get('dim_x', 4) - - # observed covariates - cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) - x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) - - z_tilde_1 = np.exp(0.5 * x[:, 0]) - z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) - z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 - z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 - - z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, x[:, 4:])) - z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) - - # error terms - var_eps_y = 5 - eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) - var_eps_d = 1 - eps_d = np.random.normal(loc=0, scale=np.sqrt(var_eps_d), size=n_obs) - - # unobserved confounder - a_bounds = (-1, 1) - a = np.random.uniform(low=a_bounds[0], high=a_bounds[1], size=n_obs) - var_a = np.square(a_bounds[1] - a_bounds[0]) / 12 - - # get the required impact of the confounder on the propensity score - m_short = -z[:, 0] + 0.5 * z[:, 1] - 0.25 * z[:, 2] - 0.1 * z[:, 3] - - def f_m(gamma_a): - rr_long = eps_d / var_eps_d - rr_short = (gamma_a * a + eps_d) / (gamma_a ** 2 * var_a + var_eps_d) - C2_D = (np.mean(np.square(rr_long)) - np.mean(np.square(rr_short))) / np.mean(np.square(rr_short)) - return np.square(C2_D / (1 + C2_D) - cf_d) - - gamma_a = minimize_scalar(f_m).x - m_long = m_short + gamma_a * a - d = m_long + eps_d - - # short and long version of g - g_partial_reg = 210 + 27.4 * z[:, 0] + 13.7 * (z[:, 1] + z[:, 2] + z[:, 3]) - - var_d = np.var(d) - - def f_g(beta_a): - g_diff = beta_a * (a - gamma_a * (var_a / var_d) * d) - y_diff = eps_y + g_diff - return np.square(np.mean(np.square(g_diff)) / np.mean(np.square(y_diff)) - cf_y) - - beta_a = minimize_scalar(f_g).x - - g_long = theta * d + g_partial_reg + beta_a * a - g_short = (theta + gamma_a * beta_a * var_a / var_d) * d + g_partial_reg - - y = g_long + eps_y - - oracle_values = {'g_long': g_long, - 'g_short': g_short, - 'm_long': m_long, - 'm_short': m_short, - 'theta': theta, - 'gamma_a': gamma_a, - 'beta_a': beta_a, - 'a': a, - 'z': z} - - res_dict = {'x': x, - 'y': y, - 'd': d, - 'oracle_values': oracle_values} - - return res_dict - - -def make_heterogeneous_data(n_obs=200, p=30, support_size=5, n_x=1, binary_treatment=False): - """ - Creates a simple synthetic example for heterogeneous treatment effects. - The data generating process is based on the Monte Carlo simulation from Oprescu et al. (2019). - - The data is generated as - - .. math:: - - Y_i & = \\theta_0(X_i)D_i + \\langle X_i,\\gamma_0\\rangle + \\epsilon_i - - D_i & = \\langle X_i,\\beta_0\\rangle + \\eta_i, - - where :math:`X_i\\sim\\mathcal{U}[0,1]^{p}` and :math:`\\epsilon_i,\\eta_i - \\sim\\mathcal{U}[-1,1]`. - If the treatment is set to be binary, the treatment is generated as - - .. math:: - D_i = 1\\{\\langle X_i,\\beta_0\\rangle \\ge \\eta_i\\}. - - The coefficient vectors :math:`\\gamma_0` and :math:`\\beta_0` both have small random (identical) support - which values are drawn independently from :math:`\\mathcal{U}[0,1]` and :math:`\\mathcal{U}[0,0.3]`. - Further, :math:`\\theta_0(x)` defines the conditional treatment effect, which is defined differently depending - on the dimension of :math:`x`. - - If the heterogeneity is univariate the conditional treatment effect takes the following form - - .. math:: - \\theta_0(x) = \\exp(2x_0) + 3\\sin(4x_0), - - whereas for the two-dimensional case the conditional treatment effect is defined as - - .. math:: - \\theta_0(x) = \\exp(2x_0) + 3\\sin(4x_1). - - Parameters - ---------- - n_obs : int - Number of observations to simulate. - Default is ``200``. - - p : int - Dimension of covariates. - Default is ``30``. - - support_size : int - Number of relevant (confounding) covariates. - Default is ``5``. - - n_x : int - Dimension of the heterogeneity. Can be either ``1`` or ``2``. - Default is ``1``. - - binary_treatment : bool - Indicates whether the treatment is binary. - Default is ``False``. - - Returns - ------- - res_dict : dictionary - Dictionary with entries ``data``, ``effects``, ``treatment_effect``. - - """ - # simple input checks - assert n_x in [1, 2], 'n_x must be either 1 or 2.' - assert support_size <= p, 'support_size must be smaller than p.' - assert isinstance(binary_treatment, bool), 'binary_treatment must be a boolean.' - - # define treatment effects - if n_x == 1: - def treatment_effect(x): - return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 0]) - else: - assert n_x == 2 - - # redefine treatment effect - def treatment_effect(x): - return np.exp(2 * x[:, 0]) + 3 * np.sin(4 * x[:, 1]) - - # Outcome support and coefficients - support_y = np.random.choice(np.arange(p), size=support_size, replace=False) - coefs_y = np.random.uniform(0, 1, size=support_size) - # treatment support and coefficients - support_d = support_y - coefs_d = np.random.uniform(0, 0.3, size=support_size) - - # noise - epsilon = np.random.uniform(-1, 1, size=n_obs) - eta = np.random.uniform(-1, 1, size=n_obs) - - # Generate controls, covariates, treatments and outcomes - x = np.random.uniform(0, 1, size=(n_obs, p)) - # Heterogeneous treatment effects - te = treatment_effect(x) - if binary_treatment: - d = 1.0 * (np.dot(x[:, support_d], coefs_d) >= eta) - else: - d = np.dot(x[:, support_d], coefs_d) + eta - y = te * d + np.dot(x[:, support_y], coefs_y) + epsilon - - # Now we build the dataset - y_df = pd.DataFrame({'y': y}) - d_df = pd.DataFrame({'d': d}) - x_df = pd.DataFrame( - data=x, - index=np.arange(x.shape[0]), - columns=[f'X_{i}' for i in range(x.shape[1])] - ) - - data = pd.concat([y_df, d_df, x_df], axis=1) - res_dict = { - 'data': data, - 'effects': te, - 'treatment_effect': treatment_effect} - return res_dict - - -def make_ssm_data(n_obs=8000, dim_x=100, theta=1, mar=True, return_type='DoubleMLData'): - """ - Generates data from a sample selection model (SSM). - The data generating process is defined as - - .. math:: - - y_i &= \\theta d_i + x_i' \\beta d_i + u_i, - - s_i &= 1\\left\\lbrace d_i + \\gamma z_i + x_i' \\beta + v_i > 0 \\right\\rbrace, - - d_i &= 1\\left\\lbrace x_i' \\beta + w_i > 0 \\right\\rbrace, - - with Y being observed if :math:`s_i = 1` and covariates :math:`x_i \\sim \\mathcal{N}(0, \\Sigma^2_x)`, where - :math:`\\Sigma^2_x` is a matrix with entries - :math:`\\Sigma_{kj} = 0.5^{|j-k|}`. - :math:`\\beta` is a `dim_x`-vector with entries :math:`\\beta_j=\\frac{0.4}{j^2}` - :math:`z_i \\sim \\mathcal{N}(0, 1)`, - :math:`(u_i,v_i) \\sim \\mathcal{N}(0, \\Sigma^2_{u,v})`, - :math:`w_i \\sim \\mathcal{N}(0, 1)`. - - - The data generating process is inspired by a process used in the simulation study (see Appendix E) of Bia, - Huber and Lafférs (2023). - - Parameters - ---------- - n_obs : - The number of observations to simulate. - dim_x : - The number of covariates. - theta : - The value of the causal parameter. - mar: - Boolean. Indicates whether missingness at random holds. - return_type : - If ``'DoubleMLData'`` or ``DoubleMLData``, returns a ``DoubleMLData`` object. - - If ``'DataFrame'``, ``'pd.DataFrame'`` or ``pd.DataFrame``, returns a ``pd.DataFrame``. - - If ``'array'``, ``'np.ndarray'``, ``'np.array'`` or ``np.ndarray``, returns ``np.ndarray``'s ``(x, y, d, z, s)``. - - References - ---------- - Michela Bia, Martin Huber & Lukáš Lafférs (2023) Double Machine Learning for Sample Selection Models, - Journal of Business & Economic Statistics, DOI: 10.1080/07350015.2023.2271071 - """ - if mar: - sigma = np.array([[1, 0], [0, 1]]) - gamma = 0 - else: - sigma = np.array([[1, 0.8], [0.8, 1]]) - gamma = 1 - - e = np.random.multivariate_normal(mean=[0, 0], cov=sigma, size=n_obs).T - - cov_mat = toeplitz([np.power(0.5, k) for k in range(dim_x)]) - x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) - - beta = [0.4 / (k ** 2) for k in range(1, dim_x + 1)] - - d = np.where(np.dot(x, beta) + np.random.randn(n_obs) > 0, 1, 0) - z = np.random.randn(n_obs) - s = np.where(np.dot(x, beta) + d + gamma * z + e[0] > 0, 1, 0) - - y = np.dot(x, beta) + theta * d + e[1] - y[s == 0] = 0 - - if return_type in _array_alias: - return x, y, d, z, s - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] - if mar: - data = pd.DataFrame(np.column_stack((x, y, d, s)), - columns=x_cols + ['y', 'd', 's']) - else: - data = pd.DataFrame(np.column_stack((x, y, d, z, s)), - columns=x_cols + ['y', 'd', 'z', 's']) - if return_type in _data_frame_alias: - return data - else: - if mar: - return DoubleMLData(data, 'y', 'd', x_cols, None, None, 's') - return DoubleMLData(data, 'y', 'd', x_cols, 'z', None, 's') - else: - raise ValueError('Invalid return_type.') - - -def make_irm_data_discrete_treatments(n_obs=200, n_levels=3, linear=False, random_state=None, **kwargs): - """ - Generates data from a interactive regression (IRM) model with multiple treatment levels (based on an - underlying continous treatment). - - The data generating process is defined as follows (similar to the Monte Carlo simulation used - in Sant'Anna and Zhao (2020)). - - Let :math:`X= (X_1, X_2, X_3, X_4, X_5)^T \\sim \\mathcal{N}(0, \\Sigma)`, where :math:`\\Sigma` corresponds - to the identity matrix. - Further, define :math:`Z_j = (\\tilde{Z_j} - \\mathbb{E}[\\tilde{Z}_j]) / \\sqrt{\\text{Var}(\\tilde{Z}_j)}`, - where - - .. math:: - - \\tilde{Z}_1 &= \\exp(0.5 \\cdot X_1) - - \\tilde{Z}_2 &= 10 + X_2/(1 + \\exp(X_1)) - - \\tilde{Z}_3 &= (0.6 + X_1 \\cdot X_3 / 25)^3 - - \\tilde{Z}_4 &= (20 + X_2 + X_4)^2 - - \\tilde{Z}_5 &= X_5. - - A continuous treatment :math:`D_{\\text{cont}}` is generated as - - .. math:: - - D_{\\text{cont}} = \\xi (-Z_1 + 0.5 Z_2 - 0.25 Z_3 - 0.1 Z_4) + \\varepsilon_D, - - where :math:`\\varepsilon_D \\sim \\mathcal{N}(0,1)` and :math:`\\xi=0.3`. The corresponding treatment - effect is defined as - - .. math:: - - \\theta (d) = 0.1 \\exp(d) + 10 \\sin(0.7 d) + 2 d - 0.2 d^2. - - Based on the continous treatment, a discrete treatment :math:`D` is generated as with a baseline level of - :math:`D=0` and additional levels based on the quantiles of :math:`D_{\\text{cont}}`. The number of levels - is defined by :math:`n_{\\text{levels}}`. Each level is chosen to have the same probability of being selected. - - The potential outcomes are defined as - - .. math:: - - Y(0) &= 210 + 27.4 Z_1 + 13.7 (Z_2 + Z_3 + Z_4) + \\varepsilon_Y - - Y(1) &= \\theta (D_{\\text{cont}}) 1\\{D_{\\text{cont}} > 0\\} + Y(0), - - where :math:`\\varepsilon_Y \\sim \\mathcal{N}(0,5)`. Further, the observed outcome is defined as - - .. math:: - - Y = Y(1) 1\\{D > 0\\} + Y(0) 1\\{D = 0\\}. - - The data is returned as a dictionary with the entries ``x``, ``y``, ``d`` and ``oracle_values``. - - Parameters - ---------- - n_obs : int - The number of observations to simulate. - Default is ``200``. - - n_levels : int - The number of treatment levels. - Default is ``3``. - - linear : bool - Indicates whether the true underlying regression is linear. - Default is ``False``. - - random_state : int - Random seed for reproducibility. - Default is ``42``. - - Returns - ------- - res_dict : dictionary - Dictionary with entries ``x``, ``y``, ``d`` and ``oracle_values``. - The oracle values contain the continuous treatment, the level bounds, the potential level, ITE - and the potential outcome without treatment. - - """ - if random_state is not None: - np.random.seed(random_state) - xi = kwargs.get('xi', 0.3) - c = kwargs.get('c', 0.0) - dim_x = kwargs.get('dim_x', 5) - - if not isinstance(n_levels, int): - raise ValueError('n_levels must be an integer.') - if n_levels < 2: - raise ValueError('n_levels must be at least 2.') - - # observed covariates - cov_mat = toeplitz([np.power(c, k) for k in range(dim_x)]) - x = np.random.multivariate_normal(np.zeros(dim_x), cov_mat, size=[n_obs, ]) - - def f_reg(w): - res = 210 + 27.4 * w[:, 0] + 13.7 * (w[:, 1] + w[:, 2] + w[:, 3]) - return res - - def f_treatment(w, xi): - res = xi * (-w[:, 0] + 0.5 * w[:, 1] - 0.25 * w[:, 2] - 0.1 * w[:, 3]) - return res - - def treatment_effect(d, scale=15): - return scale * (1 / (1 + np.exp(-d - 1.2 * np.cos(d)))) - 2 - - z_tilde_1 = np.exp(0.5 * x[:, 0]) - z_tilde_2 = 10 + x[:, 1] / (1 + np.exp(x[:, 0])) - z_tilde_3 = (0.6 + x[:, 0] * x[:, 2] / 25) ** 3 - z_tilde_4 = (20 + x[:, 1] + x[:, 3]) ** 2 - - z_tilde = np.column_stack((z_tilde_1, z_tilde_2, z_tilde_3, z_tilde_4, x[:, 4:])) - z = (z_tilde - np.mean(z_tilde, axis=0)) / np.std(z_tilde, axis=0) - - # error terms - var_eps_y = 5 - eps_y = np.random.normal(loc=0, scale=np.sqrt(var_eps_y), size=n_obs) - var_eps_d = 1 - eps_d = np.random.normal(loc=0, scale=np.sqrt(var_eps_d), size=n_obs) - - if linear: - g = f_reg(x) - m = f_treatment(x, xi) - else: - assert not linear - g = f_reg(z) - m = f_treatment(z, xi) - - cont_d = m + eps_d - level_bounds = np.quantile(cont_d, q=np.linspace(0, 1, n_levels + 1)) - potential_level = sum([1.0 * (cont_d >= bound) for bound in level_bounds[1:-1]]) + 1 - eta = np.random.uniform(0, 1, size=n_obs) - d = 1.0 * (eta >= 1 / n_levels) * potential_level - - ite = treatment_effect(cont_d) - y0 = g + eps_y - # only treated for d > 0 compared to the baseline - y = ite * (d > 0) + y0 - - oracle_values = { - 'cont_d': cont_d, - 'level_bounds': level_bounds, - 'potential_level': potential_level, - 'ite': ite, - 'y0': y0, - } - - resul_dict = { - 'x': x, - 'y': y, - 'd': d, - 'oracle_values': oracle_values - } - - return resul_dict - - -def make_logistic_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLData', balanced_r0=True, treatment="continuous", **kwargs): - """ - Generates synthetic data for a logistic partially linear regression model, as in Liu et al. (2021), - designed for use in double/debiased machine learning applications. - - The data generating process is defined as follows: - - - Covariates \( x_i \sim \mathcal{N}(0, \Sigma) \), where \( \Sigma_{kj} = 0.7^{|j-k|} \). - - Treatment \( d_i = a_0(x_i) \). - - Propensity score \( p_i = \sigma(\alpha d_i + r_0(x_i)) \), where \( \sigma(\cdot) \) is the logistic function. - - Outcome \( y_i \sim \text{Bernoulli}(p_i) \). - - The nuisance functions are defined as: - - .. math:: - - a_0(x_i) &= \frac{2}{1 + \exp(x_{i,1})} - \frac{2}{1 + \exp(x_{i,2})} + \sin(x_{i,3}) + \cos(x_{i,4}) \\ - &+ 0.5 \cdot \mathbb{1}(x_{i,5} > 0) - 0.5 \cdot \mathbb{1}(x_{i,6} > 0) + 0.2 x_{i,7} x_{i,8} - 0.2 x_{i,9} x_{i,10} \\ - - r_0(x_i) &= 0.1 x_{i,1} x_{i,2} x_{i,3} + 0.1 x_{i,4} x_{i,5} + 0.1 x_{i,6}^3 - 0.5 \sin^2(x_{i,7}) \\ - &+ 0.5 \cos(x_{i,8}) + \frac{1}{1 + x_{i,9}^2} - \frac{1}{1 + \exp(x_{i,10})} \\ - &+ 0.25 \cdot \mathbb{1}(x_{i,11} > 0) - 0.25 \cdot \mathbb{1}(x_{i,13} > 0) - - Parameters - ---------- - n_obs : int - Number of observations to simulate. - dim_x : int - Number of covariates. - alpha : float - Value of the causal parameter. - return_type : str - Determines the return format. One of: - - - 'DoubleMLData' or DoubleMLData: returns a ``DoubleMLData`` object. - - 'DataFrame', 'pd.DataFrame' or pd.DataFrame: returns a ``pandas.DataFrame``. - - 'array', 'np.ndarray', 'np.array' or np.ndarray: returns tuple of numpy arrays (x, y, d, p). - - **kwargs - Optional keyword arguments (currently unused in this implementation). - - Returns - ------- - Union[DoubleMLData, pd.DataFrame, Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]] - The generated data in the specified format. - - References - ---------- - Liu, Molei, Yi Zhang, and Doudou Zhou. 2021. - "Double/Debiased Machine Learning for Logistic Partially Linear Model." - The Econometrics Journal 24 (3): 559–88. https://doi.org/10.1093/ectj/utab019. - - """ - - if balanced_r0: - def r_0(X): - return 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + \ - 0.1 * X[:, 3] * X[:, 4] + \ - 0.1 * X[:, 5] ** 3 + \ - -0.5 * np.sin(X[:, 6]) ** 2 + \ - 0.5 * np.cos(X[:, 7]) + \ - 1 / (1 + X[:, 8] ** 2) + \ - -1 / (1 + np.exp(X[:, 9])) + \ - 0.25 * np.where(X[:, 10] > 0, 1, 0) + \ - -0.25 * np.where(X[:, 12] > 0, 1, 0) - else: - def r_0(X): - return 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + \ - 0.1 * X[:, 3] * X[:, 4] + \ - 0.1 * X[:, 5] ** 3 + \ - -0.5 * np.sin(X[:, 6]) ** 2 + \ - 0.5 * np.cos(X[:, 7]) + \ - 4 / (1 + X[:, 8] ** 2) + \ - -1 / (1 + np.exp(X[:, 9])) + \ - 1.5 * np.where(X[:, 10] > 0, 1, 0) + \ - -0.25 * np.where(X[:, 12] > 0, 1, 0) - - def a_0(X): - return 2 / (1 + np.exp(X[:, 0])) + \ - -2 / (1 + np.exp(X[:, 1])) + \ - 1 * np.sin(X[:, 2]) + \ - 1 * np.cos(X[:, 3]) + \ - 0.5 * np.where(X[:, 4] > 0, 1, 0) + \ - -0.5 * np.where(X[:, 5] > 0, 1, 0) + \ - 0.2 * X[:, 6] * X[:, 7] + \ - -0.2 * X[:, 8] * X[:, 9] - - - sigma = np.full((dim_x, dim_x), 0.2) - np.fill_diagonal(sigma, 1) - - x = np.random.multivariate_normal(np.zeros(dim_x), sigma, size=n_obs) - np.clip(x, -2, 2, out=x) - - if treatment == "continuous": - d = a_0(x) - elif treatment == "binary": - d_cont = a_0(x) - d = np.random.binomial(1, expit(d_cont - d_cont.mean())) - elif treatment == "binary_unbalanced": - d_cont = a_0(x) - d = np.random.binomial(1, expit(d_cont)) - - p = expit(alpha * d[:] + r_0(x)) - - y = np.random.binomial(1, p) - - if return_type in _array_alias: - return x, y, d, p - elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d, p)), - columns=x_cols + ['y', 'd', 'p']) - if return_type in _data_frame_alias: - return data - else: - return DoubleMLData(data, 'y', 'd', x_cols, p_cols='p') - else: - raise ValueError('Invalid return_type.') diff --git a/doubleml/double_ml.py b/doubleml/double_ml.py index 1cc6bcf9..05481bf1 100644 --- a/doubleml/double_ml.py +++ b/doubleml/double_ml.py @@ -585,12 +585,6 @@ def fit(self, n_jobs_cv=None, store_predictions=True, external_predictions=None, # construct framework for inference self._framework = self.construct_framework() - - - - - - return self def construct_framework(self): diff --git a/doubleml/double_ml_data.py b/doubleml/double_ml_data.py deleted file mode 100644 index 612e6b7f..00000000 --- a/doubleml/double_ml_data.py +++ /dev/null @@ -1,1104 +0,0 @@ -import numpy as np -import pandas as pd -import io - -from abc import ABC, abstractmethod - -from sklearn.utils.validation import check_array, column_or_1d, check_consistent_length -from sklearn.utils import assert_all_finite -from sklearn.utils.multiclass import type_of_target -from .utils._estimation import _assure_2d_array -from .utils._checks import _check_set - - -class DoubleMLBaseData(ABC): - """Base Class Double machine learning data-backends - """ - def __init__(self, - data): - if not isinstance(data, pd.DataFrame): - raise TypeError('data must be of pd.DataFrame type. ' - f'{str(data)} of type {str(type(data))} was passed.') - if not data.columns.is_unique: - raise ValueError('Invalid pd.DataFrame: ' - 'Contains duplicate column names.') - self._data = data - - def __str__(self): - data_summary = self._data_summary_str() - buf = io.StringIO() - self.data.info(verbose=False, buf=buf) - df_info = buf.getvalue() - res = '================== DoubleMLBaseData Object ==================\n' + \ - '\n------------------ Data summary ------------------\n' + data_summary + \ - '\n------------------ DataFrame info ------------------\n' + df_info - return res - - def _data_summary_str(self): - data_summary = f'No. Observations: {self.n_obs}\n' - return data_summary - - @property - def data(self): - """ - The data. - """ - return self._data - - @property - def all_variables(self): - """ - All variables available in the dataset. - """ - return self.data.columns - - @property - def n_obs(self): - """ - The number of observations. - """ - return self.data.shape[0] - - # TODO: This and the following property does not make sense but the base class DoubleML needs it (especially for the - # multiple treatment variables case) and other things are also build around it, see for example DoubleML._params - @property - def d_cols(self): - return ['theta'] - - @property - def n_treat(self): - """ - The number of treatment variables. - """ - return 1 - - @property - @abstractmethod - def n_coefs(self): - pass - - -class DoubleMLData(DoubleMLBaseData): - """Double machine learning data-backend. - - :class:`DoubleMLData` objects can be initialized from - :class:`pandas.DataFrame`'s as well as :class:`numpy.ndarray`'s. - - Parameters - ---------- - data : :class:`pandas.DataFrame` - The data. - - y_col : str - The outcome variable. - - d_cols : str or list - The treatment variable(s). - - x_cols : None, str or list - The covariates. - If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor - treatment variables ``d_cols``, nor instrumental variables ``z_cols`` are used as covariates. - Default is ``None``. - - z_cols : None, str or list - The instrumental variable(s). - Default is ``None``. - - t_col : None or str - The time variable (only relevant/used for DiD Estimators). - Default is ``None``. - - s_col : None or str - The score or selection variable (only relevant/used for RDD or SSM Estimatiors). - Default is ``None``. - - p_cols : None, str or list, optional - The column(s) containing the probabilities of the outcome (only for simulated, binary data). - Default is ``None``. - - use_other_treat_as_covariate : bool - Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. - Default is ``True``. - - force_all_x_finite : bool or str - Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. - Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are - allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). - Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used - for the nuisance functions are capable to provide valid predictions with missings and / or infinite values - in the covariates ``x``. - Default is ``True``. - - Examples - -------- - >>> from doubleml import DoubleMLData - >>> from doubleml.datasets import make_plr_CCDDHNR2018 - >>> # initialization from pandas.DataFrame - >>> df = make_plr_CCDDHNR2018(return_type='DataFrame') - >>> obj_dml_data_from_df = DoubleMLData(df, 'y', 'd') - >>> # initialization from np.ndarray - >>> (x, y, d) = make_plr_CCDDHNR2018(return_type='array') - >>> obj_dml_data_from_array = DoubleMLData.from_arrays(x, y, d) - """ - def __init__(self, - data, - y_col, - d_cols, - x_cols=None, - z_cols=None, - t_col=None, - s_col=None, - p_cols=None, - use_other_treat_as_covariate=True, - force_all_x_finite=True): - DoubleMLBaseData.__init__(self, data) - - self.y_col = y_col - self.d_cols = d_cols - self.z_cols = z_cols - self.t_col = t_col - self.s_col = s_col - self.x_cols = x_cols - self.p_cols = p_cols - self._check_disjoint_sets_y_d_x_z_t_s() - self.use_other_treat_as_covariate = use_other_treat_as_covariate - self.force_all_x_finite = force_all_x_finite - self._binary_treats = self._check_binary_treats() - self._binary_outcome = self._check_binary_outcome() - self._set_y_z_t_s() - # by default, we initialize to the first treatment variable - self.set_x_d(self.d_cols[0]) - - def __str__(self): - data_summary = self._data_summary_str() - buf = io.StringIO() - self.data.info(verbose=False, buf=buf) - df_info = buf.getvalue() - res = '================== DoubleMLData Object ==================\n' + \ - '\n------------------ Data summary ------------------\n' + data_summary + \ - '\n------------------ DataFrame info ------------------\n' + df_info - return res - - def _data_summary_str(self): - data_summary = f'Outcome variable: {self.y_col}\n' \ - f'Treatment variable(s): {self.d_cols}\n' \ - f'Covariates: {self.x_cols}\n' \ - f'Instrument variable(s): {self.z_cols}\n' - if self.t_col is not None: - data_summary += f'Time variable: {self.t_col}\n' - if self.s_col is not None: - data_summary += f'Score/Selection variable: {self.s_col}\n' - data_summary += f'No. Observations: {self.n_obs}\n' - return data_summary - - @classmethod - def from_arrays(cls, x, y, d, z=None, t=None, s=None, p=None, use_other_treat_as_covariate=True, - force_all_x_finite=True): - """ - Initialize :class:`DoubleMLData` from :class:`numpy.ndarray`'s. - - Parameters - ---------- - x : :class:`numpy.ndarray` - Array of covariates. - - y : :class:`numpy.ndarray` - Array of the outcome variable. - - d : :class:`numpy.ndarray` - Array of treatment variables. - - z : None or :class:`numpy.ndarray` - Array of instrumental variables. - Default is ``None``. - - t : :class:`numpy.ndarray` - Array of the time variable (only relevant/used for DiD models). - Default is ``None``. - - s : :class:`numpy.ndarray` - Array of the score or selection variable (only relevant/used for RDD and SSM models). - Default is ``None``. - - p : None or :class:`numpy.ndarray` - Array of the probabilities of the outcome (only for simulated, binary data). - Default is ``None``. - - use_other_treat_as_covariate : bool - Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. - Default is ``True``. - - force_all_x_finite : bool or str - Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. - Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are - allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). - Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used - for the nuisance functions are capable to provide valid predictions with missings and / or infinite values - in the covariates ``x``. - Default is ``True``. - - Examples - -------- - >>> from doubleml import DoubleMLData - >>> from doubleml.datasets import make_plr_CCDDHNR2018 - >>> (x, y, d) = make_plr_CCDDHNR2018(return_type='array') - >>> obj_dml_data_from_array = DoubleMLData.from_arrays(x, y, d) - """ - if isinstance(force_all_x_finite, str): - if force_all_x_finite != 'allow-nan': - raise ValueError("Invalid force_all_x_finite " + force_all_x_finite + ". " + - "force_all_x_finite must be True, False or 'allow-nan'.") - elif not isinstance(force_all_x_finite, bool): - raise TypeError("Invalid force_all_x_finite. " + - "force_all_x_finite must be True, False or 'allow-nan'.") - - x = check_array(x, ensure_2d=False, allow_nd=False, - force_all_finite=force_all_x_finite) - d = check_array(d, ensure_2d=False, allow_nd=False) - y = column_or_1d(y, warn=True) - - x = _assure_2d_array(x) - d = _assure_2d_array(d) - - y_col = 'y' - if z is None: - check_consistent_length(x, y, d) - z_cols = None - else: - z = check_array(z, ensure_2d=False, allow_nd=False) - z = _assure_2d_array(z) - check_consistent_length(x, y, d, z) - if z.shape[1] == 1: - z_cols = ['z'] - else: - z_cols = [f'z{i + 1}' for i in np.arange(z.shape[1])] - - if t is None: - t_col = None - else: - t = column_or_1d(t, warn=True) - check_consistent_length(x, y, d, t) - t_col = 't' - - if s is None: - s_col = None - else: - s = column_or_1d(s, warn=True) - check_consistent_length(x, y, d, s) - s_col = 's' - - - if p is None: - p_cols = None - else: - if p.shape[1] == 1: - p_cols = ['p'] - else: - p_cols = [f'p{i + 1}' for i in np.arange(p.shape[1])] - - if d.shape[1] == 1: - d_cols = ['d'] - else: - d_cols = [f'd{i+1}' for i in np.arange(d.shape[1])] - - x_cols = [f'X{i+1}' for i in np.arange(x.shape[1])] - - # basline version with features, outcome and treatments - data = pd.DataFrame(np.column_stack((x, y, d)), - columns=x_cols + [y_col] + d_cols) - - if z is not None: - df_z = pd.DataFrame(z, columns=z_cols) - data = pd.concat([data, df_z], axis=1) - - if t is not None: - data[t_col] = t - - if s is not None: - data[s_col] = s - - if p is not None: - data[p_cols] = p - - return cls(data, y_col, d_cols, x_cols, z_cols, t_col, s_col, p_cols, use_other_treat_as_covariate, force_all_x_finite) - - @property - def x(self): - """ - Array of covariates; - Dynamic! May depend on the currently set treatment variable; - To get an array of all covariates (independent of the currently set treatment variable) - call ``obj.data[obj.x_cols].values``. - """ - return self._X.values - - @property - def y(self): - """ - Array of outcome variable. - """ - return self._y.values - - @property - def d(self): - """ - Array of treatment variable; - Dynamic! Depends on the currently set treatment variable; - To get an array of all treatment variables (independent of the currently set treatment variable) - call ``obj.data[obj.d_cols].values``. - """ - return self._d.values - - @property - def z(self): - """ - Array of instrumental variables. - """ - if self.z_cols is not None: - return self._z.values - else: - return None - - @property - def t(self): - """ - Array of time variable. - """ - if self.t_col is not None: - return self._t.values - else: - return None - - @property - def s(self): - """ - Array of score or selection variable. - """ - if self.s_col is not None: - return self._s.values - else: - return None - - @property - def p_cols(self): - """ - The column(s) containing the probabilities of the outcome (only for simulated data). - """ - return self._p_cols - - @p_cols.setter - def p_cols(self, value): - if value is not None: - if isinstance(value, str): - value = [value] - if not isinstance(value, list): - raise TypeError('The probability column(s) p_cols must be of str or list type (or None). ' - f'{str(value)} of type {str(type(value))} was passed.') - if not len(set(value)) == len(value): - raise ValueError('Invalid probability column(s) p_cols: ' - 'Contains duplicate values.') - if not set(value).issubset(set(self.all_variables)): - raise ValueError('Invalid probability column(s) p_cols. ' - 'At least one probability column is not a data column.') - self._p_cols = value - else: - self._p_cols = None - - @property - def p(self): - """ - Array of probabilities of the outcome (only for simulated data). - """ - if self.p_cols is not None: - return self._p.values - else: - return None - - @property - def n_treat(self): - """ - The number of treatment variables. - """ - return len(self.d_cols) - - @property - def n_coefs(self): - """ - The number of coefficients to be estimated. - """ - return self.n_treat - - @property - def n_instr(self): - """ - The number of instruments. - """ - if self.z_cols is not None: - n_instr = len(self.z_cols) - else: - n_instr = 0 - return n_instr - - @property - def binary_treats(self): - """ - Series with logical(s) indicating whether the treatment variable(s) are binary with values 0 and 1. - """ - return self._binary_treats - - @property - def binary_outcome(self): - """ - Logical indicating whether the outcome variable is binary with values 0 and 1. - """ - return self._binary_outcome - - @property - def x_cols(self): - """ - The covariates. - """ - return self._x_cols - - @x_cols.setter - def x_cols(self, value): - reset_value = hasattr(self, '_x_cols') - if value is not None: - if isinstance(value, str): - value = [value] - if not isinstance(value, list): - raise TypeError('The covariates x_cols must be of str or list type (or None). ' - f'{str(value)} of type {str(type(value))} was passed.') - if not len(set(value)) == len(value): - raise ValueError('Invalid covariates x_cols: ' - 'Contains duplicate values.') - if not set(value).issubset(set(self.all_variables)): - raise ValueError('Invalid covariates x_cols. ' - 'At least one covariate is no data column.') - assert set(value).issubset(set(self.all_variables)) - self._x_cols = value - else: - excluded_cols = set.union({self.y_col}, set(self.d_cols)) - if (self.z_cols is not None): - excluded_cols = set.union(excluded_cols, set(self.z_cols)) - for col in [self.t_col, self.s_col]: - col = _check_set(col) - excluded_cols = set.union(excluded_cols, col) - self._x_cols = [col for col in self.data.columns if col not in excluded_cols] - if reset_value: - self._check_disjoint_sets() - # by default, we initialize to the first treatment variable - self.set_x_d(self.d_cols[0]) - - @property - def d_cols(self): - """ - The treatment variable(s). - """ - return self._d_cols - - @d_cols.setter - def d_cols(self, value): - reset_value = hasattr(self, '_d_cols') - if isinstance(value, str): - value = [value] - if not isinstance(value, list): - raise TypeError('The treatment variable(s) d_cols must be of str or list type. ' - f'{str(value)} of type {str(type(value))} was passed.') - if not len(set(value)) == len(value): - raise ValueError('Invalid treatment variable(s) d_cols: ' - 'Contains duplicate values.') - if not set(value).issubset(set(self.all_variables)): - raise ValueError('Invalid treatment variable(s) d_cols. ' - 'At least one treatment variable is no data column.') - self._d_cols = value - if reset_value: - self._check_disjoint_sets() - # by default, we initialize to the first treatment variable - self.set_x_d(self.d_cols[0]) - - @property - def y_col(self): - """ - The outcome variable. - """ - return self._y_col - - @y_col.setter - def y_col(self, value): - reset_value = hasattr(self, '_y_col') - if not isinstance(value, str): - raise TypeError('The outcome variable y_col must be of str type. ' - f'{str(value)} of type {str(type(value))} was passed.') - if value not in self.all_variables: - raise ValueError('Invalid outcome variable y_col. ' - f'{value} is no data column.') - self._y_col = value - if reset_value: - self._check_disjoint_sets() - self._set_y_z_t_s() - - @property - def z_cols(self): - """ - The instrumental variable(s). - """ - return self._z_cols - - @z_cols.setter - def z_cols(self, value): - reset_value = hasattr(self, '_z_cols') - if value is not None: - if isinstance(value, str): - value = [value] - if not isinstance(value, list): - raise TypeError('The instrumental variable(s) z_cols must be of str or list type (or None). ' - f'{str(value)} of type {str(type(value))} was passed.') - if not len(set(value)) == len(value): - raise ValueError('Invalid instrumental variable(s) z_cols: ' - 'Contains duplicate values.') - if not set(value).issubset(set(self.all_variables)): - raise ValueError('Invalid instrumental variable(s) z_cols. ' - 'At least one instrumental variable is no data column.') - self._z_cols = value - else: - self._z_cols = None - if reset_value: - self._check_disjoint_sets() - self._set_y_z_t_s() - - @property - def t_col(self): - """ - The time variable. - """ - return self._t_col - - @t_col.setter - def t_col(self, value): - reset_value = hasattr(self, '_t_col') - if value is not None: - if not isinstance(value, str): - raise TypeError('The time variable t_col must be of str type (or None). ' - f'{str(value)} of type {str(type(value))} was passed.') - if value not in self.all_variables: - raise ValueError('Invalid time variable t_col. ' - f'{value} is no data column.') - self._t_col = value - else: - self._t_col = None - if reset_value: - self._check_disjoint_sets() - self._set_y_z_t_s() - - @property - def s_col(self): - """ - The score or selection variable. - """ - return self._s_col - - @s_col.setter - def s_col(self, value): - reset_value = hasattr(self, '_s_col') - if value is not None: - if not isinstance(value, str): - raise TypeError('The score or selection variable s_col must be of str type (or None). ' - f'{str(value)} of type {str(type(value))} was passed.') - if value not in self.all_variables: - raise ValueError('Invalid score or selection variable s_col. ' - f'{value} is no data column.') - self._s_col = value - else: - self._s_col = None - if reset_value: - self._check_disjoint_sets() - self._set_y_z_t_s() - - @property - def use_other_treat_as_covariate(self): - """ - Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. - """ - return self._use_other_treat_as_covariate - - @use_other_treat_as_covariate.setter - def use_other_treat_as_covariate(self, value): - reset_value = hasattr(self, '_use_other_treat_as_covariate') - if not isinstance(value, bool): - raise TypeError('use_other_treat_as_covariate must be True or False. ' - f'Got {str(value)}.') - self._use_other_treat_as_covariate = value - if reset_value: - # by default, we initialize to the first treatment variable - self.set_x_d(self.d_cols[0]) - - @property - def force_all_x_finite(self): - """ - Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. - """ - return self._force_all_x_finite - - @force_all_x_finite.setter - def force_all_x_finite(self, value): - reset_value = hasattr(self, '_force_all_x_finite') - if isinstance(value, str): - if value != 'allow-nan': - raise ValueError("Invalid force_all_x_finite " + value + ". " + - "force_all_x_finite must be True, False or 'allow-nan'.") - elif not isinstance(value, bool): - raise TypeError("Invalid force_all_x_finite. " + - "force_all_x_finite must be True, False or 'allow-nan'.") - self._force_all_x_finite = value - if reset_value: - # by default, we initialize to the first treatment variable - self.set_x_d(self.d_cols[0]) - - def _set_y_z_t_s(self): - assert_all_finite(self.data.loc[:, self.y_col]) - self._y = self.data.loc[:, self.y_col] - if self.z_cols is None: - self._z = None - else: - assert_all_finite(self.data.loc[:, self.z_cols]) - self._z = self.data.loc[:, self.z_cols] - - if self.t_col is None: - self._t = None - else: - assert_all_finite(self.data.loc[:, self.t_col]) - self._t = self.data.loc[:, self.t_col] - - if self.s_col is None: - self._s = None - else: - assert_all_finite(self.data.loc[:, self.s_col]) - self._s = self.data.loc[:, self.s_col] - - def set_x_d(self, treatment_var): - """ - Function that assigns the role for the treatment variables in the multiple-treatment case. - - Parameters - ---------- - treatment_var : str - Active treatment variable that will be set to d. - """ - if not isinstance(treatment_var, str): - raise TypeError('treatment_var must be of str type. ' - f'{str(treatment_var)} of type {str(type(treatment_var))} was passed.') - if treatment_var not in self.d_cols: - raise ValueError('Invalid treatment_var. ' - f'{treatment_var} is not in d_cols.') - if self.use_other_treat_as_covariate: - # note that the following line needs to be adapted in case an intersection of x_cols and d_cols as allowed - # (see https://github.com/DoubleML/doubleml-for-py/issues/83) - xd_list = self.x_cols + self.d_cols - xd_list.remove(treatment_var) - else: - xd_list = self.x_cols - assert_all_finite(self.data.loc[:, treatment_var]) - if self.force_all_x_finite: - assert_all_finite(self.data.loc[:, xd_list], - allow_nan=self.force_all_x_finite == 'allow-nan') - self._d = self.data.loc[:, treatment_var] - self._X = self.data.loc[:, xd_list] - - def _check_binary_treats(self): - is_binary = pd.Series(dtype=bool, index=self.d_cols) - for treatment_var in self.d_cols: - this_d = self.data.loc[:, treatment_var] - binary_treat = (type_of_target(this_d) == 'binary') - zero_one_treat = np.all((np.power(this_d, 2) - this_d) == 0) - is_binary[treatment_var] = (binary_treat & zero_one_treat) - return is_binary - - def _check_binary_outcome(self): - y = self.data.loc[:, self.y_col] - binary_outcome = (type_of_target(y) == 'binary') - zero_one_outcome = np.all((np.power(y, 2) - y) == 0) - is_binary = (binary_outcome & zero_one_outcome) - return is_binary - - def _check_disjoint_sets(self): - # this function can be extended in inherited subclasses - self._check_disjoint_sets_y_d_x_z_t_s() - - def _check_disjoint_sets_y_d_x_z_t_s(self): - y_col_set = {self.y_col} - x_cols_set = set(self.x_cols) - d_cols_set = set(self.d_cols) - - if not y_col_set.isdisjoint(x_cols_set): - raise ValueError(f'{str(self.y_col)} cannot be set as outcome variable ``y_col`` and covariate in ' - '``x_cols``.') - if not y_col_set.isdisjoint(d_cols_set): - raise ValueError(f'{str(self.y_col)} cannot be set as outcome variable ``y_col`` and treatment variable in ' - '``d_cols``.') - # note that the line xd_list = self.x_cols + self.d_cols in method set_x_d needs adaption if an intersection of - # x_cols and d_cols as allowed (see https://github.com/DoubleML/doubleml-for-py/issues/83) - if not d_cols_set.isdisjoint(x_cols_set): - raise ValueError('At least one variable/column is set as treatment variable (``d_cols``) and as covariate' - '(``x_cols``). Consider using parameter ``use_other_treat_as_covariate``.') - - if self.z_cols is not None: - z_cols_set = set(self.z_cols) - if not y_col_set.isdisjoint(z_cols_set): - raise ValueError(f'{str(self.y_col)} cannot be set as outcome variable ``y_col`` and instrumental ' - 'variable in ``z_cols``.') - if not d_cols_set.isdisjoint(z_cols_set): - raise ValueError('At least one variable/column is set as treatment variable (``d_cols``) and ' - 'instrumental variable in ``z_cols``.') - if not x_cols_set.isdisjoint(z_cols_set): - raise ValueError('At least one variable/column is set as covariate (``x_cols``) and instrumental ' - 'variable in ``z_cols``.') - - self._check_disjoint_sets_t_s() - - def _check_disjoint_sets_t_s(self): - y_col_set = {self.y_col} - x_cols_set = set(self.x_cols) - d_cols_set = set(self.d_cols) - - if self.t_col is not None: - t_col_set = {self.t_col} - if not t_col_set.isdisjoint(x_cols_set): - raise ValueError(f'{str(self.t_col)} cannot be set as time variable ``t_col`` and covariate in ' - '``x_cols``.') - if not t_col_set.isdisjoint(d_cols_set): - raise ValueError(f'{str(self.t_col)} cannot be set as time variable ``t_col`` and treatment variable in ' - '``d_cols``.') - if not t_col_set.isdisjoint(y_col_set): - raise ValueError(f'{str(self.t_col)} cannot be set as time variable ``t_col`` and outcome variable ' - '``y_col``.') - if self.z_cols is not None: - z_cols_set = set(self.z_cols) - if not t_col_set.isdisjoint(z_cols_set): - raise ValueError(f'{str(self.t_col)} cannot be set as time variable ``t_col`` and instrumental ' - 'variable in ``z_cols``.') - - if self.s_col is not None: - s_col_set = {self.s_col} - if not s_col_set.isdisjoint(x_cols_set): - raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and covariate in ' - '``x_cols``.') - if not s_col_set.isdisjoint(d_cols_set): - raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and treatment ' - 'variable in ``d_cols``.') - if not s_col_set.isdisjoint(y_col_set): - raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and outcome ' - 'variable ``y_col``.') - if self.z_cols is not None: - z_cols_set = set(self.z_cols) - if not s_col_set.isdisjoint(z_cols_set): - raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and ' - 'instrumental variable in ``z_cols``.') - if self.t_col is not None: - t_col_set = {self.t_col} - if not s_col_set.isdisjoint(t_col_set): - raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and time ' - 'variable ``t_col``.') - - -class DoubleMLClusterData(DoubleMLData): - """Double machine learning data-backend for data with cluster variables. - - :class:`DoubleMLClusterData` objects can be initialized from - :class:`pandas.DataFrame`'s as well as :class:`numpy.ndarray`'s. - - Parameters - ---------- - data : :class:`pandas.DataFrame` - The data. - - y_col : str - The outcome variable. - - d_cols : str or list - The treatment variable(s). - - cluster_cols : str or list - The cluster variable(s). - - x_cols : None, str or list - The covariates. - If ``None``, all variables (columns of ``data``) which are neither specified as outcome variable ``y_col``, nor - treatment variables ``d_cols``, nor instrumental variables ``z_cols`` are used as covariates. - Default is ``None``. - - z_cols : None, str or list - The instrumental variable(s). - Default is ``None``. - - t_col : None or str - The time variable (only relevant/used for DiD Estimators). - Default is ``None``. - - s_col : None or str - The score or selection variable (only relevant/used for RDD and SSM Estimatiors). - Default is ``None``. - - use_other_treat_as_covariate : bool - Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. - Default is ``True``. - - force_all_x_finite : bool or str - Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. - Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are - allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). - Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used - for the nuisance functions are capable to provide valid predictions with missings and / or infinite values - in the covariates ``x``. - Default is ``True``. - - Examples - -------- - >>> from doubleml import DoubleMLClusterData - >>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021 - >>> # initialization from pandas.DataFrame - >>> df = make_pliv_multiway_cluster_CKMS2021(return_type='DataFrame') - >>> obj_dml_data_from_df = DoubleMLClusterData(df, 'Y', 'D', ['cluster_var_i', 'cluster_var_j'], z_cols='Z') - >>> # initialization from np.ndarray - >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array') - >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z) - """ - def __init__(self, - data, - y_col, - d_cols, - cluster_cols, - x_cols=None, - z_cols=None, - t_col=None, - s_col=None, - use_other_treat_as_covariate=True, - force_all_x_finite=True): - DoubleMLBaseData.__init__(self, data) - - # we need to set cluster_cols (needs _data) before call to the super __init__ because of the x_cols setter - self.cluster_cols = cluster_cols - self._set_cluster_vars() - DoubleMLData.__init__(self, - data, - y_col, - d_cols, - x_cols, - z_cols, - t_col, - s_col, - use_other_treat_as_covariate, - force_all_x_finite) - self._check_disjoint_sets_cluster_cols() - - def __str__(self): - data_summary = self._data_summary_str() - buf = io.StringIO() - self.data.info(verbose=False, buf=buf) - df_info = buf.getvalue() - res = '================== DoubleMLClusterData Object ==================\n' + \ - '\n------------------ Data summary ------------------\n' + data_summary + \ - '\n------------------ DataFrame info ------------------\n' + df_info - return res - - def _data_summary_str(self): - data_summary = f'Outcome variable: {self.y_col}\n' \ - f'Treatment variable(s): {self.d_cols}\n' \ - f'Cluster variable(s): {self.cluster_cols}\n' \ - f'Covariates: {self.x_cols}\n' \ - f'Instrument variable(s): {self.z_cols}\n' - if self.t_col is not None: - data_summary += f'Time variable: {self.t_col}\n' - if self.s_col is not None: - data_summary += f'Score/Selection variable: {self.s_col}\n' - - data_summary += f'No. Observations: {self.n_obs}\n' - return data_summary - - @classmethod - def from_arrays(cls, x, y, d, cluster_vars, z=None, t=None, s=None, use_other_treat_as_covariate=True, - force_all_x_finite=True): - """ - Initialize :class:`DoubleMLClusterData` from :class:`numpy.ndarray`'s. - - Parameters - ---------- - x : :class:`numpy.ndarray` - Array of covariates. - - y : :class:`numpy.ndarray` - Array of the outcome variable. - - d : :class:`numpy.ndarray` - Array of treatment variables. - - cluster_vars : :class:`numpy.ndarray` - Array of cluster variables. - - z : None or :class:`numpy.ndarray` - Array of instrumental variables. - Default is ``None``. - - t : :class:`numpy.ndarray` - Array of the time variable (only relevant/used for DiD models). - Default is ``None``. - - s : :class:`numpy.ndarray` - Array of the score or selection variable (only relevant/used for RDD or SSM models). - Default is ``None``. - - use_other_treat_as_covariate : bool - Indicates whether in the multiple-treatment case the other treatment variables should be added as covariates. - Default is ``True``. - - force_all_x_finite : bool or str - Indicates whether to raise an error on infinite values and / or missings in the covariates ``x``. - Possible values are: ``True`` (neither missings ``np.nan``, ``pd.NA`` nor infinite values ``np.inf`` are - allowed), ``False`` (missings and infinite values are allowed), ``'allow-nan'`` (only missings are allowed). - Note that the choice ``False`` and ``'allow-nan'`` are only reasonable if the machine learning methods used - for the nuisance functions are capable to provide valid predictions with missings and / or infinite values - in the covariates ``x``. - Default is ``True``. - - Examples - -------- - >>> from doubleml import DoubleMLClusterData - >>> from doubleml.datasets import make_pliv_multiway_cluster_CKMS2021 - >>> (x, y, d, cluster_vars, z) = make_pliv_multiway_cluster_CKMS2021(return_type='array') - >>> obj_dml_data_from_array = DoubleMLClusterData.from_arrays(x, y, d, cluster_vars, z) - """ - dml_data = DoubleMLData.from_arrays(x, y, d, z, t, s, use_other_treat_as_covariate, force_all_x_finite) - cluster_vars = check_array(cluster_vars, ensure_2d=False, allow_nd=False) - cluster_vars = _assure_2d_array(cluster_vars) - if cluster_vars.shape[1] == 1: - cluster_cols = ['cluster_var'] - else: - cluster_cols = [f'cluster_var{i + 1}' for i in np.arange(cluster_vars.shape[1])] - - data = pd.concat((pd.DataFrame(cluster_vars, columns=cluster_cols), dml_data.data), axis=1) - - return (cls(data, dml_data.y_col, dml_data.d_cols, cluster_cols, - dml_data.x_cols, dml_data.z_cols, dml_data.t_col, dml_data.s_col, - dml_data.use_other_treat_as_covariate, dml_data.force_all_x_finite)) - - @property - def cluster_cols(self): - """ - The cluster variable(s). - """ - return self._cluster_cols - - @cluster_cols.setter - def cluster_cols(self, value): - reset_value = hasattr(self, '_cluster_cols') - if isinstance(value, str): - value = [value] - if not isinstance(value, list): - raise TypeError('The cluster variable(s) cluster_cols must be of str or list type. ' - f'{str(value)} of type {str(type(value))} was passed.') - if not len(set(value)) == len(value): - raise ValueError('Invalid cluster variable(s) cluster_cols: ' - 'Contains duplicate values.') - if not set(value).issubset(set(self.all_variables)): - raise ValueError('Invalid cluster variable(s) cluster_cols. ' - 'At least one cluster variable is no data column.') - self._cluster_cols = value - if reset_value: - self._check_disjoint_sets() - self._set_cluster_vars() - - @property - def n_cluster_vars(self): - """ - The number of cluster variables. - """ - return len(self.cluster_cols) - - @property - def cluster_vars(self): - """ - Array of cluster variable(s). - """ - return self._cluster_vars.values - - @DoubleMLData.x_cols.setter - def x_cols(self, value): - if value is not None: - # this call might become much easier with https://github.com/python/cpython/pull/26194 - super(self.__class__, self.__class__).x_cols.__set__(self, value) - else: - if self.s_col is None: - if (self.z_cols is not None) & (self.t_col is not None): - y_d_z_t = set.union({self.y_col}, set(self.d_cols), set(self.z_cols), {self.t_col}, set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d_z_t] - elif self.z_cols is not None: - y_d_z = set.union({self.y_col}, set(self.d_cols), set(self.z_cols), set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d_z] - elif self.t_col is not None: - y_d_t = set.union({self.y_col}, set(self.d_cols), {self.t_col}, set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d_t] - else: - y_d = set.union({self.y_col}, set(self.d_cols), set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d] - else: - if (self.z_cols is not None) & (self.t_col is not None): - y_d_z_t_s = set.union({self.y_col}, set(self.d_cols), set(self.z_cols), {self.t_col}, {self.s_col}, - set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d_z_t_s] - elif self.z_cols is not None: - y_d_z_s = set.union({self.y_col}, set(self.d_cols), set(self.z_cols), {self.s_col}, set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d_z_s] - elif self.t_col is not None: - y_d_t_s = set.union({self.y_col}, set(self.d_cols), {self.t_col}, {self.s_col}, set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d_t_s] - else: - y_d_s = set.union({self.y_col}, set(self.d_cols), {self.s_col}, set(self.cluster_cols)) - x_cols = [col for col in self.data.columns if col not in y_d_s] - # this call might become much easier with https://github.com/python/cpython/pull/26194 - super(self.__class__, self.__class__).x_cols.__set__(self, x_cols) - - def _check_disjoint_sets(self): - # apply the standard checks from the DoubleMLData class - super(DoubleMLClusterData, self)._check_disjoint_sets() - self._check_disjoint_sets_cluster_cols() - - def _check_disjoint_sets_cluster_cols(self): - # apply the standard checks from the DoubleMLData class - super(DoubleMLClusterData, self)._check_disjoint_sets() - - # special checks for the additional cluster variables - cluster_cols_set = set(self.cluster_cols) - y_col_set = {self.y_col} - x_cols_set = set(self.x_cols) - d_cols_set = set(self.d_cols) - t_col_set = {self.t_col} - s_col_set = {self.s_col} - - if not y_col_set.isdisjoint(cluster_cols_set): - raise ValueError(f'{str(self.y_col)} cannot be set as outcome variable ``y_col`` and cluster ' - 'variable in ``cluster_cols``.') - if not d_cols_set.isdisjoint(cluster_cols_set): - raise ValueError('At least one variable/column is set as treatment variable (``d_cols``) and ' - 'cluster variable in ``cluster_cols``.') - # TODO: Is the following combination allowed, or not? - if not x_cols_set.isdisjoint(cluster_cols_set): - raise ValueError('At least one variable/column is set as covariate (``x_cols``) and cluster ' - 'variable in ``cluster_cols``.') - if self.z_cols is not None: - z_cols_set = set(self.z_cols) - if not z_cols_set.isdisjoint(cluster_cols_set): - raise ValueError('At least one variable/column is set as instrumental variable (``z_cols``) and ' - 'cluster variable in ``cluster_cols``.') - if self.t_col is not None: - if not t_col_set.isdisjoint(cluster_cols_set): - raise ValueError(f'{str(self.t_col)} cannot be set as time variable ``t_col`` and ' - 'cluster variable in ``cluster_cols``.') - if self.s_col is not None: - if not s_col_set.isdisjoint(cluster_cols_set): - raise ValueError(f'{str(self.s_col)} cannot be set as score or selection variable ``s_col`` and ' - 'cluster variable in ``cluster_cols``.') - - def _set_cluster_vars(self): - assert_all_finite(self.data.loc[:, self.cluster_cols]) - self._cluster_vars = self.data.loc[:, self.cluster_cols] diff --git a/doubleml/utils/_estimation.py b/doubleml/utils/_estimation.py index 8086322a..7f24fde5 100644 --- a/doubleml/utils/_estimation.py +++ b/doubleml/utils/_estimation.py @@ -187,22 +187,6 @@ def _draw_weights(method, n_rep_boot, n_obs): return weights -def _trimm(preds, trimming_rule, trimming_threshold): - if trimming_rule == 'truncate': - preds[preds < trimming_threshold] = trimming_threshold - preds[preds > 1 - trimming_threshold] = 1 - trimming_threshold - return preds - - -def _normalize_ipw(propensity, treatment): - mean_treat1 = np.mean(np.divide(treatment, propensity)) - mean_treat0 = np.mean(np.divide(1.0 - treatment, 1.0 - propensity)) - normalized_weights = np.multiply(treatment, np.multiply(propensity, mean_treat1)) \ - + np.multiply(1.0 - treatment, 1.0 - np.multiply(1.0 - propensity, mean_treat0)) - - return normalized_weights - - def _rmse(y_true, y_pred): subset = np.logical_not(np.isnan(y_true)) rmse = root_mean_squared_error(y_true[subset], y_pred[subset]) From 29114ce4ac7663618b5113285f660b86c46298fe Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 27 Oct 2025 14:35:46 -0700 Subject: [PATCH 16/23] Ruff checks and formatting --- doubleml/__init__.py | 4 +- doubleml/double_ml_score_mixins.py | 31 +- doubleml/plm/datasets/dgp_lplr_LZZ2020.py | 98 ++-- doubleml/plm/lplr.py | 523 +++++++++++-------- doubleml/plm/tests/_utils_logistic_manual.py | 313 ----------- doubleml/plm/tests/_utils_lplr_manual.py | 1 - doubleml/plm/tests/test_lplr_exceptions.py | 6 +- doubleml/plm/tests/tests_logistic.py | 307 ----------- doubleml/utils/_estimation.py | 59 ++- doubleml/utils/resampling.py | 38 +- 10 files changed, 447 insertions(+), 933 deletions(-) delete mode 100644 doubleml/plm/tests/_utils_logistic_manual.py delete mode 100644 doubleml/plm/tests/tests_logistic.py diff --git a/doubleml/__init__.py b/doubleml/__init__.py index 7c8ead97..cb3891ba 100644 --- a/doubleml/__init__.py +++ b/doubleml/__init__.py @@ -13,11 +13,9 @@ from .irm.pq import DoubleMLPQ from .irm.qte import DoubleMLQTE from .irm.ssm import DoubleMLSSM -from doubleml.plm.lplr import DoubleMLLPLR - +from .plm.lplr import DoubleMLLPLR from .plm.pliv import DoubleMLPLIV from .plm.plr import DoubleMLPLR -from .logistic.logistic import DoubleMLLogit from .utils.blp import DoubleMLBLP from .utils.policytree import DoubleMLPolicyTree diff --git a/doubleml/double_ml_score_mixins.py b/doubleml/double_ml_score_mixins.py index b0c69c25..f1112db9 100644 --- a/doubleml/double_ml_score_mixins.py +++ b/doubleml/double_ml_score_mixins.py @@ -150,10 +150,12 @@ def score_deriv(theta): theta_hat = root_res.root if not root_res.converged: score_val = score(theta_hat) - msg = ('Could not find a root of the score function.\n ' - f'Flag: {root_res.flag}.\n' - f'Score value found is {score_val} ' - f'for parameter theta equal to {theta_hat}.') + msg = ( + "Could not find a root of the score function.\n " + f"Flag: {root_res.flag}.\n" + f"Score value found is {score_val} " + f"for parameter theta equal to {theta_hat}." + ) if self._error_on_convergence_failure: raise ValueError(msg) else: @@ -185,15 +187,16 @@ def score_squared(theta): else: score_val_sign = np.sign(score(alt_coef_start)) if score_val_sign > 0: - theta_hat_array, score_val, _ = fmin_l_bfgs_b( score, self._coef_start_val, approx_grad=True, bounds=[self._coef_bounds] ) theta_hat = theta_hat_array.item() - msg = ('Could not find a root of the score function.\n ' - f'Minimum score value found is {score_val} ' - f'for parameter theta equal to {theta_hat}.\n ' - 'No theta found such that the score function evaluates to a negative value.') + msg = ( + "Could not find a root of the score function.\n " + f"Minimum score value found is {score_val} " + f"for parameter theta equal to {theta_hat}.\n " + "No theta found such that the score function evaluates to a negative value." + ) if self._error_on_convergence_failure: raise ValueError(msg) else: @@ -208,10 +211,12 @@ def neg_score(theta): neg_score, self._coef_start_val, approx_grad=True, bounds=[self._coef_bounds] ) theta_hat = theta_hat_array.item() - msg = ('Could not find a root of the score function. ' - f'Maximum score value found is {-1*neg_score_val} ' - f'for parameter theta equal to {theta_hat}. ' - 'No theta found such that the score function evaluates to a positive value.') + msg = ( + "Could not find a root of the score function. " + f"Maximum score value found is {-1 * neg_score_val} " + f"for parameter theta equal to {theta_hat}. " + "No theta found such that the score function evaluates to a positive value." + ) if self._error_on_convergence_failure: raise ValueError(msg) else: diff --git a/doubleml/plm/datasets/dgp_lplr_LZZ2020.py b/doubleml/plm/datasets/dgp_lplr_LZZ2020.py index 007e2b91..3d6d7127 100644 --- a/doubleml/plm/datasets/dgp_lplr_LZZ2020.py +++ b/doubleml/plm/datasets/dgp_lplr_LZZ2020.py @@ -9,28 +9,32 @@ _data_frame_alias = _get_data_frame_alias() _dml_data_alias = _get_dml_data_alias() -def make_lplr_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLData', balanced_r0=True, treatment="continuous", **kwargs): - """ + +def make_lplr_LZZ2020( + n_obs=500, dim_x=20, alpha=0.5, return_type="DoubleMLData", balanced_r0=True, treatment="continuous", **kwargs +): + r""" Generates synthetic data for a logistic partially linear regression model, as in Liu et al. (2021), designed for use in double/debiased machine learning applications. The data generating process is defined as follows: - - Covariates \( x_i \sim \mathcal{N}(0, \Sigma) \), where \( \Sigma_{kj} = 0.7^{|j-k|} \). - - Treatment \( d_i = a_0(x_i) \). - - Propensity score \( p_i = \sigma(\alpha d_i + r_0(x_i)) \), where \( \sigma(\cdot) \) is the logistic function. - - Outcome \( y_i \sim \text{Bernoulli}(p_i) \). + - Covariates :math:`x_i \sim \mathcal{N}(0, \Sigma)`, where :math:`\Sigma_{kj} = 0.7^{|j-k|}`. + - Treatment :math:`d_i = a_0(x_i)`. + - Propensity score :math:`p_i = \sigma(\alpha d_i + r_0(x_i))`, where :math:`\sigma(\cdot)` is the logistic function. + - Outcome :math:`y_i \sim \text{Bernoulli}(p_i)`. The nuisance functions are defined as: .. math:: - + \begin{aligned} a_0(x_i) &= \frac{2}{1 + \exp(x_{i,1})} - \frac{2}{1 + \exp(x_{i,2})} + \sin(x_{i,3}) + \cos(x_{i,4}) \\ - &+ 0.5 \cdot \mathbb{1}(x_{i,5} > 0) - 0.5 \cdot \mathbb{1}(x_{i,6} > 0) + 0.2 x_{i,7} x_{i,8} - 0.2 x_{i,9} x_{i,10} \\ - - r_0(x_i) &= 0.1 x_{i,1} x_{i,2} x_{i,3} + 0.1 x_{i,4} x_{i,5} + 0.1 x_{i,6}^3 - 0.5 \sin^2(x_{i,7}) \\ - &+ 0.5 \cos(x_{i,8}) + \frac{1}{1 + x_{i,9}^2} - \frac{1}{1 + \exp(x_{i,10})} \\ - &+ 0.25 \cdot \mathbb{1}(x_{i,11} > 0) - 0.25 \cdot \mathbb{1}(x_{i,13} > 0) + &\quad + 0.5 \cdot \mathbb{1}(x_{i,5} > 0) - 0.5 \cdot \mathbb{1}(x_{i,6} > 0) + 0.2\, x_{i,7} x_{i,8} + - 0.2\, x_{i,9} x_{i,10} \\ + r_0(x_i) &= 0.1\, x_{i,1} x_{i,2} x_{i,3} + 0.1\, x_{i,4} x_{i,5} + 0.1\, x_{i,6}^3 - 0.5 \sin^2(x_{i,7}) \\ + &\quad + 0.5 \cos(x_{i,8}) + \frac{1}{1 + x_{i,9}^2} - \frac{1}{1 + \exp(x_{i,10})} \\ + &\quad + 0.25 \cdot \mathbb{1}(x_{i,11} > 0) - 0.25 \cdot \mathbb{1}(x_{i,13} > 0) + \end{aligned} Parameters ---------- @@ -73,38 +77,45 @@ def make_lplr_LZZ2020(n_obs=500, dim_x=20, alpha=0.5, return_type='DoubleMLData' """ if balanced_r0: + def r_0(X): - return 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + \ - 0.1 * X[:, 3] * X[:, 4] + \ - 0.1 * X[:, 5] ** 3 + \ - -0.5 * np.sin(X[:, 6]) ** 2 + \ - 0.5 * np.cos(X[:, 7]) + \ - 1 / (1 + X[:, 8] ** 2) + \ - -1 / (1 + np.exp(X[:, 9])) + \ - 0.25 * np.where(X[:, 10] > 0, 1, 0) + \ - -0.25 * np.where(X[:, 12] > 0, 1, 0) + return ( + 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + + 0.1 * X[:, 3] * X[:, 4] + + 0.1 * X[:, 5] ** 3 + + -0.5 * np.sin(X[:, 6]) ** 2 + + 0.5 * np.cos(X[:, 7]) + + 1 / (1 + X[:, 8] ** 2) + + -1 / (1 + np.exp(X[:, 9])) + + 0.25 * np.where(X[:, 10] > 0, 1, 0) + + -0.25 * np.where(X[:, 12] > 0, 1, 0) + ) else: + def r_0(X): - return 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + \ - 0.1 * X[:, 3] * X[:, 4] + \ - 0.1 * X[:, 5] ** 3 + \ - -0.5 * np.sin(X[:, 6]) ** 2 + \ - 0.5 * np.cos(X[:, 7]) + \ - 4 / (1 + X[:, 8] ** 2) + \ - -1 / (1 + np.exp(X[:, 9])) + \ - 1.5 * np.where(X[:, 10] > 0, 1, 0) + \ - -0.25 * np.where(X[:, 12] > 0, 1, 0) + return ( + 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + + 0.1 * X[:, 3] * X[:, 4] + + 0.1 * X[:, 5] ** 3 + + -0.5 * np.sin(X[:, 6]) ** 2 + + 0.5 * np.cos(X[:, 7]) + + 4 / (1 + X[:, 8] ** 2) + + -1 / (1 + np.exp(X[:, 9])) + + 1.5 * np.where(X[:, 10] > 0, 1, 0) + + -0.25 * np.where(X[:, 12] > 0, 1, 0) + ) def a_0(X): - return 2 / (1 + np.exp(X[:, 0])) + \ - -2 / (1 + np.exp(X[:, 1])) + \ - 1 * np.sin(X[:, 2]) + \ - 1 * np.cos(X[:, 3]) + \ - 0.5 * np.where(X[:, 4] > 0, 1, 0) + \ - -0.5 * np.where(X[:, 5] > 0, 1, 0) + \ - 0.2 * X[:, 6] * X[:, 7] + \ - -0.2 * X[:, 8] * X[:, 9] - + return ( + 2 / (1 + np.exp(X[:, 0])) + + -2 / (1 + np.exp(X[:, 1])) + + 1 * np.sin(X[:, 2]) + + 1 * np.cos(X[:, 3]) + + 0.5 * np.where(X[:, 4] > 0, 1, 0) + + -0.5 * np.where(X[:, 5] > 0, 1, 0) + + 0.2 * X[:, 6] * X[:, 7] + + -0.2 * X[:, 8] * X[:, 9] + ) sigma = np.full((dim_x, dim_x), 0.2) np.fill_diagonal(sigma, 1) @@ -128,12 +139,11 @@ def a_0(X): if return_type in _array_alias: return x, y, d, p elif return_type in _data_frame_alias + _dml_data_alias: - x_cols = [f'X{i + 1}' for i in np.arange(dim_x)] - data = pd.DataFrame(np.column_stack((x, y, d, p)), - columns=x_cols + ['y', 'd', 'p']) + x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d, p)), columns=x_cols + ["y", "d", "p"]) if return_type in _data_frame_alias: return data else: - return DoubleMLData(data, 'y', 'd', x_cols) + return DoubleMLData(data, "y", "d", x_cols) else: - raise ValueError('Invalid return_type.') \ No newline at end of file + raise ValueError("Invalid return_type.") diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index 1ed00810..edf17f08 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -1,34 +1,22 @@ import inspect import numpy as np - -from doubleml.utils._estimation import ( - _dml_cv_predict, - _trimm, - _predict_zero_one_propensity, - _cond_targets, - _get_bracket_guess, - _default_kde, - _normalize_ipw, - _dml_tune, - _solve_ipw_score, -) +import scipy from sklearn.base import clone from sklearn.utils import check_X_y -import scipy from sklearn.utils.multiclass import type_of_target from doubleml import DoubleMLData from doubleml.double_ml import DoubleML from doubleml.double_ml_score_mixins import NonLinearScoreMixin -from doubleml.utils import DoubleMLClusterResampling -from doubleml.utils._checks import _check_score, _check_finite_predictions, _check_is_propensity +from doubleml.utils._checks import _check_finite_predictions, _check_is_propensity, _check_score +from doubleml.utils._estimation import ( + _dml_cv_predict, + _dml_tune, +) from doubleml.utils.resampling import DoubleMLDoubleResampling - - - class DoubleMLLPLR(NonLinearScoreMixin, DoubleML): """Double machine learning for partially logistic models (binary outcomes) @@ -89,24 +77,22 @@ class DoubleMLLPLR(NonLinearScoreMixin, DoubleML): The high-dimensional vector :math:`X = (X_1, \\ldots, X_p)` consists of other confounding covariates. """ - def __init__(self, - obj_dml_data, - ml_M, - ml_t, - ml_m, - ml_a=None, - n_folds=5, - n_folds_inner=5, - n_rep=1, - score='nuisance_space', - draw_sample_splitting=True, - error_on_convergence_failure=False,): + def __init__( + self, + obj_dml_data, + ml_M, + ml_t, + ml_m, + ml_a=None, + n_folds=5, + n_folds_inner=5, + n_rep=1, + score="nuisance_space", + draw_sample_splitting=True, + error_on_convergence_failure=False, + ): self.n_folds_inner = n_folds_inner - super().__init__(obj_dml_data, - n_folds, - n_rep, - score, - draw_sample_splitting) + super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting) # Ensure outcome only contains 0 and 1 (validate early in constructor) if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]): @@ -117,208 +103,264 @@ def __init__(self, self._coef_start_val = 1.0 self._check_data(self._dml_data) - valid_scores = ['nuisance_space', 'instrument'] + valid_scores = ["nuisance_space", "instrument"] _check_score(self.score, valid_scores, allow_callable=False) - _ = self._check_learner(ml_t, 'ml_t', regressor=True, classifier=False) - _ = self._check_learner(ml_M, 'ml_M', regressor=False, classifier=True) + _ = self._check_learner(ml_t, "ml_t", regressor=True, classifier=False) + _ = self._check_learner(ml_M, "ml_M", regressor=False, classifier=True) if np.array_equal(np.unique(obj_dml_data.d), [0, 1]): - ml_m_is_classifier = self._check_learner(ml_m, 'ml_m', regressor=False, classifier=True) + ml_m_is_classifier = self._check_learner(ml_m, "ml_m", regressor=False, classifier=True) else: - ml_m_is_classifier = self._check_learner(ml_m, 'ml_m', regressor=True, classifier=False) - self._learner = {'ml_m': ml_m, 'ml_t': ml_t, 'ml_M': ml_M} + ml_m_is_classifier = self._check_learner(ml_m, "ml_m", regressor=True, classifier=False) + self._learner = {"ml_m": ml_m, "ml_t": ml_t, "ml_M": ml_M} if ml_a is not None: - ml_a_is_classifier = self._check_learner(ml_a, 'ml_a', regressor=True, classifier=True) - self._learner['ml_a'] = ml_a + ml_a_is_classifier = self._check_learner(ml_a, "ml_a", regressor=True, classifier=True) + self._learner["ml_a"] = ml_a self._ml_a_provided = True else: - self._learner['ml_a'] = clone(ml_m) + self._learner["ml_a"] = clone(ml_m) ml_a_is_classifier = ml_m_is_classifier self._ml_a_provided = False - self._predict_method = {'ml_t': 'predict', 'ml_M': 'predict_proba'} + self._predict_method = {"ml_t": "predict", "ml_M": "predict_proba"} if ml_m_is_classifier: if self._dml_data.binary_treats.all(): - self._predict_method['ml_m'] = 'predict_proba' + self._predict_method["ml_m"] = "predict_proba" else: - raise ValueError(f'The ml_m learner {str(ml_m)} was identified as classifier ' - 'but at least one treatment variable is not binary with values 0 and 1.') + raise ValueError( + f"The ml_m learner {str(ml_m)} was identified as classifier " + "but at least one treatment variable is not binary with values 0 and 1." + ) else: - self._predict_method['ml_m'] = 'predict' + self._predict_method["ml_m"] = "predict" if ml_a_is_classifier: if self._dml_data.binary_treats.all(): - self._predict_method['ml_a'] = 'predict_proba' + self._predict_method["ml_a"] = "predict_proba" else: - raise ValueError(f'The ml_a learner {str(ml_a)} was identified as classifier ' - 'but at least one treatment variable is not binary with values 0 and 1.') + raise ValueError( + f"The ml_a learner {str(ml_a)} was identified as classifier " + "but at least one treatment variable is not binary with values 0 and 1." + ) else: - self._predict_method['ml_a'] = 'predict' + self._predict_method["ml_a"] = "predict" - if score == 'instrument': - sig = inspect.signature(self.learner['ml_a'].fit) - if not 'sample_weight' in sig.parameters: - raise ValueError('Learner \"ml_a\" who supports sample_weight is required for score type \"instrument\"') + if score == "instrument": + sig = inspect.signature(self.learner["ml_a"].fit) + if "sample_weight" not in sig.parameters: + raise ValueError('Learner "ml_a" who supports sample_weight is required for score type "instrument"') self._initialize_ml_nuisance_params() self._external_predictions_implemented = True def _initialize_ml_nuisance_params(self): - self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} - for learner in self._learner} + self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in self._learner} def _check_data(self, obj_dml_data): if not isinstance(obj_dml_data, DoubleMLData): - raise TypeError('The data must be of DoubleMLData type. ' - f'{str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed.') + raise TypeError( + f"The data must be of DoubleMLData type. {str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed." + ) if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]): - raise TypeError('The outcome variable y must be binary with values 0 and 1.') + raise TypeError("The outcome variable y must be binary with values 0 and 1.") return - - def _double_dml_cv_predict(self, estimator, estimator_name, x, y, smpls=None, smpls_inner=None, - n_jobs=None, est_params=None, method='predict', sample_weights=None): + def _double_dml_cv_predict( + self, + estimator, + estimator_name, + x, + y, + smpls=None, + smpls_inner=None, + n_jobs=None, + est_params=None, + method="predict", + sample_weights=None, + ): res = {} - res['preds'] = np.zeros(y.shape, dtype=float) - res['preds_inner'] = [] - res['models'] = [] + res["preds"] = np.zeros(y.shape, dtype=float) + res["preds_inner"] = [] + res["models"] = [] for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): - res_inner = _dml_cv_predict(estimator, x, y, smpls=smpls_double_split, n_jobs=n_jobs, - est_params=est_params, method=method, - return_models=True, smpls_is_partition=True, sample_weights=sample_weights) - _check_finite_predictions(res_inner['preds'], estimator, estimator_name, smpls_double_split) - - res['preds_inner'].append(res_inner['preds']) - for model in res_inner['models']: - res['models'].append(model) - if method == 'predict_proba': - res['preds'][smpls_single_split[1]] += model.predict_proba(x[smpls_single_split[1]])[:, 1] + res_inner = _dml_cv_predict( + estimator, + x, + y, + smpls=smpls_double_split, + n_jobs=n_jobs, + est_params=est_params, + method=method, + return_models=True, + smpls_is_partition=True, + sample_weights=sample_weights, + ) + _check_finite_predictions(res_inner["preds"], estimator, estimator_name, smpls_double_split) + + res["preds_inner"].append(res_inner["preds"]) + for model in res_inner["models"]: + res["models"].append(model) + if method == "predict_proba": + res["preds"][smpls_single_split[1]] += model.predict_proba(x[smpls_single_split[1]])[:, 1] else: - res['preds'][smpls_single_split[1]] += model.predict(x[smpls_single_split[1]]) + res["preds"][smpls_single_split[1]] += model.predict(x[smpls_single_split[1]]) res["preds"] /= len(smpls) - res['targets'] = np.copy(y) + res["targets"] = np.copy(y) return res - - def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False): - x, y = check_X_y(self._dml_data.x, self._dml_data.y, - force_all_finite=False) - x, d = check_X_y(x, self._dml_data.d, - force_all_finite=False) - x_d_concat = np.hstack((d.reshape(-1,1), x)) - m_external = external_predictions['ml_m'] is not None - M_external = external_predictions['ml_M'] is not None - t_external = external_predictions['ml_t'] is not None - if 'ml_a' in self._learner: - a_external = external_predictions['ml_a'] is not None + x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False) + x, d = check_X_y(x, self._dml_data.d, force_all_finite=False) + x_d_concat = np.hstack((d.reshape(-1, 1), x)) + m_external = external_predictions["ml_m"] is not None + M_external = external_predictions["ml_M"] is not None + t_external = external_predictions["ml_t"] is not None + if "ml_a" in self._learner: + a_external = external_predictions["ml_a"] is not None else: a_external = False if M_external: - M_hat = {'preds': external_predictions['ml_M'], - 'targets': None, - 'models': None} + M_hat = {"preds": external_predictions["ml_M"], "targets": None, "models": None} else: - M_hat = (self._double_dml_cv_predict(self._learner['ml_M'], 'ml_M', x_d_concat, y, smpls=smpls, smpls_inner=self.__smpls__inner, - n_jobs=n_jobs_cv, - est_params=self._get_params('ml_M'), method=self._predict_method['ml_M'])) - + M_hat = self._double_dml_cv_predict( + self._learner["ml_M"], + "ml_M", + x_d_concat, + y, + smpls=smpls, + smpls_inner=self.__smpls__inner, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_M"), + method=self._predict_method["ml_M"], + ) # nuisance m if m_external: - m_hat = {'preds': external_predictions['ml_m'], - 'targets': None, - 'models': None} + m_hat = {"preds": external_predictions["ml_m"], "targets": None, "models": None} else: - if self.score == 'instrument': + if self.score == "instrument": weights = [] for i, (train, test) in enumerate(smpls): - weights.append( M_hat['preds_inner'][i][train] * (1-M_hat['preds_inner'][i][train])) - m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=smpls, n_jobs=n_jobs_cv, - est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'], - return_models=return_models, sample_weights=weights) - - elif self.score == 'nuisance_space': + weights.append(M_hat["preds_inner"][i][train] * (1 - M_hat["preds_inner"][i][train])) + m_hat = _dml_cv_predict( + self._learner["ml_m"], + x, + d, + smpls=smpls, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_m"), + method=self._predict_method["ml_m"], + return_models=return_models, + sample_weights=weights, + ) + + elif self.score == "nuisance_space": filtered_smpls = [] for train, test in smpls: train_filtered = train[y[train] == 0] filtered_smpls.append((train_filtered, test)) - m_hat = _dml_cv_predict(self._learner['ml_m'], x, d, smpls=filtered_smpls, n_jobs=n_jobs_cv, - est_params=self._get_params('ml_m'), method=self._predict_method['ml_m'], - return_models=return_models) + m_hat = _dml_cv_predict( + self._learner["ml_m"], + x, + d, + smpls=filtered_smpls, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_m"), + method=self._predict_method["ml_m"], + return_models=return_models, + ) else: raise NotImplementedError - _check_finite_predictions(m_hat['preds'], self._learner['ml_m'], 'ml_m', smpls) + _check_finite_predictions(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls) - if self._check_learner(self._learner['ml_m'], 'ml_m', regressor=True, classifier=True): - _check_is_propensity(m_hat['preds'], self._learner['ml_m'], 'ml_m', smpls, eps=1e-12) + if self._check_learner(self._learner["ml_m"], "ml_m", regressor=True, classifier=True): + _check_is_propensity(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls, eps=1e-12) if self._dml_data.binary_treats[self._dml_data.d_cols[self._i_treat]]: - binary_preds = (type_of_target(m_hat['preds']) == 'binary') - zero_one_preds = np.all((np.power(m_hat['preds'], 2) - m_hat['preds']) == 0) + binary_preds = type_of_target(m_hat["preds"]) == "binary" + zero_one_preds = np.all((np.power(m_hat["preds"], 2) - m_hat["preds"]) == 0) if binary_preds & zero_one_preds: - raise ValueError(f'For the binary treatment variable {self._dml_data.d_cols[self._i_treat]}, ' - f'predictions obtained with the ml_m learner {str(self._learner["ml_m"])} are also ' - 'observed to be binary with values 0 and 1. Make sure that for classifiers ' - 'probabilities and not labels are predicted.') + raise ValueError( + f"For the binary treatment variable {self._dml_data.d_cols[self._i_treat]}, " + f"predictions obtained with the ml_m learner {str(self._learner['ml_m'])} are also " + "observed to be binary with values 0 and 1. Make sure that for classifiers " + "probabilities and not labels are predicted." + ) if a_external: - a_hat = {'preds': external_predictions['ml_a'], - 'targets': None, - 'models': None} + a_hat = {"preds": external_predictions["ml_a"], "targets": None, "models": None} else: - a_hat = (self._double_dml_cv_predict(self._learner['ml_a'], 'ml_a', x, d, smpls=smpls, smpls_inner=self.__smpls__inner, - n_jobs=n_jobs_cv, - est_params=self._get_params('ml_a'), method=self._predict_method['ml_a'])) + a_hat = self._double_dml_cv_predict( + self._learner["ml_a"], + "ml_a", + x, + d, + smpls=smpls, + smpls_inner=self.__smpls__inner, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_a"), + method=self._predict_method["ml_a"], + ) W_inner = [] beta = np.zeros(d.shape, dtype=float) for i, (train, test) in enumerate(smpls): - M_iteration = M_hat['preds_inner'][i][train] + M_iteration = M_hat["preds_inner"][i][train] M_iteration = np.clip(M_iteration, 1e-8, 1 - 1e-8) w = scipy.special.logit(M_iteration) W_inner.append(w) - d_tilde = (d - a_hat['preds_inner'][i])[train] - beta[test] = np.sum(d_tilde * w) / np.sum(d_tilde ** 2) - + d_tilde = (d - a_hat["preds_inner"][i])[train] + beta[test] = np.sum(d_tilde * w) / np.sum(d_tilde**2) # nuisance t if t_external: - t_hat = {'preds': external_predictions['ml_t'], - 'targets': None, - 'models': None} + t_hat = {"preds": external_predictions["ml_t"], "targets": None, "models": None} else: - t_hat = _dml_cv_predict(self._learner['ml_t'], x, W_inner, smpls=smpls, n_jobs=n_jobs_cv, - est_params=self._get_params('ml_t'), method=self._predict_method['ml_t'], - return_models=return_models) - _check_finite_predictions(t_hat['preds'], self._learner['ml_t'], 'ml_t', smpls) - + t_hat = _dml_cv_predict( + self._learner["ml_t"], + x, + W_inner, + smpls=smpls, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_t"), + method=self._predict_method["ml_t"], + return_models=return_models, + ) + _check_finite_predictions(t_hat["preds"], self._learner["ml_t"], "ml_t", smpls) r_hat = {} - r_hat['preds'] = t_hat['preds'] - beta * a_hat['preds'] - - psi_elements = self._score_elements(y, d, r_hat['preds'], m_hat['preds']) - - preds = {'predictions': {'ml_r': r_hat['preds'], - 'ml_m': m_hat['preds'], - 'ml_a': a_hat['preds'], - 'ml_t': t_hat['preds'], - 'ml_M': M_hat['preds']}, - 'targets': {'ml_r': None, - 'ml_m': m_hat['targets'], - 'ml_a': a_hat['targets'], - 'ml_t': t_hat['targets'], - 'ml_M': M_hat['targets']}, - 'models': {'ml_r': None, - 'ml_m': m_hat['models'], - 'ml_a': a_hat['models'], - 'ml_t': t_hat['models'], - 'ml_M': M_hat['models']}} + r_hat["preds"] = t_hat["preds"] - beta * a_hat["preds"] + + psi_elements = self._score_elements(y, d, r_hat["preds"], m_hat["preds"]) + + preds = { + "predictions": { + "ml_r": r_hat["preds"], + "ml_m": m_hat["preds"], + "ml_a": a_hat["preds"], + "ml_t": t_hat["preds"], + "ml_M": M_hat["preds"], + }, + "targets": { + "ml_r": None, + "ml_m": m_hat["targets"], + "ml_a": a_hat["targets"], + "ml_t": t_hat["targets"], + "ml_M": M_hat["targets"], + }, + "models": { + "ml_r": None, + "ml_m": m_hat["models"], + "ml_a": a_hat["models"], + "ml_t": t_hat["models"], + "ml_M": M_hat["models"], + }, + } return psi_elements, preds @@ -327,90 +369,128 @@ def _score_elements(self, y, d, r_hat, m_hat): d_tilde = d - m_hat psi_hat = scipy.special.expit(-r_hat) score_const = d_tilde * (1 - y) * np.exp(r_hat) - psi_elements = {"y": y, "d": d, "d_tilde": d_tilde, "r_hat": r_hat, "m_hat": m_hat, "psi_hat": psi_hat, "score_const": score_const} + psi_elements = { + "y": y, + "d": d, + "d_tilde": d_tilde, + "r_hat": r_hat, + "m_hat": m_hat, + "psi_hat": psi_hat, + "score_const": score_const, + } return psi_elements @property def _score_element_names(self): - return ['y', 'd', 'd_tilde', 'r_hat', 'm_hat', 'psi_hat', 'score_const'] + return ["y", "d", "d_tilde", "r_hat", "m_hat", "psi_hat", "score_const"] def _sensitivity_element_est(self, preds): - pass + pass - def _nuisance_tuning(self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, - search_mode, n_iter_randomized_search): + def _nuisance_tuning( + self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search + ): # TODO: test - x, y = check_X_y(self._dml_data.x, self._dml_data.y, - force_all_finite=False) - x, d = check_X_y(x, self._dml_data.d, - force_all_finite=False) + x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False) + x, d = check_X_y(x, self._dml_data.d, force_all_finite=False) x_d_concat = np.hstack((d.reshape(-1, 1), x)) if scoring_methods is None: - scoring_methods = {'ml_m': None, - 'ml_M': None, - 'ml_a': None, - 'ml_t': None} + scoring_methods = {"ml_m": None, "ml_M": None, "ml_a": None, "ml_t": None} train_inds = [train_index for (train_index, _) in smpls] - M_tune_res = _dml_tune(y, x_d_concat, train_inds, - self._learner['ml_M'], param_grids['ml_M'], scoring_methods['ml_M'], - n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search) + M_tune_res = _dml_tune( + y, + x_d_concat, + train_inds, + self._learner["ml_M"], + param_grids["ml_M"], + scoring_methods["ml_M"], + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + ) filtered_train_inds = [] - if self.score == 'nuisance_space': + if self.score == "nuisance_space": for train, test in smpls: train_filtered = train[y[train] == 0] filtered_train_inds.append(train_filtered) - elif self.score == 'instrument': + elif self.score == "instrument": filtered_train_inds = train_inds else: raise NotImplementedError - m_tune_res = _dml_tune(d, x, filtered_train_inds, - self._learner['ml_m'], param_grids['ml_m'], scoring_methods['ml_m'], - n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search) - - a_tune_res = _dml_tune(d, x, train_inds, - self._learner['ml_a'], param_grids['ml_a'], scoring_methods['ml_a'], - n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search) + m_tune_res = _dml_tune( + d, + x, + filtered_train_inds, + self._learner["ml_m"], + param_grids["ml_m"], + scoring_methods["ml_m"], + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + ) + + a_tune_res = _dml_tune( + d, + x, + train_inds, + self._learner["ml_a"], + param_grids["ml_a"], + scoring_methods["ml_a"], + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + ) M_best_params = [xx.best_params_ for xx in M_tune_res] m_best_params = [xx.best_params_ for xx in m_tune_res] a_best_params = [xx.best_params_ for xx in a_tune_res] # Create targets for tuning ml_t - M_hat = (self._double_dml_cv_predict(self._learner['ml_M'], 'ml_M', x_d_concat, y, smpls=smpls, - smpls_inner=self.__smpls__inner, - n_jobs=n_jobs_cv, - est_params=M_best_params, method=self._predict_method['ml_M'])) + M_hat = self._double_dml_cv_predict( + self._learner["ml_M"], + "ml_M", + x_d_concat, + y, + smpls=smpls, + smpls_inner=self.__smpls__inner, + n_jobs=n_jobs_cv, + est_params=M_best_params, + method=self._predict_method["ml_M"], + ) W_inner = [] for i, (train, test) in enumerate(smpls): - M_iteration = M_hat['preds_inner'][i][train] + M_iteration = M_hat["preds_inner"][i][train] M_iteration = np.clip(M_iteration, 1e-8, 1 - 1e-8) w = scipy.special.logit(M_iteration) W_inner.append(w) - t_tune_res = _dml_tune(W_inner, x, train_inds, - self._learner['ml_t'], param_grids['ml_t'], scoring_methods['ml_t'], - n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search) + t_tune_res = _dml_tune( + W_inner, + x, + train_inds, + self._learner["ml_t"], + param_grids["ml_t"], + scoring_methods["ml_t"], + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + ) t_best_params = [xx.best_params_ for xx in t_tune_res] - - # Update params and tune_res to include ml_a and ml_t - params = {'ml_M': M_best_params, - 'ml_m': m_best_params, - 'ml_a': a_best_params, - 'ml_t': t_best_params} - tune_res = {'M_tune': M_tune_res, - 'm_tune': m_tune_res, - 'a_tune': a_tune_res, - 't_tune': t_tune_res} - - res = {'params': params, - 'tune_res': tune_res} + params = {"ml_M": M_best_params, "ml_m": m_best_params, "ml_a": a_best_params, "ml_t": t_best_params} + tune_res = {"M_tune": M_tune_res, "m_tune": m_tune_res, "a_tune": a_tune_res, "t_tune": t_tune_res} + + res = {"params": params, "tune_res": tune_res} return res @@ -430,37 +510,40 @@ def draw_sample_splitting(self): self : object """ - obj_dml_resampling = DoubleMLDoubleResampling(n_folds=self.n_folds, - n_folds_inner=self.n_folds_inner, - n_rep=self.n_rep, - n_obs=self._dml_data.n_obs, - stratify=self._strata) + obj_dml_resampling = DoubleMLDoubleResampling( + n_folds=self.n_folds, + n_folds_inner=self.n_folds_inner, + n_rep=self.n_rep, + n_obs=self._dml_data.n_obs, + stratify=self._strata, + ) self._smpls, self._smpls_inner = obj_dml_resampling.split_samples() return self def set_sample_splitting(self): - raise NotImplementedError('set_sample_splitting is not implemented for DoubleMLLPLR.') + raise NotImplementedError("set_sample_splitting is not implemented for DoubleMLLPLR.") def _compute_score(self, psi_elements, coef): - - if self.score == 'nuisance_space': + if self.score == "nuisance_space": score_1 = psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d_tilde"] score = psi_elements["psi_hat"] * (score_1 - psi_elements["score_const"]) - elif self.score == 'instrument': - score = (psi_elements["y"] - scipy.special.expit(coef * psi_elements["d"]+ psi_elements["r_hat"])) * psi_elements["d_tilde"] + elif self.score == "instrument": + score = (psi_elements["y"] - scipy.special.expit(coef * psi_elements["d"] + psi_elements["r_hat"])) * psi_elements[ + "d_tilde" + ] else: raise NotImplementedError return score def _compute_score_deriv(self, psi_elements, coef, inds=None): - if self.score == 'nuisance_space': - deriv_1 = - psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d"] - deriv = psi_elements["psi_hat"] * psi_elements["d_tilde"] * deriv_1 - elif self.score == 'instrument': - expit = scipy.special.expit(coef * psi_elements["d"]+ psi_elements["r_hat"]) - deriv = - psi_elements["d"] * expit * (1-expit) * psi_elements["d_tilde"] + if self.score == "nuisance_space": + deriv_1 = -psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d"] + deriv = psi_elements["psi_hat"] * psi_elements["d_tilde"] * deriv_1 + elif self.score == "instrument": + expit = scipy.special.expit(coef * psi_elements["d"] + psi_elements["r_hat"]) + deriv = -psi_elements["d"] * expit * (1 - expit) * psi_elements["d_tilde"] else: raise NotImplementedError diff --git a/doubleml/plm/tests/_utils_logistic_manual.py b/doubleml/plm/tests/_utils_logistic_manual.py deleted file mode 100644 index af4d034e..00000000 --- a/doubleml/plm/tests/_utils_logistic_manual.py +++ /dev/null @@ -1,313 +0,0 @@ -import numpy as np -import scipy -from sklearn.base import clone, is_classifier - -from doubleml.tests._utils_boot import boot_manual, draw_weights -from doubleml.tests._utils import fit_predict, fit_predict_proba, tune_grid_search - - -def fit_logistic_multitreat(y, x, d, learner_l, learner_m, learner_g, all_smpls, score, - n_rep=1, l_params=None, m_params=None, g_params=None, - use_other_treat_as_covariate=True): - n_obs = len(y) - n_d = d.shape[1] - - thetas = list() - ses = list() - all_l_hat = list() - all_m_hat = list() - all_g_hat = list() - for i_rep in range(n_rep): - smpls = all_smpls[i_rep] - thetas_this_rep = np.full(n_d, np.nan) - ses_this_rep = np.full(n_d, np.nan) - all_l_hat_this_rep = list() - all_m_hat_this_rep = list() - all_g_hat_this_rep = list() - - for i_d in range(n_d): - if use_other_treat_as_covariate: - xd = np.hstack((x, np.delete(d, i_d, axis=1))) - else: - xd = x - - l_hat, m_hat, g_hat, thetas_this_rep[i_d], ses_this_rep[i_d] = fit_plr_single_split( - y, xd, d[:, i_d], - learner_l, learner_m, learner_g, - smpls, score, - l_params, m_params, g_params) - all_l_hat_this_rep.append(l_hat) - all_m_hat_this_rep.append(m_hat) - all_g_hat_this_rep.append(g_hat) - - thetas.append(thetas_this_rep) - ses.append(ses_this_rep) - all_l_hat.append(all_l_hat_this_rep) - all_m_hat.append(all_m_hat_this_rep) - all_g_hat.append(all_g_hat_this_rep) - - theta = np.full(n_d, np.nan) - se = np.full(n_d, np.nan) - for i_d in range(n_d): - theta_vec = np.array([xx[i_d] for xx in thetas]) - se_vec = np.array([xx[i_d] for xx in ses]) - theta[i_d] = np.median(theta_vec) - se[i_d] = np.sqrt(np.median(np.power(se_vec, 2) * n_obs + np.power(theta_vec - theta[i_d], 2)) / n_obs) - - res = {'theta': theta, 'se': se, - 'thetas': thetas, 'ses': ses, - 'all_l_hat': all_l_hat, 'all_m_hat': all_m_hat, 'all_g_hat': all_g_hat} - - return res - - -def fit_logistic(y, x, d, learner_l, learner_m, learner_g, all_smpls, score, - n_rep=1, l_params=None, m_params=None, g_params=None): - n_obs = len(y) - - thetas = np.zeros(n_rep) - ses = np.zeros(n_rep) - all_l_hat = list() - all_m_hat = list() - all_g_hat = list() - for i_rep in range(n_rep): - smpls = all_smpls[i_rep] - l_hat, m_hat, g_hat, thetas[i_rep], ses[i_rep] = fit_plr_single_split( - y, x, d, - learner_l, learner_m, learner_g, - smpls, score, - l_params, m_params, g_params) - all_l_hat.append(l_hat) - all_m_hat.append(m_hat) - all_g_hat.append(g_hat) - - theta = np.median(thetas) - se = np.sqrt(np.median(np.power(ses, 2) * n_obs + np.power(thetas - theta, 2)) / n_obs) - - res = {'theta': theta, 'se': se, - 'thetas': thetas, 'ses': ses, - 'all_l_hat': all_l_hat, 'all_m_hat': all_m_hat, 'all_g_hat': all_g_hat} - - return res - - -def fit_plr_logistic_split(y, x, d, learner_l, learner_m, learner_g, smpls, score, - l_params=None, m_params=None, g_params=None): - fit_g = (score == 'IV-type') | callable(score) - if is_classifier(learner_m): - l_hat, m_hat, g_hat = fit_nuisance_plr_classifier(y, x, d, - learner_l, learner_m, learner_g, - smpls, fit_g, - l_params, m_params, g_params) - else: - l_hat, m_hat, g_hat = fit_nuisance_plr(y, x, d, - learner_l, learner_m, learner_g, - smpls, fit_g, - l_params, m_params, g_params) - - theta, se = plr_dml2(y, x, d, l_hat, m_hat, g_hat, - smpls, score) - - return l_hat, m_hat, g_hat, theta, se - - -def fit_nuisance_logistic(y, x, d, learner_l, learner_m, learner_g, smpls, fit_g=True, - l_params=None, m_params=None, g_params=None): - ml_l = clone(learner_l) - l_hat = fit_predict(y, x, ml_l, l_params, smpls) - - ml_m = clone(learner_m) - m_hat = fit_predict(d, x, ml_m, m_params, smpls) - - if fit_g: - y_minus_l_hat, d_minus_m_hat, _ = compute_plr_residuals(y, d, l_hat, m_hat, [], smpls) - psi_a = -np.multiply(d_minus_m_hat, d_minus_m_hat) - psi_b = np.multiply(d_minus_m_hat, y_minus_l_hat) - theta_initial = -np.nanmean(psi_b) / np.nanmean(psi_a) - - ml_g = clone(learner_g) - g_hat = fit_predict(y - theta_initial*d, x, ml_g, g_params, smpls) - else: - g_hat = [] - - return l_hat, m_hat, g_hat - - -def fit_nuisance_logistic_classifier(y, x, d, learner_l, learner_m, learner_g, smpls, fit_g=True, - l_params=None, m_params=None, g_params=None): - ml_l = clone(learner_l) - l_hat = fit_predict(y, x, ml_l, l_params, smpls) - - ml_m = clone(learner_m) - m_hat = fit_predict_proba(d, x, ml_m, m_params, smpls) - - if fit_g: - y_minus_l_hat, d_minus_m_hat, _ = compute_plr_residuals(y, d, l_hat, m_hat, [], smpls) - psi_a = -np.multiply(d_minus_m_hat, d_minus_m_hat) - psi_b = np.multiply(d_minus_m_hat, y_minus_l_hat) - theta_initial = -np.mean(psi_b) / np.mean(psi_a) - - ml_g = clone(learner_g) - g_hat = fit_predict(y - theta_initial*d, x, ml_g, g_params, smpls) - else: - g_hat = [] - - return l_hat, m_hat, g_hat - - -def compute_plr_residuals(y, d, l_hat, m_hat, g_hat, smpls): - y_minus_l_hat = np.full_like(y, np.nan, dtype='float64') - d_minus_m_hat = np.full_like(d, np.nan, dtype='float64') - y_minus_g_hat = np.full_like(y, np.nan, dtype='float64') - for idx, (_, test_index) in enumerate(smpls): - y_minus_l_hat[test_index] = y[test_index] - l_hat[idx] - if len(g_hat) > 0: - y_minus_g_hat[test_index] = y[test_index] - g_hat[idx] - d_minus_m_hat[test_index] = d[test_index] - m_hat[idx] - return y_minus_l_hat, d_minus_m_hat, y_minus_g_hat - - - - -def var_plr(theta, d, y_minus_l_hat, d_minus_m_hat, y_minus_g_hat, score, n_obs): - if score == 'partialling out': - var = 1/n_obs * 1/np.power(np.mean(np.multiply(d_minus_m_hat, d_minus_m_hat)), 2) * \ - np.mean(np.power(np.multiply(y_minus_l_hat - d_minus_m_hat*theta, d_minus_m_hat), 2)) - else: - assert score == 'IV-type' - var = 1/n_obs * 1/np.power(np.mean(np.multiply(d_minus_m_hat, d)), 2) * \ - np.mean(np.power(np.multiply(y_minus_g_hat - d*theta, d_minus_m_hat), 2)) - - return var - - -def plr_orth(y_minus_l_hat, d_minus_m_hat, y_minus_g_hat, d, score): - if score == 'IV-type': - res = np.mean(np.multiply(d_minus_m_hat, y_minus_g_hat))/np.mean(np.multiply(d_minus_m_hat, d)) - else: - assert score == 'partialling out' - res = scipy.linalg.lstsq(d_minus_m_hat.reshape(-1, 1), y_minus_l_hat)[0] - - return res - - -def boot_plr(y, d, thetas, ses, all_l_hat, all_m_hat, all_g_hat, - all_smpls, score, bootstrap, n_rep_boot, - n_rep=1, apply_cross_fitting=True): - all_boot_t_stat = list() - for i_rep in range(n_rep): - smpls = all_smpls[i_rep] - if apply_cross_fitting: - n_obs = len(y) - else: - test_index = smpls[0][1] - n_obs = len(test_index) - weights = draw_weights(bootstrap, n_rep_boot, n_obs) - - boot_t_stat = boot_plr_single_split( - thetas[i_rep], y, d, all_l_hat[i_rep], all_m_hat[i_rep], all_g_hat[i_rep], smpls, - score, ses[i_rep], - weights, n_rep_boot, apply_cross_fitting) - all_boot_t_stat.append(boot_t_stat) - - # differently for plr because of n_rep_boot and multiple treatmentsa - boot_t_stat = np.transpose(np.vstack(all_boot_t_stat)) - - return boot_t_stat - - -def boot_plr_multitreat(y, d, thetas, ses, all_l_hat, all_m_hat, all_g_hat, - all_smpls, score, bootstrap, n_rep_boot, - n_rep=1, apply_cross_fitting=True): - n_d = d.shape[1] - all_boot_t_stat = list() - for i_rep in range(n_rep): - smpls = all_smpls[i_rep] - if apply_cross_fitting: - n_obs = len(y) - else: - test_index = smpls[0][1] - n_obs = len(test_index) - weights = draw_weights(bootstrap, n_rep_boot, n_obs) - - boot_t_stat = np.full((n_d, n_rep_boot), np.nan) - for i_d in range(n_d): - boot_t_stat[i_d, :] = boot_plr_single_split( - thetas[i_rep][i_d], y, d[:, i_d], - all_l_hat[i_rep][i_d], all_m_hat[i_rep][i_d], all_g_hat[i_rep][i_d], - smpls, score, ses[i_rep][i_d], - weights, n_rep_boot, apply_cross_fitting) - - # transpose for shape (n_rep_boot, n_d) - boot_t_stat = np.transpose(boot_t_stat) - all_boot_t_stat.append(boot_t_stat) - - # stack repetitions along the last axis - boot_t_stat = np.stack(all_boot_t_stat, axis=2) - - return boot_t_stat - - -def boot_plr_single_split(theta, y, d, l_hat, m_hat, g_hat, - smpls, score, se, weights, n_rep, apply_cross_fitting): - y_minus_l_hat, d_minus_m_hat, y_minus_g_hat = compute_plr_residuals(y, d, l_hat, m_hat, g_hat, smpls) - - if apply_cross_fitting: - if score == 'partialling out': - J = np.mean(-np.multiply(d_minus_m_hat, d_minus_m_hat)) - else: - assert score == 'IV-type' - J = np.mean(-np.multiply(d_minus_m_hat, d)) - else: - test_index = smpls[0][1] - if score == 'partialling out': - J = np.mean(-np.multiply(d_minus_m_hat[test_index], d_minus_m_hat[test_index])) - else: - assert score == 'IV-type' - J = np.mean(-np.multiply(d_minus_m_hat[test_index], d[test_index])) - - if score == 'partialling out': - psi = np.multiply(y_minus_l_hat - d_minus_m_hat * theta, d_minus_m_hat) - else: - assert score == 'IV-type' - psi = np.multiply(y_minus_g_hat - d * theta, d_minus_m_hat) - - boot_t_stat = boot_manual(psi, J, smpls, se, weights, n_rep, apply_cross_fitting) - - return boot_t_stat - - -def fit_sensitivity_elements_plr(y, d, all_coef, predictions, score, n_rep): - n_treat = d.shape[1] - n_obs = len(y) - - sigma2 = np.full(shape=(1, n_rep, n_treat), fill_value=np.nan) - nu2 = np.full(shape=(1, n_rep, n_treat), fill_value=np.nan) - psi_sigma2 = np.full(shape=(n_obs, n_rep, n_treat), fill_value=np.nan) - psi_nu2 = np.full(shape=(n_obs, n_rep, n_treat), fill_value=np.nan) - - for i_rep in range(n_rep): - for i_treat in range(n_treat): - d_tilde = d[:, i_treat] - m_hat = predictions['ml_m'][:, i_rep, i_treat] - theta = all_coef[i_treat, i_rep] - if score == 'partialling out': - l_hat = predictions['ml_l'][:, i_rep, i_treat] - sigma2_score_element = np.square(y - l_hat - np.multiply(theta, d_tilde-m_hat)) - else: - assert score == 'IV-type' - g_hat = predictions['ml_g'][:, i_rep, i_treat] - sigma2_score_element = np.square(y - g_hat - np.multiply(theta, d_tilde)) - - sigma2[0, i_rep, i_treat] = np.mean(sigma2_score_element) - psi_sigma2[:, i_rep, i_treat] = sigma2_score_element - sigma2[0, i_rep, i_treat] - - nu2[0, i_rep, i_treat] = np.divide(1.0, np.mean(np.square(d_tilde-m_hat))) - psi_nu2[:, i_rep, i_treat] = nu2[0, i_rep, i_treat] - \ - np.multiply(np.square(d_tilde-m_hat), np.square(nu2[0, i_rep, i_treat])) - - element_dict = {'sigma2': sigma2, - 'nu2': nu2, - 'psi_sigma2': psi_sigma2, - 'psi_nu2': psi_nu2} - return element_dict diff --git a/doubleml/plm/tests/_utils_lplr_manual.py b/doubleml/plm/tests/_utils_lplr_manual.py index f14a1f66..8f45b5b0 100644 --- a/doubleml/plm/tests/_utils_lplr_manual.py +++ b/doubleml/plm/tests/_utils_lplr_manual.py @@ -297,7 +297,6 @@ def tune_nuisance_ssm_mar(y, x, d, z, s, ml_g, ml_pi, ml_m, smpls, n_folds_tune, def tune_nuisance_ssm_nonignorable( y, x, d, z, s, ml_g, ml_pi, ml_m, smpls, n_folds_tune, param_grid_g, param_grid_pi, param_grid_m ): - train_inds = [tr for (tr, _) in smpls] inner0_list, inner1_list = [], [] diff --git a/doubleml/plm/tests/test_lplr_exceptions.py b/doubleml/plm/tests/test_lplr_exceptions.py index 4361e7c7..8a55fe59 100644 --- a/doubleml/plm/tests/test_lplr_exceptions.py +++ b/doubleml/plm/tests/test_lplr_exceptions.py @@ -6,7 +6,6 @@ from sklearn.linear_model import Lasso, LogisticRegression from doubleml import DoubleMLLPLR -from doubleml.data.base_data import DoubleMLBaseData, DoubleMLData from doubleml.plm.datasets import make_lplr_LZZ2020 np.random.seed(3141) @@ -19,6 +18,7 @@ dml_lplr = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m) dml_lplr_instrument = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score="instrument") + @pytest.mark.ci def test_lplr_exception_data(): msg = ( @@ -45,6 +45,7 @@ def test_lplr_exception_scores(): with pytest.raises(TypeError, match=msg): _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score=0) + @pytest.mark.ci def test_ssm_exception_resampling(): msg = "The number of folds must be of int type. 1.5 of type was passed." @@ -74,6 +75,7 @@ def test_lplr_exception_get_params(): with pytest.raises(ValueError, match=msg): dml_lplr.get_params("ml_x") + @pytest.mark.ci def test_lplr_exception_smpls(): msg = ( @@ -84,6 +86,7 @@ def test_lplr_exception_smpls(): with pytest.raises(ValueError, match=msg): _ = dml_plr_no_smpls.smpls + @pytest.mark.ci def test_lplr_exception_fit(): msg = "The number of CPUs used to fit the learners must be of int type. 5 of type was passed." @@ -96,6 +99,7 @@ def test_lplr_exception_fit(): with pytest.raises(TypeError, match=msg): dml_lplr.fit(store_models=1) + @pytest.mark.ci def test_lplr_exception_bootstrap(): dml_lplr_boot = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m) diff --git a/doubleml/plm/tests/tests_logistic.py b/doubleml/plm/tests/tests_logistic.py deleted file mode 100644 index a77db7a6..00000000 --- a/doubleml/plm/tests/tests_logistic.py +++ /dev/null @@ -1,307 +0,0 @@ -import pytest -import math -import scipy -import numpy as np -import pandas as pd - -from sklearn.base import clone - -from sklearn.linear_model import LinearRegression, Lasso -from sklearn.ensemble import RandomForestRegressor - -import doubleml as dml - -from doubleml.tests._utils import draw_smpls -from ._utils_logistic_manual import fit_logistic, boot_plr - - -@pytest.fixture(scope='module', - params=[RandomForestRegressor(max_depth=2, n_estimators=10), - LinearRegression(), - Lasso(alpha=0.1)]) -def learner(request): - return request.param - - -@pytest.fixture(scope='module', - params=['IV-type', 'partialling out']) -def score(request): - return request.param - - -@pytest.fixture(scope="module") -def dml_plr_fixture(generate_data1, learner, score): - boot_methods = ['normal'] - n_folds = 2 - n_rep_boot = 502 - - # collect data - data = generate_data1 - x_cols = data.columns[data.columns.str.startswith('X')].tolist() - - # Set machine learning methods for m & g - ml_l = clone(learner) - ml_m = clone(learner) - ml_g = clone(learner) - - np.random.seed(3141) - obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols) - if score == 'partialling out': - dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, - ml_l, ml_m, - n_folds=n_folds, - score=score) - else: - assert score == 'IV-type' - dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, - ml_l, ml_m, ml_g, - n_folds, - score=score) - - dml_plr_obj.fit() - - np.random.seed(3141) - y = data['y'].values - x = data.loc[:, x_cols].values - d = data['d'].values - n_obs = len(y) - all_smpls = draw_smpls(n_obs, n_folds) - - res_manual = fit_plr(y, x, d, clone(learner), clone(learner), clone(learner), - all_smpls, score) - - np.random.seed(3141) - # test with external nuisance predictions - if score == 'partialling out': - dml_plr_obj_ext = dml.DoubleMLPLR(obj_dml_data, - ml_l, ml_m, - n_folds, - score=score) - else: - assert score == 'IV-type' - dml_plr_obj_ext = dml.DoubleMLPLR(obj_dml_data, - ml_l, ml_m, ml_g, - n_folds, - score=score) - - # synchronize the sample splitting - dml_plr_obj_ext.set_sample_splitting(all_smpls=all_smpls) - - if score == 'partialling out': - prediction_dict = {'d': {'ml_l': dml_plr_obj.predictions['ml_l'].reshape(-1, 1), - 'ml_m': dml_plr_obj.predictions['ml_m'].reshape(-1, 1)}} - else: - assert score == 'IV-type' - prediction_dict = {'d': {'ml_l': dml_plr_obj.predictions['ml_l'].reshape(-1, 1), - 'ml_m': dml_plr_obj.predictions['ml_m'].reshape(-1, 1), - 'ml_g': dml_plr_obj.predictions['ml_g'].reshape(-1, 1)}} - - dml_plr_obj_ext.fit(external_predictions=prediction_dict) - - res_dict = {'coef': dml_plr_obj.coef, - 'coef_manual': res_manual['theta'], - 'coef_ext': dml_plr_obj_ext.coef, - 'se': dml_plr_obj.se, - 'se_manual': res_manual['se'], - 'se_ext': dml_plr_obj_ext.se, - 'boot_methods': boot_methods} - - for bootstrap in boot_methods: - np.random.seed(3141) - boot_t_stat = boot_plr(y, d, res_manual['thetas'], res_manual['ses'], - res_manual['all_l_hat'], res_manual['all_m_hat'], res_manual['all_g_hat'], - all_smpls, score, bootstrap, n_rep_boot) - - np.random.seed(3141) - dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) - np.random.seed(3141) - dml_plr_obj_ext.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) - res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat - res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat.reshape(-1, 1, 1) - res_dict['boot_t_stat' + bootstrap + '_ext'] = dml_plr_obj_ext.boot_t_stat - - # sensitivity tests - res_dict['sensitivity_elements'] = dml_plr_obj.sensitivity_elements - res_dict['sensitivity_elements_manual'] = fit_sensitivity_elements_plr(y, d.reshape(-1, 1), - all_coef=dml_plr_obj.all_coef, - predictions=dml_plr_obj.predictions, - score=score, - n_rep=1) - # check if sensitivity score with rho=0 gives equal asymptotic standard deviation - dml_plr_obj.sensitivity_analysis(rho=0.0) - res_dict['sensitivity_ses'] = dml_plr_obj.sensitivity_params['se'] - return res_dict - - -@pytest.mark.ci -def test_dml_plr_coef(dml_plr_fixture): - assert math.isclose(dml_plr_fixture['coef'], - dml_plr_fixture['coef_manual'], - rel_tol=1e-9, abs_tol=1e-4) - assert math.isclose(dml_plr_fixture['coef'], - dml_plr_fixture['coef_ext'], - rel_tol=1e-9, abs_tol=1e-4) - - -@pytest.mark.ci -def test_dml_plr_se(dml_plr_fixture): - assert math.isclose(dml_plr_fixture['se'], - dml_plr_fixture['se_manual'], - rel_tol=1e-9, abs_tol=1e-4) - assert math.isclose(dml_plr_fixture['se'], - dml_plr_fixture['se_ext'], - rel_tol=1e-9, abs_tol=1e-4) - - -@pytest.mark.ci -def test_dml_plr_boot(dml_plr_fixture): - for bootstrap in dml_plr_fixture['boot_methods']: - assert np.allclose(dml_plr_fixture['boot_t_stat' + bootstrap], - dml_plr_fixture['boot_t_stat' + bootstrap + '_manual'], - rtol=1e-9, atol=1e-4) - assert np.allclose(dml_plr_fixture['boot_t_stat' + bootstrap], - dml_plr_fixture['boot_t_stat' + bootstrap + '_ext'], - rtol=1e-9, atol=1e-4) - - -@pytest.mark.ci -def test_dml_plr_sensitivity(dml_plr_fixture): - sensitivity_element_names = ['sigma2', 'nu2', 'psi_sigma2', 'psi_nu2'] - for sensitivity_element in sensitivity_element_names: - assert np.allclose(dml_plr_fixture['sensitivity_elements'][sensitivity_element], - dml_plr_fixture['sensitivity_elements_manual'][sensitivity_element]) - - -@pytest.mark.ci -def test_dml_plr_sensitivity_rho0(dml_plr_fixture): - assert np.allclose(dml_plr_fixture['se'], - dml_plr_fixture['sensitivity_ses']['lower'], - rtol=1e-9, atol=1e-4) - assert np.allclose(dml_plr_fixture['se'], - dml_plr_fixture['sensitivity_ses']['upper'], - rtol=1e-9, atol=1e-4) - - -@pytest.fixture(scope="module") -def dml_plr_ols_manual_fixture(generate_data1, score): - learner = LinearRegression() - boot_methods = ['Bayes', 'normal', 'wild'] - n_folds = 2 - n_rep_boot = 501 - - # collect data - data = generate_data1 - x_cols = data.columns[data.columns.str.startswith('X')].tolist() - - # Set machine learning methods for m & g - ml_l = clone(learner) - ml_g = clone(learner) - ml_m = clone(learner) - - obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols) - if score == 'partialling out': - dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, - ml_l, ml_m, - n_folds=n_folds, - score=score) - else: - assert score == 'IV-type' - dml_plr_obj = dml.DoubleMLPLR(obj_dml_data, - ml_l, ml_m, ml_g, - n_folds, - score=score) - - n = data.shape[0] - this_smpl = list() - xx = int(n/2) - this_smpl.append((np.arange(xx, n), np.arange(0, xx))) - this_smpl.append((np.arange(0, xx), np.arange(xx, n))) - smpls = [this_smpl] - dml_plr_obj.set_sample_splitting(smpls) - - dml_plr_obj.fit() - - y = data['y'].values - x = data.loc[:, x_cols].values - d = data['d'].values - - # add column of ones for intercept - o = np.ones((n, 1)) - x = np.append(x, o, axis=1) - - smpls = dml_plr_obj.smpls[0] - - l_hat = [] - l_hat_vec = np.full_like(y, np.nan) - for (train_index, test_index) in smpls: - ols_est = scipy.linalg.lstsq(x[train_index], y[train_index])[0] - preds = np.dot(x[test_index], ols_est) - l_hat.append(preds) - l_hat_vec[test_index] = preds - - m_hat = [] - m_hat_vec = np.full_like(d, np.nan) - for (train_index, test_index) in smpls: - ols_est = scipy.linalg.lstsq(x[train_index], d[train_index])[0] - preds = np.dot(x[test_index], ols_est) - m_hat.append(preds) - m_hat_vec[test_index] = preds - - g_hat = [] - if score == 'IV-type': - theta_initial = scipy.linalg.lstsq((d - m_hat_vec).reshape(-1, 1), y - l_hat_vec)[0] - for (train_index, test_index) in smpls: - ols_est = scipy.linalg.lstsq(x[train_index], - y[train_index] - d[train_index] * theta_initial)[0] - g_hat.append(np.dot(x[test_index], ols_est)) - - res_manual, se_manual = plr_dml2(y, x, d, - l_hat, m_hat, g_hat, - smpls, score) - - res_dict = {'coef': dml_plr_obj.coef, - 'coef_manual': res_manual, - 'se': dml_plr_obj.se, - 'se_manual': se_manual, - 'boot_methods': boot_methods} - - for bootstrap in boot_methods: - np.random.seed(3141) - boot_t_stat = boot_plr(y, d, [res_manual], [se_manual], - [l_hat], [m_hat], [g_hat], - [smpls], score, bootstrap, n_rep_boot) - - np.random.seed(3141) - dml_plr_obj.bootstrap(method=bootstrap, n_rep_boot=n_rep_boot) - res_dict['boot_t_stat' + bootstrap] = dml_plr_obj.boot_t_stat - res_dict['boot_t_stat' + bootstrap + '_manual'] = boot_t_stat.reshape(-1, 1, 1) - - return res_dict - - -@pytest.mark.ci -def test_dml_plr_ols_manual_coef(dml_plr_ols_manual_fixture): - assert math.isclose(dml_plr_ols_manual_fixture['coef'], - dml_plr_ols_manual_fixture['coef_manual'], - rel_tol=1e-9, abs_tol=1e-4) - - -@pytest.mark.ci -def test_dml_plr_ols_manual_se(dml_plr_ols_manual_fixture): - assert math.isclose(dml_plr_ols_manual_fixture['se'], - dml_plr_ols_manual_fixture['se_manual'], - rel_tol=1e-9, abs_tol=1e-4) - - -@pytest.mark.ci -def test_dml_plr_ols_manual_boot(dml_plr_ols_manual_fixture): - for bootstrap in dml_plr_ols_manual_fixture['boot_methods']: - assert np.allclose(dml_plr_ols_manual_fixture['boot_t_stat' + bootstrap], - dml_plr_ols_manual_fixture['boot_t_stat' + bootstrap + '_manual'], - rtol=1e-9, atol=1e-4) - - -@pytest.fixture(scope='module', - params=["nonrobust", "HC0", "HC1", "HC2", "HC3"]) -def cov_type(request): - return request.param \ No newline at end of file diff --git a/doubleml/utils/_estimation.py b/doubleml/utils/_estimation.py index 7f24fde5..d10ae48b 100644 --- a/doubleml/utils/_estimation.py +++ b/doubleml/utils/_estimation.py @@ -43,9 +43,19 @@ def _fit(estimator, x, y, train_index, idx=None): return estimator, idx -def _dml_cv_predict(estimator, x, y, smpls=None, - n_jobs=None, est_params=None, method='predict', return_train_preds=False, return_models=False, - smpls_is_partition=None, sample_weights=None): +def _dml_cv_predict( + estimator, + x, + y, + smpls=None, + n_jobs=None, + est_params=None, + method="predict", + return_train_preds=False, + return_models=False, + smpls_is_partition=None, + sample_weights=None, +): n_obs = x.shape[0] # TODO: Better name for smples_is_partition @@ -53,9 +63,15 @@ def _dml_cv_predict(estimator, x, y, smpls=None, smpls_is_partition = _check_is_partition(smpls, n_obs) fold_specific_params = (est_params is not None) & (not isinstance(est_params, dict)) fold_specific_target = isinstance(y, list) - manual_cv_predict = (not smpls_is_partition) | return_train_preds | fold_specific_params | fold_specific_target \ - | return_models | bool(sample_weights) - #TODO: Check if cross_val_predict supports weights + manual_cv_predict = ( + (not smpls_is_partition) + | return_train_preds + | fold_specific_params + | fold_specific_target + | return_models + | bool(sample_weights) + ) + # TODO: Check if cross_val_predict supports weights res = {"models": None} if not manual_cv_predict: @@ -149,21 +165,34 @@ def _dml_cv_predict(estimator, x, y, smpls=None, return res -def _dml_tune(y, x, train_inds, - learner, param_grid, scoring_method, - n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search, fold_specific_target=False): +def _dml_tune( + y, + x, + train_inds, + learner, + param_grid, + scoring_method, + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + fold_specific_target=False, +): tune_res = list() for i, train_index in enumerate(train_inds): tune_resampling = KFold(n_splits=n_folds_tune, shuffle=True) if search_mode == "grid_search": g_grid_search = GridSearchCV(learner, param_grid, scoring=scoring_method, cv=tune_resampling, n_jobs=n_jobs_cv) else: - assert search_mode == 'randomized_search' - g_grid_search = RandomizedSearchCV(learner, param_grid, - scoring=scoring_method, - cv=tune_resampling, - n_jobs=n_jobs_cv, - n_iter=n_iter_randomized_search) + assert search_mode == "randomized_search" + g_grid_search = RandomizedSearchCV( + learner, + param_grid, + scoring=scoring_method, + cv=tune_resampling, + n_jobs=n_jobs_cv, + n_iter=n_iter_randomized_search, + ) if fold_specific_target: tune_res.append(g_grid_search.fit(x[train_index, :], y[i])) else: diff --git a/doubleml/utils/resampling.py b/doubleml/utils/resampling.py index d1014517..38c1ac59 100644 --- a/doubleml/utils/resampling.py +++ b/doubleml/utils/resampling.py @@ -26,12 +26,7 @@ def split_samples(self): class DoubleMLDoubleResampling: - def __init__(self, - n_folds, - n_folds_inner, - n_rep, - n_obs, - stratify=None): + def __init__(self, n_folds, n_folds_inner, n_rep, n_obs, stratify=None): self.n_folds = n_folds self.n_folds_inner = n_folds_inner self.n_rep = n_rep @@ -39,12 +34,13 @@ def __init__(self, self.stratify = stratify if n_folds < 2: - raise ValueError('n_folds must be greater than 1. ' - 'You can use set_sample_splitting with a tuple to only use one fold.') + raise ValueError( + "n_folds must be greater than 1. You can use set_sample_splitting with a tuple to only use one fold." + ) if n_folds_inner < 2: - raise ValueError('n_folds_inner must be greater than 1. ' - 'You can use set_sample_splitting with a tuple to only use one fold.') - + raise ValueError( + "n_folds_inner must be greater than 1. You can use set_sample_splitting with a tuple to only use one fold." + ) if self.stratify is None: self.resampling = RepeatedKFold(n_splits=n_folds, n_repeats=n_rep) @@ -55,17 +51,27 @@ def __init__(self, def split_samples(self): all_smpls = [(train, test) for train, test in self.resampling.split(X=np.zeros(self.n_obs), y=self.stratify)] - smpls = [all_smpls[(i_repeat * self.n_folds):((i_repeat + 1) * self.n_folds)] - for i_repeat in range(self.n_rep)] + smpls = [all_smpls[(i_repeat * self.n_folds) : ((i_repeat + 1) * self.n_folds)] for i_repeat in range(self.n_rep)] smpls_inner = [] for _ in range(self.n_rep): smpls_inner_rep = [] for train, test in all_smpls: if self.stratify is None: - smpls_inner_rep.append([(train[train_inner], train[test_inner]) for train_inner, test_inner in self.resampling_inner.split(X=train)]) + smpls_inner_rep.append( + [ + (train[train_inner], train[test_inner]) + for train_inner, test_inner in self.resampling_inner.split(X=train) + ] + ) else: - smpls_inner_rep.append([(train[train_inner], train[test_inner]) for train_inner, test_inner in - self.resampling_inner.split(X=np.zeros(len(train)), y=self.stratify[train])]) + smpls_inner_rep.append( + [ + (train[train_inner], train[test_inner]) + for train_inner, test_inner in self.resampling_inner.split( + X=np.zeros(len(train)), y=self.stratify[train] + ) + ] + ) smpls_inner.append(smpls_inner_rep) return smpls, smpls_inner From 5d2d1ed24deec8ca565b9ebe1260e3f9b0584b94 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 27 Oct 2025 21:56:22 -0700 Subject: [PATCH 17/23] Unit tests work and bug fix in lplr --- doubleml/plm/lplr.py | 6 +- doubleml/plm/tests/_utils_lplr_manual.py | 371 +++++++-------------- doubleml/plm/tests/test_lplr.py | 31 +- doubleml/plm/tests/test_lplr_exceptions.py | 18 +- doubleml/plm/tests/test_lplr_tune.py | 163 +++------ 5 files changed, 205 insertions(+), 384 deletions(-) diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index edf17f08..08a6bbfa 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -109,10 +109,8 @@ def __init__( _ = self._check_learner(ml_t, "ml_t", regressor=True, classifier=False) _ = self._check_learner(ml_M, "ml_M", regressor=False, classifier=True) - if np.array_equal(np.unique(obj_dml_data.d), [0, 1]): - ml_m_is_classifier = self._check_learner(ml_m, "ml_m", regressor=False, classifier=True) - else: - ml_m_is_classifier = self._check_learner(ml_m, "ml_m", regressor=True, classifier=False) + + ml_m_is_classifier = self._check_learner(ml_m, "ml_m", regressor=True, classifier=True) self._learner = {"ml_m": ml_m, "ml_t": ml_t, "ml_M": ml_M} if ml_a is not None: diff --git a/doubleml/plm/tests/_utils_lplr_manual.py b/doubleml/plm/tests/_utils_lplr_manual.py index 8f45b5b0..69904701 100644 --- a/doubleml/plm/tests/_utils_lplr_manual.py +++ b/doubleml/plm/tests/_utils_lplr_manual.py @@ -8,74 +8,54 @@ def fit_selection( - y, - x, - d, - z, - s, - learner_g, - learner_pi, - learner_m, - all_smpls, - score, - trimming_rule="truncate", - trimming_threshold=1e-2, - normalize_ipw=True, - n_rep=1, - g_d0_params=None, - g_d1_params=None, - pi_params=None, - m_params=None, + y, + x, + d, + learner_M, + learner_t, + learner_m, + all_smpls, + score, + trimming_rule="truncate", + trimming_threshold=1e-2, + n_rep=1, + M_params=None, + t_params=None, + m_params=None, ): n_obs = len(y) thetas = np.zeros(n_rep) ses = np.zeros(n_rep) - all_g_d1_hat = list() - all_g_d0_hat = list() - all_pi_hat = list() + all_M_hat = list() + all_t_hat = list() all_m_hat = list() - all_psi_a = list() - all_psi_b = list() - for i_rep in range(n_rep): smpls = all_smpls[i_rep] - g_hat_d1_list, g_hat_d0_list, pi_hat_list, m_hat_list = fit_nuisance_selection( + M_hat_list, t_hat_list, m_hat_list = fit_nuisance_selection( y, x, d, - z, - s, - learner_g, - learner_pi, + learner_M, + learner_t, learner_m, smpls, score, trimming_rule=trimming_rule, trimming_threshold=trimming_threshold, - g_d0_params=g_d0_params, - g_d1_params=g_d1_params, - pi_params=pi_params, + M_params=M_params, + t_params=t_params, m_params=m_params, ) - all_g_d1_hat.append(g_hat_d1_list) - all_g_d0_hat.append(g_hat_d0_list) - all_pi_hat.append(pi_hat_list) - all_m_hat.append(m_hat_list) - - g_hat_d1, g_hat_d0, pi_hat, m_hat = compute_selection(y, g_hat_d1_list, g_hat_d0_list, pi_hat_list, m_hat_list, smpls) - - dtreat = d == 1 - dcontrol = d == 0 - psi_a, psi_b = selection_score_elements(dtreat, dcontrol, g_hat_d1, g_hat_d0, pi_hat, m_hat, s, y, normalize_ipw) - all_psi_a.append(psi_a) - all_psi_b.append(psi_b) + all_M_hat.append(M_hat) + all_t_hat.append(t_hat) + all_m_hat.append(m_hat) - thetas[i_rep], ses[i_rep] = selection_dml2(psi_a, psi_b) + thetas[i_rep], ses[i_rep] = solve_score(M_hat_list, t_hat_list, m_hat_list) theta = np.median(thetas) se = np.sqrt(np.median(np.power(ses, 2) * n_obs + np.power(thetas - theta, 2)) / n_obs) @@ -85,9 +65,8 @@ def fit_selection( "se": se, "thetas": thetas, "ses": ses, - "all_g_d1_hat": all_g_d1_hat, - "all_g_d0_hat": all_g_d0_hat, - "all_pi_hat": all_pi_hat, + "all_M_hat": all_M_hat, + "all_t_hat": all_t_hat, "all_m_hat": all_m_hat, "all_psi_a": all_psi_a, "all_psi_b": all_psi_b, @@ -95,176 +74,125 @@ def fit_selection( return res +def solve_score(M_hat, t_hat, m_hat): + pass def fit_nuisance_selection( - y, - x, - d, - z, - s, - learner_g, - learner_pi, - learner_m, - smpls, - score, - trimming_rule="truncate", - trimming_threshold=1e-2, - g_d0_params=None, - g_d1_params=None, - pi_params=None, - m_params=None, + y, + x, + d, + learner_M, + learner_t, + learner_m, + smpls, + score, + trimming_rule="truncate", + trimming_threshold=1e-2, + M_params=None, + t_params=None, + m_params=None, ): - ml_g_d1 = clone(learner_g) - ml_g_d0 = clone(learner_g) - ml_pi = clone(learner_pi) + # TODO: complete for lplr + n_obs = len(y) + ml_M = clone(learner_M) + ml_t = clone(learner_t) ml_m = clone(learner_m) - if z is None: - dx = np.column_stack((d, x)) - else: - dx = np.column_stack((d, x, z)) - - if score == "missing-at-random": - pi_hat_list = fit_predict_proba(s, dx, ml_pi, pi_params, smpls, trimming_threshold=trimming_threshold) - - m_hat_list = fit_predict_proba(d, x, ml_m, m_params, smpls) - - train_cond_d1_s1 = np.intersect1d(np.where(d == 1)[0], np.where(s == 1)[0]) - g_hat_d1_list = fit_predict(y, x, ml_g_d1, g_d1_params, smpls, train_cond=train_cond_d1_s1) - - train_cond_d0_s1 = np.intersect1d(np.where(d == 0)[0], np.where(s == 1)[0]) - g_hat_d0_list = fit_predict(y, x, ml_g_d0, g_d0_params, smpls, train_cond=train_cond_d0_s1) - else: - # initialize empty lists - g_hat_d1_list = [] - g_hat_d0_list = [] - pi_hat_list = [] - m_hat_list = [] - - # create strata for splitting - strata = d.reshape(-1, 1) + 2 * s.reshape(-1, 1) - - # POTENTIAL OUTCOME Y(1) - for i_fold, _ in enumerate(smpls): - ml_g_d1 = clone(learner_g) - ml_pi = clone(learner_pi) - ml_m = clone(learner_m) - - # set the params for the nuisance learners - if g_d1_params is not None: - ml_g_d1.set_params(**g_d1_params[i_fold]) - if g_d0_params is not None: - ml_g_d0.set_params(**g_d0_params[i_fold]) - if pi_params is not None: - ml_pi.set_params(**pi_params[i_fold]) - if m_params is not None: - ml_m.set_params(**m_params[i_fold]) - - train_inds = smpls[i_fold][0] - test_inds = smpls[i_fold][1] - - # start nested crossfitting - train_inds_1, train_inds_2 = train_test_split( - train_inds, test_size=0.5, random_state=42, stratify=strata[train_inds] - ) + dx = np.column_stack((d, x)) + + # initialize empty lists + g_hat_d1_list = [] + g_hat_d0_list = [] + pi_hat_list = [] + m_hat_list = [] + + # create strata for splitting + strata = d.reshape(-1, 1) + 2 * s.reshape(-1, 1) + + # POTENTIAL OUTCOME Y(1) + for i_fold, _ in enumerate(smpls): + ml_g_d1 = clone(learner_g) + ml_pi = clone(learner_pi) + ml_m = clone(learner_m) + + # set the params for the nuisance learners + if g_d1_params is not None: + ml_g_d1.set_params(**g_d1_params[i_fold]) + if g_d0_params is not None: + ml_g_d0.set_params(**g_d0_params[i_fold]) + if pi_params is not None: + ml_pi.set_params(**pi_params[i_fold]) + if m_params is not None: + ml_m.set_params(**m_params[i_fold]) + + train_inds = smpls[i_fold][0] + test_inds = smpls[i_fold][1] + + # start nested crossfitting + train_inds_1, train_inds_2 = train_test_split( + train_inds, test_size=0.5, random_state=42, stratify=strata[train_inds] + ) - s_train_1 = s[train_inds_1] - dx_train_1 = dx[train_inds_1, :] + s_train_1 = s[train_inds_1] + dx_train_1 = dx[train_inds_1, :] - # preliminary propensity score for selection - ml_pi_prelim = clone(ml_pi) - # fit on first part of training set - ml_pi_prelim.fit(dx_train_1, s_train_1) - pi_hat_prelim = _predict_zero_one_propensity(ml_pi_prelim, dx) + # preliminary propensity score for selection + ml_pi_prelim = clone(ml_pi) + # fit on first part of training set + ml_pi_prelim.fit(dx_train_1, s_train_1) + pi_hat_prelim = _predict_zero_one_propensity(ml_pi_prelim, dx) - # predictions for small pi in denominator - pi_hat = pi_hat_prelim[test_inds] + # predictions for small pi in denominator + pi_hat = pi_hat_prelim[test_inds] - # add selection indicator to covariates - xpi = np.column_stack((x, pi_hat_prelim)) + # add selection indicator to covariates + xpi = np.column_stack((x, pi_hat_prelim)) - # estimate propensity score p using the second training sample - xpi_train_2 = xpi[train_inds_2, :] - d_train_2 = d[train_inds_2] - xpi_test = xpi[test_inds, :] + # estimate propensity score p using the second training sample + xpi_train_2 = xpi[train_inds_2, :] + d_train_2 = d[train_inds_2] + xpi_test = xpi[test_inds, :] - ml_m.fit(xpi_train_2, d_train_2) + ml_m.fit(xpi_train_2, d_train_2) - m_hat = _predict_zero_one_propensity(ml_m, xpi_test) + m_hat = _predict_zero_one_propensity(ml_m, xpi_test) - # estimate conditional outcome on second training sample -- treatment - s1_d1_train_2_indices = np.intersect1d(np.where(d == 1)[0], np.intersect1d(np.where(s == 1)[0], train_inds_2)) - xpi_s1_d1_train_2 = xpi[s1_d1_train_2_indices, :] - y_s1_d1_train_2 = y[s1_d1_train_2_indices] + # estimate conditional outcome on second training sample -- treatment + s1_d1_train_2_indices = np.intersect1d(np.where(d == 1)[0], np.intersect1d(np.where(s == 1)[0], train_inds_2)) + xpi_s1_d1_train_2 = xpi[s1_d1_train_2_indices, :] + y_s1_d1_train_2 = y[s1_d1_train_2_indices] - ml_g_d1.fit(xpi_s1_d1_train_2, y_s1_d1_train_2) + ml_g_d1.fit(xpi_s1_d1_train_2, y_s1_d1_train_2) - # predict conditional outcome - g_hat_d1 = ml_g_d1.predict(xpi_test) + # predict conditional outcome + g_hat_d1 = ml_g_d1.predict(xpi_test) - # estimate conditional outcome on second training sample -- control - s1_d0_train_2_indices = np.intersect1d(np.where(d == 0)[0], np.intersect1d(np.where(s == 1)[0], train_inds_2)) - xpi_s1_d0_train_2 = xpi[s1_d0_train_2_indices, :] - y_s1_d0_train_2 = y[s1_d0_train_2_indices] + # estimate conditional outcome on second training sample -- control + s1_d0_train_2_indices = np.intersect1d(np.where(d == 0)[0], np.intersect1d(np.where(s == 1)[0], train_inds_2)) + xpi_s1_d0_train_2 = xpi[s1_d0_train_2_indices, :] + y_s1_d0_train_2 = y[s1_d0_train_2_indices] - ml_g_d0.fit(xpi_s1_d0_train_2, y_s1_d0_train_2) + ml_g_d0.fit(xpi_s1_d0_train_2, y_s1_d0_train_2) - # predict conditional outcome - g_hat_d0 = ml_g_d0.predict(xpi_test) + # predict conditional outcome + g_hat_d0 = ml_g_d0.predict(xpi_test) - m_hat = _trimm(m_hat, trimming_rule, trimming_threshold) + m_hat = _trimm(m_hat, trimming_rule, trimming_threshold) - # append predictions on test sample to final list of predictions - g_hat_d1_list.append(g_hat_d1) - g_hat_d0_list.append(g_hat_d0) - pi_hat_list.append(pi_hat) - m_hat_list.append(m_hat) + # append predictions on test sample to final list of predictions + g_hat_d1_list.append(g_hat_d1) + g_hat_d0_list.append(g_hat_d0) + pi_hat_list.append(pi_hat) + m_hat_list.append(m_hat) - return g_hat_d1_list, g_hat_d0_list, pi_hat_list, m_hat_list -def compute_selection(y, g_hat_d1_list, g_hat_d0_list, pi_hat_list, m_hat_list, smpls): - g_hat_d1 = np.full_like(y, np.nan, dtype="float64") - g_hat_d0 = np.full_like(y, np.nan, dtype="float64") - pi_hat = np.full_like(y, np.nan, dtype="float64") m_hat = np.full_like(y, np.nan, dtype="float64") - for idx, (_, test_index) in enumerate(smpls): - g_hat_d1[test_index] = g_hat_d1_list[idx] - g_hat_d0[test_index] = g_hat_d0_list[idx] - pi_hat[test_index] = pi_hat_list[idx] + M_hat[test_index] = M_hat_list[idx] + t_hat[test_index] = t_hat_list[idx] m_hat[test_index] = m_hat_list[idx] - - return g_hat_d1, g_hat_d0, pi_hat, m_hat - - -def selection_score_elements(dtreat, dcontrol, g_d1, g_d0, pi, m, s, y, normalize_ipw): - # psi_a - psi_a = -1 * np.ones_like(y) - - # psi_b - if normalize_ipw: - weight_treat = sum(dtreat) / sum((dtreat * s) / (m * pi)) - weight_control = sum(dcontrol) / sum((dcontrol * s) / ((1 - m) * pi)) - - psi_b1 = weight_treat * ((dtreat * s * (y - g_d1)) / (m * pi)) + g_d1 - psi_b0 = weight_control * ((dcontrol * s * (y - g_d0)) / ((1 - m) * pi)) + g_d0 - - else: - psi_b1 = (dtreat * s * (y - g_d1)) / (m * pi) + g_d1 - psi_b0 = (dcontrol * s * (y - g_d0)) / ((1 - m) * pi) + g_d0 - - psi_b = psi_b1 - psi_b0 - - return psi_a, psi_b - - -def selection_dml2(psi_a, psi_b): - n_obs = len(psi_a) - theta_hat = -np.mean(psi_b) / np.mean(psi_a) - se = np.sqrt(var_selection(theta_hat, psi_a, psi_b, n_obs)) - - return theta_hat, se + return M_hat, t_hat, m_hat def var_selection(theta, psi_a, psi_b, n_obs): @@ -273,62 +201,17 @@ def var_selection(theta, psi_a, psi_b, n_obs): return var -def tune_nuisance_ssm_mar(y, x, d, z, s, ml_g, ml_pi, ml_m, smpls, n_folds_tune, param_grid_g, param_grid_pi, param_grid_m): - d0_s1 = np.intersect1d(np.where(d == 0)[0], np.where(s == 1)[0]) - d1_s1 = np.intersect1d(np.where(d == 1)[0], np.where(s == 1)[0]) - - g0_tune_res = tune_grid_search(y, x, ml_g, smpls, param_grid_g, n_folds_tune, train_cond=d0_s1) - g1_tune_res = tune_grid_search(y, x, ml_g, smpls, param_grid_g, n_folds_tune, train_cond=d1_s1) - +def tune_nuisance(y, x, d, ml_M, ml_t, ml_m, smpls, n_folds_tune, param_grid_M, param_grid_t, param_grid_m): dx = np.column_stack((x, d)) - pi_tune_res = tune_grid_search(s, dx, ml_pi, smpls, param_grid_pi, n_folds_tune) + M_tune_res = tune_grid_search(y, dx, ml_M, smpls, param_grid_M, n_folds_tune) m_tune_res = tune_grid_search(d, x, ml_m, smpls, param_grid_m, n_folds_tune) - g0_best_params = [xx.best_params_ for xx in g0_tune_res] - g1_best_params = [xx.best_params_ for xx in g1_tune_res] - pi_best_params = [xx.best_params_ for xx in pi_tune_res] - m_best_params = [xx.best_params_ for xx in m_tune_res] - - return g0_best_params, g1_best_params, pi_best_params, m_best_params + t_tune_res = tune_grid_search(d, x, ml_t, smpls, param_grid_t, n_folds_tune) + M_best_params = [xx.best_params_ for xx in M_tune_res] + t_best_params = [xx.best_params_ for xx in t_tune_res] + m_best_params = [xx.best_params_ for xx in m_tune_res] -def tune_nuisance_ssm_nonignorable( - y, x, d, z, s, ml_g, ml_pi, ml_m, smpls, n_folds_tune, param_grid_g, param_grid_pi, param_grid_m -): - train_inds = [tr for (tr, _) in smpls] - - inner0_list, inner1_list = [], [] - for tr in train_inds: - i0, i1 = train_test_split(tr, test_size=0.5, stratify=d[tr] + 2 * s[tr], random_state=42) - inner0_list.append(i0) - inner1_list.append(i1) - - X_dz = np.c_[x, d.reshape(-1, 1), z.reshape(-1, 1)] - pi_tune_res = tune_grid_search(s, X_dz, ml_pi, [(i0, np.array([])) for i0 in inner0_list], param_grid_pi, n_folds_tune) - pi_best_params = [gs.best_params_ for gs in pi_tune_res] - - pi_hat_full = np.full_like(s, np.nan, dtype=float) - for i0, i1, gs in zip(inner0_list, inner1_list, pi_tune_res): - ml_pi_temp = clone(ml_pi) - ml_pi_temp.set_params(**gs.best_params_) - ml_pi_temp.fit(X_dz[i0], s[i0]) - ph = _predict_zero_one_propensity(ml_pi_temp, X_dz) - pi_hat_full[i1] = ph[i1] - - X_pi = np.c_[x, pi_hat_full] - m_tune_res = tune_grid_search(d, X_pi, ml_m, [(i1, np.array([])) for i1 in inner1_list], param_grid_m, n_folds_tune) - m_best_params = [gs.best_params_ for gs in m_tune_res] - - X_pi_d = np.c_[x, d.reshape(-1, 1), pi_hat_full.reshape(-1, 1)] - inner1_d0_s1 = [i1[(d[i1] == 0) & (s[i1] == 1)] for i1 in inner1_list] - inner1_d1_s1 = [i1[(d[i1] == 1) & (s[i1] == 1)] for i1 in inner1_list] - - g0_tune_res = tune_grid_search(y, X_pi_d, ml_g, [(idx, np.array([])) for idx in inner1_d0_s1], param_grid_g, n_folds_tune) - g1_tune_res = tune_grid_search(y, X_pi_d, ml_g, [(idx, np.array([])) for idx in inner1_d1_s1], param_grid_g, n_folds_tune) - - g0_best_params = [gs.best_params_ for gs in g0_tune_res] - g1_best_params = [gs.best_params_ for gs in g1_tune_res] - - return g0_best_params, g1_best_params, pi_best_params, m_best_params + t_tune_res = tune_grid_search(t_targets, x, ml_t, smpls, param_grid_t, n_folds_tune) diff --git a/doubleml/plm/tests/test_lplr.py b/doubleml/plm/tests/test_lplr.py index c561d9fe..8e551cab 100644 --- a/doubleml/plm/tests/test_lplr.py +++ b/doubleml/plm/tests/test_lplr.py @@ -3,7 +3,8 @@ import numpy as np import pytest from sklearn.base import clone -from sklearn.linear_model import LassoCV, LogisticRegressionCV +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.linear_model import LassoCV, LogisticRegressionCV, LogisticRegression import doubleml as dml @@ -11,38 +12,36 @@ from ._utils_ssm_manual import fit_selection -@pytest.fixture(scope="module", params=[[LassoCV(), LogisticRegressionCV(penalty="l1", solver="liblinear")]]) -def learner(request): +@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)]) +def learner_M(request): return request.param - -@pytest.fixture(scope="module", params=["missing-at-random", "nonignorable"]) -def score(request): +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) +def learner_t(request): return request.param -@pytest.fixture(scope="module", params=[True, False]) -def normalize_ipw(request): +@pytest.fixture(scope="module", params=[LogisticRegression(random_state=42)]) +def learner_m(request): return request.param - -@pytest.fixture(scope="module", params=[0.01]) -def trimming_threshold(request): +@pytest.fixture(scope="module", params=["nuisance_space", "instrument"]) +def score(request): return request.param @pytest.fixture(scope="module") def dml_selection_fixture( - generate_data_selection_mar, generate_data_selection_nonignorable, learner, score, trimming_threshold, normalize_ipw + generate_data_selection, learner, score, learner_M, + learner_t, + learner_m, ): n_folds = 3 # collect data np.random.seed(42) - if score == "missing-at-random": - (x, y, d, z, s) = generate_data_selection_mar - else: - (x, y, d, z, s) = generate_data_selection_nonignorable + (x, y, d, z, s) = generate_data_selection + ml_g = clone(learner[0]) ml_pi = clone(learner[1]) diff --git a/doubleml/plm/tests/test_lplr_exceptions.py b/doubleml/plm/tests/test_lplr_exceptions.py index 8a55fe59..cfe9f067 100644 --- a/doubleml/plm/tests/test_lplr_exceptions.py +++ b/doubleml/plm/tests/test_lplr_exceptions.py @@ -11,7 +11,7 @@ np.random.seed(3141) n = 100 # create test data and basic learners -dml_data = make_lplr_LZZ2020(alpha=0.5, n_obs=n, dim_x=10) +dml_data = make_lplr_LZZ2020(alpha=0.5, n_obs=n, dim_x=20) ml_M = RandomForestClassifier() ml_t = RandomForestRegressor() ml_m = RandomForestRegressor() @@ -22,13 +22,13 @@ @pytest.mark.ci def test_lplr_exception_data(): msg = ( - r"The data must be of DoubleMLData type\. .* of type " + r"The data must be of DoubleMLData.* type\.[\s\S]* of type " r" was passed\." ) with pytest.raises(TypeError, match=msg): _ = DoubleMLLPLR(pd.DataFrame(), ml_M, ml_t, ml_m) - dml_data_nb = make_lplr_LZZ2020(alpha=0.5, n_obs=50, dim_x=5) + dml_data_nb = make_lplr_LZZ2020(alpha=0.5, n_obs=50, dim_x=20) dml_data_nb.data[dml_data_nb.y_col] = dml_data_nb.data[dml_data_nb.y_col] + 1 dml_data_nb._set_y_z() with pytest.raises(TypeError, match="The outcome variable y must be binary with values 0 and 1."): @@ -41,7 +41,7 @@ def test_lplr_exception_scores(): msg = "Invalid score MAR" with pytest.raises(ValueError, match=msg): _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score="MAR") - msg = "score should be string. 0 was passed." + msg = "score should be a string. 0 was passed." with pytest.raises(TypeError, match=msg): _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score=0) @@ -71,7 +71,7 @@ def test_ssm_exception_resampling(): @pytest.mark.ci def test_lplr_exception_get_params(): - msg = "Invalid nuisance learner ml_x. Valid nuisance learner ml_M or ml_g_t or ml_m or ml_a." + msg = "Invalid nuisance learner ml_x. Valid nuisance learner ml_m or ml_t or ml_M or ml_a." with pytest.raises(ValueError, match=msg): dml_lplr.get_params("ml_x") @@ -148,7 +148,7 @@ def test_lplr_exception_confint(): @pytest.mark.ci def test_lplr_exception_set_ml_nuisance_params(): # invalid learner name - msg = "Invalid nuisance learner g. Valid nuisance learner ml_M or ml_t or ml_m or ml_a." + msg = "Invalid nuisance learner g. Valid nuisance learner ml_m or ml_t or ml_M or ml_a." with pytest.raises(ValueError, match=msg): dml_lplr.set_ml_nuisance_params("g", "d", {"alpha": 0.1}) # invalid treatment variable @@ -171,7 +171,7 @@ class _DummyNoClassifier(_DummyNoGetParams): def get_params(self): pass - def predict_proba(self): + def predict(self): pass @@ -216,7 +216,7 @@ def test_lplr_exception_learner(): log_reg._estimator_type = None msg = ( r"Learner provided for ml_m is probably invalid: LogisticRegressionManipulatedType\(\) is \(probably\) " - r"no classifier\." + r"neither a regressor nor a classifier. Method predict is used for prediction\." ) with pytest.warns(UserWarning, match=msg): _ = DoubleMLLPLR(dml_data, ml_M, ml_t, log_reg) @@ -284,7 +284,7 @@ def test_double_ml_exception_evaluate_learner(): dml_lplr_obj.evaluate_learners(metric="mse") msg = ( - r"The learners have to be a subset of \['ml_M', 'ml_t', 'ml_m', 'ml_a'\]\. " + r"The learners have to be a subset of \['ml_m', 'ml_t', 'ml_M', 'ml_a'\]\. " r"Learners \['ml_mu', 'ml_p'\] provided." ) with pytest.raises(ValueError, match=msg): diff --git a/doubleml/plm/tests/test_lplr_tune.py b/doubleml/plm/tests/test_lplr_tune.py index 0e0fa7bf..28aa387f 100644 --- a/doubleml/plm/tests/test_lplr_tune.py +++ b/doubleml/plm/tests/test_lplr_tune.py @@ -3,17 +3,20 @@ import numpy as np import pytest from sklearn.base import clone -from sklearn.ensemble import RandomForestRegressor -from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier +from sklearn.linear_model import Lasso, LogisticRegression import doubleml as dml from ...tests._utils import draw_smpls -from ._utils_lplr_manual import fit_selection, tune_nuisance_ssm_mar, tune_nuisance_ssm_nonignorable +from ._utils_lplr_manual import fit_selection, tune_nuisance +@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)]) +def learner_M(request): + return request.param @pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) -def learner_g(request): +def learner_t(request): return request.param @@ -22,84 +25,63 @@ def learner_m(request): return request.param -@pytest.fixture(scope="module", params=["missing-at-random", "nonignorable"]) +@pytest.fixture(scope="module", params=["nuisance_space", "instrument"]) def score(request): return request.param -@pytest.fixture(scope="module", params=[True, False]) -def normalize_ipw(request): - return request.param - - @pytest.fixture(scope="module", params=[True, False]) def tune_on_folds(request): return request.param def get_par_grid(learner): - if learner.__class__ in [RandomForestRegressor]: + if learner.__class__ in [RandomForestRegressor, RandomForestClassifier]: par_grid = {"n_estimators": [5, 10, 20]} else: - assert learner.__class__ in [LogisticRegression] + assert learner.__class__ in [LogisticRegression, Lasso] par_grid = {"C": np.logspace(-2, 2, 10)} return par_grid @pytest.fixture(scope="module") -def dml_ssm_fixture( - generate_data_selection_mar, - generate_data_selection_nonignorable, - learner_g, +def dml_lplr_fixture( + generate_data_selection, + learner_M, + learner_t, learner_m, score, - normalize_ipw, tune_on_folds, ): - par_grid = {"ml_g": get_par_grid(learner_g), "ml_pi": get_par_grid(learner_m), "ml_m": get_par_grid(learner_m)} + par_grid = {"ml_M": get_par_grid(learner_M), "ml_t": get_par_grid(learner_t), "ml_m": get_par_grid(learner_m)} n_folds_tune = 4 n_folds = 2 # collect data np.random.seed(42) - if score == "missing-at-random": - (x, y, d, z, s) = generate_data_selection_mar - else: - (x, y, d, z, s) = generate_data_selection_nonignorable + x, y, d = generate_data_selection + n_obs = len(y) all_smpls = draw_smpls(n_obs, n_folds) - ml_g = clone(learner_g) - ml_pi = clone(learner_m) + ml_M = clone(learner_M) + ml_t = clone(learner_t) ml_m = clone(learner_m) np.random.seed(42) - if score == "missing-at-random": - obj_dml_data = dml.DoubleMLSSMData.from_arrays(x, y, d, z=None, s=s) - dml_sel_obj = dml.DoubleMLSSM( - obj_dml_data, - ml_g, - ml_pi, - ml_m, - n_folds=n_folds, - score=score, - normalize_ipw=normalize_ipw, - draw_sample_splitting=False, - ) - else: - assert score == "nonignorable" - obj_dml_data = dml.DoubleMLSSMData.from_arrays(x, y, d, z=z, s=s) - dml_sel_obj = dml.DoubleMLSSM( - obj_dml_data, - ml_g, - ml_pi, - ml_m, - n_folds=n_folds, - score=score, - normalize_ipw=normalize_ipw, - draw_sample_splitting=False, - ) + + obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d) + dml_sel_obj = dml.DoubleMLLPLR( + obj_dml_data, + ml_M, + ml_t, + ml_m, + n_folds=n_folds, + score=score, + draw_sample_splitting=False, + ) + # synchronize the sample splitting np.random.seed(42) @@ -115,95 +97,54 @@ def dml_ssm_fixture( np.random.seed(42) smpls = all_smpls[0] if tune_on_folds: - if score == "missing-at-random": - g0_best_params, g1_best_params, pi_best_params, m_best_params = tune_nuisance_ssm_mar( - y, - x, - d, - z, - s, - clone(learner_g), - clone(learner_m), - clone(learner_m), - smpls, - n_folds_tune, - par_grid["ml_g"], - par_grid["ml_pi"], - par_grid["ml_m"], - ) - elif score == "nonignorable": - g0_best_params, g1_best_params, pi_best_params, m_best_params = tune_nuisance_ssm_nonignorable( + + M_best_params, t_best_params, m_best_params = tune_nuisance( y, x, d, - z, - s, - clone(learner_g), - clone(learner_m), + clone(learner_M), + clone(learner_t), clone(learner_m), smpls, n_folds_tune, - par_grid["ml_g"], - par_grid["ml_pi"], + par_grid["ml_M"], + par_grid["ml_t"], par_grid["ml_m"], ) else: xx = [(np.arange(len(y)), np.array([]))] - if score == "missing-at-random": - g0_best_params, g1_best_params, pi_best_params, m_best_params = tune_nuisance_ssm_mar( + g0_best_params, g1_best_params, pi_best_params, m_best_params = tune_nuisance( y, x, d, - z, - s, - clone(learner_g), - clone(learner_m), + clone(learner_M), + clone(learner_t), clone(learner_m), xx, n_folds_tune, - par_grid["ml_g"], - par_grid["ml_pi"], - par_grid["ml_m"], - ) - elif score == "nonignorable": - g0_best_params, g1_best_params, pi_best_params, m_best_params = tune_nuisance_ssm_nonignorable( - y, - x, - d, - z, - s, - clone(learner_g), - clone(learner_m), - clone(learner_m), - xx, - n_folds_tune, - par_grid["ml_g"], - par_grid["ml_pi"], + par_grid["ml_M"], + par_grid["ml_t"], par_grid["ml_m"], ) - g0_best_params = g0_best_params * n_folds - g1_best_params = g1_best_params * n_folds - pi_best_params = pi_best_params * n_folds - m_best_params = m_best_params * n_folds + + M_best_params = M_best_params * n_folds + t_best_params = t_best_params * n_folds + m_best_params = m_best_params * n_folds np.random.seed(42) res_manual = fit_selection( y, x, d, - z, - s, - clone(learner_g), - clone(learner_m), + clone(learner_M), + clone(learner_t), clone(learner_m), all_smpls, score, - normalize_ipw=normalize_ipw, - g_d0_params=g0_best_params, - g_d1_params=g1_best_params, - pi_params=pi_best_params, + M_params=M_best_params, + t_params=t_best_params, m_params=m_best_params, ) @@ -219,9 +160,9 @@ def dml_ssm_fixture( @pytest.mark.ci def test_dml_ssm_coef(dml_ssm_fixture): - assert math.isclose(dml_ssm_fixture["coef"], dml_ssm_fixture["coef_manual"], rel_tol=1e-9, abs_tol=1e-4) + assert math.isclose(dml_lplr_fixture["coef"], dml_lplr_fixture["coef_manual"], rel_tol=1e-9, abs_tol=1e-4) @pytest.mark.ci def test_dml_ssm_se(dml_ssm_fixture): - assert math.isclose(dml_ssm_fixture["se"], dml_ssm_fixture["se_manual"], rel_tol=1e-9, abs_tol=1e-4) + assert math.isclose(dml_lplr_fixture["se"], dml_lplr_fixture["se_manual"], rel_tol=1e-9, abs_tol=1e-4) From 2c626a011bb2d68f658f2113eaff47a37dcbcd8a Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Mon, 27 Oct 2025 22:07:08 -0700 Subject: [PATCH 18/23] Cleanup --- doubleml/plm/__init__.py | 2 +- doubleml/plm/datasets/__init__.py | 2 +- doubleml/plm/datasets/dgp_lplr_LZZ2020.py | 2 ++ doubleml/plm/tests/_utils_lplr_manual.py | 2 +- doubleml/plm/tests/test_lplr.py | 2 +- doubleml/plm/tests/test_lplr_tune.py | 3 ++- 6 files changed, 8 insertions(+), 5 deletions(-) diff --git a/doubleml/plm/__init__.py b/doubleml/plm/__init__.py index 37262ed9..f5e135e3 100644 --- a/doubleml/plm/__init__.py +++ b/doubleml/plm/__init__.py @@ -2,9 +2,9 @@ The :mod:`doubleml.plm` module implements double machine learning estimates based on partially linear models. """ +from .lplr import DoubleMLLPLR from .pliv import DoubleMLPLIV from .plr import DoubleMLPLR -from .lplr import DoubleMLLPLR __all__ = [ "DoubleMLPLR", diff --git a/doubleml/plm/datasets/__init__.py b/doubleml/plm/datasets/__init__.py index 5f433ae7..6e8e9bb5 100644 --- a/doubleml/plm/datasets/__init__.py +++ b/doubleml/plm/datasets/__init__.py @@ -4,11 +4,11 @@ from ._make_pliv_data import _make_pliv_data from .dgp_confounded_plr_data import make_confounded_plr_data +from .dgp_lplr_LZZ2020 import make_lplr_LZZ2020 from .dgp_pliv_CHS2015 import make_pliv_CHS2015 from .dgp_pliv_multiway_cluster_CKMS2021 import make_pliv_multiway_cluster_CKMS2021 from .dgp_plr_CCDDHNR2018 import make_plr_CCDDHNR2018 from .dgp_plr_turrell2018 import make_plr_turrell2018 -from .dgp_lplr_LZZ2020 import make_lplr_LZZ2020 __all__ = [ "make_plr_CCDDHNR2018", diff --git a/doubleml/plm/datasets/dgp_lplr_LZZ2020.py b/doubleml/plm/datasets/dgp_lplr_LZZ2020.py index 3d6d7127..a9b4ece9 100644 --- a/doubleml/plm/datasets/dgp_lplr_LZZ2020.py +++ b/doubleml/plm/datasets/dgp_lplr_LZZ2020.py @@ -131,6 +131,8 @@ def a_0(X): elif treatment == "binary_unbalanced": d_cont = a_0(x) d = np.random.binomial(1, expit(d_cont)) + else: + raise ValueError("Invalid treatment type.") p = expit(alpha * d[:] + r_0(x)) diff --git a/doubleml/plm/tests/_utils_lplr_manual.py b/doubleml/plm/tests/_utils_lplr_manual.py index 69904701..072eb2b5 100644 --- a/doubleml/plm/tests/_utils_lplr_manual.py +++ b/doubleml/plm/tests/_utils_lplr_manual.py @@ -2,7 +2,7 @@ from sklearn.base import clone from sklearn.model_selection import train_test_split -from ...tests._utils import fit_predict, fit_predict_proba, tune_grid_search +from ...tests._utils import tune_grid_search from ...utils._estimation import _predict_zero_one_propensity from ...utils._propensity_score import _trimm diff --git a/doubleml/plm/tests/test_lplr.py b/doubleml/plm/tests/test_lplr.py index 8e551cab..9ef7ec73 100644 --- a/doubleml/plm/tests/test_lplr.py +++ b/doubleml/plm/tests/test_lplr.py @@ -4,7 +4,7 @@ import pytest from sklearn.base import clone from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.linear_model import LassoCV, LogisticRegressionCV, LogisticRegression +from sklearn.linear_model import LogisticRegression import doubleml as dml diff --git a/doubleml/plm/tests/test_lplr_tune.py b/doubleml/plm/tests/test_lplr_tune.py index 28aa387f..6d13e5d1 100644 --- a/doubleml/plm/tests/test_lplr_tune.py +++ b/doubleml/plm/tests/test_lplr_tune.py @@ -3,7 +3,7 @@ import numpy as np import pytest from sklearn.base import clone -from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.linear_model import Lasso, LogisticRegression import doubleml as dml @@ -11,6 +11,7 @@ from ...tests._utils import draw_smpls from ._utils_lplr_manual import fit_selection, tune_nuisance + @pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)]) def learner_M(request): return request.param From 98194367463f0e382726c8a01dbb05a7d5ff9f19 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Wed, 5 Nov 2025 18:41:19 -0800 Subject: [PATCH 19/23] Tests updated --- doubleml/plm/lplr.py | 10 + doubleml/plm/tests/_utils_lplr_manual.py | 217 --------------------- doubleml/plm/tests/test_lplr.py | 79 ++------ doubleml/plm/tests/test_lplr_exceptions.py | 1 + doubleml/plm/tests/test_lplr_tune.py | 129 ++++-------- 5 files changed, 67 insertions(+), 369 deletions(-) delete mode 100644 doubleml/plm/tests/_utils_lplr_manual.py diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index 08a6bbfa..468b9359 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -389,6 +389,8 @@ def _sensitivity_element_est(self, preds): def _nuisance_tuning( self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search ): + if self._i_rep is None: + raise ValueError("tune_on_folds must be True as targets have to be created for ml_t on folds.") # TODO: test x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False) x, d = check_X_y(x, self._dml_data.d, force_all_finite=False) @@ -470,6 +472,13 @@ def _nuisance_tuning( w = scipy.special.logit(M_iteration) W_inner.append(w) + # Reshape W_inner into full-length arrays per fold: fill train indices, others are NaN + W_targets = [] + for i, train in enumerate(train_inds): + wt = np.full(x.shape[0], np.nan, dtype=float) + wt[train] = W_inner[i] + W_targets.append(wt) + t_tune_res = _dml_tune( W_inner, x, @@ -481,6 +490,7 @@ def _nuisance_tuning( n_jobs_cv, search_mode, n_iter_randomized_search, + fold_specific_target=True ) t_best_params = [xx.best_params_ for xx in t_tune_res] diff --git a/doubleml/plm/tests/_utils_lplr_manual.py b/doubleml/plm/tests/_utils_lplr_manual.py deleted file mode 100644 index 072eb2b5..00000000 --- a/doubleml/plm/tests/_utils_lplr_manual.py +++ /dev/null @@ -1,217 +0,0 @@ -import numpy as np -from sklearn.base import clone -from sklearn.model_selection import train_test_split - -from ...tests._utils import tune_grid_search -from ...utils._estimation import _predict_zero_one_propensity -from ...utils._propensity_score import _trimm - - -def fit_selection( - y, - x, - d, - learner_M, - learner_t, - learner_m, - all_smpls, - score, - trimming_rule="truncate", - trimming_threshold=1e-2, - n_rep=1, - M_params=None, - t_params=None, - m_params=None, -): - n_obs = len(y) - - thetas = np.zeros(n_rep) - ses = np.zeros(n_rep) - - all_M_hat = list() - all_t_hat = list() - all_m_hat = list() - - for i_rep in range(n_rep): - smpls = all_smpls[i_rep] - - M_hat_list, t_hat_list, m_hat_list = fit_nuisance_selection( - y, - x, - d, - learner_M, - learner_t, - learner_m, - smpls, - score, - trimming_rule=trimming_rule, - trimming_threshold=trimming_threshold, - M_params=M_params, - t_params=t_params, - m_params=m_params, - ) - - all_M_hat.append(M_hat) - all_t_hat.append(t_hat) - all_m_hat.append(m_hat) - - thetas[i_rep], ses[i_rep] = solve_score(M_hat_list, t_hat_list, m_hat_list) - - theta = np.median(thetas) - se = np.sqrt(np.median(np.power(ses, 2) * n_obs + np.power(thetas - theta, 2)) / n_obs) - - res = { - "theta": theta, - "se": se, - "thetas": thetas, - "ses": ses, - "all_M_hat": all_M_hat, - "all_t_hat": all_t_hat, - "all_m_hat": all_m_hat, - "all_psi_a": all_psi_a, - "all_psi_b": all_psi_b, - } - - return res - -def solve_score(M_hat, t_hat, m_hat): - pass - -def fit_nuisance_selection( - y, - x, - d, - learner_M, - learner_t, - learner_m, - smpls, - score, - trimming_rule="truncate", - trimming_threshold=1e-2, - M_params=None, - t_params=None, - m_params=None, -): - # TODO: complete for lplr - n_obs = len(y) - ml_M = clone(learner_M) - ml_t = clone(learner_t) - ml_m = clone(learner_m) - - dx = np.column_stack((d, x)) - - # initialize empty lists - g_hat_d1_list = [] - g_hat_d0_list = [] - pi_hat_list = [] - m_hat_list = [] - - # create strata for splitting - strata = d.reshape(-1, 1) + 2 * s.reshape(-1, 1) - - # POTENTIAL OUTCOME Y(1) - for i_fold, _ in enumerate(smpls): - ml_g_d1 = clone(learner_g) - ml_pi = clone(learner_pi) - ml_m = clone(learner_m) - - # set the params for the nuisance learners - if g_d1_params is not None: - ml_g_d1.set_params(**g_d1_params[i_fold]) - if g_d0_params is not None: - ml_g_d0.set_params(**g_d0_params[i_fold]) - if pi_params is not None: - ml_pi.set_params(**pi_params[i_fold]) - if m_params is not None: - ml_m.set_params(**m_params[i_fold]) - - train_inds = smpls[i_fold][0] - test_inds = smpls[i_fold][1] - - # start nested crossfitting - train_inds_1, train_inds_2 = train_test_split( - train_inds, test_size=0.5, random_state=42, stratify=strata[train_inds] - ) - - s_train_1 = s[train_inds_1] - dx_train_1 = dx[train_inds_1, :] - - # preliminary propensity score for selection - ml_pi_prelim = clone(ml_pi) - # fit on first part of training set - ml_pi_prelim.fit(dx_train_1, s_train_1) - pi_hat_prelim = _predict_zero_one_propensity(ml_pi_prelim, dx) - - # predictions for small pi in denominator - pi_hat = pi_hat_prelim[test_inds] - - # add selection indicator to covariates - xpi = np.column_stack((x, pi_hat_prelim)) - - # estimate propensity score p using the second training sample - xpi_train_2 = xpi[train_inds_2, :] - d_train_2 = d[train_inds_2] - xpi_test = xpi[test_inds, :] - - ml_m.fit(xpi_train_2, d_train_2) - - m_hat = _predict_zero_one_propensity(ml_m, xpi_test) - - # estimate conditional outcome on second training sample -- treatment - s1_d1_train_2_indices = np.intersect1d(np.where(d == 1)[0], np.intersect1d(np.where(s == 1)[0], train_inds_2)) - xpi_s1_d1_train_2 = xpi[s1_d1_train_2_indices, :] - y_s1_d1_train_2 = y[s1_d1_train_2_indices] - - ml_g_d1.fit(xpi_s1_d1_train_2, y_s1_d1_train_2) - - # predict conditional outcome - g_hat_d1 = ml_g_d1.predict(xpi_test) - - # estimate conditional outcome on second training sample -- control - s1_d0_train_2_indices = np.intersect1d(np.where(d == 0)[0], np.intersect1d(np.where(s == 1)[0], train_inds_2)) - xpi_s1_d0_train_2 = xpi[s1_d0_train_2_indices, :] - y_s1_d0_train_2 = y[s1_d0_train_2_indices] - - ml_g_d0.fit(xpi_s1_d0_train_2, y_s1_d0_train_2) - - # predict conditional outcome - g_hat_d0 = ml_g_d0.predict(xpi_test) - - m_hat = _trimm(m_hat, trimming_rule, trimming_threshold) - - # append predictions on test sample to final list of predictions - g_hat_d1_list.append(g_hat_d1) - g_hat_d0_list.append(g_hat_d0) - pi_hat_list.append(pi_hat) - m_hat_list.append(m_hat) - - - - m_hat = np.full_like(y, np.nan, dtype="float64") - for idx, (_, test_index) in enumerate(smpls): - M_hat[test_index] = M_hat_list[idx] - t_hat[test_index] = t_hat_list[idx] - m_hat[test_index] = m_hat_list[idx] - return M_hat, t_hat, m_hat - - -def var_selection(theta, psi_a, psi_b, n_obs): - J = np.mean(psi_a) - var = 1 / n_obs * np.mean(np.power(np.multiply(psi_a, theta) + psi_b, 2)) / np.power(J, 2) - return var - - -def tune_nuisance(y, x, d, ml_M, ml_t, ml_m, smpls, n_folds_tune, param_grid_M, param_grid_t, param_grid_m): - dx = np.column_stack((x, d)) - - M_tune_res = tune_grid_search(y, dx, ml_M, smpls, param_grid_M, n_folds_tune) - - m_tune_res = tune_grid_search(d, x, ml_m, smpls, param_grid_m, n_folds_tune) - - t_tune_res = tune_grid_search(d, x, ml_t, smpls, param_grid_t, n_folds_tune) - - M_best_params = [xx.best_params_ for xx in M_tune_res] - t_best_params = [xx.best_params_ for xx in t_tune_res] - m_best_params = [xx.best_params_ for xx in m_tune_res] - - t_tune_res = tune_grid_search(t_targets, x, ml_t, smpls, param_grid_t, n_folds_tune) diff --git a/doubleml/plm/tests/test_lplr.py b/doubleml/plm/tests/test_lplr.py index 9ef7ec73..154c4763 100644 --- a/doubleml/plm/tests/test_lplr.py +++ b/doubleml/plm/tests/test_lplr.py @@ -1,15 +1,10 @@ -import math - import numpy as np import pytest from sklearn.base import clone from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.linear_model import LogisticRegression import doubleml as dml - -from ...tests._utils import draw_smpls -from ._utils_ssm_manual import fit_selection +from ..datasets import make_lplr_LZZ2020 @pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)]) @@ -21,7 +16,7 @@ def learner_t(request): return request.param -@pytest.fixture(scope="module", params=[LogisticRegression(random_state=42)]) +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) def learner_m(request): return request.param @@ -31,74 +26,36 @@ def score(request): @pytest.fixture(scope="module") -def dml_selection_fixture( - generate_data_selection, learner, score, learner_M, - learner_t, - learner_m, +def dml_lplr_fixture( + score, learner_M, learner_t, learner_m, ): - n_folds = 3 + n_folds = 5 + alpha = 0.5 # collect data np.random.seed(42) - (x, y, d, z, s) = generate_data_selection - - - ml_g = clone(learner[0]) - ml_pi = clone(learner[1]) - ml_m = clone(learner[1]) - - np.random.seed(42) - n_obs = len(y) - all_smpls = draw_smpls(n_obs, n_folds) + obj_dml_data = make_lplr_LZZ2020(alpha=alpha) - np.random.seed(42) - if score == "missing-at-random": - obj_dml_data = dml.DoubleMLSSMData.from_arrays(x, y, d, z=None, s=s) - dml_sel_obj = dml.DoubleMLSSM(obj_dml_data, ml_g, ml_pi, ml_m, n_folds=n_folds, score=score) - else: - assert score == "nonignorable" - obj_dml_data = dml.DoubleMLSSMData.from_arrays(x, y, d, z=z, s=s) - dml_sel_obj = dml.DoubleMLSSM(obj_dml_data, ml_g, ml_pi, ml_m, n_folds=n_folds, score=score) + ml_M = clone(learner_M) + ml_t = clone(learner_t) + ml_m = clone(learner_m) - np.random.seed(42) - dml_sel_obj.set_sample_splitting(all_smpls=all_smpls) + dml_sel_obj = dml.DoubleMLLPLR(obj_dml_data, ml_M, ml_t, ml_m, n_folds=n_folds, score=score) dml_sel_obj.fit() - np.random.seed(42) - res_manual = fit_selection( - y, - x, - d, - z, - s, - clone(learner[0]), - clone(learner[1]), - clone(learner[1]), - all_smpls, - score, - trimming_rule="truncate", - trimming_threshold=trimming_threshold, - normalize_ipw=normalize_ipw, - ) - res_dict = { "coef": dml_sel_obj.coef[0], - "coef_manual": res_manual["theta"], "se": dml_sel_obj.se[0], - "se_manual": res_manual["se"], + "true_coef": alpha, } - # sensitivity tests - # TODO - return res_dict @pytest.mark.ci -def test_dml_selection_coef(dml_selection_fixture): - assert math.isclose(dml_selection_fixture["coef"], dml_selection_fixture["coef_manual"], rel_tol=1e-9, abs_tol=1e-2) - - -@pytest.mark.ci -def test_dml_selection_se(dml_selection_fixture): - assert math.isclose(dml_selection_fixture["se"], dml_selection_fixture["se_manual"], rel_tol=1e-9, abs_tol=5e-2) +def test_dml_lplr_coef(dml_lplr_fixture): + # true_coef should lie within three standard deviations of the estimate + coef = dml_lplr_fixture["coef"] + se = dml_lplr_fixture["se"] + true_coef = dml_lplr_fixture["true_coef"] + assert abs(coef - true_coef) <= 3.0 * np.sqrt(se) diff --git a/doubleml/plm/tests/test_lplr_exceptions.py b/doubleml/plm/tests/test_lplr_exceptions.py index cfe9f067..1be83c12 100644 --- a/doubleml/plm/tests/test_lplr_exceptions.py +++ b/doubleml/plm/tests/test_lplr_exceptions.py @@ -295,3 +295,4 @@ def eval_fct(y_pred, y_true): with pytest.raises(ValueError): dml_lplr_obj.evaluate_learners(metric=eval_fct) + diff --git a/doubleml/plm/tests/test_lplr_tune.py b/doubleml/plm/tests/test_lplr_tune.py index 6d13e5d1..2926d755 100644 --- a/doubleml/plm/tests/test_lplr_tune.py +++ b/doubleml/plm/tests/test_lplr_tune.py @@ -7,10 +7,7 @@ from sklearn.linear_model import Lasso, LogisticRegression import doubleml as dml - -from ...tests._utils import draw_smpls -from ._utils_lplr_manual import fit_selection, tune_nuisance - +from ..datasets import make_lplr_LZZ2020 @pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)]) def learner_M(request): @@ -21,20 +18,19 @@ def learner_t(request): return request.param -@pytest.fixture(scope="module", params=[LogisticRegression(random_state=42)]) +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) def learner_m(request): return request.param +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) +def learner_a(request): + return request.param @pytest.fixture(scope="module", params=["nuisance_space", "instrument"]) def score(request): return request.param -@pytest.fixture(scope="module", params=[True, False]) -def tune_on_folds(request): - return request.param - def get_par_grid(learner): if learner.__class__ in [RandomForestRegressor, RandomForestClassifier]: @@ -47,123 +43,74 @@ def get_par_grid(learner): @pytest.fixture(scope="module") def dml_lplr_fixture( - generate_data_selection, learner_M, learner_t, learner_m, + learner_a, score, - tune_on_folds, + tune_on_folds=True, ): - par_grid = {"ml_M": get_par_grid(learner_M), "ml_t": get_par_grid(learner_t), "ml_m": get_par_grid(learner_m)} + par_grid = {"ml_M": get_par_grid(learner_M), "ml_t": get_par_grid(learner_t), "ml_m": get_par_grid(learner_m), "ml_a": get_par_grid(learner_a)} n_folds_tune = 4 - n_folds = 2 - - # collect data - np.random.seed(42) - x, y, d = generate_data_selection - + n_folds = 5 + alpha = 0.5 - n_obs = len(y) - all_smpls = draw_smpls(n_obs, n_folds) ml_M = clone(learner_M) ml_t = clone(learner_t) ml_m = clone(learner_m) + ml_a = clone(learner_a) - np.random.seed(42) - - obj_dml_data = dml.DoubleMLData.from_arrays(x, y, d) + obj_dml_data = make_lplr_LZZ2020(alpha=alpha) dml_sel_obj = dml.DoubleMLLPLR( obj_dml_data, ml_M, ml_t, ml_m, + ml_a=ml_a, n_folds=n_folds, score=score, - draw_sample_splitting=False, ) - - # synchronize the sample splitting - np.random.seed(42) - dml_sel_obj.set_sample_splitting(all_smpls=all_smpls) - - np.random.seed(42) # tune hyperparameters tune_res = dml_sel_obj.tune(par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune, return_tune_res=False) - assert isinstance(tune_res, dml.DoubleMLSSM) + assert isinstance(tune_res, dml.DoubleMLLPLR) dml_sel_obj.fit() - np.random.seed(42) - smpls = all_smpls[0] - if tune_on_folds: - - M_best_params, t_best_params, m_best_params = tune_nuisance( - y, - x, - d, - clone(learner_M), - clone(learner_t), - clone(learner_m), - smpls, - n_folds_tune, - par_grid["ml_M"], - par_grid["ml_t"], - par_grid["ml_m"], - ) - - else: - xx = [(np.arange(len(y)), np.array([]))] - g0_best_params, g1_best_params, pi_best_params, m_best_params = tune_nuisance( - y, - x, - d, - clone(learner_M), - clone(learner_t), - clone(learner_m), - xx, - n_folds_tune, - par_grid["ml_M"], - par_grid["ml_t"], - par_grid["ml_m"], - ) - - - M_best_params = M_best_params * n_folds - t_best_params = t_best_params * n_folds - m_best_params = m_best_params * n_folds - - np.random.seed(42) - res_manual = fit_selection( - y, - x, - d, - clone(learner_M), - clone(learner_t), - clone(learner_m), - all_smpls, - score, - M_params=M_best_params, - t_params=t_best_params, - m_params=m_best_params, - ) - res_dict = { "coef": dml_sel_obj.coef[0], - "coef_manual": res_manual["theta"], "se": dml_sel_obj.se[0], - "se_manual": res_manual["se"], + "true_coef": alpha, } return res_dict @pytest.mark.ci -def test_dml_ssm_coef(dml_ssm_fixture): - assert math.isclose(dml_lplr_fixture["coef"], dml_lplr_fixture["coef_manual"], rel_tol=1e-9, abs_tol=1e-4) +def test_dml_selection_coef(dml_lplr_fixture): + # true_coef should lie within three standard deviations of the estimate + coef = dml_lplr_fixture["coef"] + se = dml_lplr_fixture["se"] + true_coef = dml_lplr_fixture["true_coef"] + assert abs(coef - true_coef) <= 3.0 * np.sqrt(se) @pytest.mark.ci -def test_dml_ssm_se(dml_ssm_fixture): - assert math.isclose(dml_lplr_fixture["se"], dml_lplr_fixture["se_manual"], rel_tol=1e-9, abs_tol=1e-4) +def test_lplr_exception_tuning( + learner_M, + learner_t, + learner_m, + learner_a,): + # LPLR valid scores are 'nuisance_space' and 'instrument' + obj_dml_data = make_lplr_LZZ2020(alpha=0.5) + ml_M = clone(learner_M) + ml_t = clone(learner_t) + ml_m = clone(learner_m) + ml_a = clone(learner_a) + dml_lplr_obj = dml.DoubleMLLPLR(obj_dml_data, ml_M, ml_t, ml_m) + par_grid = {"ml_M": get_par_grid(learner_M), "ml_t": get_par_grid(learner_t), "ml_m": get_par_grid(learner_m), + "ml_a": get_par_grid(learner_a)} + msg = "tune_on_folds must be True as targets have to be created for ml_t on folds." + with pytest.raises(ValueError, match=msg): + dml_lplr_obj.tune(par_grid, tune_on_folds=False) \ No newline at end of file From 5a7e2796fb35282e49c8ef23e6db95b6030a6d22 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Wed, 5 Nov 2025 18:45:15 -0800 Subject: [PATCH 20/23] Pre-commit checks --- doubleml/plm/lplr.py | 3 +-- doubleml/plm/tests/test_lplr.py | 8 +++++- doubleml/plm/tests/test_lplr_exceptions.py | 1 - doubleml/plm/tests/test_lplr_tune.py | 31 +++++++++++++++------- 4 files changed, 29 insertions(+), 14 deletions(-) diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index 468b9359..af545216 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -109,7 +109,6 @@ def __init__( _ = self._check_learner(ml_t, "ml_t", regressor=True, classifier=False) _ = self._check_learner(ml_M, "ml_M", regressor=False, classifier=True) - ml_m_is_classifier = self._check_learner(ml_m, "ml_m", regressor=True, classifier=True) self._learner = {"ml_m": ml_m, "ml_t": ml_t, "ml_M": ml_M} @@ -490,7 +489,7 @@ def _nuisance_tuning( n_jobs_cv, search_mode, n_iter_randomized_search, - fold_specific_target=True + fold_specific_target=True, ) t_best_params = [xx.best_params_ for xx in t_tune_res] diff --git a/doubleml/plm/tests/test_lplr.py b/doubleml/plm/tests/test_lplr.py index 154c4763..4eaf8613 100644 --- a/doubleml/plm/tests/test_lplr.py +++ b/doubleml/plm/tests/test_lplr.py @@ -4,6 +4,7 @@ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor import doubleml as dml + from ..datasets import make_lplr_LZZ2020 @@ -11,6 +12,7 @@ def learner_M(request): return request.param + @pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) def learner_t(request): return request.param @@ -20,6 +22,7 @@ def learner_t(request): def learner_m(request): return request.param + @pytest.fixture(scope="module", params=["nuisance_space", "instrument"]) def score(request): return request.param @@ -27,7 +30,10 @@ def score(request): @pytest.fixture(scope="module") def dml_lplr_fixture( - score, learner_M, learner_t, learner_m, + score, + learner_M, + learner_t, + learner_m, ): n_folds = 5 alpha = 0.5 diff --git a/doubleml/plm/tests/test_lplr_exceptions.py b/doubleml/plm/tests/test_lplr_exceptions.py index 1be83c12..cfe9f067 100644 --- a/doubleml/plm/tests/test_lplr_exceptions.py +++ b/doubleml/plm/tests/test_lplr_exceptions.py @@ -295,4 +295,3 @@ def eval_fct(y_pred, y_true): with pytest.raises(ValueError): dml_lplr_obj.evaluate_learners(metric=eval_fct) - diff --git a/doubleml/plm/tests/test_lplr_tune.py b/doubleml/plm/tests/test_lplr_tune.py index 2926d755..70ea6381 100644 --- a/doubleml/plm/tests/test_lplr_tune.py +++ b/doubleml/plm/tests/test_lplr_tune.py @@ -1,5 +1,3 @@ -import math - import numpy as np import pytest from sklearn.base import clone @@ -7,12 +5,15 @@ from sklearn.linear_model import Lasso, LogisticRegression import doubleml as dml + from ..datasets import make_lplr_LZZ2020 + @pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)]) def learner_M(request): return request.param + @pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) def learner_t(request): return request.param @@ -22,16 +23,17 @@ def learner_t(request): def learner_m(request): return request.param + @pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) def learner_a(request): return request.param + @pytest.fixture(scope="module", params=["nuisance_space", "instrument"]) def score(request): return request.param - def get_par_grid(learner): if learner.__class__ in [RandomForestRegressor, RandomForestClassifier]: par_grid = {"n_estimators": [5, 10, 20]} @@ -50,12 +52,16 @@ def dml_lplr_fixture( score, tune_on_folds=True, ): - par_grid = {"ml_M": get_par_grid(learner_M), "ml_t": get_par_grid(learner_t), "ml_m": get_par_grid(learner_m), "ml_a": get_par_grid(learner_a)} + par_grid = { + "ml_M": get_par_grid(learner_M), + "ml_t": get_par_grid(learner_t), + "ml_m": get_par_grid(learner_m), + "ml_a": get_par_grid(learner_a), + } n_folds_tune = 4 n_folds = 5 alpha = 0.5 - ml_M = clone(learner_M) ml_t = clone(learner_t) ml_m = clone(learner_m) @@ -101,16 +107,21 @@ def test_lplr_exception_tuning( learner_M, learner_t, learner_m, - learner_a,): + learner_a, +): # LPLR valid scores are 'nuisance_space' and 'instrument' obj_dml_data = make_lplr_LZZ2020(alpha=0.5) ml_M = clone(learner_M) ml_t = clone(learner_t) ml_m = clone(learner_m) - ml_a = clone(learner_a) + dml_lplr_obj = dml.DoubleMLLPLR(obj_dml_data, ml_M, ml_t, ml_m) - par_grid = {"ml_M": get_par_grid(learner_M), "ml_t": get_par_grid(learner_t), "ml_m": get_par_grid(learner_m), - "ml_a": get_par_grid(learner_a)} + par_grid = { + "ml_M": get_par_grid(learner_M), + "ml_t": get_par_grid(learner_t), + "ml_m": get_par_grid(learner_m), + "ml_a": get_par_grid(learner_a), + } msg = "tune_on_folds must be True as targets have to be created for ml_t on folds." with pytest.raises(ValueError, match=msg): - dml_lplr_obj.tune(par_grid, tune_on_folds=False) \ No newline at end of file + dml_lplr_obj.tune(par_grid, tune_on_folds=False) From fc03cc65aaf2f216b8e44d2e5f4aee9adf8727ca Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Thu, 6 Nov 2025 11:39:25 -0800 Subject: [PATCH 21/23] Pre-commit checks on all files --- doubleml/plm/__init__.py | 6 +----- doubleml/plm/datasets/dgp_lplr_LZZ2020.py | 1 + doubleml/plm/tests/test_lplr_exceptions.py | 8 ++------ 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/doubleml/plm/__init__.py b/doubleml/plm/__init__.py index f5e135e3..283bc91b 100644 --- a/doubleml/plm/__init__.py +++ b/doubleml/plm/__init__.py @@ -6,8 +6,4 @@ from .pliv import DoubleMLPLIV from .plr import DoubleMLPLR -__all__ = [ - "DoubleMLPLR", - "DoubleMLPLIV", - "DoubleMLLPLR" -] +__all__ = ["DoubleMLPLR", "DoubleMLPLIV", "DoubleMLLPLR"] diff --git a/doubleml/plm/datasets/dgp_lplr_LZZ2020.py b/doubleml/plm/datasets/dgp_lplr_LZZ2020.py index a9b4ece9..284da7d8 100644 --- a/doubleml/plm/datasets/dgp_lplr_LZZ2020.py +++ b/doubleml/plm/datasets/dgp_lplr_LZZ2020.py @@ -90,6 +90,7 @@ def r_0(X): + 0.25 * np.where(X[:, 10] > 0, 1, 0) + -0.25 * np.where(X[:, 12] > 0, 1, 0) ) + else: def r_0(X): diff --git a/doubleml/plm/tests/test_lplr_exceptions.py b/doubleml/plm/tests/test_lplr_exceptions.py index cfe9f067..c4c57fd9 100644 --- a/doubleml/plm/tests/test_lplr_exceptions.py +++ b/doubleml/plm/tests/test_lplr_exceptions.py @@ -21,10 +21,7 @@ @pytest.mark.ci def test_lplr_exception_data(): - msg = ( - r"The data must be of DoubleMLData.* type\.[\s\S]* of type " - r" was passed\." - ) + msg = r"The data must be of DoubleMLData.* type\.[\s\S]* of type " r" was passed\." with pytest.raises(TypeError, match=msg): _ = DoubleMLLPLR(pd.DataFrame(), ml_M, ml_t, ml_m) @@ -284,8 +281,7 @@ def test_double_ml_exception_evaluate_learner(): dml_lplr_obj.evaluate_learners(metric="mse") msg = ( - r"The learners have to be a subset of \['ml_m', 'ml_t', 'ml_M', 'ml_a'\]\. " - r"Learners \['ml_mu', 'ml_p'\] provided." + r"The learners have to be a subset of \['ml_m', 'ml_t', 'ml_M', 'ml_a'\]\. " r"Learners \['ml_mu', 'ml_p'\] provided." ) with pytest.raises(ValueError, match=msg): dml_lplr_obj.evaluate_learners(learners=["ml_mu", "ml_p"]) From 5dae65189666090406604cafb3438e04dcfd1ebf Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Thu, 6 Nov 2025 16:06:48 -0800 Subject: [PATCH 22/23] Changed function signature, test --- doubleml/plm/lplr.py | 4 ++-- doubleml/plm/tests/test_lplr.py | 8 +++++++- doubleml/plm/tests/test_lplr_tune.py | 18 ++++++------------ 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index af545216..3ef6e496 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -414,7 +414,7 @@ def _nuisance_tuning( filtered_train_inds = [] if self.score == "nuisance_space": - for train, test in smpls: + for train, _ in smpls: train_filtered = train[y[train] == 0] filtered_train_inds.append(train_filtered) elif self.score == "instrument": @@ -528,7 +528,7 @@ def draw_sample_splitting(self): return self - def set_sample_splitting(self): + def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): raise NotImplementedError("set_sample_splitting is not implemented for DoubleMLLPLR.") def _compute_score(self, psi_elements, coef): diff --git a/doubleml/plm/tests/test_lplr.py b/doubleml/plm/tests/test_lplr.py index 4eaf8613..9c94a8a4 100644 --- a/doubleml/plm/tests/test_lplr.py +++ b/doubleml/plm/tests/test_lplr.py @@ -28,19 +28,25 @@ def score(request): return request.param +@pytest.fixture(scope="module", params=["continuous", "binary", "binary_unbalanced"]) +def treatment(request): + return request.param + + @pytest.fixture(scope="module") def dml_lplr_fixture( score, learner_M, learner_t, learner_m, + treatment, ): n_folds = 5 alpha = 0.5 # collect data np.random.seed(42) - obj_dml_data = make_lplr_LZZ2020(alpha=alpha) + obj_dml_data = make_lplr_LZZ2020(alpha=alpha, treatment=treatment) ml_M = clone(learner_M) ml_t = clone(learner_t) diff --git a/doubleml/plm/tests/test_lplr_tune.py b/doubleml/plm/tests/test_lplr_tune.py index 70ea6381..64653f5e 100644 --- a/doubleml/plm/tests/test_lplr_tune.py +++ b/doubleml/plm/tests/test_lplr_tune.py @@ -2,7 +2,6 @@ import pytest from sklearn.base import clone from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor -from sklearn.linear_model import Lasso, LogisticRegression import doubleml as dml @@ -34,13 +33,8 @@ def score(request): return request.param -def get_par_grid(learner): - if learner.__class__ in [RandomForestRegressor, RandomForestClassifier]: - par_grid = {"n_estimators": [5, 10, 20]} - else: - assert learner.__class__ in [LogisticRegression, Lasso] - par_grid = {"C": np.logspace(-2, 2, 10)} - return par_grid +def get_par_grid(): + return {"n_estimators": [5, 10, 20]} @pytest.fixture(scope="module") @@ -53,10 +47,10 @@ def dml_lplr_fixture( tune_on_folds=True, ): par_grid = { - "ml_M": get_par_grid(learner_M), - "ml_t": get_par_grid(learner_t), - "ml_m": get_par_grid(learner_m), - "ml_a": get_par_grid(learner_a), + "ml_M": get_par_grid(), + "ml_t": get_par_grid(), + "ml_m": get_par_grid(), + "ml_a": get_par_grid(), } n_folds_tune = 4 n_folds = 5 From 13fca2f6b166e2550c586e6c548d65ddf67f9b62 Mon Sep 17 00:00:00 2001 From: Julius Herzig Date: Thu, 6 Nov 2025 16:09:35 -0800 Subject: [PATCH 23/23] Argument fix --- doubleml/plm/lplr.py | 2 +- doubleml/plm/tests/test_lplr_tune.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py index 3ef6e496..8f609e04 100644 --- a/doubleml/plm/lplr.py +++ b/doubleml/plm/lplr.py @@ -465,7 +465,7 @@ def _nuisance_tuning( ) W_inner = [] - for i, (train, test) in enumerate(smpls): + for i, (train, _) in enumerate(smpls): M_iteration = M_hat["preds_inner"][i][train] M_iteration = np.clip(M_iteration, 1e-8, 1 - 1e-8) w = scipy.special.logit(M_iteration) diff --git a/doubleml/plm/tests/test_lplr_tune.py b/doubleml/plm/tests/test_lplr_tune.py index 64653f5e..7c7c4aeb 100644 --- a/doubleml/plm/tests/test_lplr_tune.py +++ b/doubleml/plm/tests/test_lplr_tune.py @@ -111,10 +111,10 @@ def test_lplr_exception_tuning( dml_lplr_obj = dml.DoubleMLLPLR(obj_dml_data, ml_M, ml_t, ml_m) par_grid = { - "ml_M": get_par_grid(learner_M), - "ml_t": get_par_grid(learner_t), - "ml_m": get_par_grid(learner_m), - "ml_a": get_par_grid(learner_a), + "ml_M": get_par_grid(), + "ml_t": get_par_grid(), + "ml_m": get_par_grid(), + "ml_a": get_par_grid(), } msg = "tune_on_folds must be True as targets have to be created for ml_t on folds." with pytest.raises(ValueError, match=msg):