diff --git a/doubleml/__init__.py b/doubleml/__init__.py index 6cf7de96..cb3891ba 100644 --- a/doubleml/__init__.py +++ b/doubleml/__init__.py @@ -13,6 +13,7 @@ from .irm.pq import DoubleMLPQ from .irm.qte import DoubleMLQTE from .irm.ssm import DoubleMLSSM +from .plm.lplr import DoubleMLLPLR from .plm.pliv import DoubleMLPLIV from .plm.plr import DoubleMLPLR from .utils.blp import DoubleMLBLP @@ -42,6 +43,7 @@ "DoubleMLBLP", "DoubleMLPolicyTree", "DoubleMLSSM", + "DoubleMLLPLR", ] __version__ = importlib.metadata.version("doubleml") diff --git a/doubleml/double_ml_score_mixins.py b/doubleml/double_ml_score_mixins.py index 57dd6e62..f1112db9 100644 --- a/doubleml/double_ml_score_mixins.py +++ b/doubleml/double_ml_score_mixins.py @@ -86,6 +86,7 @@ class NonLinearScoreMixin: _score_type = "nonlinear" _coef_start_val = np.nan _coef_bounds = None + _error_on_convergence_failure = False @property @abstractmethod @@ -149,12 +150,16 @@ def score_deriv(theta): theta_hat = root_res.root if not root_res.converged: score_val = score(theta_hat) - warnings.warn( + msg = ( "Could not find a root of the score function.\n " f"Flag: {root_res.flag}.\n" f"Score value found is {score_val} " f"for parameter theta equal to {theta_hat}." ) + if self._error_on_convergence_failure: + raise ValueError(msg) + else: + warnings.warn(msg) else: signs_different, bracket_guess = _get_bracket_guess(score, self._coef_start_val, self._coef_bounds) @@ -186,12 +191,16 @@ def score_squared(theta): score, self._coef_start_val, approx_grad=True, bounds=[self._coef_bounds] ) theta_hat = theta_hat_array.item() - warnings.warn( + msg = ( "Could not find a root of the score function.\n " f"Minimum score value found is {score_val} " f"for parameter theta equal to {theta_hat}.\n " "No theta found such that the score function evaluates to a negative value." ) + if self._error_on_convergence_failure: + raise ValueError(msg) + else: + warnings.warn(msg) else: def neg_score(theta): @@ -202,11 +211,15 @@ def neg_score(theta): neg_score, self._coef_start_val, approx_grad=True, bounds=[self._coef_bounds] ) theta_hat = theta_hat_array.item() - warnings.warn( + msg = ( "Could not find a root of the score function. " f"Maximum score value found is {-1 * neg_score_val} " f"for parameter theta equal to {theta_hat}. " "No theta found such that the score function evaluates to a positive value." ) + if self._error_on_convergence_failure: + raise ValueError(msg) + else: + warnings.warn(msg) return theta_hat diff --git a/doubleml/plm/__init__.py b/doubleml/plm/__init__.py index e81f00c5..283bc91b 100644 --- a/doubleml/plm/__init__.py +++ b/doubleml/plm/__init__.py @@ -2,10 +2,8 @@ The :mod:`doubleml.plm` module implements double machine learning estimates based on partially linear models. """ +from .lplr import DoubleMLLPLR from .pliv import DoubleMLPLIV from .plr import DoubleMLPLR -__all__ = [ - "DoubleMLPLR", - "DoubleMLPLIV", -] +__all__ = ["DoubleMLPLR", "DoubleMLPLIV", "DoubleMLLPLR"] diff --git a/doubleml/plm/datasets/__init__.py b/doubleml/plm/datasets/__init__.py index b2bb7df0..6e8e9bb5 100644 --- a/doubleml/plm/datasets/__init__.py +++ b/doubleml/plm/datasets/__init__.py @@ -4,6 +4,7 @@ from ._make_pliv_data import _make_pliv_data from .dgp_confounded_plr_data import make_confounded_plr_data +from .dgp_lplr_LZZ2020 import make_lplr_LZZ2020 from .dgp_pliv_CHS2015 import make_pliv_CHS2015 from .dgp_pliv_multiway_cluster_CKMS2021 import make_pliv_multiway_cluster_CKMS2021 from .dgp_plr_CCDDHNR2018 import make_plr_CCDDHNR2018 @@ -15,5 +16,6 @@ "make_confounded_plr_data", "make_pliv_CHS2015", "make_pliv_multiway_cluster_CKMS2021", + "make_lplr_LZZ2020", "_make_pliv_data", ] diff --git a/doubleml/plm/datasets/dgp_lplr_LZZ2020.py b/doubleml/plm/datasets/dgp_lplr_LZZ2020.py new file mode 100644 index 00000000..284da7d8 --- /dev/null +++ b/doubleml/plm/datasets/dgp_lplr_LZZ2020.py @@ -0,0 +1,152 @@ +import numpy as np +import pandas as pd +from scipy.special import expit + +from doubleml.data import DoubleMLData +from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias + +_array_alias = _get_array_alias() +_data_frame_alias = _get_data_frame_alias() +_dml_data_alias = _get_dml_data_alias() + + +def make_lplr_LZZ2020( + n_obs=500, dim_x=20, alpha=0.5, return_type="DoubleMLData", balanced_r0=True, treatment="continuous", **kwargs +): + r""" + Generates synthetic data for a logistic partially linear regression model, as in Liu et al. (2021), + designed for use in double/debiased machine learning applications. + + The data generating process is defined as follows: + + - Covariates :math:`x_i \sim \mathcal{N}(0, \Sigma)`, where :math:`\Sigma_{kj} = 0.7^{|j-k|}`. + - Treatment :math:`d_i = a_0(x_i)`. + - Propensity score :math:`p_i = \sigma(\alpha d_i + r_0(x_i))`, where :math:`\sigma(\cdot)` is the logistic function. + - Outcome :math:`y_i \sim \text{Bernoulli}(p_i)`. + + The nuisance functions are defined as: + + .. math:: + \begin{aligned} + a_0(x_i) &= \frac{2}{1 + \exp(x_{i,1})} - \frac{2}{1 + \exp(x_{i,2})} + \sin(x_{i,3}) + \cos(x_{i,4}) \\ + &\quad + 0.5 \cdot \mathbb{1}(x_{i,5} > 0) - 0.5 \cdot \mathbb{1}(x_{i,6} > 0) + 0.2\, x_{i,7} x_{i,8} + - 0.2\, x_{i,9} x_{i,10} \\ + r_0(x_i) &= 0.1\, x_{i,1} x_{i,2} x_{i,3} + 0.1\, x_{i,4} x_{i,5} + 0.1\, x_{i,6}^3 - 0.5 \sin^2(x_{i,7}) \\ + &\quad + 0.5 \cos(x_{i,8}) + \frac{1}{1 + x_{i,9}^2} - \frac{1}{1 + \exp(x_{i,10})} \\ + &\quad + 0.25 \cdot \mathbb{1}(x_{i,11} > 0) - 0.25 \cdot \mathbb{1}(x_{i,13} > 0) + \end{aligned} + + Parameters + ---------- + n_obs : int + Number of observations to simulate. + dim_x : int + Number of covariates. + alpha : float + Value of the causal parameter. + return_type : str + Determines the return format. One of: + + - 'DoubleMLData' or DoubleMLData: returns a ``DoubleMLData`` object. + - 'DataFrame', 'pd.DataFrame' or pd.DataFrame: returns a ``pandas.DataFrame``. + - 'array', 'np.ndarray', 'np.array' or np.ndarray: returns tuple of numpy arrays (x, y, d, p). + balanced_r0 : bool, default True + If True, uses the "balanced" r_0 specification (smaller magnitude / more balanced + heterogeneity). If False, uses an "unbalanced" r_0 specification with larger + share of Y=0. + treatment : {'continuous', 'binary', 'binary_unbalanced'}, default 'continuous' + Determines how the treatment d is generated from a_0(x): + - 'continuous': d = a_0(x) (continuous treatment). + - 'binary': d ~ Bernoulli( sigmoid(a_0(x) - mean(a_0(x))) ) . + - 'binary_unbalanced': d ~ Bernoulli( sigmoid(a_0(x)) ). + + **kwargs + Optional keyword arguments (currently unused in this implementation). + + Returns + ------- + Union[DoubleMLData, pd.DataFrame, Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]] + The generated data in the specified format. + + References + ---------- + Liu, Molei, Yi Zhang, and Doudou Zhou. 2021. + "Double/Debiased Machine Learning for Logistic Partially Linear Model." + The Econometrics Journal 24 (3): 559–88. https://doi.org/10.1093/ectj/utab019. + + """ + + if balanced_r0: + + def r_0(X): + return ( + 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + + 0.1 * X[:, 3] * X[:, 4] + + 0.1 * X[:, 5] ** 3 + + -0.5 * np.sin(X[:, 6]) ** 2 + + 0.5 * np.cos(X[:, 7]) + + 1 / (1 + X[:, 8] ** 2) + + -1 / (1 + np.exp(X[:, 9])) + + 0.25 * np.where(X[:, 10] > 0, 1, 0) + + -0.25 * np.where(X[:, 12] > 0, 1, 0) + ) + + else: + + def r_0(X): + return ( + 0.1 * X[:, 0] * X[:, 1] * X[:, 2] + + 0.1 * X[:, 3] * X[:, 4] + + 0.1 * X[:, 5] ** 3 + + -0.5 * np.sin(X[:, 6]) ** 2 + + 0.5 * np.cos(X[:, 7]) + + 4 / (1 + X[:, 8] ** 2) + + -1 / (1 + np.exp(X[:, 9])) + + 1.5 * np.where(X[:, 10] > 0, 1, 0) + + -0.25 * np.where(X[:, 12] > 0, 1, 0) + ) + + def a_0(X): + return ( + 2 / (1 + np.exp(X[:, 0])) + + -2 / (1 + np.exp(X[:, 1])) + + 1 * np.sin(X[:, 2]) + + 1 * np.cos(X[:, 3]) + + 0.5 * np.where(X[:, 4] > 0, 1, 0) + + -0.5 * np.where(X[:, 5] > 0, 1, 0) + + 0.2 * X[:, 6] * X[:, 7] + + -0.2 * X[:, 8] * X[:, 9] + ) + + sigma = np.full((dim_x, dim_x), 0.2) + np.fill_diagonal(sigma, 1) + + x = np.random.multivariate_normal(np.zeros(dim_x), sigma, size=n_obs) + np.clip(x, -2, 2, out=x) + + if treatment == "continuous": + d = a_0(x) + elif treatment == "binary": + d_cont = a_0(x) + d = np.random.binomial(1, expit(d_cont - d_cont.mean())) + elif treatment == "binary_unbalanced": + d_cont = a_0(x) + d = np.random.binomial(1, expit(d_cont)) + else: + raise ValueError("Invalid treatment type.") + + p = expit(alpha * d[:] + r_0(x)) + + y = np.random.binomial(1, p) + + if return_type in _array_alias: + return x, y, d, p + elif return_type in _data_frame_alias + _dml_data_alias: + x_cols = [f"X{i + 1}" for i in np.arange(dim_x)] + data = pd.DataFrame(np.column_stack((x, y, d, p)), columns=x_cols + ["y", "d", "p"]) + if return_type in _data_frame_alias: + return data + else: + return DoubleMLData(data, "y", "d", x_cols) + else: + raise ValueError("Invalid return_type.") diff --git a/doubleml/plm/lplr.py b/doubleml/plm/lplr.py new file mode 100644 index 00000000..8f609e04 --- /dev/null +++ b/doubleml/plm/lplr.py @@ -0,0 +1,557 @@ +import inspect + +import numpy as np +import scipy +from sklearn.base import clone +from sklearn.utils import check_X_y +from sklearn.utils.multiclass import type_of_target + +from doubleml import DoubleMLData +from doubleml.double_ml import DoubleML +from doubleml.double_ml_score_mixins import NonLinearScoreMixin +from doubleml.utils._checks import _check_finite_predictions, _check_is_propensity, _check_score +from doubleml.utils._estimation import ( + _dml_cv_predict, + _dml_tune, +) +from doubleml.utils.resampling import DoubleMLDoubleResampling + + +class DoubleMLLPLR(NonLinearScoreMixin, DoubleML): + """Double machine learning for partially logistic models (binary outcomes) + + Parameters + ---------- + obj_dml_data : DoubleMLData + The DoubleMLData object providing the data and variable specification. + The outcome variable y must be binary with values {0, 1}. + ml_M : estimator + Classifier for M_0(D, X) = P[Y = 1 | D, X]. Must implement fit() and predict_proba(). + ml_t : estimator + Regressor for the auxiliary regression used to predict log-odds. Must implement fit() and predict(). + ml_m : estimator + Learner for m_0(X) = E[D | X]. For binary treatments a classifier with predict_proba() is expected; + for continuous treatments a regressor with predict() is expected. + ml_a : estimator, optional + Optional alternative learner for E[D | X]. If not provided, a clone of ml_m is used. + Must support the same prediction interface as ml_m. + n_folds : int, default=5 + Number of outer cross-fitting folds. + n_folds_inner : int, default=5 + Number of inner folds for nested resampling used internally. + n_rep : int, default=1 + Number of repetitions for sample splitting. + score : {'nuisance_space', 'instrument'} or callable, default='nuisance_space' + Score to use. 'nuisance_space' estimates m on subsamples with y=0; 'instrument' uses an instrument-type score. + draw_sample_splitting : bool, default=True + Whether to draw sample splitting during initialization. + error_on_convergence_failure : bool, default=False + If True, raise an error on convergence failure of score. + + Examples + -------- + >>> import numpy as np + >>> import doubleml as dml + >>> from doubleml.plm.datasets import make_lplr_LZZ2020 + >>> from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier + >>> from sklearn.base import clone + >>> np.random.seed(3141) + >>> ml_t = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) + >>> ml_m = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) + >>> ml_M = RandomForestClassifier(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2) + >>> obj_dml_data = make_lplr_LZZ2020(alpha=0.5, n_obs=500, dim_x=20) + >>> dml_lplr_obj = dml.DoubleMLPLR(obj_dml_data, ml_M, ml_t, ml_m) + >>> dml_lplr_obj.fit().summary + coef std err t P>|t| 2.5 % 97.5 % + d 0.480691 0.040533 11.859129 1.929729e-32 0.401247 0.560135 + + Notes + ----- + **Partially logistic regression (PLR)** models take the form + + .. math:: + + Y = \\text{expit} ( D \\theta_0 + r_0(X)) + + where :math:`Y` is the outcome variable and :math:`D` is the policy variable of interest. + The high-dimensional vector :math:`X = (X_1, \\ldots, X_p)` consists of other confounding covariates. + """ + + def __init__( + self, + obj_dml_data, + ml_M, + ml_t, + ml_m, + ml_a=None, + n_folds=5, + n_folds_inner=5, + n_rep=1, + score="nuisance_space", + draw_sample_splitting=True, + error_on_convergence_failure=False, + ): + self.n_folds_inner = n_folds_inner + super().__init__(obj_dml_data, n_folds, n_rep, score, draw_sample_splitting) + + # Ensure outcome only contains 0 and 1 (validate early in constructor) + if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]): + raise TypeError("The outcome variable y must be binary with values 0 and 1.") + + self._error_on_convergence_failure = error_on_convergence_failure + self._coef_bounds = (-1e-2, 1e2) + self._coef_start_val = 1.0 + + self._check_data(self._dml_data) + valid_scores = ["nuisance_space", "instrument"] + _check_score(self.score, valid_scores, allow_callable=False) + + _ = self._check_learner(ml_t, "ml_t", regressor=True, classifier=False) + _ = self._check_learner(ml_M, "ml_M", regressor=False, classifier=True) + + ml_m_is_classifier = self._check_learner(ml_m, "ml_m", regressor=True, classifier=True) + self._learner = {"ml_m": ml_m, "ml_t": ml_t, "ml_M": ml_M} + + if ml_a is not None: + ml_a_is_classifier = self._check_learner(ml_a, "ml_a", regressor=True, classifier=True) + self._learner["ml_a"] = ml_a + self._ml_a_provided = True + else: + self._learner["ml_a"] = clone(ml_m) + ml_a_is_classifier = ml_m_is_classifier + self._ml_a_provided = False + + self._predict_method = {"ml_t": "predict", "ml_M": "predict_proba"} + + if ml_m_is_classifier: + if self._dml_data.binary_treats.all(): + self._predict_method["ml_m"] = "predict_proba" + else: + raise ValueError( + f"The ml_m learner {str(ml_m)} was identified as classifier " + "but at least one treatment variable is not binary with values 0 and 1." + ) + else: + self._predict_method["ml_m"] = "predict" + + if ml_a_is_classifier: + if self._dml_data.binary_treats.all(): + self._predict_method["ml_a"] = "predict_proba" + else: + raise ValueError( + f"The ml_a learner {str(ml_a)} was identified as classifier " + "but at least one treatment variable is not binary with values 0 and 1." + ) + else: + self._predict_method["ml_a"] = "predict" + + if score == "instrument": + sig = inspect.signature(self.learner["ml_a"].fit) + if "sample_weight" not in sig.parameters: + raise ValueError('Learner "ml_a" who supports sample_weight is required for score type "instrument"') + + self._initialize_ml_nuisance_params() + self._external_predictions_implemented = True + + def _initialize_ml_nuisance_params(self): + self._params = {learner: {key: [None] * self.n_rep for key in self._dml_data.d_cols} for learner in self._learner} + + def _check_data(self, obj_dml_data): + if not isinstance(obj_dml_data, DoubleMLData): + raise TypeError( + f"The data must be of DoubleMLData type. {str(obj_dml_data)} of type {str(type(obj_dml_data))} was passed." + ) + if not np.array_equal(np.unique(obj_dml_data.y), [0, 1]): + raise TypeError("The outcome variable y must be binary with values 0 and 1.") + return + + def _double_dml_cv_predict( + self, + estimator, + estimator_name, + x, + y, + smpls=None, + smpls_inner=None, + n_jobs=None, + est_params=None, + method="predict", + sample_weights=None, + ): + res = {} + res["preds"] = np.zeros(y.shape, dtype=float) + res["preds_inner"] = [] + res["models"] = [] + for smpls_single_split, smpls_double_split in zip(smpls, smpls_inner): + res_inner = _dml_cv_predict( + estimator, + x, + y, + smpls=smpls_double_split, + n_jobs=n_jobs, + est_params=est_params, + method=method, + return_models=True, + smpls_is_partition=True, + sample_weights=sample_weights, + ) + _check_finite_predictions(res_inner["preds"], estimator, estimator_name, smpls_double_split) + + res["preds_inner"].append(res_inner["preds"]) + for model in res_inner["models"]: + res["models"].append(model) + if method == "predict_proba": + res["preds"][smpls_single_split[1]] += model.predict_proba(x[smpls_single_split[1]])[:, 1] + else: + res["preds"][smpls_single_split[1]] += model.predict(x[smpls_single_split[1]]) + res["preds"] /= len(smpls) + res["targets"] = np.copy(y) + return res + + def _nuisance_est(self, smpls, n_jobs_cv, external_predictions, return_models=False): + x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False) + x, d = check_X_y(x, self._dml_data.d, force_all_finite=False) + x_d_concat = np.hstack((d.reshape(-1, 1), x)) + m_external = external_predictions["ml_m"] is not None + M_external = external_predictions["ml_M"] is not None + t_external = external_predictions["ml_t"] is not None + if "ml_a" in self._learner: + a_external = external_predictions["ml_a"] is not None + else: + a_external = False + + if M_external: + M_hat = {"preds": external_predictions["ml_M"], "targets": None, "models": None} + else: + M_hat = self._double_dml_cv_predict( + self._learner["ml_M"], + "ml_M", + x_d_concat, + y, + smpls=smpls, + smpls_inner=self.__smpls__inner, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_M"), + method=self._predict_method["ml_M"], + ) + + # nuisance m + if m_external: + m_hat = {"preds": external_predictions["ml_m"], "targets": None, "models": None} + else: + if self.score == "instrument": + weights = [] + for i, (train, test) in enumerate(smpls): + weights.append(M_hat["preds_inner"][i][train] * (1 - M_hat["preds_inner"][i][train])) + m_hat = _dml_cv_predict( + self._learner["ml_m"], + x, + d, + smpls=smpls, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_m"), + method=self._predict_method["ml_m"], + return_models=return_models, + sample_weights=weights, + ) + + elif self.score == "nuisance_space": + filtered_smpls = [] + for train, test in smpls: + train_filtered = train[y[train] == 0] + filtered_smpls.append((train_filtered, test)) + m_hat = _dml_cv_predict( + self._learner["ml_m"], + x, + d, + smpls=filtered_smpls, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_m"), + method=self._predict_method["ml_m"], + return_models=return_models, + ) + else: + raise NotImplementedError + _check_finite_predictions(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls) + + if self._check_learner(self._learner["ml_m"], "ml_m", regressor=True, classifier=True): + _check_is_propensity(m_hat["preds"], self._learner["ml_m"], "ml_m", smpls, eps=1e-12) + + if self._dml_data.binary_treats[self._dml_data.d_cols[self._i_treat]]: + binary_preds = type_of_target(m_hat["preds"]) == "binary" + zero_one_preds = np.all((np.power(m_hat["preds"], 2) - m_hat["preds"]) == 0) + if binary_preds & zero_one_preds: + raise ValueError( + f"For the binary treatment variable {self._dml_data.d_cols[self._i_treat]}, " + f"predictions obtained with the ml_m learner {str(self._learner['ml_m'])} are also " + "observed to be binary with values 0 and 1. Make sure that for classifiers " + "probabilities and not labels are predicted." + ) + + if a_external: + a_hat = {"preds": external_predictions["ml_a"], "targets": None, "models": None} + else: + a_hat = self._double_dml_cv_predict( + self._learner["ml_a"], + "ml_a", + x, + d, + smpls=smpls, + smpls_inner=self.__smpls__inner, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_a"), + method=self._predict_method["ml_a"], + ) + + W_inner = [] + beta = np.zeros(d.shape, dtype=float) + + for i, (train, test) in enumerate(smpls): + M_iteration = M_hat["preds_inner"][i][train] + M_iteration = np.clip(M_iteration, 1e-8, 1 - 1e-8) + w = scipy.special.logit(M_iteration) + W_inner.append(w) + d_tilde = (d - a_hat["preds_inner"][i])[train] + beta[test] = np.sum(d_tilde * w) / np.sum(d_tilde**2) + + # nuisance t + if t_external: + t_hat = {"preds": external_predictions["ml_t"], "targets": None, "models": None} + else: + t_hat = _dml_cv_predict( + self._learner["ml_t"], + x, + W_inner, + smpls=smpls, + n_jobs=n_jobs_cv, + est_params=self._get_params("ml_t"), + method=self._predict_method["ml_t"], + return_models=return_models, + ) + _check_finite_predictions(t_hat["preds"], self._learner["ml_t"], "ml_t", smpls) + + r_hat = {} + r_hat["preds"] = t_hat["preds"] - beta * a_hat["preds"] + + psi_elements = self._score_elements(y, d, r_hat["preds"], m_hat["preds"]) + + preds = { + "predictions": { + "ml_r": r_hat["preds"], + "ml_m": m_hat["preds"], + "ml_a": a_hat["preds"], + "ml_t": t_hat["preds"], + "ml_M": M_hat["preds"], + }, + "targets": { + "ml_r": None, + "ml_m": m_hat["targets"], + "ml_a": a_hat["targets"], + "ml_t": t_hat["targets"], + "ml_M": M_hat["targets"], + }, + "models": { + "ml_r": None, + "ml_m": m_hat["models"], + "ml_a": a_hat["models"], + "ml_t": t_hat["models"], + "ml_M": M_hat["models"], + }, + } + + return psi_elements, preds + + def _score_elements(self, y, d, r_hat, m_hat): + # compute residual + d_tilde = d - m_hat + psi_hat = scipy.special.expit(-r_hat) + score_const = d_tilde * (1 - y) * np.exp(r_hat) + psi_elements = { + "y": y, + "d": d, + "d_tilde": d_tilde, + "r_hat": r_hat, + "m_hat": m_hat, + "psi_hat": psi_hat, + "score_const": score_const, + } + + return psi_elements + + @property + def _score_element_names(self): + return ["y", "d", "d_tilde", "r_hat", "m_hat", "psi_hat", "score_const"] + + def _sensitivity_element_est(self, preds): + pass + + def _nuisance_tuning( + self, smpls, param_grids, scoring_methods, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search + ): + if self._i_rep is None: + raise ValueError("tune_on_folds must be True as targets have to be created for ml_t on folds.") + # TODO: test + x, y = check_X_y(self._dml_data.x, self._dml_data.y, force_all_finite=False) + x, d = check_X_y(x, self._dml_data.d, force_all_finite=False) + x_d_concat = np.hstack((d.reshape(-1, 1), x)) + + if scoring_methods is None: + scoring_methods = {"ml_m": None, "ml_M": None, "ml_a": None, "ml_t": None} + + train_inds = [train_index for (train_index, _) in smpls] + M_tune_res = _dml_tune( + y, + x_d_concat, + train_inds, + self._learner["ml_M"], + param_grids["ml_M"], + scoring_methods["ml_M"], + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + ) + + filtered_train_inds = [] + if self.score == "nuisance_space": + for train, _ in smpls: + train_filtered = train[y[train] == 0] + filtered_train_inds.append(train_filtered) + elif self.score == "instrument": + filtered_train_inds = train_inds + else: + raise NotImplementedError + m_tune_res = _dml_tune( + d, + x, + filtered_train_inds, + self._learner["ml_m"], + param_grids["ml_m"], + scoring_methods["ml_m"], + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + ) + + a_tune_res = _dml_tune( + d, + x, + train_inds, + self._learner["ml_a"], + param_grids["ml_a"], + scoring_methods["ml_a"], + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + ) + + M_best_params = [xx.best_params_ for xx in M_tune_res] + m_best_params = [xx.best_params_ for xx in m_tune_res] + a_best_params = [xx.best_params_ for xx in a_tune_res] + + # Create targets for tuning ml_t + M_hat = self._double_dml_cv_predict( + self._learner["ml_M"], + "ml_M", + x_d_concat, + y, + smpls=smpls, + smpls_inner=self.__smpls__inner, + n_jobs=n_jobs_cv, + est_params=M_best_params, + method=self._predict_method["ml_M"], + ) + + W_inner = [] + for i, (train, _) in enumerate(smpls): + M_iteration = M_hat["preds_inner"][i][train] + M_iteration = np.clip(M_iteration, 1e-8, 1 - 1e-8) + w = scipy.special.logit(M_iteration) + W_inner.append(w) + + # Reshape W_inner into full-length arrays per fold: fill train indices, others are NaN + W_targets = [] + for i, train in enumerate(train_inds): + wt = np.full(x.shape[0], np.nan, dtype=float) + wt[train] = W_inner[i] + W_targets.append(wt) + + t_tune_res = _dml_tune( + W_inner, + x, + train_inds, + self._learner["ml_t"], + param_grids["ml_t"], + scoring_methods["ml_t"], + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + fold_specific_target=True, + ) + t_best_params = [xx.best_params_ for xx in t_tune_res] + + # Update params and tune_res to include ml_a and ml_t + params = {"ml_M": M_best_params, "ml_m": m_best_params, "ml_a": a_best_params, "ml_t": t_best_params} + tune_res = {"M_tune": M_tune_res, "m_tune": m_tune_res, "a_tune": a_tune_res, "t_tune": t_tune_res} + + res = {"params": params, "tune_res": tune_res} + + return res + + @property + def __smpls__inner(self): + return self._smpls_inner[self._i_rep] + + def draw_sample_splitting(self): + """ + Draw sample splitting for DoubleML models. + + The samples are drawn according to the attributes + ``n_folds`` and ``n_rep``. + + Returns + ------- + self : object + """ + + obj_dml_resampling = DoubleMLDoubleResampling( + n_folds=self.n_folds, + n_folds_inner=self.n_folds_inner, + n_rep=self.n_rep, + n_obs=self._dml_data.n_obs, + stratify=self._strata, + ) + self._smpls, self._smpls_inner = obj_dml_resampling.split_samples() + + return self + + def set_sample_splitting(self, all_smpls, all_smpls_cluster=None): + raise NotImplementedError("set_sample_splitting is not implemented for DoubleMLLPLR.") + + def _compute_score(self, psi_elements, coef): + if self.score == "nuisance_space": + score_1 = psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d_tilde"] + score = psi_elements["psi_hat"] * (score_1 - psi_elements["score_const"]) + elif self.score == "instrument": + score = (psi_elements["y"] - scipy.special.expit(coef * psi_elements["d"] + psi_elements["r_hat"])) * psi_elements[ + "d_tilde" + ] + else: + raise NotImplementedError + + return score + + def _compute_score_deriv(self, psi_elements, coef, inds=None): + if self.score == "nuisance_space": + deriv_1 = -psi_elements["y"] * np.exp(-coef * psi_elements["d"]) * psi_elements["d"] + deriv = psi_elements["psi_hat"] * psi_elements["d_tilde"] * deriv_1 + elif self.score == "instrument": + expit = scipy.special.expit(coef * psi_elements["d"] + psi_elements["r_hat"]) + deriv = -psi_elements["d"] * expit * (1 - expit) * psi_elements["d_tilde"] + else: + raise NotImplementedError + + return deriv diff --git a/doubleml/plm/tests/test_lplr.py b/doubleml/plm/tests/test_lplr.py new file mode 100644 index 00000000..9c94a8a4 --- /dev/null +++ b/doubleml/plm/tests/test_lplr.py @@ -0,0 +1,73 @@ +import numpy as np +import pytest +from sklearn.base import clone +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor + +import doubleml as dml + +from ..datasets import make_lplr_LZZ2020 + + +@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)]) +def learner_M(request): + return request.param + + +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) +def learner_t(request): + return request.param + + +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) +def learner_m(request): + return request.param + + +@pytest.fixture(scope="module", params=["nuisance_space", "instrument"]) +def score(request): + return request.param + + +@pytest.fixture(scope="module", params=["continuous", "binary", "binary_unbalanced"]) +def treatment(request): + return request.param + + +@pytest.fixture(scope="module") +def dml_lplr_fixture( + score, + learner_M, + learner_t, + learner_m, + treatment, +): + n_folds = 5 + alpha = 0.5 + + # collect data + np.random.seed(42) + obj_dml_data = make_lplr_LZZ2020(alpha=alpha, treatment=treatment) + + ml_M = clone(learner_M) + ml_t = clone(learner_t) + ml_m = clone(learner_m) + + dml_sel_obj = dml.DoubleMLLPLR(obj_dml_data, ml_M, ml_t, ml_m, n_folds=n_folds, score=score) + dml_sel_obj.fit() + + res_dict = { + "coef": dml_sel_obj.coef[0], + "se": dml_sel_obj.se[0], + "true_coef": alpha, + } + + return res_dict + + +@pytest.mark.ci +def test_dml_lplr_coef(dml_lplr_fixture): + # true_coef should lie within three standard deviations of the estimate + coef = dml_lplr_fixture["coef"] + se = dml_lplr_fixture["se"] + true_coef = dml_lplr_fixture["true_coef"] + assert abs(coef - true_coef) <= 3.0 * np.sqrt(se) diff --git a/doubleml/plm/tests/test_lplr_exceptions.py b/doubleml/plm/tests/test_lplr_exceptions.py new file mode 100644 index 00000000..c4c57fd9 --- /dev/null +++ b/doubleml/plm/tests/test_lplr_exceptions.py @@ -0,0 +1,293 @@ +import numpy as np +import pandas as pd +import pytest +from sklearn.base import BaseEstimator +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor +from sklearn.linear_model import Lasso, LogisticRegression + +from doubleml import DoubleMLLPLR +from doubleml.plm.datasets import make_lplr_LZZ2020 + +np.random.seed(3141) +n = 100 +# create test data and basic learners +dml_data = make_lplr_LZZ2020(alpha=0.5, n_obs=n, dim_x=20) +ml_M = RandomForestClassifier() +ml_t = RandomForestRegressor() +ml_m = RandomForestRegressor() +dml_lplr = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m) +dml_lplr_instrument = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score="instrument") + + +@pytest.mark.ci +def test_lplr_exception_data(): + msg = r"The data must be of DoubleMLData.* type\.[\s\S]* of type " r" was passed\." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(pd.DataFrame(), ml_M, ml_t, ml_m) + + dml_data_nb = make_lplr_LZZ2020(alpha=0.5, n_obs=50, dim_x=20) + dml_data_nb.data[dml_data_nb.y_col] = dml_data_nb.data[dml_data_nb.y_col] + 1 + dml_data_nb._set_y_z() + with pytest.raises(TypeError, match="The outcome variable y must be binary with values 0 and 1."): + _ = DoubleMLLPLR(dml_data_nb, ml_M, ml_t, ml_m) + + +@pytest.mark.ci +def test_lplr_exception_scores(): + # LPLR valid scores are 'nuisance_space' and 'instrument' + msg = "Invalid score MAR" + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score="MAR") + msg = "score should be a string. 0 was passed." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, score=0) + + +@pytest.mark.ci +def test_ssm_exception_resampling(): + msg = "The number of folds must be of int type. 1.5 of type was passed." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, n_folds=1.5) + + msg = "The number of repetitions for the sample splitting must be of int type. 1.5 of type was passed." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, n_rep=1.5) + + msg = "The number of folds must be positive. 0 was passed." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, n_folds=0) + + msg = "The number of repetitions for the sample splitting must be positive. 0 was passed." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, n_rep=0) + + msg = "draw_sample_splitting must be True or False. Got true." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, draw_sample_splitting="true") + + +@pytest.mark.ci +def test_lplr_exception_get_params(): + msg = "Invalid nuisance learner ml_x. Valid nuisance learner ml_m or ml_t or ml_M or ml_a." + with pytest.raises(ValueError, match=msg): + dml_lplr.get_params("ml_x") + + +@pytest.mark.ci +def test_lplr_exception_smpls(): + msg = ( + "Sample splitting not specified. " + r"Either draw samples via .draw_sample splitting\(\) or set external samples via .set_sample_splitting\(\)." + ) + dml_plr_no_smpls = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m, draw_sample_splitting=False) + with pytest.raises(ValueError, match=msg): + _ = dml_plr_no_smpls.smpls + + +@pytest.mark.ci +def test_lplr_exception_fit(): + msg = "The number of CPUs used to fit the learners must be of int type. 5 of type was passed." + with pytest.raises(TypeError, match=msg): + dml_lplr.fit(n_jobs_cv="5") + msg = "store_predictions must be True or False. Got 1." + with pytest.raises(TypeError, match=msg): + dml_lplr.fit(store_predictions=1) + msg = "store_models must be True or False. Got 1." + with pytest.raises(TypeError, match=msg): + dml_lplr.fit(store_models=1) + + +@pytest.mark.ci +def test_lplr_exception_bootstrap(): + dml_lplr_boot = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m) + msg = r"Apply fit\(\) before bootstrap\(\)." + with pytest.raises(ValueError, match=msg): + dml_lplr_boot.bootstrap() + + dml_lplr_boot.fit() + msg = 'Method must be "Bayes", "normal" or "wild". Got Gaussian.' + with pytest.raises(ValueError, match=msg): + dml_lplr_boot.bootstrap(method="Gaussian") + msg = "The number of bootstrap replications must be of int type. 500 of type was passed." + with pytest.raises(TypeError, match=msg): + dml_lplr_boot.bootstrap(n_rep_boot="500") + msg = "The number of bootstrap replications must be positive. 0 was passed." + with pytest.raises(ValueError, match=msg): + dml_lplr_boot.bootstrap(n_rep_boot=0) + + +@pytest.mark.ci +def test_lplr_exception_confint(): + dml_lplr_conf = DoubleMLLPLR(dml_data, ml_M, ml_t, ml_m) + msg = r"Apply fit\(\) before confint\(\)." + with pytest.raises(ValueError, match=msg): + dml_lplr_conf.confint() + dml_lplr_conf.fit() + + msg = "joint must be True or False. Got 1." + with pytest.raises(TypeError, match=msg): + dml_lplr_conf.confint(joint=1) + msg = "The confidence level must be of float type. 5% of type was passed." + with pytest.raises(TypeError, match=msg): + dml_lplr_conf.confint(level="5%") + msg = r"The confidence level must be in \(0,1\). 0.0 was passed." + with pytest.raises(ValueError, match=msg): + dml_lplr_conf.confint(level=0.0) + + msg = r"Apply bootstrap\(\) before confint\(joint=True\)." + with pytest.raises(ValueError, match=msg): + dml_lplr_conf.confint(joint=True) + dml_lplr_conf.bootstrap() + df_lplr_ci = dml_lplr_conf.confint(joint=True) + assert isinstance(df_lplr_ci, pd.DataFrame) + + +@pytest.mark.ci +def test_lplr_exception_set_ml_nuisance_params(): + # invalid learner name + msg = "Invalid nuisance learner g. Valid nuisance learner ml_m or ml_t or ml_M or ml_a." + with pytest.raises(ValueError, match=msg): + dml_lplr.set_ml_nuisance_params("g", "d", {"alpha": 0.1}) + # invalid treatment variable + msg = "Invalid treatment variable y. Valid treatment variable d." + with pytest.raises(ValueError, match=msg): + dml_lplr.set_ml_nuisance_params("ml_M", "y", {"alpha": 0.1}) + + +class _DummyNoSetParams: + def fit(self): + pass + + +class _DummyNoGetParams(_DummyNoSetParams): + def set_params(self): + pass + + +class _DummyNoClassifier(_DummyNoGetParams): + def get_params(self): + pass + + def predict(self): + pass + + +class LogisticRegressionManipulatedType(LogisticRegression): + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.estimator_type = None + return tags + + +@pytest.mark.ci +@pytest.mark.filterwarnings( + r"ignore:.*is \(probably\) neither a regressor nor a classifier.*:UserWarning", +) +def test_lplr_exception_learner(): + err_msg_prefix = "Invalid learner provided for ml_t: " + + msg = err_msg_prefix + "provide an instance of a learner instead of a class." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, Lasso, ml_m) + msg = err_msg_prefix + r"BaseEstimator\(\) has no method .fit\(\)." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, BaseEstimator(), ml_m) + msg = r"has no method .set_params\(\)." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, _DummyNoSetParams(), ml_m) + msg = r"has no method .get_params\(\)." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, _DummyNoGetParams(), ml_m) + + # ml_m may not be a classifier when treatment is not binary + msg = ( + r"The ml_m learner LogisticRegression\(\) was identified as classifier " + r"but at least one treatment variable is not binary with values 0 and 1\." + ) + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, LogisticRegression()) + + # construct a classifier which is not identifiable as classifier via is_classifier by sklearn + log_reg = LogisticRegressionManipulatedType() + # TODO(0.11) can be removed if the sklearn dependency is bumped to 1.6.0 + log_reg._estimator_type = None + msg = ( + r"Learner provided for ml_m is probably invalid: LogisticRegressionManipulatedType\(\) is \(probably\) " + r"neither a regressor nor a classifier. Method predict is used for prediction\." + ) + with pytest.warns(UserWarning, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, ml_t, log_reg) + + +@pytest.mark.ci +@pytest.mark.filterwarnings( + r"ignore:.*is \(probably\) neither a regressor nor a classifier.*:UserWarning", + r"ignore: Learner provided for ml_m is probably invalid.*is \(probably\) no classifier.*:UserWarning", +) +def test_lplr_exception_and_warning_learner(): + # invalid ml_M (must be a classifier with predict_proba) + with pytest.raises(TypeError): + _ = DoubleMLLPLR(dml_data, _DummyNoClassifier(), ml_t, ml_m) + msg = "Invalid learner provided for ml_M: " + r"Lasso\(\) has no method .predict_proba\(\)." + with pytest.raises(TypeError, match=msg): + _ = DoubleMLLPLR(dml_data, Lasso(), ml_t, ml_m) + + +class LassoWithNanPred(Lasso): + def predict(self, X): + preds = super().predict(X) + n_obs = len(preds) + preds[np.random.randint(0, n_obs, 1)] = np.nan + return preds + + +class LassoWithInfPred(Lasso): + def predict(self, X): + preds = super().predict(X) + n_obs = len(preds) + preds[np.random.randint(0, n_obs, 1)] = np.inf + return preds + + +@pytest.mark.ci +def test_lplr_nan_prediction(): + msg = r"Predictions from learner LassoWithNanPred\(\) for ml_t are not finite." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, LassoWithNanPred(), ml_m).fit() + msg = r"Predictions from learner LassoWithInfPred\(\) for ml_t are not finite." + with pytest.raises(ValueError, match=msg): + _ = DoubleMLLPLR(dml_data, ml_M, LassoWithInfPred(), ml_m).fit() + + +@pytest.mark.ci +def test_double_ml_exception_evaluate_learner(): + dml_lplr_obj = DoubleMLLPLR( + dml_data, + ml_M=LogisticRegression(), + ml_t=Lasso(), + ml_m=RandomForestRegressor(), + n_folds=5, + score="nuisance_space", + ) + + msg = r"Apply fit\(\) before evaluate_learners\(\)." + with pytest.raises(ValueError, match=msg): + dml_lplr_obj.evaluate_learners() + + dml_lplr_obj.fit() + + msg = "metric should be a callable. 'mse' was passed." + with pytest.raises(TypeError, match=msg): + dml_lplr_obj.evaluate_learners(metric="mse") + + msg = ( + r"The learners have to be a subset of \['ml_m', 'ml_t', 'ml_M', 'ml_a'\]\. " r"Learners \['ml_mu', 'ml_p'\] provided." + ) + with pytest.raises(ValueError, match=msg): + dml_lplr_obj.evaluate_learners(learners=["ml_mu", "ml_p"]) + + def eval_fct(y_pred, y_true): + return np.nan + + with pytest.raises(ValueError): + dml_lplr_obj.evaluate_learners(metric=eval_fct) diff --git a/doubleml/plm/tests/test_lplr_tune.py b/doubleml/plm/tests/test_lplr_tune.py new file mode 100644 index 00000000..7c7c4aeb --- /dev/null +++ b/doubleml/plm/tests/test_lplr_tune.py @@ -0,0 +1,121 @@ +import numpy as np +import pytest +from sklearn.base import clone +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor + +import doubleml as dml + +from ..datasets import make_lplr_LZZ2020 + + +@pytest.fixture(scope="module", params=[RandomForestClassifier(random_state=42)]) +def learner_M(request): + return request.param + + +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) +def learner_t(request): + return request.param + + +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) +def learner_m(request): + return request.param + + +@pytest.fixture(scope="module", params=[RandomForestRegressor(random_state=42)]) +def learner_a(request): + return request.param + + +@pytest.fixture(scope="module", params=["nuisance_space", "instrument"]) +def score(request): + return request.param + + +def get_par_grid(): + return {"n_estimators": [5, 10, 20]} + + +@pytest.fixture(scope="module") +def dml_lplr_fixture( + learner_M, + learner_t, + learner_m, + learner_a, + score, + tune_on_folds=True, +): + par_grid = { + "ml_M": get_par_grid(), + "ml_t": get_par_grid(), + "ml_m": get_par_grid(), + "ml_a": get_par_grid(), + } + n_folds_tune = 4 + n_folds = 5 + alpha = 0.5 + + ml_M = clone(learner_M) + ml_t = clone(learner_t) + ml_m = clone(learner_m) + ml_a = clone(learner_a) + + obj_dml_data = make_lplr_LZZ2020(alpha=alpha) + dml_sel_obj = dml.DoubleMLLPLR( + obj_dml_data, + ml_M, + ml_t, + ml_m, + ml_a=ml_a, + n_folds=n_folds, + score=score, + ) + + # tune hyperparameters + tune_res = dml_sel_obj.tune(par_grid, tune_on_folds=tune_on_folds, n_folds_tune=n_folds_tune, return_tune_res=False) + assert isinstance(tune_res, dml.DoubleMLLPLR) + + dml_sel_obj.fit() + + res_dict = { + "coef": dml_sel_obj.coef[0], + "se": dml_sel_obj.se[0], + "true_coef": alpha, + } + + return res_dict + + +@pytest.mark.ci +def test_dml_selection_coef(dml_lplr_fixture): + # true_coef should lie within three standard deviations of the estimate + coef = dml_lplr_fixture["coef"] + se = dml_lplr_fixture["se"] + true_coef = dml_lplr_fixture["true_coef"] + assert abs(coef - true_coef) <= 3.0 * np.sqrt(se) + + +@pytest.mark.ci +def test_lplr_exception_tuning( + learner_M, + learner_t, + learner_m, + learner_a, +): + # LPLR valid scores are 'nuisance_space' and 'instrument' + obj_dml_data = make_lplr_LZZ2020(alpha=0.5) + ml_M = clone(learner_M) + ml_t = clone(learner_t) + ml_m = clone(learner_m) + + dml_lplr_obj = dml.DoubleMLLPLR(obj_dml_data, ml_M, ml_t, ml_m) + par_grid = { + "ml_M": get_par_grid(), + "ml_t": get_par_grid(), + "ml_m": get_par_grid(), + "ml_a": get_par_grid(), + } + msg = "tune_on_folds must be True as targets have to be created for ml_t on folds." + with pytest.raises(ValueError, match=msg): + dml_lplr_obj.tune(par_grid, tune_on_folds=False) diff --git a/doubleml/utils/_estimation.py b/doubleml/utils/_estimation.py index 3d99d93a..d10ae48b 100644 --- a/doubleml/utils/_estimation.py +++ b/doubleml/utils/_estimation.py @@ -44,16 +44,34 @@ def _fit(estimator, x, y, train_index, idx=None): def _dml_cv_predict( - estimator, x, y, smpls=None, n_jobs=None, est_params=None, method="predict", return_train_preds=False, return_models=False + estimator, + x, + y, + smpls=None, + n_jobs=None, + est_params=None, + method="predict", + return_train_preds=False, + return_models=False, + smpls_is_partition=None, + sample_weights=None, ): n_obs = x.shape[0] - smpls_is_partition = _check_is_partition(smpls, n_obs) + # TODO: Better name for smples_is_partition + if smpls_is_partition is None: + smpls_is_partition = _check_is_partition(smpls, n_obs) fold_specific_params = (est_params is not None) & (not isinstance(est_params, dict)) fold_specific_target = isinstance(y, list) manual_cv_predict = ( - (not smpls_is_partition) | return_train_preds | fold_specific_params | fold_specific_target | return_models + (not smpls_is_partition) + | return_train_preds + | fold_specific_params + | fold_specific_target + | return_models + | bool(sample_weights) ) + # TODO: Check if cross_val_predict supports weights res = {"models": None} if not manual_cv_predict: @@ -148,10 +166,20 @@ def _dml_cv_predict( def _dml_tune( - y, x, train_inds, learner, param_grid, scoring_method, n_folds_tune, n_jobs_cv, search_mode, n_iter_randomized_search + y, + x, + train_inds, + learner, + param_grid, + scoring_method, + n_folds_tune, + n_jobs_cv, + search_mode, + n_iter_randomized_search, + fold_specific_target=False, ): tune_res = list() - for train_index in train_inds: + for i, train_index in enumerate(train_inds): tune_resampling = KFold(n_splits=n_folds_tune, shuffle=True) if search_mode == "grid_search": g_grid_search = GridSearchCV(learner, param_grid, scoring=scoring_method, cv=tune_resampling, n_jobs=n_jobs_cv) @@ -165,7 +193,10 @@ def _dml_tune( n_jobs=n_jobs_cv, n_iter=n_iter_randomized_search, ) - tune_res.append(g_grid_search.fit(x[train_index, :], y[train_index])) + if fold_specific_target: + tune_res.append(g_grid_search.fit(x[train_index, :], y[i])) + else: + tune_res.append(g_grid_search.fit(x[train_index, :], y[train_index])) return tune_res @@ -300,7 +331,7 @@ def _var_est(psi, psi_deriv, smpls, is_cluster_data, cluster_vars=None, smpls_cl J_l = test_cluster_inds[1] const = np.divide(min(len(I_k), len(J_l)), (np.square(len(I_k) * len(J_l)))) for cluster_value in I_k: - ind_cluster = (first_cluster_var == cluster_value) & np.isin(second_cluster_var, J_l) + ind_cluster = (first_cluster_var == cluster_value) & np.in1d(second_cluster_var, J_l) gamma_hat += const * np.sum(np.outer(psi[ind_cluster], psi[ind_cluster])) for cluster_value in J_l: ind_cluster = (second_cluster_var == cluster_value) & np.isin(first_cluster_var, I_k) diff --git a/doubleml/utils/resampling.py b/doubleml/utils/resampling.py index 188d2f24..38c1ac59 100644 --- a/doubleml/utils/resampling.py +++ b/doubleml/utils/resampling.py @@ -25,6 +25,58 @@ def split_samples(self): return smpls +class DoubleMLDoubleResampling: + def __init__(self, n_folds, n_folds_inner, n_rep, n_obs, stratify=None): + self.n_folds = n_folds + self.n_folds_inner = n_folds_inner + self.n_rep = n_rep + self.n_obs = n_obs + self.stratify = stratify + + if n_folds < 2: + raise ValueError( + "n_folds must be greater than 1. You can use set_sample_splitting with a tuple to only use one fold." + ) + if n_folds_inner < 2: + raise ValueError( + "n_folds_inner must be greater than 1. You can use set_sample_splitting with a tuple to only use one fold." + ) + + if self.stratify is None: + self.resampling = RepeatedKFold(n_splits=n_folds, n_repeats=n_rep) + self.resampling_inner = RepeatedKFold(n_splits=n_folds_inner, n_repeats=1) + else: + self.resampling = RepeatedStratifiedKFold(n_splits=n_folds, n_repeats=n_rep) + self.resampling_inner = RepeatedStratifiedKFold(n_splits=n_folds_inner, n_repeats=1) + + def split_samples(self): + all_smpls = [(train, test) for train, test in self.resampling.split(X=np.zeros(self.n_obs), y=self.stratify)] + smpls = [all_smpls[(i_repeat * self.n_folds) : ((i_repeat + 1) * self.n_folds)] for i_repeat in range(self.n_rep)] + smpls_inner = [] + for _ in range(self.n_rep): + smpls_inner_rep = [] + for train, test in all_smpls: + if self.stratify is None: + smpls_inner_rep.append( + [ + (train[train_inner], train[test_inner]) + for train_inner, test_inner in self.resampling_inner.split(X=train) + ] + ) + else: + smpls_inner_rep.append( + [ + (train[train_inner], train[test_inner]) + for train_inner, test_inner in self.resampling_inner.split( + X=np.zeros(len(train)), y=self.stratify[train] + ) + ] + ) + smpls_inner.append(smpls_inner_rep) + + return smpls, smpls_inner + + class DoubleMLClusterResampling: def __init__(self, n_folds, n_rep, n_obs, n_cluster_vars, cluster_vars): self.n_folds = n_folds