Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doubleml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .irm.pq import DoubleMLPQ
from .irm.qte import DoubleMLQTE
from .irm.ssm import DoubleMLSSM
from .plm.lplr import DoubleMLLPLR
from .plm.pliv import DoubleMLPLIV
from .plm.plr import DoubleMLPLR
from .utils.blp import DoubleMLBLP
Expand Down Expand Up @@ -42,6 +43,7 @@
"DoubleMLBLP",
"DoubleMLPolicyTree",
"DoubleMLSSM",
"DoubleMLLPLR",
]

__version__ = importlib.metadata.version("doubleml")
19 changes: 16 additions & 3 deletions doubleml/double_ml_score_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ class NonLinearScoreMixin:
_score_type = "nonlinear"
_coef_start_val = np.nan
_coef_bounds = None
_error_on_convergence_failure = False

@property
@abstractmethod
Expand Down Expand Up @@ -149,12 +150,16 @@ def score_deriv(theta):
theta_hat = root_res.root
if not root_res.converged:
score_val = score(theta_hat)
warnings.warn(
msg = (
"Could not find a root of the score function.\n "
f"Flag: {root_res.flag}.\n"
f"Score value found is {score_val} "
f"for parameter theta equal to {theta_hat}."
)
if self._error_on_convergence_failure:
raise ValueError(msg)
else:
warnings.warn(msg)
else:
signs_different, bracket_guess = _get_bracket_guess(score, self._coef_start_val, self._coef_bounds)

Expand Down Expand Up @@ -186,12 +191,16 @@ def score_squared(theta):
score, self._coef_start_val, approx_grad=True, bounds=[self._coef_bounds]
)
theta_hat = theta_hat_array.item()
warnings.warn(
msg = (
"Could not find a root of the score function.\n "
f"Minimum score value found is {score_val} "
f"for parameter theta equal to {theta_hat}.\n "
"No theta found such that the score function evaluates to a negative value."
)
if self._error_on_convergence_failure:
raise ValueError(msg)
else:
warnings.warn(msg)
else:

def neg_score(theta):
Expand All @@ -202,11 +211,15 @@ def neg_score(theta):
neg_score, self._coef_start_val, approx_grad=True, bounds=[self._coef_bounds]
)
theta_hat = theta_hat_array.item()
warnings.warn(
msg = (
"Could not find a root of the score function. "
f"Maximum score value found is {-1 * neg_score_val} "
f"for parameter theta equal to {theta_hat}. "
"No theta found such that the score function evaluates to a positive value."
)
if self._error_on_convergence_failure:
raise ValueError(msg)
else:
warnings.warn(msg)

return theta_hat
6 changes: 2 additions & 4 deletions doubleml/plm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@
The :mod:`doubleml.plm` module implements double machine learning estimates based on partially linear models.
"""

from .lplr import DoubleMLLPLR
from .pliv import DoubleMLPLIV
from .plr import DoubleMLPLR

__all__ = [
"DoubleMLPLR",
"DoubleMLPLIV",
]
__all__ = ["DoubleMLPLR", "DoubleMLPLIV", "DoubleMLLPLR"]
2 changes: 2 additions & 0 deletions doubleml/plm/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from ._make_pliv_data import _make_pliv_data
from .dgp_confounded_plr_data import make_confounded_plr_data
from .dgp_lplr_LZZ2020 import make_lplr_LZZ2020
from .dgp_pliv_CHS2015 import make_pliv_CHS2015
from .dgp_pliv_multiway_cluster_CKMS2021 import make_pliv_multiway_cluster_CKMS2021
from .dgp_plr_CCDDHNR2018 import make_plr_CCDDHNR2018
Expand All @@ -15,5 +16,6 @@
"make_confounded_plr_data",
"make_pliv_CHS2015",
"make_pliv_multiway_cluster_CKMS2021",
"make_lplr_LZZ2020",
"_make_pliv_data",
]
152 changes: 152 additions & 0 deletions doubleml/plm/datasets/dgp_lplr_LZZ2020.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import numpy as np
import pandas as pd
from scipy.special import expit

from doubleml.data import DoubleMLData
from doubleml.utils._aliases import _get_array_alias, _get_data_frame_alias, _get_dml_data_alias

_array_alias = _get_array_alias()
_data_frame_alias = _get_data_frame_alias()
_dml_data_alias = _get_dml_data_alias()


def make_lplr_LZZ2020(
n_obs=500, dim_x=20, alpha=0.5, return_type="DoubleMLData", balanced_r0=True, treatment="continuous", **kwargs
):
r"""
Generates synthetic data for a logistic partially linear regression model, as in Liu et al. (2021),
designed for use in double/debiased machine learning applications.

The data generating process is defined as follows:

- Covariates :math:`x_i \sim \mathcal{N}(0, \Sigma)`, where :math:`\Sigma_{kj} = 0.7^{|j-k|}`.
- Treatment :math:`d_i = a_0(x_i)`.
- Propensity score :math:`p_i = \sigma(\alpha d_i + r_0(x_i))`, where :math:`\sigma(\cdot)` is the logistic function.
- Outcome :math:`y_i \sim \text{Bernoulli}(p_i)`.

The nuisance functions are defined as:

.. math::
\begin{aligned}
a_0(x_i) &= \frac{2}{1 + \exp(x_{i,1})} - \frac{2}{1 + \exp(x_{i,2})} + \sin(x_{i,3}) + \cos(x_{i,4}) \\
&\quad + 0.5 \cdot \mathbb{1}(x_{i,5} > 0) - 0.5 \cdot \mathbb{1}(x_{i,6} > 0) + 0.2\, x_{i,7} x_{i,8}
- 0.2\, x_{i,9} x_{i,10} \\
r_0(x_i) &= 0.1\, x_{i,1} x_{i,2} x_{i,3} + 0.1\, x_{i,4} x_{i,5} + 0.1\, x_{i,6}^3 - 0.5 \sin^2(x_{i,7}) \\
&\quad + 0.5 \cos(x_{i,8}) + \frac{1}{1 + x_{i,9}^2} - \frac{1}{1 + \exp(x_{i,10})} \\
&\quad + 0.25 \cdot \mathbb{1}(x_{i,11} > 0) - 0.25 \cdot \mathbb{1}(x_{i,13} > 0)
\end{aligned}

Parameters
----------
n_obs : int
Number of observations to simulate.
dim_x : int
Number of covariates.
alpha : float
Value of the causal parameter.
return_type : str
Determines the return format. One of:

- 'DoubleMLData' or DoubleMLData: returns a ``DoubleMLData`` object.
- 'DataFrame', 'pd.DataFrame' or pd.DataFrame: returns a ``pandas.DataFrame``.
- 'array', 'np.ndarray', 'np.array' or np.ndarray: returns tuple of numpy arrays (x, y, d, p).
balanced_r0 : bool, default True
If True, uses the "balanced" r_0 specification (smaller magnitude / more balanced
heterogeneity). If False, uses an "unbalanced" r_0 specification with larger
share of Y=0.
treatment : {'continuous', 'binary', 'binary_unbalanced'}, default 'continuous'
Determines how the treatment d is generated from a_0(x):
- 'continuous': d = a_0(x) (continuous treatment).
- 'binary': d ~ Bernoulli( sigmoid(a_0(x) - mean(a_0(x))) ) .
- 'binary_unbalanced': d ~ Bernoulli( sigmoid(a_0(x)) ).

**kwargs
Optional keyword arguments (currently unused in this implementation).

Returns
-------
Union[DoubleMLData, pd.DataFrame, Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]
The generated data in the specified format.

References
----------
Liu, Molei, Yi Zhang, and Doudou Zhou. 2021.
"Double/Debiased Machine Learning for Logistic Partially Linear Model."
The Econometrics Journal 24 (3): 559–88. https://doi.org/10.1093/ectj/utab019.

"""

if balanced_r0:

def r_0(X):
return (
0.1 * X[:, 0] * X[:, 1] * X[:, 2]
+ 0.1 * X[:, 3] * X[:, 4]
+ 0.1 * X[:, 5] ** 3
+ -0.5 * np.sin(X[:, 6]) ** 2
+ 0.5 * np.cos(X[:, 7])
+ 1 / (1 + X[:, 8] ** 2)
+ -1 / (1 + np.exp(X[:, 9]))
+ 0.25 * np.where(X[:, 10] > 0, 1, 0)
+ -0.25 * np.where(X[:, 12] > 0, 1, 0)
)

else:

def r_0(X):
return (
0.1 * X[:, 0] * X[:, 1] * X[:, 2]
+ 0.1 * X[:, 3] * X[:, 4]
+ 0.1 * X[:, 5] ** 3
+ -0.5 * np.sin(X[:, 6]) ** 2
+ 0.5 * np.cos(X[:, 7])
+ 4 / (1 + X[:, 8] ** 2)
+ -1 / (1 + np.exp(X[:, 9]))
+ 1.5 * np.where(X[:, 10] > 0, 1, 0)
+ -0.25 * np.where(X[:, 12] > 0, 1, 0)
)

def a_0(X):
return (
2 / (1 + np.exp(X[:, 0]))
+ -2 / (1 + np.exp(X[:, 1]))
+ 1 * np.sin(X[:, 2])
+ 1 * np.cos(X[:, 3])
+ 0.5 * np.where(X[:, 4] > 0, 1, 0)
+ -0.5 * np.where(X[:, 5] > 0, 1, 0)
+ 0.2 * X[:, 6] * X[:, 7]
+ -0.2 * X[:, 8] * X[:, 9]
)

sigma = np.full((dim_x, dim_x), 0.2)
np.fill_diagonal(sigma, 1)

x = np.random.multivariate_normal(np.zeros(dim_x), sigma, size=n_obs)
np.clip(x, -2, 2, out=x)

if treatment == "continuous":
d = a_0(x)
elif treatment == "binary":
d_cont = a_0(x)
d = np.random.binomial(1, expit(d_cont - d_cont.mean()))
elif treatment == "binary_unbalanced":
d_cont = a_0(x)
d = np.random.binomial(1, expit(d_cont))
else:
raise ValueError("Invalid treatment type.")

p = expit(alpha * d[:] + r_0(x))

y = np.random.binomial(1, p)

if return_type in _array_alias:
return x, y, d, p
elif return_type in _data_frame_alias + _dml_data_alias:
x_cols = [f"X{i + 1}" for i in np.arange(dim_x)]
data = pd.DataFrame(np.column_stack((x, y, d, p)), columns=x_cols + ["y", "d", "p"])
if return_type in _data_frame_alias:
return data
else:
return DoubleMLData(data, "y", "d", x_cols)
else:
raise ValueError("Invalid return_type.")
Loading