Skip to content

Commit a10e38d

Browse files
authored
Merge pull request #119 from DoubleML/m-cluster-ses
Cluster robust double machine learning
2 parents cce8814 + 7cf1e9b commit a10e38d

24 files changed

+1685
-105
lines changed

NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Generated by roxygen2: do not edit by hand
22

3+
export(DoubleMLClusterData)
34
export(DoubleMLData)
45
export(DoubleMLIIVM)
56
export(DoubleMLIRM)
@@ -12,6 +13,7 @@ export(fetch_bonus)
1213
export(make_iivm_data)
1314
export(make_irm_data)
1415
export(make_pliv_CHS2015)
16+
export(make_pliv_multiway_cluster_CKMS2021)
1517
export(make_plr_CCDDHNR2018)
1618
export(make_plr_turrell2018)
1719
import(checkmate)

R/datasets.R

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -744,3 +744,176 @@ make_iivm_data = function(n_obs = 500, dim_x = 20, theta = 1, alpha_x = 0.2,
744744
}
745745
}
746746
}
747+
748+
749+
#' @title Generates data from a partially linear IV regression model with
750+
#' multiway cluster sample used in Chiang et al. (2021).
751+
#'
752+
#' @description
753+
#' Generates data from a partially linear IV regression model with multiway
754+
#' cluster sample used in Chiang et al. (2021). The data generating process
755+
#' is defined as
756+
#'
757+
#' \eqn{Z_{ij} = X_{ij}' \xi_0 + V_{ij},}
758+
#'
759+
#' \eqn{D_{ij} = Z_{ij}' \pi_{10} + X_{ij}' \pi_{20} + v_{ij},}
760+
#'
761+
#' \eqn{Y_{ij} = D_{ij} \theta + X_{ij}' \zeta_0 + \varepsilon_{ij},}
762+
#'
763+
#' with
764+
#'
765+
#' \eqn{X_{ij} = (1 - \omega_1^X - \omega_2^X) \alpha_{ij}^X
766+
#' + \omega_1^X \alpha_{i}^X + \omega_2^X \alpha_{j}^X,}
767+
#'
768+
#' \eqn{\varepsilon_{ij} = (1 - \omega_1^\varepsilon - \omega_2^\varepsilon) \alpha_{ij}^\varepsilon
769+
#' + \omega_1^\varepsilon \alpha_{i}^\varepsilon + \omega_2^\varepsilon \alpha_{j}^\varepsilon,}
770+
#'
771+
#' \eqn{v_{ij} = (1 - \omega_1^v - \omega_2^v) \alpha_{ij}^v
772+
#' + \omega_1^v \alpha_{i}^v + \omega_2^v \alpha_{j}^v,}
773+
#'
774+
#' \eqn{V_{ij} = (1 - \omega_1^V - \omega_2^V) \alpha_{ij}^V
775+
#' + \omega_1^V \alpha_{i}^V + \omega_2^V \alpha_{j}^V,}
776+
#'
777+
#' and \eqn{\alpha_{ij}^X, \alpha_{i}^X, \alpha_{j}^X \sim \mathcal{N}(0, \Sigma)}
778+
#' where \eqn{\Sigma} is a \eqn{p_x \times p_x} matrix with entries
779+
#' \eqn{\Sigma_{kj} = s_X^{|j-k|}}.
780+
#'
781+
#' Further
782+
#'
783+
#' \eqn{\left(\begin{array}{c} \alpha_{ij}^\varepsilon \\ \alpha_{ij}^v \end{array}\right),
784+
#' \left(\begin{array}{c} \alpha_{i}^\varepsilon \\ \alpha_{i}^v \end{array}\right),
785+
#' \left(\begin{array}{c} \alpha_{j}^\varepsilon \\ \alpha_{j}^v \end{array}\right)
786+
#' \sim \mathcal{N}\left(0, \left(\begin{array}{cc} 1 & s_{\varepsilon v} \\
787+
#' s_{\varepsilon v} & 1 \end{array}\right) \right)}
788+
#'
789+
#' and \eqn{\alpha_{ij}^V, \alpha_{i}^V, \alpha_{j}^V \sim \mathcal{N}(0, 1)}.
790+
#'
791+
#' @references Chiang, H. D., Kato K., Ma, Y. and Sasaki, Y. (2021),
792+
#' Multiway Cluster Robust Double/Debiased Machine Learning,
793+
#' Journal of Business & Economic Statistics,
794+
#' \doi{10.1080/07350015.2021.1895815}, https://arxiv.org/abs/1909.03489.
795+
#'
796+
#' @param N (`integer(1)`) \cr
797+
#' The number of observations (first dimension).
798+
#'
799+
#' @param M (`integer(1)`) \cr
800+
#' The number of observations (second dimension).
801+
#'
802+
#' @param dim_X (`integer(1)`) \cr
803+
#' The number of covariates.
804+
#'
805+
#' @param theta (`numeric(1)`) \cr
806+
#' The value of the causal parameter.
807+
#'
808+
#' @param return_type (`character(1)`) \cr
809+
#' If `"DoubleMLClusterData"`, returns a `DoubleMLClusterData` object.
810+
#' If `"data.frame"` returns a `data.frame()`.
811+
#' If `"data.table"` returns a `data.table()`.
812+
#' If `"matrix"` a named `list()` with entries `X`, `y`, `d`, `z` and
813+
#' `cluster_vars` is returned.
814+
#' Every entry in the list is a `matrix()` object. Default is `"DoubleMLClusterData"`.
815+
#'
816+
#' @param ... \cr
817+
#' Additional keyword arguments to set non-default values for the parameters
818+
#' \eqn{\pi_{10}=1.0},
819+
#' \eqn{\omega_X = \omega_{\varepsilon} = \omega_V = \omega_v = (0.25, 0.25)},
820+
#' \eqn{s_X = s_{\varepsilon v} = 0.25}, or the \eqn{p_x}-vectors
821+
#' \eqn{\zeta_0 = \pi_{20} = \xi_0} with default entries
822+
#' \eqn{\zeta_{0})_j = 0.5^j}.
823+
#'
824+
#' @return A data object according to the choice of `return_type`.
825+
#'
826+
#' @export
827+
make_pliv_multiway_cluster_CKMS2021 = function(N = 25, M = 25, dim_X = 100,
828+
theta = 1.,
829+
return_type = "DoubleMLClusterData",
830+
...) {
831+
832+
assert_choice(
833+
return_type,
834+
c("data.table", "matrix", "data.frame", "DoubleMLClusterData"))
835+
kwargs = list(...)
836+
pi_10 = if ("pi_10" %in% names(kwargs)) kwargs$pi_10 else 1.0
837+
zeta_0 = if ("zeta_0" %in% names(kwargs)) kwargs$zeta_0 else 0.5^(1:dim_X)
838+
pi_20 = if ("pi_20" %in% names(kwargs)) kwargs$pi_20 else 0.5^(1:dim_X)
839+
xi_0 = if ("xi_0" %in% names(kwargs)) kwargs$xi_0 else 0.5^(1:dim_X)
840+
841+
omega_X = if ("omega_X" %in% names(kwargs)) kwargs$omega_X else c(0.25, 0.25)
842+
omega_epsilon = if ("omega_epsilon" %in% names(kwargs)) kwargs$omega_epsilon else c(0.25, 0.25)
843+
omega_v = if ("omega_v" %in% names(kwargs)) kwargs$omega_v else c(0.25, 0.25)
844+
omega_V = if ("omega_V" %in% names(kwargs)) kwargs$omega_V else c(0.25, 0.25)
845+
846+
s_X = if ("s_X" %in% names(kwargs)) kwargs$s_X else 0.25
847+
s_epsilon_v = if ("s_epsilon_v" %in% names(kwargs)) kwargs$s_epsilon_v else 0.25
848+
849+
alpha_V = rnorm(N * M)
850+
alpha_V_i = rep(rnorm(N), each = M)
851+
alpha_V_j = rep(rnorm(M), times = N)
852+
853+
cov_mat = matrix(c(1, s_epsilon_v, s_epsilon_v, 1), nrow = 2)
854+
alpha_eps_v = rmvnorm(n = N * M, mean = rep(0, 2), sigma = cov_mat)
855+
alpha_eps = alpha_eps_v[, 1]
856+
alpha_v = alpha_eps_v[, 2]
857+
858+
alpha_eps_v_i = rmvnorm(n = N, mean = rep(0, 2), sigma = cov_mat)
859+
alpha_eps_i = rep(alpha_eps_v_i[, 1], each = M)
860+
alpha_v_i = rep(alpha_eps_v_i[, 2], each = M)
861+
862+
alpha_eps_v_j = rmvnorm(n = M, mean = rep(0, 2), sigma = cov_mat)
863+
alpha_eps_j = rep(alpha_eps_v_j[, 1], times = N)
864+
alpha_v_j = rep(alpha_eps_v_j[, 2], times = N)
865+
866+
cov_mat = toeplitz(s_X^(0:(dim_X - 1)))
867+
alpha_X = rmvnorm(n = N * M, mean = rep(0, dim_X), sigma = cov_mat)
868+
xx = rmvnorm(N, mean = rep(0, dim_X), sigma = cov_mat)
869+
alpha_X_i = matrix(rep(xx, each = M), ncol = ncol(xx), byrow = FALSE)
870+
xx = rmvnorm(M, mean = rep(0, dim_X), sigma = cov_mat)
871+
alpha_X_j = matrix(rep(xx, each = N), ncol = ncol(xx), byrow = TRUE)
872+
873+
# generate variables
874+
x = (1 - omega_X[1] - omega_X[2]) * alpha_X +
875+
omega_X[1] * alpha_X_i + omega_X[2] * alpha_X_j
876+
877+
eps = (1 - omega_epsilon[1] - omega_epsilon[2]) * alpha_eps +
878+
omega_epsilon[1] * alpha_eps_i + omega_epsilon[2] * alpha_eps_j
879+
880+
v = (1 - omega_v[1] - omega_v[2]) * alpha_v +
881+
omega_v[1] * alpha_v_i + omega_v[2] * alpha_v_j
882+
883+
V = (1 - omega_V[1] - omega_V[2]) * alpha_V +
884+
omega_V[1] * alpha_V_i + omega_V[2] * alpha_V_j
885+
886+
z = x %*% xi_0 + V
887+
d = z * pi_10 + x %*% pi_20 + v
888+
y = d * theta + x %*% zeta_0 + eps
889+
890+
cluster_vars = expand.grid(seq(M), seq(N))[, c(2, 1)]
891+
892+
colnames(x) = paste0("X", 1:dim_X)
893+
colnames(y) = "Y"
894+
colnames(d) = "D"
895+
colnames(z) = "Z"
896+
colnames(cluster_vars) = c("cluster_var_i", "cluster_var_j")
897+
898+
if (return_type == "matrix") {
899+
return(list("X" = x, "y" = y, "d" = d, "cluster_vars" = cluster_vars, "z" = z))
900+
} else {
901+
if (return_type == "data.frame") {
902+
data = data.frame(x, y, d, cluster_vars, z)
903+
return(data)
904+
} else if (return_type == "data.table") {
905+
data = data.table(x, y, d, cluster_vars, z)
906+
return(data)
907+
} else if (return_type == "DoubleMLClusterData") {
908+
dt = data.table(x, y, d, cluster_vars, z)
909+
data = DoubleMLClusterData$new(dt,
910+
y_col = "Y", d_cols = "D",
911+
x_cols = colnames(x),
912+
z_cols = "Z",
913+
cluster_cols = c(
914+
"cluster_var_i",
915+
"cluster_var_j"))
916+
return(data)
917+
}
918+
}
919+
}

0 commit comments

Comments
 (0)