Appendix A: Definitions
We extend the notation used in Hu et al. (2011) to accommodate covariates. Define the following simplified notation:
$$\begin{aligned} \partial _{1}F(t_1,t_2|w)= & {} \frac{\partial F(t_1,t_2|w)}{\partial t_1},\quad \partial _{1}G(t_1,t_2|w)=\frac{\partial G(t_1,t_2|w)}{\partial t_1},\\ \partial _2F(t_1,t_2|w)= & {} \frac{\partial F(t_1,t_2|w)}{\partial t_2}, \quad \partial _2G(t_1,t_2|w)=\frac{\partial G(t_1,t_2|w)}{\partial t_2},\\ \partial _{1,2}F(t_1,t_2|w)= & {} \frac{\partial ^2 F(t_1,t_2|w)}{\partial t_1 \partial t_2},\quad \partial _{1,2}G(t_1,t_2|w)=\frac{\partial ^2 G(t_1,t_2|w)}{\partial t_1 \partial t_2}, \end{aligned}$$
where F and G denote the survival functions of \((T_1, T_2)\) and \((C_1, C_2)\) conditional on \(W=w\), respectively. Then the conditional density function of \((X_1,X_2,\varDelta _1,\varDelta _2)\) given \(W=w\) can be written as
$$\begin{aligned}&q(t_1,t_2,\delta _1,\delta _2|w) \\&\quad = \partial _{1,2}F(t_1,t_2|w)^{\delta _1\delta _2} \{-\partial _{1}F(t_1,t_2|w)\}^{\delta _1(1-\delta _2)} \{ -\partial _2F(t_1,t_2|w)\}^{(1-\delta _1)\delta _2} \\&\qquad F(t_1,t_2|w)^{(1-\delta _1)(1-\delta _2)} \partial _{1,2}G(t_1,t_2|w)^{(1-\delta _1)(1-\delta _2)} \\&\qquad \{-\partial _{1}G(t_1,t_2|w)\}^{(1-\delta _1)\delta _2} \{-\partial _2G(t_1,t_2|w)\}^{\delta _1(1-\delta _2)} G(t_1,t_2|w)^{\delta _1\delta _2}, \end{aligned}$$
and the joint density of \((X_1,X_2,\varDelta _1,\varDelta _2,W)\) is
$$\begin{aligned} p(t_1,t_2,\delta _1,\delta _2,w) = q(t_1,t_2,\delta _1,\delta _2|w)f_{W}(w), \end{aligned}$$
(11)
where \(f_{W}(w)\) denotes the distribution function of W.
For a discrete covariate W, we introduce the following notation:
$$\begin{aligned}&{\tilde{g}}^d(\varDelta _2,X_{1},X_2,W,\varDelta _1^*,X_1^*,X_2^*,W^*; {\varvec{\xi }})\\&\quad = \frac{I(W=W^*)\varDelta ^*_1\varDelta _2 {{\dot{\beta }}}(X^*_{1},X_2,W^*)I(X_{1} \ge X^*_{1})I(X_2 \le X^*_2)\theta (X^*_1,X_2,W^*,{\varvec{\xi }})}{S(X^*_{1},X_2,W^*)}, \end{aligned}$$
where
$$\begin{aligned} S(t_1,t_2,w)= & {} Pr(X_{1}\ge t_1,X_{2}\ge t_2|W=w)f_W(w). \end{aligned}$$
(12)
By fixing \((\varDelta _1^*,X_1^*,X_2^*,W^*)\) at \((\delta _1, x_1, x_2,w)\), we also define
$$\begin{aligned} {\tilde{h}}^d_Q(\delta _1,x_1,x_2,w; {\varvec{\xi }})= & {} Q{\tilde{g}}^d(\varDelta _2,X_{1},X_2,W,\delta _1,x_1,x_2,w; {\varvec{\xi }}). \end{aligned}$$
Similarly, fixing \((\varDelta _2,X_1,X_2,W)\) at \((\delta _2, x_1, x_2,w)\), define
$$\begin{aligned} {\tilde{h}}^d_P(\delta _2,x_1,x_2,w; {\varvec{\xi }})=P{\tilde{g}}^d(\delta _2,x_{1},x_2,w,\varDelta _1^*,X_1^*,X_2^*,W^*; {\varvec{\xi }}). \end{aligned}$$
For a continuous covariate W, define \(V=(X_1,X_2,\varDelta _1,\varDelta _2,W)\) and
$$\begin{aligned}&b(V_i,V_j;{\varvec{\xi }})= \varDelta _{1i}\varDelta _{2j}I(X_{1j}\ge X_{1i})I(X_{2j}\le X_{2i})e^{\beta (X_{1i},X_{2j},W_i;\xi )} \nonumber \\&\qquad \quad \,\,\qquad \qquad \times \, {{\dot{\beta }}}(X_{1i},X_{2j},W_i), \end{aligned}$$
(13)
$$\begin{aligned}&v(V_i,V_j;{\varvec{\xi }}) =\frac{1}{n}[N(X_{1i},X_{2j},W_i) \nonumber \\&\qquad \qquad \qquad \qquad -{\mathbf {K}}_h(0)I(X_{2j}\le X_{2i})(1-\theta (X_{1i},X_{2j},W_i;{\varvec{\xi }}))],\nonumber \\&g^{(n)}(V_i,V_j;{\varvec{\xi }}) = \frac{{\mathbf {K}}_h(W_j-W_i)b(V_i,V_j;{\varvec{\xi }})}{v(V_i,V_j;{\varvec{\xi }})},\nonumber \\&g_h^{(n)}(V_i,V_j;{\varvec{\xi }})= \frac{{\mathbf {K}}_h(W_j-W_i)b(V_i,V_j;{\varvec{\xi }})}{S_h(X_{1i},X_{2j},W_i)}\nonumber \\&{\tilde{g}}^{(n)}(V_i,V_j;{\varvec{\xi }})= \frac{{\mathbf {K}}_h(W_j-W_i)b(V_i,V_j;{\varvec{\xi }})}{S(X_{1i},X_{2j},W_i)}\nonumber \\&{\tilde{h}}(V_i;{\varvec{\xi }})= E_{X_{1j},X_{2j},\varDelta _{1j},\varDelta _{2j}|W_j=W_i, X_{1i},X_{2i},\varDelta _{1i},\varDelta _{2i}}\left[ \frac{b(V_i,V_j;{\varvec{\xi }})}{S(X_{1i},X_{2j}|W_i)}\right] \nonumber \\&{\tilde{h}}^*(V_j;{\varvec{\xi }})= E_{X_{1i},X_{2i},\varDelta _{1i},\varDelta _{2i}|W_i=W_j, X_{1j},X_{2j},\varDelta _{1j},\varDelta _{2j}}\left[ \frac{b(V_i,V_j;{\varvec{\xi }})}{S(X_{1i},X_{2j}|W_j)}\right] \nonumber \\&u^{(2)}({\varvec{\xi }})=E_{X_{1i},X_{2i},\varDelta _{1i},\varDelta _{2i},W_i}{\tilde{h}}(V_i;{\varvec{\xi }}), \end{aligned}$$
(14)
where
$$\begin{aligned} S_h(t_1,t_2,w)= & {} E\left[ I(X_{1}\ge t_1,X_{2}\ge t_2){\mathbf {K}}_h(W-w)\right] . \end{aligned}$$
Clearly, we have \(S(t_1,t_2,w)=\lim _{h\downarrow 0}S_h(t_1,t_2,w)\).
Appendix B: Proof of Theorem 1
For consistency, we will first show that \({\varvec{U}}^{(k)}_n({\varvec{\xi }})\) converges uniformly to \({\varvec{u}}^{(k)}\), \(k=1,2\), then show that \({\varvec{u}}({\varvec{\xi }})={\varvec{0}}\) has the unique solution at \({\varvec{\xi }}_0\), and finally show the consistency of \({\hat{{\varvec{\xi }}}}_n\) satisfying \({\varvec{U}}_n({\hat{{\varvec{\xi }}}}_n)={\varvec{0}}\).
The uniform convergence of \({\varvec{U}}_n^{(1)}({\varvec{\xi }})\) to \({\varvec{u}}^{(1)}({\varvec{\xi }})\) remains the same for both discrete and continuous covariates. However, for a continuous covariate, \({\varvec{U}}_n^{(2)}({\varvec{\xi }})\) involves the kernel function which is unbounded as the bandwidth goes to 0, so the proof for the uniform convergence of \({\varvec{U}}_n^{(2)}({\varvec{\xi }})\) to \({\varvec{u}}^{(2)}({\varvec{\xi }})\) is treated separately for discrete W with finite levels and continuous W. When W is discrete with finite levels, the proof is similar to that provided in Hu et al. (2011). So we focus on continuous W.
First, let \((X^*_1, X^*_2, \varDelta ^*_1, \varDelta ^*_2, W^*)\) be an identical copy of \((X_1, X_2, \varDelta _1, \varDelta _2,W)\). Define the deterministic function \({\varvec{u}}({\varvec{\xi }}) = {\varvec{u}}^{(1)}({\varvec{\xi }}) - {\varvec{u}}^{(2)}({\varvec{\xi }}) + {\varvec{u}}^{(3)}({\varvec{\xi }}) - {\varvec{u}}^{(4)}({\varvec{\xi }})\), with
$$\begin{aligned} {\varvec{u}}^{(1)}({\varvec{\xi }})= & {} {\varvec{u}}^{(3)}({\varvec{\xi }}) = E\left\{ \varDelta _1 \varDelta _2{{\dot{\beta }}} (X_{1},X_2,W)\right\} ,\\ {\varvec{u}}^{(2)}({\varvec{\xi }})= & {} {\varvec{u}}^{(4)}({\varvec{\xi }}) \\= & {} E\left\{ \varDelta ^*_1\varDelta _2 {{\dot{\beta }}} (X^*_{1},X_2,W)\frac{I(X_{1}\ge X^*_{1})I(X_2 \le X^*_2)\theta (X^*_{1},X_2,W; {\varvec{\xi }})}{S(X^*_{1},X_2|W^*)}\right\} , \end{aligned}$$
where \(S(x_1,x_2|w)=Pr(X_{1}>x_1,X_2>x_2|W=w)\).
Similar to Hu et al. (2011), we use \({\mathbb {P}}_n\) and \({\mathbb {Q}}_n\) to denote the empirical measures of n independent copies of \((X_1^*, X_2^*, \varDelta _1^*, \varDelta _2^*, W^*)\) and \((X_1, X_2, \varDelta _1, \varDelta _2, W)\) that follow the distributions P and Q, respectively, which make the double summations more tractable. For model (3), \({\varvec{U}}_n^{(1)}({\varvec{\xi }}) = {\mathbb {Q}}_n \varDelta _1 \varDelta _2 {{\dot{\beta }}}(X_1,X_2,W)\) is free of \({\varvec{\xi }}\), and \({{\dot{\beta }}}(X_1,X_2,W)\) is bounded from Conditions C1, C2 and C5. Hence by the law of large numbers, we have
$$\begin{aligned} \sup _{\xi } |{\varvec{U}}_n^{(1)}({\varvec{\xi }}) - {\varvec{u}}^{(1)}({\varvec{\xi }})| = |({\mathbb {Q}}_n - Q)\varDelta _1 \varDelta _2 {{\dot{\beta }}}(X_1,X_2,W)| \rightarrow {\varvec{0}} \end{aligned}$$
either almost surely or in probability. Convergence in probability should be adequate here for the proof.
By Härdle et al. (1988),
$$\begin{aligned}&\frac{1}{n} N(t_1,t_2,w) \\&\qquad = \frac{1}{n} \sum _{k=1}^nI(X_{1k}\ge t_1,X_{2k}\ge t_2){\mathbf {K}}_h(W_k-w)\\&\qquad =\frac{\sum _{k=1}^nI(X_{1k}\ge t_1,X_{2k}\ge t_2){\mathbf {K}}_h(W_k-w)}{\sum _{k=1}^n{\mathbf {K}}_h(W_k-w)}\times \frac{\sum _{k=1}^n{\mathbf {K}}_h(W_k-w)}{n}\\&\qquad =E(I(X_{1}\ge t_1,X_{2}\ge t_2)|W=w)f(w)+o_p(1)\\&\qquad =S(t_1,t_2,w)+o_p(1). \end{aligned}$$
Also note that the difference between \(g^{(n)}\) and \({\tilde{g}}^{(n)}\) is their denominators wherein we replace the denominator of \(g^{(n)}\) by its limit. We then have the following:
$$\begin{aligned}&\sup _{\xi } \Big |\frac{1}{n^2}\sum _{i=1}^n\sum _{j=1}^n g^{(n)}(V_i,V_j;{\varvec{\xi }})-\frac{1}{n^2}\sum _{i=1}^n\sum _{j=1}^n{\tilde{g}}^{(n)}(V_i,V_j;{\varvec{\xi }})\Big |\\&\qquad \le \frac{1}{n}\sum _{i=1}^n\sup _{\xi } \Big |\frac{1}{n}\sum _{j=1}^n g^{(n)}(V_i,V_j;{\varvec{\xi }})-\frac{1}{n}\sum _{j=1}^n{\tilde{g}}^{(n)}(V_i,V_j;{\varvec{\xi }})\Big |\\&\qquad =\frac{1}{n}\sum _{i=1}^n\sup _{\xi } \Big |\frac{1}{n}\sum _{j=1}^n\frac{{\mathbf {K}}_h(W_j-W_i)b(V_i,V_j;{\varvec{\xi }})}{v(V_i,V_j;{\varvec{\xi }})S(X_{1i},X_{2j},W_i)} \\&\qquad \qquad \times \,\big (v(V_i,V_j;{\varvec{\xi }}) - S(X_{1i},X_{2j},W_i)\big )\Big |\\&\qquad =\frac{1}{n}\sum _{i=1}^n\frac{1}{n}\sum _{j=1}^n{\mathbf {K}}_h(W_j-W_i)\sup _{\xi } \Big |\frac{b(V_i,V_j;{\varvec{\xi }})}{v(V_i,V_j;{\varvec{\xi }})S(X_{1i},X_{2j},W_i)}\\&\qquad \qquad \times \,\big (v(V_i,V_j;{\varvec{\xi }}) - S(X_{1i},X_{2j},W_i)\big )\Big |\\&\qquad \le \frac{1}{n}\sum _{i=1}^n\frac{\frac{1}{n}\sum _{j=1}^n{\mathbf {K}}_h(W_j-W_i)}{\frac{1}{n} N(X_{1i},X_{2j},W_i)+o_p(1)}\sup _{\xi }\Big |\frac{b(V_i,V_j;{\varvec{\xi }})}{S(X_{1i},X_{2j},W_i)}\Big | \\&\qquad \qquad \times \, \Big (\sup \Big |(n^{-1} N(X_{1i},X_{2j},W_i)-S(X_{1i},X_{2j},W_i))\Big | \\&\qquad \qquad +\,\sup _{\xi } \Big |n^{-1}{\mathbf {K}}_h(0)I(X_{2j}\le X_{2i})(1-\theta (X_{1i},X_{2j},W_i;{\varvec{\xi }}))\Big |\Big )\\&\qquad \le \frac{1}{n}\sum _{i=1}^n O_p(1)O_p(1)\Big (\sup \Big |(n^{-1} N(X_{1i},X_{2j},W_i) - S(X_{1i},X_{2j},W_i))\Big | \\&\qquad \qquad +\,O_p((nh)^{-1})\Big ) \\&\qquad \le O_p(1)\Big (O_p(\max \{(nh/\log n)^{-1/2},h^{\epsilon } \})+O_p((nh)^{-1})\Big ) \\&\qquad =o_p(1). \end{aligned}$$
In the last inequality, we used the result of strong uniform consistency for conditional functional estimators of Härdle et al. (1988).
Next, we want to show that the difference between \(\frac{1}{n^2}\sum _{i=1}^n\sum _{j=1}^n{\tilde{g}}^{(n)}_{ij}({\varvec{\xi }})\) and \(\frac{1}{n}\sum _{i=1}^n{\tilde{h}}(V_i;{\varvec{\xi }})\) is \(o_p(1)\). Again using the result of Härdle et al. (1988) in the following calculation, we have
$$\begin{aligned}&\sup _{\xi }\left| \frac{1}{n^2}\sum _{i=1}^n\sum _{j=1}^n{\tilde{g}}^{(n)}(V_i,V_j;{\varvec{\xi }})-\frac{1}{n}\sum _{i=1}^n{\tilde{h}}(V_i;{\varvec{\xi }})\right| \\&\qquad \le \frac{1}{n}\sum _{i=1}^n\sup _{\xi }\left| \frac{1}{n}\sum _{i=1}^n{\tilde{g}}^{(n)}(V_i,V_j;{\varvec{\xi }})-{\tilde{h}}(V_i;{\varvec{\xi }})\right| \\&\qquad \le \frac{1}{n}\sum _{i=1}^n\sup _{{\varvec{\xi }},V_i}\left| \frac{1}{n}\sum _{i=1}^n{\tilde{g}}^{(n)}(V_i,V_j;{\varvec{\xi }})-{\tilde{h}}(V_i;{\varvec{\xi }})\right| \\&\qquad =\sup _{{\varvec{\xi }},V_i}\left| \frac{1}{n}\sum _{i=1}^n{\tilde{g}}^{(n)}(V_i,V_j;{\varvec{\xi }})-{\tilde{h}}(V_i;{\varvec{\xi }})\right| \\&\qquad =O_p(\max \{(nh/\log n)^{-1/2},h^{\epsilon } \})\\&\qquad =o_p(1). \end{aligned}$$
Last, we want to show that the difference between \(\frac{1}{n}\sum _{i=1}^n{\tilde{h}}(V_i;{\varvec{\xi }})\) and its deterministic limit \(u^{(2)}({\varvec{\xi }})\) is \(o_p(1)\) uniformly in \({\varvec{\xi }}\). For model (3) under C1-C3, it is straightforward to see that all the component functions of \(b(V_i,V_j;{\varvec{\xi }})\) are Donsker. Thus \(b(V_i,V_j;{\varvec{\xi }})\) is Donsker. Then by Theorem 2.10.2 in van der Vaart and Wellner (1996), \({\tilde{h}}(V_i;{\varvec{\xi }})\) is also Donsker. Hence, \({\tilde{h}}(V_i;{\varvec{\xi }})\) is Glivenko–Cantelli. We then have
$$\begin{aligned} \sup _{\xi } \Big |\frac{1}{n}\sum _{i=1}^n{\tilde{h}}(V_i;{\varvec{\xi }})-u^{(2)}({\varvec{\xi }})\Big | = o_p(1). \end{aligned}$$
Thus we have shown that \({\varvec{U}}_n({\varvec{\xi }})\) converges uniformly to \({\varvec{u}}({\varvec{\xi }})\) in probability.
Following a similar calculation in Hu et al. (2011), we can also show that \({\varvec{\xi }}_0\) is the unique solution of \({\varvec{u}}({\varvec{\xi }})=0\). The consistency of \({\hat{{\varvec{\xi }}}}_n\) follows immediately.
Appendix C: Proof of Theorem 2
For asymptotic normality, the goal is to write \({\varvec{U}}_n({\varvec{\xi }}_0)\) as an average of ni.i.d. terms plus a \(o_p(n^{-1/2})\) term. The technical difficulty arises when \({\varvec{U}}_n({\varvec{\xi }}_0)\) involves the kernel function which is unbounded as the bandwidth goes to 0, so that we can no longer rely on the properties of Donsker functions. Here, we briefly give the results of a discrete covariate and then focus mainly on the linearization of \({\varvec{U}}_n({\varvec{\xi }}_0)\) for a continuous covariate.
Define \({{\dot{{\varvec{U}}}}}_n({\varvec{\xi }}) \equiv d {\varvec{U}}_n({\varvec{\xi }})/d{\varvec{\xi }}\). By Taylor expansion of \({\varvec{U}}_n({\hat{{\varvec{\xi }}}}_n)\) around \({\varvec{\xi }}_0\), we have
$$\begin{aligned} n^{1/2}({\hat{{\varvec{\xi }}}}_n - {\varvec{\xi }}_0) = - \left\{ {{\dot{{\varvec{U}}}}}_n({\varvec{\xi }}^*)\right\} ^{-1} n^{1/2}{\varvec{U}}_n({\varvec{\xi }}_0), \end{aligned}$$
(15)
where \({\varvec{\xi }}^*\) lies between \({\hat{{\varvec{\xi }}}}_n\) and \({\varvec{\xi }}_0\). By a similar calculation as in the proof of Theorem 1 showing the uniform consistency of \({\varvec{U}}_n({\varvec{\xi }})\), we can show that \(\sup |{{\dot{{\varvec{U}}}}}_n({\varvec{\xi }}) - {{\dot{{\varvec{u}}}}}({\varvec{\xi }})| = o_p(1)\). Thus by the consistency of \({\hat{{\varvec{\xi }}}}_n\), which implies the consistency of \({\varvec{\xi }}^*\), and the continuity of \({{\dot{{\varvec{u}}}}}({\varvec{\xi }})\), we obtain \({{\dot{{\varvec{U}}}}}_n({\varvec{\xi }}^*) = {{\dot{{\varvec{u}}}}}({\varvec{\xi }}_0) + o_p(1)\), where \({{\dot{{\varvec{u}}}}}({\varvec{\xi }}_0) = - 2 E\left\{ \varDelta _1\varDelta _2{{\dot{\beta }}}(X_1,X_2,W)^{\otimes 2}\right\} = - {\varvec{I}}({\varvec{\xi }}_0)\) is invertible by Condition C4. Hence based on the fact that continuity holds for the inverse operator, (15) can be written as
$$\begin{aligned} n^{1/2}({\hat{{\varvec{\xi }}}}_n - {\varvec{\xi }}_0) = \left\{ {\varvec{I}}({\varvec{\xi }}_0)^{-1} + o_p(1) \right\} n^{1/2}{\varvec{U}}_n({\varvec{\xi }}_0). \end{aligned}$$
(16)
We now need to find the asymptotic representation of \(n^{1/2}{\varvec{U}}_n({\varvec{\xi }}_0)\). We only check it for \({\varvec{U}}^{(1)}_n({\varvec{\xi }}_0) - {\varvec{U}}^{(2)}_n({\varvec{\xi }}_0)\). The calculation for \({\varvec{U}}^{(3)}_n({\varvec{\xi }}_0) - {\varvec{U}}^{(4)}_n({\varvec{\xi }}_0)\) is virtually identical and yields the same asymptotic representation.
It is easily seen that
$$\begin{aligned} n^{1/2}\left( {\varvec{U}}^{(1)}_n({\varvec{\xi }}_0) - {\varvec{u}}^{(1)}({\varvec{\xi }}_0)\right) = {\mathbb {G}}_n \{\varDelta _1\varDelta _2{{\dot{\beta }}}(X_1,X_2,W)\}, \end{aligned}$$
(17)
where \({\mathbb {G}}_n = n^{1/2}({\mathbb {P}}_n - P)\). We then focus on \(n^{1/2}\left( {\varvec{U}}^{(2)}_n({\varvec{\xi }}_0) - {\varvec{u}}^{(2)}({\varvec{\xi }}_0)\right) \), whose linearization differs vastly for a discrete covariate and a continuous covariate, largely because we could no longer rely on Donsker Theorem for a continuous covariate case when kernel functions are involved. Thus the two cases are treated separately in the proof.
1.1 C.1. Linearization of \(n^{1/2}\left( {\varvec{U}}^{(2)}_n(V_i,V_j;{\varvec{\xi }}_0) - {\varvec{u}}^{(2)}({\varvec{\xi }}_0)\right) \) for a discrete covariate
Following similar calculation as in Hu et al. (2011), we can show that
$$\begin{aligned}&n^{1/2}\left\{ {\varvec{U}}^{(2)}_n({\varvec{\xi }}_0) - {\varvec{u}}^{(2)}({\varvec{\xi }}_0)\right\} \nonumber \\&\quad = {\mathbb {G}}_n \Bigg \{{{\tilde{h}}}^d_Q(\varDelta _1, X_1, X_2,W; {\varvec{\xi }}_0) + {{\tilde{h}}}^d_P(\varDelta _2, X_1, X_2,W; {\varvec{\xi }}_0)\nonumber \\&\qquad -\int \!\!\!\int I(X_1 \ge x_1^*, X_2 \ge x_2,W=w^*) r(\delta _1, x_1, x_2,w, \delta _2^*, x_1^*, x_2^*,w^*) \nonumber \\&\qquad dP(\delta _1^*, \delta _2^*,x_1^*,x_2^*,w^*)dQ(\delta _1,\delta _2, x_1,x_2,w) \Bigg \} +o_p(1), \end{aligned}$$
(18)
where
$$\begin{aligned}&r(\delta _1, x_1, x_2,w, \delta _2^*, x_1^*, x_2^*,w^*) \\&\qquad = \frac{ I(w=w^*)\delta _1^*\delta _2{\dot{\beta }}(x^*_1, x_2,w^*)I(x_1 \ge x^*_1)I(x_2 \le x^*_2) e^{\beta (x^*_1, x_2,w^*;\xi _0)}}{\{S(x_1^*,x_2,w^*)\}^2}. \end{aligned}$$
Then we obtain
$$\begin{aligned} n^{1/2}{\varvec{U}}_n({\varvec{\xi }}_0)= & {} 2{\mathbb {G}}_n \Bigg \{ \varDelta _1\varDelta _2 {{\dot{\beta }}}(X_1,X_2,W) - {{\tilde{h}}}^d_Q(\varDelta _1, X_1, X_2,W; {\varvec{\xi }}_0) \nonumber \\&\qquad -\, {{\tilde{h}}}^d_P(\varDelta _2, X_1, X_2,W; {\varvec{\xi }}_0) \nonumber \\&\qquad +\, \int \!\!\!\int I(X_1 \ge x_1^*, X_2 \ge x_2,W=w^*) \nonumber \\&\qquad \times \, r(\delta _1, x_1, x_2,w, \delta _2^*, x_1^*, x_2^*,w^*) \nonumber \\&\qquad dP(\delta _1^*, \delta _2^*,x_1^*,x_2^*,w^*)dQ(\delta _1,\delta _2, x_1,x_2,w) \Bigg \} + o_p(1) \nonumber \\&\rightarrow _d N({\varvec{0}}, {\varvec{\varSigma }}({\varvec{\xi }}_0)) . \end{aligned}$$
(19)
Thus from (16) we obtain the desired asymptotic distribution of \(n^{1/2}({\hat{{\varvec{\xi }}}}_n - {\varvec{\xi }}_0)\).
Let \(\mathbf{Z}({\varvec{\xi }}_0)\) denote the expression inside \(\{ \, \}\) in (19) for a generic data point. It is clear that each \(\mathbf{Z}_i({\varvec{\xi }}_0)\) is a function of i-th observation, hence \(\mathbf{Z}_i({\varvec{\xi }}_0)\)’s are i.i.d. Then under the regularity conditions we have the weak convergence in (19) with \({\varvec{\varSigma }}({\varvec{\xi }}_0) = 4 E \left\{ \mathbf{Z}({\varvec{\xi }}_0)^{\otimes 2}\right\} \). We estimate the covariance matrix of \(\mathbf{Z}({\varvec{\xi }}_0)\) by its sample covariance matrix with \(\tilde{h}^d_Q(\varDelta _1, X_1, X_2,W; {\varvec{\xi }}_0)\), \({{\tilde{h}}}^d_P(\varDelta _1, X_1, X_2,W; {\varvec{\xi }}_0)\) and the double integral substituted by their sample averages, and \({\varvec{\xi }}_0\) replaced by \({\hat{{\varvec{\xi }}}}_n\). After the approximation/substitution, quantities are no longer i.i.d. However, it can be shown that \(\mathbf{Z}({\varvec{\xi }}_0)\) and its sample approximation belong to Glivenko–Cantelli class of functions, which leads to an asymptotically valid covariance estimator. Our simulation has shown the empirical variance and the variance estimates are very close.
1.2 C.2. Linearization of \(n^{1/2}\left( {\varvec{U}}^{(2)}_n(V_i,V_j;{\varvec{\xi }}_0) - {\varvec{u}}^{(2)}({\varvec{\xi }}_0)\right) \) for a continuous covariate
We focus on \(n^{1/2}\left( {\varvec{U}}^{(2)}_n(V_i,V_j;{\varvec{\xi }}_0) - {\varvec{u}}^{(2)}({\varvec{\xi }}_0)\right) \) with the following decomposition:
$$\begin{aligned}&n^{1/2} \left( {\varvec{U}}^{(2)}_n(V_i,V_j;{\varvec{\xi }}_0) - {\varvec{u}}^{(2)}({\varvec{\xi }}_0)\right) \nonumber \\&\quad = n^{1/2} \frac{1}{n^2}\sum _{i=1}^n\sum _{j=1}^n\left( g^{(n)}(V_i,V_j;{\varvec{\xi }}_0)-g_h^{(n)}(V_i,V_j;{\varvec{\xi }}_0)\right) \\&\qquad +\,n^{1/2}\frac{1}{n^2}\sum _{i=1}^n\sum _{j=1}^n\left( g_h^{(n)}(V_i,V_j;{\varvec{\xi }}_0)-{\tilde{g}}^{(n)}(V_i,V_j;{\varvec{\xi }}_0)\right) \\&\qquad +\,\frac{1}{n^{1/2}}\sum _{i=1}^n\left( \frac{1}{n}\sum _{j=1}^n {\tilde{g}}^{(n)}(V_i,V_j;{\varvec{\xi }}_0)-{\tilde{h}}(V_i;{\varvec{\xi }}_0)\right) \\&\qquad +\,\frac{1}{n^{1/2}}\sum _{i=1}^n\left( {\tilde{h}}(V_i;{\varvec{\xi }}_0)- {\varvec{u}}^{(2)}({\varvec{\xi }}_0)\right) \\&\quad =-A-B+C+D. \end{aligned}$$
Now we will look at the four terms separately. Firstly, term D is a sum of i.i.d. items, and thus \(D={\mathbb {G}}_n\left( {\tilde{h}}(V,{\varvec{\xi }}_0)\right) \).
Secondly, term C can be decomposed as follows:
$$\begin{aligned} C= & {} \frac{1}{n^{1/2}}\sum _{i=1}^n\left( \frac{1}{n}\sum _{j=1}^n {\tilde{g}}^{(n)}(V_i,V_j;{\varvec{\xi }}_0)-{\tilde{h}}(V_i,{\varvec{\xi }}_0)\right) \\= & {} {\mathbb {G}}_n\left( {\mathbb {P}}_n^*{\tilde{g}}^{(n)}(V,V^*,{\varvec{\xi }}_0)-P^*{\tilde{g}}^{(n)}(V,V^*,{\varvec{\xi }}_0)\right) \\&+\,{\mathbb {G}}_n\left( P^*{\tilde{g}}^{(n)}(V,V^*,{\varvec{\xi }}_0)-{\tilde{h}}^*(V,{\varvec{\xi }}_0)\right) \\&+\,{\mathbb {G}}_n\left( {\tilde{h}}^*(V,{\varvec{\xi }}_0)\right) +n^{1/2}{\mathbb {P}}_n^*\left( P{\tilde{g}}^{(n)}(V,V^*,{\varvec{\xi }}_0)-{\tilde{h}}(V^*,{\varvec{\xi }}_0)\right) \\= & {} C_1+C_2+C_3+C_4. \end{aligned}$$
For the last equality of the above equation, we want to show that \(C_1=o_p(1)\), \(C_2=o_p(1)\) and \(C_4=o_p(1)\), so that \(C=C_3+o_p(1)\). First, by lemma A.2 of Ichimura (1993) \(P{\tilde{g}}^{(n)}(V,V^*,{\varvec{\xi }}_0)-{\tilde{h}}(V^*,{\varvec{\xi }}_0)=O(h^2)\). Thus \(C_4=n^{1/2}O(h^2)=o_p(1)\) for h satisfying C8. Likewise, \(P^*{\tilde{g}}^{(n)}(V,V^*,{\varvec{\xi }}_0)-{\tilde{h}}^*(V,{\varvec{\xi }}_0)=O(h^2)\), and therefore \(C_2=n^{1/2}O(h^2)=o_p(1)\) for h satisfying C8. Finally, we need to show that \(C_1={\mathbb {G}}_n\left( ({\mathbb {P}}_n^*-P^*){\mathbf {K}}_h(W^*-W)\frac{b(V^*,V;{\varvec{\xi }}_0)}{S(X_1^*,X_2,W^*;{\varvec{\xi }}_0)}\right) =o_p(1)\).
First, set
$$\begin{aligned} r_h(V,V^*)= & {} {\mathbf {K}}\left( \frac{W^*-W}{h}\right) \frac{b(V^*,V;{\varvec{\xi }}_0)}{S(X_1^*,X_2,W^*;{\varvec{\xi }}_0)}, \\ {\tilde{r}}_{h}(V,V^*)= & {} r_h(V,V^*)-Pr_h(V,V^*)-P^*r_h(V,V^*)+PP^*r_h(V,V^*), \end{aligned}$$
and
$$\begin{aligned} T_n({\tilde{r}}_{h})=\sum _{1\le i \ne j\le n} {\tilde{r}}_{h}(V_i,V_j). \end{aligned}$$
Then we have
$$\begin{aligned} C_1= & {} h^{-1}{\mathbb {G}}_n\left( ({\mathbb {P}}_n^*-P^*)r_h(V,V^*)\right) \\= & {} h^{-1}\sqrt{n}\left( ({\mathbb {P}}_n-P)({\mathbb {P}}_n^*-P^*)r_h(V,V^*)\right) \\= & {} h^{-1}\sqrt{n}\frac{1}{n^2}\left( T_n({\tilde{r}}_{h})+\sum _{i=1}^n{\tilde{r}}_{h}(V_i,V_i)\right) \\= & {} \frac{1}{\sqrt{n}hn}T_n({\tilde{r}}_{h})+\frac{1}{nh\sqrt{n}}\sum _{i=1}^n{\tilde{r}}_{h}(V_i,V_i)\\= & {} C_{11}+C_{12}. \end{aligned}$$
Applying the central limit theorem, it is easy to see that \(C_{12}=o_p(1)\). To show that \(C_{11}=o_p(1)\), we need the following definition and theorem from Nolan and Pollard (1987). We keep the same numbering for the definition and theorem as in the original paper for the ease of reference.
Definition 8. Call a class of functions \({\mathcal {F}}\) Euclidean for the envelop F if there exist constants A and V such that
$$\begin{aligned} N_1(\epsilon , Q, {\mathcal {F}}, F)\le A\epsilon ^{-V}, \quad \text { for }0<\epsilon \le 1, \end{aligned}$$
whenever \(0<QF<\infty \), where \(N_1\) denotes the covering number with \(L^1\) norm.
Theorem 9. Let\({\mathcal {F}}\)be a Euclidean class of P-degenerate functions with envelope 1. LetW(n, x) be a bounded weight function that is decreasing in both arguments and satisfies
$$\begin{aligned} \sum _{n=1}^\infty \int _0^1n^{-1}W(n,x)(1+\log (1/x))dx<\infty . \end{aligned}$$
If\(v(\cdot )\)is a function on\({\mathcal {F}}\)for which\(v(f)\ge \sup _xP|f(x,\cdot )|\), then
$$\begin{aligned} n^{-1}||W(n,v(f)^{1/2})T_n(f)||\rightarrow 0. \end{aligned}$$
In our case, each \({\tilde{r}}_{h}\) is P-degenerate; that is \(P{\tilde{r}}_{h}(V,\cdot )=0\). The class of all \({\tilde{r}}_{h}\) is a candidate for the above theorem. Following Nolan and Pollard (1987) page 795, it is easy to check that there exists a constant C for which
$$\begin{aligned} \sup _{x,y,h}|{\tilde{r}}_{h}(x,y)|\le C \text{ and } \sup _{x}P^*|{\tilde{r}}_{h}(x,\cdot )|\le C(1\wedge h) \end{aligned}$$
for all \(h>0\). We can rescale to make C equal to 1.
If kernel \({\mathbf {K}}\) is of bounded variation, e.g. standard normal density, then \(\{{\tilde{r}}_{h}\}\) is a Euclidean class. For details of establishing Euclidean property in a particular class, please refer to Sect. 5 of Nolan and Pollard (1987).
Invoking Theorem 9 of Nolan and Pollard (1987), we obtain
$$\begin{aligned} n^{-1}||W(n,v(f)^{1/2})T_n(f)||=o_p(1), \end{aligned}$$
where \(v({\tilde{r}}_{h})=1\wedge h\) and \(W(n,x)=(1+nx^{10})^{-1}\). Since W is bounded by 1 and
$$\begin{aligned} \int _0^1W(n,x)(1+\log (1/x))dx=O(n^{-1/10}\log n) \end{aligned}$$
the conditions of Theorem 9 are satisfied.
Returning to the calculation for \(C_{11}\),
$$\begin{aligned} C_{11}= & {} \frac{1}{\sqrt{n}hn}T_n({\tilde{r}}_{h})\\\le & {} \frac{1}{\sqrt{n}hW(n,v(f)^{1/2})}||n^{-1}W(n,v(f)^{1/2})T_n({\tilde{r}}_{h})||\\= & {} \frac{1+n(1\wedge h)^{5}}{\sqrt{n}h}o_p(1)\\\le & {} \frac{1+nh^{5}}{\sqrt{n}h}o_p(1)\\= & {} o_p(1)+\sqrt{n}h^{4}o_p(1). \end{aligned}$$
Thus \(C_{11}=o_p(1)\) for h satisfying C8. Then we obtain \(C_1=C_{11}+C_{12}=o_p(1)\) and thus \(C={\mathbb {G}}_n\left( {\tilde{h}}^*(V,{\varvec{\xi }}_0)\right) +o_p(1)\).
Thirdly, we want to show B is \(o_p(1)\) and hence negligible. Now
$$\begin{aligned} B= & {} n^{\frac{1}{2}}n^{-2}\sum _{i=1}^n\sum _{j=1}^n \frac{{\mathbf {K}}_h(W_j-W_i)b(V_i,V_j;{\varvec{\xi }}_0)}{S(X_{1i},X_{2j},W_i)S_h(X_{1i},X_{2j},W_i)} \\&\times \left( S_h(X_{1i},X_{2j},W_i) - S(X_{1i},X_{2j},W_i)\right) \\= & {} n^{\frac{1}{2}}n^{-2}\sum _{i=1}^n\sum _{j=1}^n \frac{{\mathbf {K}}_h(W_j-W_i)b(V_i,V_j;{\varvec{\xi }}_0)}{S(X_{1i},X_{2j},W_i)S_h(X_{1i},X_{2j},W_i)}O(h^2). \end{aligned}$$
The inner summation divided by n is bounded by the density of W at \(W_i\) times \(O(h^2)\), which is seen from the following:
$$\begin{aligned}&n^{-1}\sum _{j=1}^n\frac{{\mathbf {K}}_h(W_j-W_i)b(V_i,V_j;{\varvec{\xi }}_0)}{S(X_{1i},X_{2j},W_i)S_h(X_{1i},X_{2j},W_i)}O(h^2)\\&\qquad =n^{-1}\sum _{j=1}^n\frac{{\mathbf {K}}_h(W_j-W_i)b(V_i,V_j;{\varvec{\xi }}_0)}{S(X_{1i},X_{2j},W_i)(S(X_{1i},X_{2j},W_i)+o(1))}O(h^2)\\&\qquad \lesssim O(h^2)n^{-1}\sum _{j=1}^n{\mathbf {K}}_h(W_j-W_i)\\&\qquad \approx f(W_i)O(h^2)\\&\qquad =O(h^2) \end{aligned}$$
where “\(\lesssim \)” denotes “less than up to some constant coefficient”. Therefore, we have \(B=n^{1/2}O(h^2)=o_p(1)\) for h satisfying C8.
Lastly, term A can be decomposed as
$$\begin{aligned} A= & {} n^{\frac{1}{2}} n^{-2}\sum _{i=1}^n\sum _{j=1}^n \frac{{\mathbf {K}}_h(W_j-W_i)b(V_i,V_j;{\varvec{\xi }}_0)}{v(V_i,V_j;{\varvec{\xi }}_0)S_h(X_{1i},X_{2j},W_i)} \\&\times \, \left( v(V_i,V_j;{\varvec{\xi }}_0) - S_h(X_{1i},X_{2j},W_i)\right) \\= & {} n^{\frac{1}{2}} n^{-2}\sum _{i=1}^n\sum _{j=1}^n \frac{{\mathbf {K}}_h(W_j-W_i)b(V_i,V_j;{\varvec{\xi }}_0)}{v(V_i,V_j;{\varvec{\xi }}_0)S_h(X_{1i},X_{2j},W_i)} \\&\times \, \left( \frac{1}{n}N(X_{1i},X_{2j},W_i) - S_h(X_{1i},X_{2j},W_i)\right) \\&+\, n^{-2}\sum _{i=1}^n\sum _{j=1}^n \frac{{\mathbf {K}}_h(W_j-W_i)b(V_i,V_j;{\varvec{\xi }}_0)}{v(V_i,V_j;{\varvec{\xi }}_0)S_h(X_{1i},X_{2j},W_i)}\frac{{\mathbf {K}}_h(0)}{n^{\frac{1}{2}}} \\&\times \, I(X_{2j}\le X_{2i})(1-\theta (X_{1i},X_{2j},W_i;{\varvec{\xi }}_0))\\= & {} n^{\frac{1}{2}}n^{-2}\sum _{i=1}^n\sum _{j=1}^n \frac{{\mathbf {K}}_h(W_j-W_i)b(V_i,V_j;{\varvec{\xi }}_0)}{S_h(X_{1i},X_{2j},W_i)^2}\\&\times \, \left( \frac{1}{n}N(X_{1i},X_{2j},W_i) - S_h(X_{1i},X_{2j},W_i)\right) + o_p(1)\\= & {} n^{\frac{1}{2}} n^{-3}\sum _{k=1}^n\sum _{i=1}^n\sum _{j=1}^n \frac{{\mathbf {K}}_h(W_j-W_i)b(V_i,V_j;{\varvec{\xi }}_0)}{S_h(X_{1i},X_{2j},W_i)^2}\\&\times \, \left( I(X_{1k}\ge X_{1i},X_{2k} \ge X_{2j}){\mathbf {K}}_h(W_k-W_i)-S_h(X_{1i},X_{2j},W_i)\right) \\&+\, o_p(1)\\= & {} n^{\frac{1}{2}} \Bigg ({\mathbb {P}}_n^\dagger {\mathbb {P}}_n^*{\mathbf {K}}_h(W^\dagger -W^*){\mathbb {P}}_n \frac{{\mathbf {K}}_h(W-W^*)b(V^*,V;{\varvec{\xi }}_0)}{S_h(X_1^*,X_2,W^*)^2}\\&\times \, I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)\\&-\,P^\dagger {\mathbb {P}}_n^*{\mathbf {K}}_h(W^\dagger -W^*){\mathbb {P}}_n \frac{{\mathbf {K}}_h(W-W^*)b(V^*,V;{\varvec{\xi }}_0)}{S_h(X_1^*,X_2,W^*)^2}\\&\times \, I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)\\&+\,P^\dagger {\mathbb {P}}_n^*{\mathbf {K}}_h(W^\dagger -W^*){\mathbb {P}}_n \frac{{\mathbf {K}}_h(W-W^*)b(V^*,V;{\varvec{\xi }}_0)}{S_h(X_1^*,X_2,W^*)^2}\\&\times \, I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)\\&-\, {\mathbb {P}}_n^*{\mathbb {P}}_n\frac{{\mathbf {K}}_h(W-W^*)b(V^*,V;{\varvec{\xi }}_0)}{S_h(X_1^*,X_2,W^*)}\Bigg )+ o_p(1) \\= & {} {\mathbb {G}}^\dagger _n \Big ( {\mathbb {P}}_n^*{\mathbf {K}}_h(W^*-W^\dagger ){\mathbb {P}}_n{\mathbf {K}}_h(W-W^*)\frac{b(V^*,V;{\varvec{\xi }}_0)}{S_h(X_1^*,X_2,W^*)^2}\\&\times \, I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2) \Big )\\&+\, n^{\frac{1}{2}}P^\dagger {\mathbb {P}}_n^*{\mathbf {K}}_h(W^\dagger -W^*){\mathbb {P}}_n \frac{{\mathbf {K}}_h(W-W^*)b(V^*,V;{\varvec{\xi }}_0)}{S_h(X_1^*,X_2,W^*)^2}\\&\times \, I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)\\&-\, n^{\frac{1}{2}}{\mathbb {P}}_n^*{\mathbb {P}}_n\frac{{\mathbf {K}}_h(W-W^*)b(V^*,V;{\varvec{\xi }}_0)}{S_h(X_1^*,X_2,W^*)}+ o_p(1)\\= & {} A_1+A_2-A_3+o_p(1). \end{aligned}$$
Term \(A_1\) can be further decomposed as
$$\begin{aligned} A_1= & {} {\mathbb {G}}^\dagger _n \Big ( {\mathbb {P}}_n^*{\mathbf {K}}_h(W^*-W^\dagger ){\mathbb {P}}_n{\mathbf {K}}_h(W-W^*)\frac{b(V^*,V;{\varvec{\xi }}_0)}{S_h(X_1^*,X_2,W^*)^2}\\&\quad \times \, I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2) \Big )\\= & {} {\mathbb {G}}^\dagger _n \Big ( {\mathbb {P}}_n^*{\mathbf {K}}_h(W^*-W^\dagger ){\mathbb {P}}_n{\mathbf {K}}_h(W-W^*)\frac{b(V^*,V;{\varvec{\xi }}_0)}{S_h(X_1^*,X_2,W^*)^2}\\&\quad \times \, I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)\\&\quad -\,P^*{\mathbf {K}}_h(W^*-W^\dagger )P{\mathbf {K}}_h(W-W^*)\frac{b(V^*,V;{\varvec{\xi }}_0)}{S_h(X_1^*,X_2,W^*)^2}\\&\quad \times \, I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2) \Big )\\&\quad +\, {\mathbb {G}}^\dagger _n \Big ( P^*{\mathbf {K}}_h(W^*-W^\dagger )P{\mathbf {K}}_h(W-W^*)\frac{b(V^*,V;{\varvec{\xi }}_0)}{S_h(X_1^*,X_2,W^*)^2} \\&\quad \times \, I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)\\&\quad -\,E_{V^*|W^*=W^\dagger }E_{V|W=W^*}\frac{b(V^*,V;{\varvec{\xi }}_0)f(W^*)}{S_h(X_1^*,X_2,W^*)^2} \\&\quad \times \, I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)f(W^\dagger ) \Big )\\&\quad +\, {\mathbb {G}}^\dagger _n \Big (E_{V^*|W^*=W^\dagger }E_{V|W=W^*}\frac{b(V^*,V;{\varvec{\xi }}_0)f(W^*)}{S_h(X_1^*,X_2,W^*)^2} \\&\quad \times \, I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)f(W^\dagger ) \Big )\\= & {} A_{11}+ A_{12}+A_{13}. \end{aligned}$$
We will show that \(A_{12}=o_p(1)\) and \(A_{11}=o_p(1)\) separately. First of all,
$$\begin{aligned} A_{12}= & {} {\mathbb {G}}^\dagger _n \Big ( P^*{\mathbf {K}}_h(W^*-W^\dagger )P{\mathbf {K}}_h(W-W^*)\frac{b(V^*,V;{\varvec{\xi }}_0)}{S_h(X_1^*,X_2,W^*)^2} \\&\times \, I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)\\&-\,E_{V^*|W^*=W^\dagger }E_{V|W=W^*}\frac{b(V^*,V;{\varvec{\xi }}_0)f(W^*)}{S_h(X_1^*,X_2,W^*)^2}\\&\times I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)f(W^\dagger ) \Big )\\= & {} {\mathbb {G}}^\dagger _n \Big (P^*{\mathbf {K}}_h(W^*-W^\dagger )P{\mathbf {K}}_h(W-W^*)\frac{b(V^*,V;{\varvec{\xi }}_0)}{S_h(X_1^*,X_2,W^*)^2}\\&\times \, I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)\\&-\, P^*{\mathbf {K}}_h(W^*-W^\dagger )E_{V|W=W^*}\frac{b(V^*,V;{\varvec{\xi }}_0)f(W^*)}{S_h(X_1^*,X_2,W^*)^2}\\&\times \, I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)\\&+\, P^*{\mathbf {K}}_h(W^*-W^\dagger )E_{V|W=W^*}\frac{b(V^*,V;{\varvec{\xi }}_0)f(W^*)}{S_h(X_1^*,X_2,W^*)^2} \\&\times \, I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)\\&-\,E_{V^*|W^*=W^\dagger }E_{V|W=W^*}\frac{b(V^*,V;{\varvec{\xi }}_0)f(W^*)}{S_h(X_1^*,X_2,W^*)^2}\\&\times \, I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)f(W^\dagger ) \Big )\\= & {} {\mathbb {G}}^\dagger _n \Big (P^*{\mathbf {K}}_h(W^*-W^\dagger )\Big \{P{\mathbf {K}}_h(W-W^*)\frac{b(V^*,V;{\varvec{\xi }}_0)}{S_h(X_1^*,X_2,W^*)^2} \\&\times \, I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)\\&-\,E_{V|W=W^*}\frac{b(V^*,V;{\varvec{\xi }}_0)f(W^*)}{S_h(X_1^*,X_2,W^*)^2}I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)\Big \}\\&+\, P^*{\mathbf {K}}_h(W^*-W^\dagger )E_{V|W=W^*}\frac{b(V^*,V;{\varvec{\xi }}_0)f(W^*)}{S_h(X_1^*,X_2,W^*)^2} \\&\times \, I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)\\&-\,E_{V^*|W^*=W^\dagger }E_{V|W=W^*}\frac{b(V^*,V;{\varvec{\xi }}_0)f(W^*)}{S_h(X_1^*,X_2,W^*)^2} \\&I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)f(W^\dagger ) \Big ). \end{aligned}$$
Note that by Lemma A.2 of Ichimura (1993),
$$\begin{aligned}&P{\mathbf {K}}_h(W-W^*)\frac{b(V^*,V;{\varvec{\xi }}_0)}{S_h(X_1^*,X_2,W^*)^2}I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)\\&\quad = E_{V|W=W^*}\frac{b(V^*,V;{\varvec{\xi }}_0)f(W^*)}{S_h(X_1^*,X_2,W^*)^2}I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)+O(h^2), \end{aligned}$$
and
$$\begin{aligned}&P^*{\mathbf {K}}_h(W^*-W^\dagger )E_{V|W=W^*}\frac{b(V^*,V;{\varvec{\xi }}_0)f(W^*)}{S_h(X_1^*,X_2,W^*)^2}I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)\\&\quad = E_{V^*|W^*=W^\dagger }E_{V|W=W^*}\frac{b(V^*,V;{\varvec{\xi }}_0)f(W^*)}{S_h(X_1^*,X_2,W^*)^2}I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)f(W^\dagger ) \Big )\\&\qquad +\,O(h^2). \end{aligned}$$
So term \(A_{12}=n^{1/2}O(h^2)=o_p(1)\) for h satisfying C8.
To show term \(A_{11}=o_p(1)\), first for fixed \(V^\dagger \) set
$$\begin{aligned}&m_{h}(V,V^*,V^\dagger )\\&\quad = h^{-1}{\mathbf {K}}\left( \frac{W^*-W^\dagger }{h}\right) {\mathbf {K}}\left( \frac{W-W^*}{h}\right) \frac{b(V^*,V;{\varvec{\xi }}_0)}{S_h(X_1^*,X_2,W^*)^2}\\&\qquad \times \, I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2),\\&{\tilde{m}}_{h}(V,V^*,V^\dagger )\\&\quad =m_{h}(V,V^*,V^\dagger )-Pm_{h}(V,V^*,V^\dagger )-P^*m_{h}(V,V^*,V^\dagger ) \\&\qquad +\,PP^*m_{h}(V,V^*,V^\dagger ). \end{aligned}$$
Then term \(A_{11}\) can be decomposed into:
$$\begin{aligned}&{\mathbb {G}}^\dagger _n(h^{-1}{\mathbb {P}}_n^*{\mathbb {P}}_nm_h-h^{-1}{P}^*{P}m_h)\\&\quad = {\mathbb {G}}^\dagger _n\left( h^{-1}{\mathbb {P}}_n^*{\mathbb {P}}_n{\tilde{m}}_h+({\mathbb {P}}_n^*-{P}^*)h^{-1}{P}m_h+({\mathbb {P}}_n-{P})h^{-1}{P^*}m_h\right) . \end{aligned}$$
Note that \({\mathbb {P}}_n^*{\mathbb {P}}_n{\tilde{m}}_h\) is again a U-process. Using a proof similar to the one that shows \(C_1=o_p(1)\), we have \({\mathbb {G}}^\dagger _n\left( h^{-1}{\mathbb {P}}_n^*{\mathbb {P}}_n{\tilde{m}}_h\right) =o_p(1)\), \({\mathbb {G}}^\dagger _n({\mathbb {P}}_n^*-{P}^*)h^{-1}{P}m_h=o_p(1)\), and \({\mathbb {G}}^\dagger _n({\mathbb {P}}_n-{P})h^{-1}{P^*}m_h=o_p(1)\) for h satisfying C8. Thus,
$$\begin{aligned} A_1= & {} A_{13}+o_p(1)\\= & {} {\mathbb {G}}^\dagger _n \Big (E_{V^*|W^*=W^\dagger }E_{V|W=W^*}\frac{b(V^*,V;{\varvec{\xi }}_0)f(W^*)}{S_h(X_1^*,X_2,W^*)^2} \\&\times \, I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)f(W^\dagger ) \Big ) +o_p(1). \end{aligned}$$
Now focusing on \(X_1^\dagger ,X_2^\dagger ,W^\dagger \) and their probability measure \(P^\dagger \), we have
$$\begin{aligned} A_2= & {} P^\dagger {\mathbb {P}}_n^*{\mathbf {K}}_h(W^\dagger -W^*){\mathbb {P}}_n \frac{{\mathbf {K}}_h(W-W^*)b(V^*,V;{\varvec{\xi }}_0)}{S_h(X_1^*,X_2,W^*)^2}\\&\times \, I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)\\= & {} {\mathbb {P}}_n^*{\mathbb {P}}_n \frac{{\mathbf {K}}_h(W-W^*)b(V^*,V;{\varvec{\xi }}_0)}{S_h(X_1^*,X_2,W^*)^2}\times (S_h(X_1^*,X_2,W^*)+O(h^2))\\= & {} {\mathbb {P}}_n^*{\mathbb {P}}_n \frac{{\mathbf {K}}_h(W-W^*)b(V^*,V;{\varvec{\xi }}_0)}{S_h(X_1^*,X_2,W^*)} \\&+\,{\mathbb {P}}_n^*{\mathbb {P}}_n \frac{{\mathbf {K}}_h(W-W^*)b(V^*,V;{\varvec{\xi }}_0)}{S_h(X_1^*,X_2,W^*)^2}O(h^2)\\= & {} {\mathbb {P}}_n^*{\mathbb {P}}_n \frac{{\mathbf {K}}_h(W-W^*)b(V^*,V;{\varvec{\xi }}_0)}{S_h(X_1^*,X_2,W^*)}+O_p(1)O(h^2)\\= & {} A_3+o_p(1). \end{aligned}$$
Putting everything together, we obtain
$$\begin{aligned} n^{1/2}{\varvec{U}}_n({\varvec{\xi }}_0)= & {} n^{1/2}\{{\varvec{U}}_n({\varvec{\xi }}_0) - {\varvec{u}}({\varvec{\xi }}_0) \} \nonumber \\= & {} n^{1/2}\{{\varvec{U}}_n^{(1)}({\varvec{\xi }}_0) - {\varvec{u}}^{(1)}({\varvec{\xi }}_0) \} - n^{1/2}\{{\varvec{U}}_n^{(2)}({\varvec{\xi }}_0) - {\varvec{u}}^{(2)}({\varvec{\xi }}_0) \} \nonumber \\&+\, \ n^{1/2}\{{\varvec{U}}_n^{(3)}({\varvec{\xi }}_0) - {\varvec{u}}^{(3)}({\varvec{\xi }}_0) \} - n^{1/2}\{{\varvec{U}}_n^{(4)}({\varvec{\xi }}_0) - {\varvec{u}}^{(4)}({\varvec{\xi }}_0) \} \nonumber \\= & {} 2{\mathbb {G}}_n \Bigg \{ \varDelta _1\varDelta _2 {\dot{\beta }}(V) - {{\tilde{h}}}^\star (V;{\varvec{\xi }}_0) - {{\tilde{h}}}(V;{\varvec{\xi }}_0) \nonumber \\&+\, E_{V^*|W^*=W^\dagger }E_{V|W=W^*}\frac{b(V^*,V;{\varvec{\xi }}_0)f(W^*)}{S_h(X_1^*,X_2,W^*)^2} \nonumber \\&\times \, I(X_1^\dagger \ge X_1^*,X_2^\dagger \ge X_2)f(W^\dagger )\Bigg \} + o_p(1) \nonumber \\&\rightarrow _d N({\varvec{0}}, {\varvec{\varSigma }}({\varvec{\xi }}_0)) . \end{aligned}$$
(20)
Thus from (16) we obtain the desired asymptotic distribution of \(n^{1/2}({\hat{{\varvec{\xi }}}}_n - {\varvec{\xi }}_0)\). The estimator of \({\varvec{\varSigma }}({\varvec{\xi }}_0)\) can be obtained similarly to the case of a discrete covariate, with the conditional expectations evaluated using kernel smoothing.