This is an example to illustrate the situation one might encountered when selecting variables using CID. Let \(Y\) be a one-dimensional target variable and \(X_i\)’s (\(i = 1, 2,\dots, 6\)) be the one-dimensional candidate explanatory variables identically and independently distributed as Uniform(0, 1). In fact, \[Y=10\sin(\pi X_1X_2)+30(X_3-0.5)^2+10X_4+5X_5+\epsilon,\] where \(\varepsilon\) is the random disturbance distributed as normal with zero mean and unit variance. Note that the explanatory variable \(X_6\) is independent of the target variable \(Y\) according to the model.
rm(list=ls())
##########################################
# data import
##########################################
x = read.csv("http://homepage.ntu.edu.tw/~lyliu/pCID/data_x_demo.csv",colClasses="numeric")
y = read.csv("http://homepage.ntu.edu.tw/~lyliu/pCID/data_y_demo.csv",colClasses="numeric")
##########################################
# CID functions
##########################################
library(np)
cid = function(x.mat,y){
if (is.null(dim(x.mat))) x.mat = as.matrix(x.mat)
bwx = npcdistbw(xdat=x.mat, ydat=y, bwmethod="normal-reference")
bwy = npudistbw(dat=y, bwmethod="normal-reference")
disty=npudist(bwy)$dist # estimate F(Y)
n=nrow(x.mat)
num=0
for(k in 1:n){
ex.mat = data.frame(x.mat[k,],y,row.names=rownames(y))
testfhat=fitted(npcdist(bws=bwx,txdat=x.mat,tydat=y,exdat=ex.mat[,1:ncol(x.mat)],eydat=ex.mat[,ncol(ex.mat)])) # estimate F(Y|X=xi)
expy=sum((testfhat-disty)^2)/n # sum_Y((F(Y|X=xi)-F(Y))^2)/N
num=num+expy # sum_X(sum_Y((F(Y|X=xi)-F(Y))^2)/N)
}
den=sum(disty*(1-disty)) # denominator: sum_Y(F(Y)(1-F(Y)))
return(num/den)
}
cid.perm = function(x.mat,y,perm.column=1){
if (is.null(dim(x.mat))) x.mat = as.matrix(x.mat)
return(cid(data.frame(x.mat[sample(1:nrow(y)),1],x.mat[,-1]),y))
}
Sequential variable selection for this example takes about 20 min on Mac OS X 10.9.4, 1.8 GHz Intel Core i5, 4GB 1600 MHz RAM
t1 = proc.time()
pcid = cid1v = apply(x,2,cid,y=y)
cid1v.null = replicate(100,apply(x,2,cid.perm,y=y))
pcid.pval = cid1v.pval = apply(apply(cbind(cid1v,cid1v.null),2,function(x){x>=cid1v}),1,mean)
t2 = proc.time()
i = 0
sel.var = sel.pcid = sel.pcid.pval = c()
while(any(pcid.pval <= 0.05) & length(sel.var) < (ncol(x)-1)){
i = i+1
sig.var = which(pcid.pval <= 0.05)
ii = which.max(cid1v[sig.var])
sel.var = c(sel.var,names(sig.var)[ii])
sel.pcid = c(sel.pcid,pcid[ii])
sel.pcid.pval = c(sel.pcid.pval,pcid.pval[ii])
cat("Step ",i,". Select ",tail(sel.var,1L),ifelse(i==1," (CID = "," (pCID = "),round(tail(sel.pcid,1L),4),"; p-value = ",round(tail(sel.pcid.pval,1L),4),
") in ",as.numeric(t2-t1)[3]," sec.\n",sep="")
t1 = t2
given.id = which(names(x) %in% sel.var)
if(length(given.id)==(ncol(x)-1)) {
cid2v = cid(data.frame(x[,-given.id],x[,given.id]),y)
cid2v.null = replicate(100,cid.perm(data.frame(x[,-given.id],x[,given.id]),y))
pcid = (cid2v-cid1v[ii])/(1-cid1v[ii])
pcid.null = (cid2v.null-cid1v[ii])/(1-cid1v[ii])
pcid.pval = mean(c(pcid,pcid.null) >= pcid)
}
else {
cid2v = apply(x[,-given.id],2,function(x,t,y){cid(data.frame(x,t),y)},t=x[,given.id],y=y)
cid2v.null = replicate(100,apply(x[,-given.id],2,function(x,t,y){cid.perm(data.frame(x,t),y)},t=x[,given.id],y=y))
pcid = (cid2v-cid1v[ii])/(1-cid1v[ii])
pcid.null = (cid2v.null-cid1v[ii])/(1-cid1v[ii])
pcid.pval = apply(apply(cbind(pcid,pcid.null),2,function(x){x>=pcid}),1,mean)
}
cid1v = cid2v
t2 = proc.time()
#print(sel.var)
}
This ‘last check’ is to ensure all significant variables according to CID/pCID are included.