Supplementary Table and Figures

Example of sequential variable selection by CID/pCID

This is an example to illustrate the situation one might encountered when selecting variables using CID. Let \(Y\) be a one-dimensional target variable and \(X_i\)’s (\(i = 1, 2,\dots, 6\)) be the one-dimensional candidate explanatory variables identically and independently distributed as Uniform(0, 1). In fact, \[Y=10\sin(\pi X_1X_2)+30(X_3-0.5)^2+10X_4+5X_5+\epsilon,\] where \(\varepsilon\) is the random disturbance distributed as normal with zero mean and unit variance. Note that the explanatory variable \(X_6\) is independent of the target variable \(Y\) according to the model.

rm(list=ls())

##########################################
# data import
##########################################
x = read.csv("http://homepage.ntu.edu.tw/~lyliu/pCID/data_x_demo.csv",colClasses="numeric")
y = read.csv("http://homepage.ntu.edu.tw/~lyliu/pCID/data_y_demo.csv",colClasses="numeric")

##########################################
# CID functions
##########################################
library(np)
cid = function(x.mat,y){
      if (is.null(dim(x.mat))) x.mat = as.matrix(x.mat)
      bwx = npcdistbw(xdat=x.mat, ydat=y, bwmethod="normal-reference")
      bwy = npudistbw(dat=y, bwmethod="normal-reference")
      disty=npudist(bwy)$dist     # estimate F(Y)

      n=nrow(x.mat)
      num=0
      for(k in 1:n){ 
         ex.mat = data.frame(x.mat[k,],y,row.names=rownames(y))
         testfhat=fitted(npcdist(bws=bwx,txdat=x.mat,tydat=y,exdat=ex.mat[,1:ncol(x.mat)],eydat=ex.mat[,ncol(ex.mat)])) # estimate F(Y|X=xi) 
         expy=sum((testfhat-disty)^2)/n    # sum_Y((F(Y|X=xi)-F(Y))^2)/N
         num=num+expy                      # sum_X(sum_Y((F(Y|X=xi)-F(Y))^2)/N)
      }

      den=sum(disty*(1-disty))       # denominator: sum_Y(F(Y)(1-F(Y)))
      return(num/den)
}

cid.perm = function(x.mat,y,perm.column=1){
      if (is.null(dim(x.mat))) x.mat = as.matrix(x.mat)
      return(cid(data.frame(x.mat[sample(1:nrow(y)),1],x.mat[,-1]),y))
}

Sequential variable selection for this example takes about 20 min on Mac OS X 10.9.4, 1.8 GHz Intel Core i5, 4GB 1600 MHz RAM

t1 = proc.time()
pcid = cid1v = apply(x,2,cid,y=y)
cid1v.null = replicate(100,apply(x,2,cid.perm,y=y))
pcid.pval = cid1v.pval = apply(apply(cbind(cid1v,cid1v.null),2,function(x){x>=cid1v}),1,mean)
t2 = proc.time()

i = 0

sel.var = sel.pcid = sel.pcid.pval = c()
while(any(pcid.pval <= 0.05) & length(sel.var) < (ncol(x)-1)){
   i = i+1
   sig.var = which(pcid.pval <= 0.05)
   ii = which.max(cid1v[sig.var])
   sel.var = c(sel.var,names(sig.var)[ii])
   sel.pcid = c(sel.pcid,pcid[ii])
   sel.pcid.pval = c(sel.pcid.pval,pcid.pval[ii])
   cat("Step ",i,". Select ",tail(sel.var,1L),ifelse(i==1," (CID = "," (pCID = "),round(tail(sel.pcid,1L),4),"; p-value = ",round(tail(sel.pcid.pval,1L),4),
       ") in ",as.numeric(t2-t1)[3]," sec.\n",sep="")
   
   t1 = t2
   given.id = which(names(x) %in% sel.var)
   
   if(length(given.id)==(ncol(x)-1)) {
      cid2v = cid(data.frame(x[,-given.id],x[,given.id]),y)
      cid2v.null = replicate(100,cid.perm(data.frame(x[,-given.id],x[,given.id]),y))
      pcid = (cid2v-cid1v[ii])/(1-cid1v[ii])
      pcid.null = (cid2v.null-cid1v[ii])/(1-cid1v[ii])
      pcid.pval = mean(c(pcid,pcid.null) >= pcid)
   }
   else {
      cid2v = apply(x[,-given.id],2,function(x,t,y){cid(data.frame(x,t),y)},t=x[,given.id],y=y)
      cid2v.null = replicate(100,apply(x[,-given.id],2,function(x,t,y){cid.perm(data.frame(x,t),y)},t=x[,given.id],y=y))
      pcid = (cid2v-cid1v[ii])/(1-cid1v[ii])
      pcid.null = (cid2v.null-cid1v[ii])/(1-cid1v[ii])
      pcid.pval = apply(apply(cbind(pcid,pcid.null),2,function(x){x>=pcid}),1,mean)
   }
   cid1v = cid2v
   t2 = proc.time()

   #print(sel.var)
}

This ‘last check’ is to ensure all significant variables according to CID/pCID are included.