# Simulation program to generate sequence-based data with dichotomous traits ########################################################################################## # If you use this program to generate your simulation data, please cite the following two references: # 1. Lin W-Y, Lou XY, Gao G, Liu N (2014). Rare Variant Association Testing by Adaptive Combination of P-values. PLoS ONE, 9: e85728. [PMID: 24454922]. # 2. Lin W-Y (2014). Association Testing of Clustered Rare Causal Variants in Case-Control Studies. PLoS ONE, 9: e94337. [PMID: 24736372]. ########################################################################################## # Developer: Wan-Yu Lin, Institute of Epidemiology and Preventive Medicine, National Taiwan University # Email: linwy@ntu.edu.tw # Copyright 2014 ########################################################################################## rm(list=ls()) n.CS <- 500 # no. of cases n.CN <- 500 # no. of controls PAR <- 3/1000 n.Cau <- 20 # no. of causal variants n.r <- c(1,4,10,16,20) # no. of risk variants # You can revise this according to your simulation setting for the no. of risk variants f0 <- 0.01 # baseline penetrance aa <- as.matrix(read.table("out.hap-1",header=F)) # read the data from the Cosi program haplo.all12 <- aa[,2:ncol(aa)] n.haplo <- nrow(haplo.all12) haplo.all <- (haplo.all12-1) MAF <- c() for(i in 1:ncol(haplo.all)){ MAF[i] <- mean(haplo.all[,i]) if(MAF[i]>0.5){ haplo.all[,i] <- (1-haplo.all[,i]) MAF[i] <- mean(haplo.all[,i]) } } potential.DSL.pre <- which(MAF<=0.01 & MAF>=0.001) # We randomly specified 25% of the variants with population MAF < 1% to be causal variants. potential.DSL <- sort(sample(potential.DSL.pre, floor(length(potential.DSL.pre)*0.25))) population <- haplo.all keep1 <- seq(1,length(MAF),1) ind <- round(length(keep1)/100) DSL <- potential.DSL[(potential.DSL>ind)][1:n.Cau] keep.snp1 <- c(ind:DSL[n.Cau]) keep.snp <- sort(keep.snp1) population.keep <- population[,keep.snp] DSL.in <- c() for(dd in 1:length(DSL)){ DSL.in[dd] <- which(keep.snp==DSL[dd]) } Y <- append(rep(0,n.CN),rep(1,n.CS)) nr.loop <- 5 # You can revise this according to your simulation setting for the no. of risk variants #for(nr.loop in 1:length(n.r)){ ran <- runif(n.Cau,0,1) rank.ran <- rank(ran) pro.pow <- ((rank.ran<=n.r[nr.loop])-0.5)*2 GRRp <- ((PAR/((1-PAR)*MAF[DSL]))+1)**pro.pow # select samples jc <- 0 jd <- 0 Copy.se <- matrix(NA,length(Y),ncol(population.keep)) while(((jc1){ prob.aff <- 0.999 } affected <- sample(c(0,1),1,c(1-prob.aff,prob.aff),replace=T) if((jc<=(n.CN)) && (affected==0)){ jc <- (jc+1) Copy.se[jc,] <- col.sum } if((jd<=(n.CS)) && (affected==1)){ jd <- (jd+1) Copy.se[n.CN+jd,] <- col.sum } } Copy <- Copy.se write.table(cbind(Y,Copy),'ADAfile',sep=' ',row.names=FALSE,col.names=FALSE,quote=FALSE,na='x',append=FALSE) #} # 'ADAfile' is the data file, in which the first column is the disease status (1: case; 0: control), # and column 2 to the last column are the numbers of minor alleles.