# Simulation program to generate sequence-based data with dichotomous traits
##########################################################################################
# If you use this program to generate your simulation data, please cite the following two references: 
# 1. Lin W-Y, Lou XY, Gao G, Liu N (2014). Rare Variant Association Testing by Adaptive Combination of P-values. PLoS ONE, 9: e85728. [PMID: 24454922]. 
# 2. Lin W-Y (2014). Association Testing of Clustered Rare Causal Variants in Case-Control Studies. PLoS ONE, 9: e94337. [PMID: 24736372].
##########################################################################################
# Developer: Wan-Yu Lin, Institute of Epidemiology and Preventive Medicine, National Taiwan University
# Email: linwy@ntu.edu.tw
# Copyright 2014
##########################################################################################

rm(list=ls())
n.CS <- 500     # no. of cases
n.CN <- 500     # no. of controls
PAR <- 3/1000   
n.Cau <- 20     # no. of causal variants

n.r <- c(1,4,10,16,20)   # no. of risk variants
# You can revise this according to your simulation setting for the no. of risk variants

f0 <- 0.01      # baseline penetrance


aa <- as.matrix(read.table("out.hap-1",header=F))    # read the data from the Cosi program
haplo.all12 <- aa[,2:ncol(aa)]
n.haplo <- nrow(haplo.all12)
haplo.all <- (haplo.all12-1)

MAF <- c()
for(i in 1:ncol(haplo.all)){
  MAF[i] <- mean(haplo.all[,i])
  if(MAF[i]>0.5){
    haplo.all[,i] <- (1-haplo.all[,i])
    MAF[i] <- mean(haplo.all[,i])
  }
}


potential.DSL.pre <- which(MAF<=0.01 & MAF>=0.001)
# We randomly specified 25% of the variants with population MAF < 1% to be causal variants. 
potential.DSL <- sort(sample(potential.DSL.pre, floor(length(potential.DSL.pre)*0.25)))


population <- haplo.all

keep1 <- seq(1,length(MAF),1)
ind <- round(length(keep1)/100)   
DSL <- potential.DSL[(potential.DSL>ind)][1:n.Cau]
keep.snp1 <- c(ind:DSL[n.Cau])
keep.snp <- sort(keep.snp1)
population.keep <- population[,keep.snp]

DSL.in <- c()
for(dd in 1:length(DSL)){
  DSL.in[dd] <- which(keep.snp==DSL[dd])
}



Y <- append(rep(0,n.CN),rep(1,n.CS))


nr.loop <- 5      # You can revise this according to your simulation setting for the no. of risk variants
#for(nr.loop in 1:length(n.r)){

    ran <- runif(n.Cau,0,1)
    rank.ran <- rank(ran) 
    pro.pow <- ((rank.ran<=n.r[nr.loop])-0.5)*2

    GRRp <- ((PAR/((1-PAR)*MAF[DSL]))+1)**pro.pow
  
    # select samples 

    jc <- 0
    jd <- 0
    Copy.se <- matrix(NA,length(Y),ncol(population.keep))

    while(((jc<n.CN) || (jd<n.CS))){
      sel <- sample(1:nrow(population.keep), 2, replace=F)
      sel.chro <- population.keep[sel,]
      col.sum <- c(apply(sel.chro, 2, sum))
      prob.aff <- f0*prod(GRRp**(col.sum[DSL.in]))
      if(prob.aff>1){
        prob.aff <- 0.999
      }
      affected <- sample(c(0,1),1,c(1-prob.aff,prob.aff),replace=T)            
      if((jc<=(n.CN)) && (affected==0)){
        jc <- (jc+1)      
        Copy.se[jc,] <- col.sum
      }
      if((jd<=(n.CS)) && (affected==1)){
        jd <- (jd+1)            
        Copy.se[n.CN+jd,] <- col.sum
      }      
    }
 

Copy <- Copy.se

write.table(cbind(Y,Copy),'ADAfile',sep=' ',row.names=FALSE,col.names=FALSE,quote=FALSE,na='x',append=FALSE)

#}

# 'ADAfile' is the data file, in which the first column is the disease status (1: case; 0: control), 
# and column 2 to the last column are the numbers of minor alleles.