# Name: Feng-Li Lian, Reg. No. B01921001
# Unit: U12 - Data Preprocessing
# Date: 12/20, 2016

# install packages: lattice, MASS, nnet
install.packages( "lattice" )
install.packages( "MASS" )
install.packages( "nnet" )

# include library:lattice, MASS, nnet
library( lattice )
library( MASS )
library( nnet )

# install packages: mice, include library: mice
install.packages( "mice" )
library( mice )

# use dataset: nhanes2
data( nhanes2 )

# number of rows and columns
nrow( nhanes2 )
ncol( nhanes2 )

summary( nhanes2 )

nhanes2
head( nhanes2 )

# not available: the total number of NA
is.na( nhanes2 )
sum( is.na( nhanes2 ) )

# complete case: the total number of complete cases
sum( complete.cases( nhanes2 ) )
md.pattern( nhanes2 )

# generate 4 complete sets
imp <- mice( nhanes2, m = 4 )

# generate linear regression model
fit <- with ( imp, lm( chl ~ age + hyp + bmi ) )
fit

# orgainze these 4 models
pooled <- pool( fit )
summary( pooled )

# 插補法
# find the 4-th column with NA
nhanes2[ , 4 ]
sub <- which( is.na( nhanes2[ , 4 ] ) == TRUE )

# generate a new dataset of the 4-th column, with non-NA
dataTR <- nhanes2[ -sub, ]

# generate a new dataset of the 4-th column, with NA
dataTE <- nhanes2[  sub, ]

# do sampling for non-NA dataset
dataTE[ , 4 ] <- sample( dataTR[ , 4 ], length( dataTE[ , 4 ] ), replace = T )
dataTE

# use mean to replace the NA data
sub <- which( is.na( nhanes2[ , 4 ] ) == TRUE )
dataTR <- nhanes2[ -sub, ]
dataTE <- nhanes2[  sub, ]
dataTE
dataTE[ , 4 ] <- mean ( dataTR[ , 4 ] )
dataTE

# use linear regression model to predict the value for NA data
sub <- which( is.na( nhanes2[ , 4 ] ) == TRUE )
dataTR <- nhanes2[ -sub, ]
dataTE <- nhanes2[  sub, ]
dataTE

# use the linear regression function to generate the model
lmout <- lm( chl ~ age, data = dataTR )

# to predict
dataTE[ , 4 ] <- round( predict( lmout, dataTE ) )
dataTE


# use hot platform method
# with NA
accept <- nhanes2[ which( apply( is.na( nhanes2 ), 1, sum ) != 0 ), ]
accept

# without NA
donate <- nhanes2[ which( apply( is.na( nhanes2 ), 1, sum ) == 0 ), ]
donate

accept[ 1, ]
donate[ 1, ]

# find similar sample and replace by the sample
sa <- donate[ which( donate[ , 1 ] == accept[ 2, 1 ] & donate[ , 3 ] == accept[ 2, 3 ] & accept[ 2, 4 ] ), ]
sa
accept[ 2, 2 ] <- sa[ 1, 2 ]
accept
accept[ 2, ]

# cold platform method
# divid the dataset by hyp
levelhyp <- nhanes2[ which( nhanes2[ , 3 ] == "yes" ), ]
levelhyp

# use the mean of the same level to replace the NA
levelhyp[ 4, 4 ] <- mean( levelhyp[ 1:3, 4 ] )
levelhyp

# find outliers and process these outliners
# install package and include library: outliers
install.packages( "outliers" )
library( outliers )

set.seed(1)
s1 <- .Random.seed
s1
y <- rnorm( 100 )
outlier( y )

outlier( y, opposite = T )

# plot
dotchart( y )

# use seperate box into several groups
set.seed(1)
s1 <- .Random.seed
x <- rnorm( 12 )
x <- sort( x )
dim( x ) <- c( 3, 4 )
x

# use mean of the 1st row be the value of the data set
x[ 1, ] <- apply( x, 1, mean )[1]
x

# use mean of the 2nd row be the value of the data set
x[ 2, ] <- apply( x, 1, mean )[2]
x

# use mean of the 3rd row be the value of the data set
x[ 3, ] <- apply( x, 1, mean )[3]
x


# chage data attribute or content

# normalize
a <- iris[,2]
plot( a)

b <- scale( a )
plot( b )

# discretize
a <- iris[,2]
plot( a )
n <- length( a )
anew <- rep( 0, n )

# find the data and re-set the level
which( a < 2.5 )
anew[ which( a < 2.5 ) ] <- 1
plot( anew )

# find the data and re-set the level for others
anew[ which( a >= 2.5 & a < 3.0 ) ] <- 2
anew[ which( a >= 3.0 & a < 3.5 ) ] <- 3
anew[ which( a >= 3.5 ) ] <- 4

plot( anew )

# level setting by city, street, etc.

city <- c( 6, 7, 2, 3, 1, 5, 4, 2, 8, 9, 2, 3, 8, 1, 2, 8, 8, 6 )
citytype <- rep( 0, 18 )
citytype[ which( city <= 5 ) ] <- 1
citytype[ which( city >= 6 ) ] <- 2
citytype