# Name: Feng-Li Lian, Reg. No. B01921001
# Unit: U11 - Data Analysis: overal case
# Date: 12/13, 2016

# 資料庫
# install package = MASS, load library = MASS, obtain data = Insurance

install.packages( "MASS" )
library( MASS )
data( Insurance )

# first look at the data set
nrow( Insurance )
ncol( Insurance )
dim( Insurance)
head( Insurance )
tail( Insurance )

# 數據化探索 - 變數概況

# variable attribute, 資料集變數屬性

attributes( Insurance )
str( Insurance )

# internal structure, 內部結構

str( Insurance )
summary( Insurance )

# 數據化探索 - 變數詳情

# describe( ) in Hmisc, 更詳細的變數情況，使用 Hmisc 軟體套件

install.packages( "Hmisc" )
library( Hmisc )

describe( Insurance[ , 1:3] )

describe( Insurance[ , 4:5] )

# basicStats( ) in fBasics, 輸出指標更豐富，使用 fBasics 軟體套件

install.packages( "fBasics" )
library( fBasics )
basicStats( Insurance$Holders )


# 數據化探索 - 分布指標

# skewness( ), kurtosis( ), in timeDate, 更詳細的分布指標

install.packages( "timeDate" )
library( timeDate )

# 偏度：	衡量資料的偏倚程度或對稱程度
skewness( Insurance[ , 4:5] )

# 峰度：	衡量資料的分布型態的陡緩程度，集中或分散
kurtosis( Insurance[ , 4:5] )

# 數據化探索 - 相關性

# use weather dataset

install.packages("rattle")
library( rattle )
data( weather )

head( weather[ , 12:21] )

# correlation matrix 相關係數矩陣

var = c( 12:21 )
cor_matrix <- cor( weather[ var ], use = "pairwise" )
cor_matrix

# plotcoor( ), 繪製相關圖

install.packages("ellipse")
library( ellipse )

plotcorr( cor_matrix, col = rep( c( "white", "black" ), 5 ))

plotcorr( cor_matrix, col = rep( c( "white", "black" ), 5 ), type = "lower")

# 視覺化探索 - 長條圖

hist( Insurance$Claims, main = "Histogram of Freq of Insurance$Claims" )

hist( Insurance$Claims, freq = FALSE, main = "Histogram of Freq of Insurance$Claims" )
hist( Insurance$Claims, freq = FALSE, density = 20,main = "Histogram of Freq of Insurance$Claims" )
hist( Insurance$Claims, freq = FALSE, density = 10,main = "Histogram of Freq of Insurance$Claims" )
hist( Insurance$Claims, freq = FALSE, density = 50,main = "Histogram of Freq of Insurance$Claims" )

lines( density( Insurance$Claims ) )

hist( Insurance$Claims, freq = FALSE, density = 20,main = "Histogram of Freq of Insurance$Claims" )
lines( density( Insurance$Claims ) )

hist( Insurance$Claims, breaks = 20, labels = TRUE, col = "black", border = "white", main = "Histogram of Freq of Insurance$Claims with 20 bars" )
str(  hist( Insurance$Claims, breaks = 20, labels = TRUE, col = "black", border = "white", main = "Histogram of Freq of Insurance$Claims with 20 bars" )   )

# 視覺化探索 -累積分布圖

dt <- ecdf( Insurance$Claims )
plot( dt, xlab = "Claims", ylab = "Propotion <= x", main = "Cumluative Distribution of Claims" )

install.packages( "Hmisc" )
library(Hmisc)

Ecdf( Insurance$Claims, xlab = "Claims", ylab = "Propotion <= x", main = "Cumluative Distribution of Claims" )


data_plot <- with( Insurance, rbind(
  data.frame( var1 = Claims[ Age == "<25" ], var2 = "<25" ), data.frame( var1 = Claims[ Age == "25-29" ], var2 = "25-29" ), data.frame( var1 = Claims[ Age == "30-35" ], var2 = "30-35" ), data.frame( var1 = Claims[ Age == ">35" ], var2 = ">35" ) )
)

data_plot <- with( Insurance, rbind(
data.frame( var1 = Claims[ Age == "<25" ], var2 = "<25" ), data.frame( var1 = Claims[ Age == "25-29" ], var2 = "25-29" ), data.frame( var1 = Claims[ Age == "30-35" ], var2 = "30-35" ), data.frame( var1 = Claims[ Age == ">35" ], var2 = ">35" ) )         )

Ecdf( data_plot$var1, group = data_plot$var2, lty = 2, label.curves=1:4, xlab = "Claims", main = "Cumluative Distribution of Claims by Age"  )

Ecdf( Insurance$Claims, add = TRUE )

# 視覺化探索 - 箱形圖

claims_bp <- boxplot( Insurance$Claims, main = "Distribution of Claims" )

Claims_bp$stats

# 標記資料點

points( x=1, y = mean( Insurance$Claims ), pch = 8 )   # use star marker for mean value

Claims_points <- as.matrix( Insurance$Claims[ which( Insurance$Claims > 102 ) ], 6, 1 )

Claims_text <- rbind( Claims_bp$stats, mean( Insurance$Claims), Claims_points )

for( i in 1:length( Claims_text ) ) text( x = 1.1, y = Claims_text[ i, ], labels = Claims_text[ i, ] )

# 一張圖容納多個箱形圖

boxplot( var1~var2, data = data_plot, horizontal = TRUE, main = "Distribution of Claims by Age", xlab = "Claims", ylab = "Age" )

# 比例箱形圖

data_bp <- list( data_plot$var1[ which( data_plot$var2 == "<25" ) ], data_plot$var1[ which( data_plot$var2 == "25-29" ) ], data_plot$var1[ which( data_plot$var2 == "30-35" ) ], data_plot$var1[ which( data_plot$var2 == ">35" ) ] )

data_bp

bpplot( data_bp, name = c(  "<25", "25-29", "30-35", ">35" ),  ylab = "Claims", xlab = "Age"  )

# 視覺化探索 - 橫條圖

Claims_Age <- with( Insurance, c( sum( Claims[ which( Age == "<25" ) ] ), sum( Claims[ which( Age == "25-29" ) ] ), sum( Claims[ which( Age == "30-35") ] ), sum( Claims[ which( Age == ">35" ) ] ) ) )
Claims_Age

barplot( Claims_Age, names.arg = c(  "<25", "25-29", "30-35", ">35" ), density = rep( 20, 4), main = "Distribution of Age by Claims", ylab = "Claims", xlab = "Age"   )


# 分組 橫條圖

Holders_Age <- with( Insurance, c( sum( Holders[ which( Age == "<25" ) ] ), sum( Holders[ which( Age == "25-29" ) ] ), sum( Holders[ which( Age == "30-35") ] ), sum( Holders[ which( Age == ">35" ) ] ) ) )
Holders_Age

data_bar <- rbind( Claims_Age, Holders_Age )
data_bar

barplot( data_bar, names.arg = c(  "<25", "25-29", "30-35", ">35" ), beside = TRUE, density = rep( 20, 4), main = "Age Distribution by Claims and Holders", ylab = "Claims & Holders", xlab = "Age", col = c( "black", "darkgrey" )   )

legend( x = "topleft", rownames( data_bar ), fill = c( "black", "darkgrey" ) )

# 堆疊 橫條圖：

Holders_Age <- with( Insurance,
c( sum( Holders[ which( Age == "<25" ) ] ),
sum( Holders[ which( Age == "25-29" ) ] ),
sum( Holders[ which( Age == "30-35") ] ),
sum( Holders[ which( Age == ">35" ) ] ) ) )

# 堆疊 橫條圖：

barplot( data_bar, names.arg = c(  "<25", "25-29", "30-35", ">35" ), beside = TRUE, main = "Age Distribution by Claims and Holders", ylab = "Claims & Holders", xlab = "Age", col = c( "black", "darkgrey" )   )

legend( x = "topleft", rownames( data_bar ), fill = c( "black", "darkgrey" ) )

# 分組 橫條圖

barplot( data_bar, names.arg = c(  "<25", "25-29", "30-35", ">35" ), main = "Age Distribution by Claims and Holders", ylab = "Claims & Holders", xlab = "Age", col = c( "black", "darkgrey" )   )
legend( x = "topleft", rownames( data_bar ), fill = c( "black", "darkgrey" ) )

# 視覺化探索 - 點陣圖

dotchart( data_bar, main = "Age Distribution by Claims and Holders", xlab = "Claims & Holders", pch = 1:2   )
legend( x = 14000, y = 15, "<25", bty = "n" )
legend( x = 14000, y = 11, "25-29", bty = "n" )
legend( x = 14000, y = 7, "30-35", bty = "n" )
legend( x = 14000, y = 3, ">35", bty = "n" )

# 視覺化探索 - 圓形圖

pie( Claims_Age, labels = c(  "<25", "25-29", "30-35", ">35" ), main = "Pie Chart of Age by Claims", col = c( "white", "lightgrey", "darkgrey", "black" )     )

percent <- round( Claims_Age / sum( Claims_Age ) * 100 )
label <- paste( c(  "<25", "25-29", "30-35", ">35" ), ":", percent, "%", sep="" )

pie( Claims_Age, labels = label, main = "Pie Chart of Age by Claims", col = c( "white", "lightgrey", "darkgrey", "black" )     )


# 圓形圖 – 3D

install.packages( "plotrix" )
library( plotrix )

pie3D( Claims_Age, labels = c(  "<25", "25-29", "30-35", ">35" ), explode = 0.05, main = "3D Pie Chart of Age by Claims", labelcex = 0.8,  col = c( "white", "lightgrey", "darkgrey", "black" )    )

