R Basics: Data Manipulation, Statistics and Graphics

Loading Sample Data

setwd("D:/_Lab_Data/R/R_Labs/")
# Sample Data Source: Department of Geography, University of Colorado
data1<- "Colorado_Geog_Data/CO2.csv"
Table1<-read.table(data1, header=TRUE, sep=",")
data2<- "Colorado_Geog_Data/thames.csv"
Thames <-read.table(data2, header=TRUE, sep=",")
Asthma1<-"Colorado_Geog_Data/asthma.csv"
Asthma.Data<-read.table(Asthma1, header=TRUE, sep=",")

Variables and Data Structures

# 1. Variables 
# Numners
x<- 1+5
x

## [1] 6

class(x)

## [1] "numeric"

is.numeric(x)

## [1] TRUE

# Characters
y<-"MYNTU"
nchar(y)

## [1] 5

# Date
z<-as.Date ("2015-02-01") 
class(z)

## [1] "Date"

NN<-as.numeric(z) # since 1970,Jan, 1
NN

## [1] 16467

# 2. Data Structures
# Vectors
VEC<-c(1,2,3,4,5,6) # c means "combine"
VEC2 <- sqrt(VEC)
VEC3<-c(1:10)
# name-value pair 命名
score<- c(80,70,60)
names(score)<-c("Chn", "Math", "Eng")
score

##  Chn Math  Eng 
##   80   70   60

# Data frame (資料表)
x1<- 2:11
y1<- -4:5
z1<- c("A", "B","C", "D","E", "F","G", "H","I", "J")
DF1<- data.frame(x1,y1,z1)
# assigned names 命名欄位名稱
DF1<- data.frame(Field1 = x1, Field2=y1, Char1=z1)
DF1

##    Field1 Field2 Char1
## 1       2     -4     A
## 2       3     -3     B
## 3       4     -2     C
## 4       5     -1     D
## 5       6      0     E
## 6       7      1     F
## 7       8      2     G
## 8       9      3     H
## 9      10      4     I
## 10     11      5     J

names(DF1)  #欄位名稱

## [1] "Field1" "Field2" "Char1"

rownames(DF1) #資料名稱

##  [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"

head(DF1) # 檢視資料

##   Field1 Field2 Char1
## 1      2     -4     A
## 2      3     -3     B
## 3      4     -2     C
## 4      5     -1     D
## 5      6      0     E
## 6      7      1     F

class(DF1)

## [1] "data.frame"

# 選取資料
V1<-DF1$Field1
V2<-DF1[3,2]
V3<-DF1[1:5,2]
V4<-DF1[,3]
V5<-DF1[2,]

# Factor
PatientID<-c(1,2,3,4)
age<-c(25,34,28,52)
diabetes<-c("Type1", "Type2", "Type1", "Type1")
status<-c("Poor","Excellent", "Improved", "Poor")

diabetes2<-factor(diabetes)
status2<-factor(status, level=c("Poor", "Improved","Excellent"), order=TRUE)

PatientData<-data.frame(PatientID, age,diabetes2, status2)
str(PatientData)

## 'data.frame':    4 obs. of  4 variables:
##  $ PatientID: num  1 2 3 4
##  $ age      : num  25 34 28 52
##  $ diabetes2: Factor w/ 2 levels "Type1","Type2": 1 2 1 1
##  $ status2  : Ord.factor w/ 3 levels "Poor"<"Improved"<..: 1 3 2 1

# LIST
LIST1<- list(c(-5:5), 3:7)
LIST2<-list(TheDataFrame=DF1, TheVECTOR=1:10, TheLIST=LIST1)
LIST2$TheDataFrame$Char1

##  [1] A B C D E F G H I J
## Levels: A B C D E F G H I J

LIST2["TheDataFrame"]

## $TheDataFrame
##    Field1 Field2 Char1
## 1       2     -4     A
## 2       3     -3     B
## 3       4     -2     C
## 4       5     -1     D
## 5       6      0     E
## 6       7      1     F
## 7       8      2     G
## 8       9      3     H
## 9      10      4     I
## 10     11      5     J

names(LIST2)

## [1] "TheDataFrame" "TheVECTOR"    "TheLIST"

# Matrix
AA<-matrix(1:12, nrow=2)
rownames(AA)<- c("UP", "DOWN")
colnames(AA)<- c("F1","F2","F3","F4","F5", "F6")
AA1<-AA[2,5]
AA2<-AA[,3]

# Array
TheArray<-array(1:12, dim=c(2,3,2))

Import and Save Data

getwd()

## [1] "D:/_Lab_Data/R/R_Labs/RMD"

head(Table1)

##   Plant   Type  Treatment conc uptake
## 1   Qn1 Quebec nonchilled   95   16.0
## 2   Qn1 Quebec nonchilled  175   30.4
## 3   Qn1 Quebec nonchilled  250   34.8
## 4   Qn1 Quebec nonchilled  350   37.2
## 5   Qn1 Quebec nonchilled  500   35.3
## 6   Qn1 Quebec nonchilled  675   39.2

class(Table1)

## [1] "data.frame"

save(Table1, TheArray, V1, file = "Multi.Rdata")
rm(Table1)

load("Multi.Rdata")
head(Table1)

##   Plant   Type  Treatment conc uptake
## 1   Qn1 Quebec nonchilled   95   16.0
## 2   Qn1 Quebec nonchilled  175   30.4
## 3   Qn1 Quebec nonchilled  250   34.8
## 4   Qn1 Quebec nonchilled  350   37.2
## 5   Qn1 Quebec nonchilled  500   35.3
## 6   Qn1 Quebec nonchilled  675   39.2

#ctl+L 清除console畫面

Working with Tables (Data.frame)

# Table1 
rm()  #清除所有物件 
getwd()

## [1] "D:/_Lab_Data/R/R_Labs/RMD"

setwd("D:/_Lab_Data/R/R_Labs/Colorado_Geog_Data")
data1<- "CO2.csv"
Table1<-read.table(data1, header=TRUE, sep=",")

CO2con <- Table1$conc
CO2.Que<-subset(Table1, Type=="Quebec")
?subset
CO2.L1 <- subset(Table1, CO2con <= 200)
CO2.L2 <- subset(Table1, CO2con > 200 & CO2con <= 600)
CO2.L3 <- subset(Table1, CO2con > 600)

Graphics

# Graphics
Uptake<-Table1$uptake
Type<-Table1$Type
Treatment<-Table1$Treatment
hist(Uptake)

plot of chunk unnamed-chunk-5

hist(Uptake, col="blue", breaks=seq(0,60,by=2), xlab="Rate (umol/m^2 sec)", main = "CO2 Uptake Rates")

plot of chunk unnamed-chunk-5

boxplot(Uptake,col="blue", xlab="Rate (umol/m^2 sec)", main = "CO2 Uptake Rates" )

plot of chunk unnamed-chunk-5

boxplot(Uptake~Type,col=rainbow(2), xlab="Rate (umol/m^2 sec)", main = "CO2 Uptake Rates" )

plot of chunk unnamed-chunk-5

boxplot(Uptake~Treatment+Type , xlab="Type/Treatment", ylab="Rate (umol/m^2 sec)", col=c("orange","green"), main = "CO2 Uptake Rates" )
legend(0.5,45,c("chilled","nochilled"),fill=c("orange","green"))

plot of chunk unnamed-chunk-5

# box plot vs. scatter plot
plot(CO2con, Uptake, xlab="CO2 Concentration", ylab="Rate (umol/m^2 sec)", pch=1, col="red" )

plot of chunk unnamed-chunk-5

boxplot(Uptake~CO2con , xlab="CO2 Concentration", ylab="Rate (umol/m^2 sec)", col="red" )

plot of chunk unnamed-chunk-5

# Plotting time-series data
names(Thames)

## [1] "Day"       "Month"     "Year"      "Date"      "Feildes"   "Redbridge"

t<- 1: nrow(Thames)
plot(t, Thames$Feildes,xlab="Days", ylab="Flow Rate", main = "Flow Rate for Thames", type="l")
# ?plot
lines(t, Thames$Redbridge, lty=2, col="red" )

# adding legend
legend(1800, 80, c('Feildes','Redbridge'), lty=c(1,2), col=c("black","red"), )

plot of chunk unnamed-chunk-5

# Paneling Graphics
par(mfrow=c(2,2))
hist(Uptake, col="blue", breaks=seq(0,60,by=2), xlab="Rate (umol/m^2 sec)", main = "CO2 Uptake Rates")
boxplot(Uptake ~ Type,col=rainbow(2), xlab="Rate (umol/m^2 sec)", main = "CO2 Uptake Rates" )
plot(CO2con, Uptake, xlab="CO2 Concentration", ylab="Rate (umol/m^2 sec)", pch=1, col="red" )
plot(t, Thames$Feildes,xlab="Days", ylab="Flow Rate", main = "Flow Rate for Thames", type="l")
lines(t, Thames$Redbridge, lty=2, col="red" )

plot of chunk unnamed-chunk-5

Baisc Statistics

Descriptive statistics

# Descriptive statistics 
summary(Thames$Feildes)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.35    2.14    3.70    5.55    5.84   87.80

mean(Thames$Feildes)

## [1] 5.551

var(Thames$Feildes)

## [1] 50.03

sd(Thames$Feildes)

## [1] 7.073

IQR(Thames$Feildes)

## [1] 3.7

hist(Thames$Feildes, col="red", breaks=seq(0,100,by=5), xlab="Flow Rate")

plot of chunk unnamed-chunk-6

boxplot(Thames$Feildes, col="blue", ylab="Flow Rate")

plot of chunk unnamed-chunk-6

# 常態分佈的檢定與繪圖
qqnorm(Thames$Feildes)
qqline(Thames$Feildes,col="red" )

plot of chunk unnamed-chunk-6

shapiro.test(Thames$Feildes)  #Shapiro-Wilk test of normality

## 
##  Shapiro-Wilk normality test
## 
## data:  Thames$Feildes
## W = 0.5575, p-value < 2.2e-16

ks.test(Thames$Feildes,"pnorm") #Kolmogorov-Smirnov Tests

## Warning: ties should not be present for the Kolmogorov-Smirnov test

## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  Thames$Feildes
## D = 0.7892, p-value < 2.2e-16
## alternative hypothesis: two-sided

data(swiss) #R內建提供的資料
class(swiss)

## [1] "data.frame"

Inf.Mor <- swiss$Infant.Mortality
summary(Inf.Mor)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    10.8    18.2    20.0    19.9    21.7    26.6

sd(Inf.Mor)

## [1] 2.913

shapiro.test(Inf.Mor)

## 
##  Shapiro-Wilk normality test
## 
## data:  Inf.Mor
## W = 0.9776, p-value = 0.4978

ks.test(Inf.Mor,"pnorm")

## Warning: ties should not be present for the Kolmogorov-Smirnov test

## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  Inf.Mor
## D = 1, p-value < 2.2e-16
## alternative hypothesis: two-sided

hist(Inf.Mor, col="grey", breaks=seq(0,30,by=2), freq=FALSE,xlab="Mortality Rate")
curve(dnorm(x, mean=19.94, sd=2.912), add=TRUE, col="red", lwd=2)
Den.Inf.Mor <- density(Inf.Mor)
lines(x=Den.Inf.Mor$x, y=Den.Inf.Mor$y,col="blue",  lwd=2, lty=2 )

plot of chunk unnamed-chunk-6

Inference statistics

# one sample t-test
mean(Thames$Redbridge)

## [1] 2.019

t.test(Thames$Redbridge,mu=2, var.equal=T)

## 
##  One Sample t-test
## 
## data:  Thames$Redbridge
## t = 0.2582, df = 2556, p-value = 0.7963
## alternative hypothesis: true mean is not equal to 2
## 95 percent confidence interval:
##  1.876 2.161
## sample estimates:
## mean of x 
##     2.019

# comparing with two stations: Feildes vs. Redbridge
t.test(Thames$Feildes,Thames$Redbridge)

## 
##  Welch Two Sample t-test
## 
## data:  Thames$Feildes and Thames$Redbridge
## t = 22.4, df = 3844, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  3.223 3.841
## sample estimates:
## mean of x mean of y 
##     5.551     2.019

var.test(Thames$Feildes,Thames$Redbridge)

## 
##  F test to compare two variances
## 
## data:  Thames$Feildes and Thames$Redbridge
## F = 3.697, num df = 2556, denom df = 2556, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  3.421 3.995
## sample estimates:
## ratio of variances 
##              3.697

# comparing with two years(repeated measures): paired T-TEST
Year<-Thames$Year
Thames.2001 <- subset(Thames$Feildes, Year=="2001")
Thames.2003 <- subset(Thames$Feildes, Year=="2003")
t.test(Thames.2001, Thames.2003, paired=TRUE)

## 
##  Paired t-test
## 
## data:  Thames.2001 and Thames.2003
## t = 10.13, df = 364, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  4.443 6.584
## sample estimates:
## mean of the differences 
##                   5.513

wilcox.test(Thames.2001, Thames.2003) #non-parametric

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Thames.2001 and Thames.2003
## W = 109518, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

# comparing among years
boxplot(Thames$Feildes~ Thames$Year, col="red", main = "Flow Rate in Feildes", xlab="Year", ylab="Flow Rate")

plot of chunk unnamed-chunk-7

Year<-Thames$Year
aggregate(Thames$Feildes, by=list(Year), FUN=mean)

##   Group.1      x
## 1    2000  7.564
## 2    2001 10.923
## 3    2002  6.863
## 4    2003  5.409
## 5    2004  3.861
## 6    2005  2.024
## 7    2006  2.209

aggregate(Thames$Feildes, by=list(Year), FUN=sd)

##   Group.1      x
## 1    2000  9.058
## 2    2001 10.024
## 3    2002  6.445
## 4    2003  7.347
## 5    2004  2.008
## 6    2005  1.075
## 7    2006  2.555

fit <-aov(Thames$Feildes~ Thames$Year)
summary(fit)

##               Df Sum Sq Mean Sq F value Pr(>F)    
## Thames$Year    1  17721   17721     411 <2e-16 ***
## Residuals   2555 110150      43                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

library(gplots)
plotmeans(Thames$Feildes~Thames$Year)

## Warning: zero-length arrow is of indeterminate angle and so skipped
## Warning: zero-length arrow is of indeterminate angle and so skipped
## Warning: zero-length arrow is of indeterminate angle and so skipped
## Warning: zero-length arrow is of indeterminate angle and so skipped
## Warning: zero-length arrow is of indeterminate angle and so skipped
## Warning: zero-length arrow is of indeterminate angle and so skipped

plot of chunk unnamed-chunk-7

Correlation and Regression

data(swiss)
Birth <-swiss$Fertility
Edu <- swiss$Education
Ca<-swiss$Catholic
plot(Birth~Edu)

plot of chunk unnamed-chunk-8

cor.test(Birth, Edu, use="complete obs")

## 
##  Pearson's product-moment correlation
## 
## data:  Birth and Edu
## t = -5.954, df = 45, p-value = 3.659e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.7987 -0.4653
## sample estimates:
##     cor 
## -0.6638

cor.test(Birth, Edu, method="spearman",use="complete obs")

## Warning: Cannot compute exact p-value with ties

## 
##  Spearman's rank correlation rho
## 
## data:  Birth and Edu
## S = 24963, p-value = 0.001806
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##     rho 
## -0.4433

summary(lm(Birth~Edu+Ca))

## 
## Call:
## lm(formula = Birth ~ Edu + Ca)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15.04  -6.58  -1.43   6.12  14.32 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  74.2337     2.3520   31.56  < 2e-16 ***
## Edu          -0.7883     0.1293   -6.10  2.4e-07 ***
## Ca            0.1109     0.0298    3.72  0.00056 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.33 on 44 degrees of freedom
## Multiple R-squared:  0.575,  Adjusted R-squared:  0.555 
## F-statistic: 29.7 on 2 and 44 DF,  p-value: 6.85e-09

# Model comparisons
model1 <-lm(Birth~Edu)
summary(model1)

## 
## Call:
## lm(formula = Birth ~ Edu)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -17.04  -6.71  -1.01   9.53  19.69 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   79.610      2.104   37.84  < 2e-16 ***
## Edu           -0.862      0.145   -5.95  3.7e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.45 on 45 degrees of freedom
## Multiple R-squared:  0.441,  Adjusted R-squared:  0.428 
## F-statistic: 35.4 on 1 and 45 DF,  p-value: 3.66e-07

model2 <-lm(Birth~Edu+Ca)
summary(model2)

## 
## Call:
## lm(formula = Birth ~ Edu + Ca)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -15.04  -6.58  -1.43   6.12  14.32 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  74.2337     2.3520   31.56  < 2e-16 ***
## Edu          -0.7883     0.1293   -6.10  2.4e-07 ***
## Ca            0.1109     0.0298    3.72  0.00056 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.33 on 44 degrees of freedom
## Multiple R-squared:  0.575,  Adjusted R-squared:  0.555 
## F-statistic: 29.7 on 2 and 44 DF,  p-value: 6.85e-09

anova(model1, model2)

## Analysis of Variance Table
## 
## Model 1: Birth ~ Edu
## Model 2: Birth ~ Edu + Ca
##   Res.Df  RSS Df Sum of Sq    F  Pr(>F)    
## 1     45 4015                              
## 2     44 3054  1       961 13.8 0.00056 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Contingency table analysis (categorical variables)
head(Asthma.Data)

##   Nsmokers  Asthma HayFever Age Gender roaddist2 d2source1 d2source2
## 1        2 control        0 8.9      1    886552         0      0.00
## 2        2 control        0 8.5      2    889338         0      0.01
## 3        2 control        1 8.5      2    827783         0      0.01
## 4        1 control        0 8.9      2    889338         0      0.01
## 5        1 control        0 6.9      1    889338         0      0.01
## 6        1    case        0 7.1      2   1213531         0      0.01
##   d2source3
## 1      0.02
## 2      0.02
## 3      0.02
## 4      0.02
## 5      0.02
## 6      0.02

HayFever<-Asthma.Data$HayFever
Disease<-Asthma.Data$Asthma
AsthmaTable<-xtabs(~ HayFever+Disease, data=Asthma.Data)
AsthmaTable

##         Disease
## HayFever case control
##        0  156     956
##        1   59     120

addmargins(prop.table(AsthmaTable,1),2)

##         Disease
## HayFever   case control    Sum
##        0 0.1403  0.8597 1.0000
##        1 0.3296  0.6704 1.0000

chisq.test(AsthmaTable) # Test of Independence

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  AsthmaTable
## X-squared = 38.46, df = 1, p-value = 5.585e-10

fisher.test(AsthmaTable)# Test of Independence

## 
##  Fisher's Exact Test for Count Data
## 
## data:  AsthmaTable
## p-value = 4.837e-09
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.2300 0.4829
## sample estimates:
## odds ratio 
##     0.3323

library(vcd)
assocstats(AsthmaTable) # Test of Association

##                     X^2 df   P(> X^2)
## Likelihood Ratio 34.076  1 5.3003e-09
## Pearson          39.814  1 2.7939e-10
## 
## Phi-Coefficient   : 0.176 
## Contingency Coeff.: 0.173 
## Cramer's V        : 0.176

# Multi-dimensional Correlation
Age<-Asthma.Data$Age
Gender<-Asthma.Data$Gender
RdDist<-Asthma.Data$roaddist2
Src1<-Asthma.Data$d2source1
Src2<-Asthma.Data$d2source2

# Correlation Matrix
DEMO<-data.frame(Age,Gender)
DIST<--data.frame(RdDist,Src1,Src2)
cor(DEMO, DIST)

##          RdDist     Src1     Src2
## Age    0.001301 -0.12894 -0.12616
## Gender 0.028306  0.07225  0.00447

cor.test(Src1,Src2)

## 
##  Pearson's product-moment correlation
## 
## data:  Src1 and Src2
## t = 0.708, df = 1289, p-value = 0.4791
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03488  0.07419
## sample estimates:
##     cor 
## 0.01972

stat<-data.frame(Age,RdDist,Src1,Src2)
library(psych)
corr.test(x=stat, use="complete")

## Call:corr.test(x = stat, use = "complete")
## Correlation matrix 
##         Age RdDist Src1  Src2
## Age    1.00   0.00 0.13  0.13
## RdDist 0.00   1.00 0.43 -0.12
## Src1   0.13   0.43 1.00  0.02
## Src2   0.13  -0.12 0.02  1.00
## Sample Size 
## [1] 1291
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##         Age RdDist Src1 Src2
## Age    0.00   0.96 0.00 0.00
## RdDist 0.96   0.00 0.00 0.00
## Src1   0.00   0.00 0.00 0.96
## Src2   0.00   0.00 0.48 0.00
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option

COV1<-cov(DIST) # Covariance Matrix
class(COV1)

## [1] "matrix"

Writing User-defined Functions and If-then Statements

MySummary <- function(x,npar=TRUE,print=TRUE) 
  {
  if (!npar) { center <- mean(x); spread <- sd(x) } 
  else {center <- median(x); spread <- mad(x) }
  if (print & !npar) { cat("Mean=", center, "\n", "SD=", spread, "\n")} 
  else if (print & npar) { cat("Median=", center, "\n", "MAD=", spread, "\n") }
  result <- list(center=center,spread=spread)
  return(result)
  }

set.seed(123456)
x <- rnorm(100, 20, 5) 
y <- MySummary(x)

## Median= 20.24 
##  MAD= 5.815

y1 <- MySummary(x, npar=FALSE, print=FALSE)
y1; y1$center

## $center
## [1] 20.08
## 
## $spread
## [1] 4.967

## [1] 20.08

R Basics: Data Manipulation, Statistics and Graphics

WEN, Tzai-Hung (NTU Geography)

Loading Sample Data

Variables and Data Structures

Import and Save Data

Working with Tables (Data.frame)

Graphics

Baisc Statistics

Writing User-defined Functions and If-then Statements