setwd("D:/_Lab_Data/R/R_Labs/")
# Sample Data Source: Department of Geography, University of Colorado
data1<- "Colorado_Geog_Data/CO2.csv"
Table1<-read.table(data1, header=TRUE, sep=",")
data2<- "Colorado_Geog_Data/thames.csv"
Thames <-read.table(data2, header=TRUE, sep=",")
Asthma1<-"Colorado_Geog_Data/asthma.csv"
Asthma.Data<-read.table(Asthma1, header=TRUE, sep=",")
# 1. Variables
# Numners
x<- 1+5
x
## [1] 6
class(x)
## [1] "numeric"
is.numeric(x)
## [1] TRUE
# Characters
y<-"MYNTU"
nchar(y)
## [1] 5
# Date
z<-as.Date ("2015-02-01")
class(z)
## [1] "Date"
NN<-as.numeric(z) # since 1970,Jan, 1
NN
## [1] 16467
# 2. Data Structures
# Vectors
VEC<-c(1,2,3,4,5,6) # c means "combine"
VEC2 <- sqrt(VEC)
VEC3<-c(1:10)
# name-value pair 命名
score<- c(80,70,60)
names(score)<-c("Chn", "Math", "Eng")
score
## Chn Math Eng
## 80 70 60
# Data frame (資料表)
x1<- 2:11
y1<- -4:5
z1<- c("A", "B","C", "D","E", "F","G", "H","I", "J")
DF1<- data.frame(x1,y1,z1)
# assigned names 命名欄位名稱
DF1<- data.frame(Field1 = x1, Field2=y1, Char1=z1)
DF1
## Field1 Field2 Char1
## 1 2 -4 A
## 2 3 -3 B
## 3 4 -2 C
## 4 5 -1 D
## 5 6 0 E
## 6 7 1 F
## 7 8 2 G
## 8 9 3 H
## 9 10 4 I
## 10 11 5 J
names(DF1) #欄位名稱
## [1] "Field1" "Field2" "Char1"
rownames(DF1) #資料名稱
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
head(DF1) # 檢視資料
## Field1 Field2 Char1
## 1 2 -4 A
## 2 3 -3 B
## 3 4 -2 C
## 4 5 -1 D
## 5 6 0 E
## 6 7 1 F
class(DF1)
## [1] "data.frame"
# 選取資料
V1<-DF1$Field1
V2<-DF1[3,2]
V3<-DF1[1:5,2]
V4<-DF1[,3]
V5<-DF1[2,]
# Factor
PatientID<-c(1,2,3,4)
age<-c(25,34,28,52)
diabetes<-c("Type1", "Type2", "Type1", "Type1")
status<-c("Poor","Excellent", "Improved", "Poor")
diabetes2<-factor(diabetes)
status2<-factor(status, level=c("Poor", "Improved","Excellent"), order=TRUE)
PatientData<-data.frame(PatientID, age,diabetes2, status2)
str(PatientData)
## 'data.frame': 4 obs. of 4 variables:
## $ PatientID: num 1 2 3 4
## $ age : num 25 34 28 52
## $ diabetes2: Factor w/ 2 levels "Type1","Type2": 1 2 1 1
## $ status2 : Ord.factor w/ 3 levels "Poor"<"Improved"<..: 1 3 2 1
# LIST
LIST1<- list(c(-5:5), 3:7)
LIST2<-list(TheDataFrame=DF1, TheVECTOR=1:10, TheLIST=LIST1)
LIST2$TheDataFrame$Char1
## [1] A B C D E F G H I J
## Levels: A B C D E F G H I J
LIST2["TheDataFrame"]
## $TheDataFrame
## Field1 Field2 Char1
## 1 2 -4 A
## 2 3 -3 B
## 3 4 -2 C
## 4 5 -1 D
## 5 6 0 E
## 6 7 1 F
## 7 8 2 G
## 8 9 3 H
## 9 10 4 I
## 10 11 5 J
names(LIST2)
## [1] "TheDataFrame" "TheVECTOR" "TheLIST"
# Matrix
AA<-matrix(1:12, nrow=2)
rownames(AA)<- c("UP", "DOWN")
colnames(AA)<- c("F1","F2","F3","F4","F5", "F6")
AA1<-AA[2,5]
AA2<-AA[,3]
# Array
TheArray<-array(1:12, dim=c(2,3,2))
getwd()
## [1] "D:/_Lab_Data/R/R_Labs/RMD"
head(Table1)
## Plant Type Treatment conc uptake
## 1 Qn1 Quebec nonchilled 95 16.0
## 2 Qn1 Quebec nonchilled 175 30.4
## 3 Qn1 Quebec nonchilled 250 34.8
## 4 Qn1 Quebec nonchilled 350 37.2
## 5 Qn1 Quebec nonchilled 500 35.3
## 6 Qn1 Quebec nonchilled 675 39.2
class(Table1)
## [1] "data.frame"
save(Table1, TheArray, V1, file = "Multi.Rdata")
rm(Table1)
load("Multi.Rdata")
head(Table1)
## Plant Type Treatment conc uptake
## 1 Qn1 Quebec nonchilled 95 16.0
## 2 Qn1 Quebec nonchilled 175 30.4
## 3 Qn1 Quebec nonchilled 250 34.8
## 4 Qn1 Quebec nonchilled 350 37.2
## 5 Qn1 Quebec nonchilled 500 35.3
## 6 Qn1 Quebec nonchilled 675 39.2
#ctl+L 清除console畫面
# Table1
rm() #清除所有物件
getwd()
## [1] "D:/_Lab_Data/R/R_Labs/RMD"
setwd("D:/_Lab_Data/R/R_Labs/Colorado_Geog_Data")
data1<- "CO2.csv"
Table1<-read.table(data1, header=TRUE, sep=",")
CO2con <- Table1$conc
CO2.Que<-subset(Table1, Type=="Quebec")
?subset
CO2.L1 <- subset(Table1, CO2con <= 200)
CO2.L2 <- subset(Table1, CO2con > 200 & CO2con <= 600)
CO2.L3 <- subset(Table1, CO2con > 600)
# Graphics
Uptake<-Table1$uptake
Type<-Table1$Type
Treatment<-Table1$Treatment
hist(Uptake)
hist(Uptake, col="blue", breaks=seq(0,60,by=2), xlab="Rate (umol/m^2 sec)", main = "CO2 Uptake Rates")
boxplot(Uptake,col="blue", xlab="Rate (umol/m^2 sec)", main = "CO2 Uptake Rates" )
boxplot(Uptake~Type,col=rainbow(2), xlab="Rate (umol/m^2 sec)", main = "CO2 Uptake Rates" )
boxplot(Uptake~Treatment+Type , xlab="Type/Treatment", ylab="Rate (umol/m^2 sec)", col=c("orange","green"), main = "CO2 Uptake Rates" )
legend(0.5,45,c("chilled","nochilled"),fill=c("orange","green"))
# box plot vs. scatter plot
plot(CO2con, Uptake, xlab="CO2 Concentration", ylab="Rate (umol/m^2 sec)", pch=1, col="red" )
boxplot(Uptake~CO2con , xlab="CO2 Concentration", ylab="Rate (umol/m^2 sec)", col="red" )
# Plotting time-series data
names(Thames)
## [1] "Day" "Month" "Year" "Date" "Feildes" "Redbridge"
t<- 1: nrow(Thames)
plot(t, Thames$Feildes,xlab="Days", ylab="Flow Rate", main = "Flow Rate for Thames", type="l")
# ?plot
lines(t, Thames$Redbridge, lty=2, col="red" )
# adding legend
legend(1800, 80, c('Feildes','Redbridge'), lty=c(1,2), col=c("black","red"), )
# Paneling Graphics
par(mfrow=c(2,2))
hist(Uptake, col="blue", breaks=seq(0,60,by=2), xlab="Rate (umol/m^2 sec)", main = "CO2 Uptake Rates")
boxplot(Uptake ~ Type,col=rainbow(2), xlab="Rate (umol/m^2 sec)", main = "CO2 Uptake Rates" )
plot(CO2con, Uptake, xlab="CO2 Concentration", ylab="Rate (umol/m^2 sec)", pch=1, col="red" )
plot(t, Thames$Feildes,xlab="Days", ylab="Flow Rate", main = "Flow Rate for Thames", type="l")
lines(t, Thames$Redbridge, lty=2, col="red" )
Descriptive statistics
# Descriptive statistics
summary(Thames$Feildes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.35 2.14 3.70 5.55 5.84 87.80
mean(Thames$Feildes)
## [1] 5.551
var(Thames$Feildes)
## [1] 50.03
sd(Thames$Feildes)
## [1] 7.073
IQR(Thames$Feildes)
## [1] 3.7
hist(Thames$Feildes, col="red", breaks=seq(0,100,by=5), xlab="Flow Rate")
boxplot(Thames$Feildes, col="blue", ylab="Flow Rate")
# 常態分佈的檢定與繪圖
qqnorm(Thames$Feildes)
qqline(Thames$Feildes,col="red" )
shapiro.test(Thames$Feildes) #Shapiro-Wilk test of normality
##
## Shapiro-Wilk normality test
##
## data: Thames$Feildes
## W = 0.5575, p-value < 2.2e-16
ks.test(Thames$Feildes,"pnorm") #Kolmogorov-Smirnov Tests
## Warning: ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: Thames$Feildes
## D = 0.7892, p-value < 2.2e-16
## alternative hypothesis: two-sided
data(swiss) #R內建提供的資料
class(swiss)
## [1] "data.frame"
Inf.Mor <- swiss$Infant.Mortality
summary(Inf.Mor)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.8 18.2 20.0 19.9 21.7 26.6
sd(Inf.Mor)
## [1] 2.913
shapiro.test(Inf.Mor)
##
## Shapiro-Wilk normality test
##
## data: Inf.Mor
## W = 0.9776, p-value = 0.4978
ks.test(Inf.Mor,"pnorm")
## Warning: ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: Inf.Mor
## D = 1, p-value < 2.2e-16
## alternative hypothesis: two-sided
hist(Inf.Mor, col="grey", breaks=seq(0,30,by=2), freq=FALSE,xlab="Mortality Rate")
curve(dnorm(x, mean=19.94, sd=2.912), add=TRUE, col="red", lwd=2)
Den.Inf.Mor <- density(Inf.Mor)
lines(x=Den.Inf.Mor$x, y=Den.Inf.Mor$y,col="blue", lwd=2, lty=2 )
Inference statistics
# one sample t-test
mean(Thames$Redbridge)
## [1] 2.019
t.test(Thames$Redbridge,mu=2, var.equal=T)
##
## One Sample t-test
##
## data: Thames$Redbridge
## t = 0.2582, df = 2556, p-value = 0.7963
## alternative hypothesis: true mean is not equal to 2
## 95 percent confidence interval:
## 1.876 2.161
## sample estimates:
## mean of x
## 2.019
# comparing with two stations: Feildes vs. Redbridge
t.test(Thames$Feildes,Thames$Redbridge)
##
## Welch Two Sample t-test
##
## data: Thames$Feildes and Thames$Redbridge
## t = 22.4, df = 3844, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 3.223 3.841
## sample estimates:
## mean of x mean of y
## 5.551 2.019
var.test(Thames$Feildes,Thames$Redbridge)
##
## F test to compare two variances
##
## data: Thames$Feildes and Thames$Redbridge
## F = 3.697, num df = 2556, denom df = 2556, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 3.421 3.995
## sample estimates:
## ratio of variances
## 3.697
# comparing with two years(repeated measures): paired T-TEST
Year<-Thames$Year
Thames.2001 <- subset(Thames$Feildes, Year=="2001")
Thames.2003 <- subset(Thames$Feildes, Year=="2003")
t.test(Thames.2001, Thames.2003, paired=TRUE)
##
## Paired t-test
##
## data: Thames.2001 and Thames.2003
## t = 10.13, df = 364, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 4.443 6.584
## sample estimates:
## mean of the differences
## 5.513
wilcox.test(Thames.2001, Thames.2003) #non-parametric
##
## Wilcoxon rank sum test with continuity correction
##
## data: Thames.2001 and Thames.2003
## W = 109518, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
# comparing among years
boxplot(Thames$Feildes~ Thames$Year, col="red", main = "Flow Rate in Feildes", xlab="Year", ylab="Flow Rate")
Year<-Thames$Year
aggregate(Thames$Feildes, by=list(Year), FUN=mean)
## Group.1 x
## 1 2000 7.564
## 2 2001 10.923
## 3 2002 6.863
## 4 2003 5.409
## 5 2004 3.861
## 6 2005 2.024
## 7 2006 2.209
aggregate(Thames$Feildes, by=list(Year), FUN=sd)
## Group.1 x
## 1 2000 9.058
## 2 2001 10.024
## 3 2002 6.445
## 4 2003 7.347
## 5 2004 2.008
## 6 2005 1.075
## 7 2006 2.555
fit <-aov(Thames$Feildes~ Thames$Year)
summary(fit)
## Df Sum Sq Mean Sq F value Pr(>F)
## Thames$Year 1 17721 17721 411 <2e-16 ***
## Residuals 2555 110150 43
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(gplots)
plotmeans(Thames$Feildes~Thames$Year)
## Warning: zero-length arrow is of indeterminate angle and so skipped
## Warning: zero-length arrow is of indeterminate angle and so skipped
## Warning: zero-length arrow is of indeterminate angle and so skipped
## Warning: zero-length arrow is of indeterminate angle and so skipped
## Warning: zero-length arrow is of indeterminate angle and so skipped
## Warning: zero-length arrow is of indeterminate angle and so skipped
Correlation and Regression
data(swiss)
Birth <-swiss$Fertility
Edu <- swiss$Education
Ca<-swiss$Catholic
plot(Birth~Edu)
cor.test(Birth, Edu, use="complete obs")
##
## Pearson's product-moment correlation
##
## data: Birth and Edu
## t = -5.954, df = 45, p-value = 3.659e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.7987 -0.4653
## sample estimates:
## cor
## -0.6638
cor.test(Birth, Edu, method="spearman",use="complete obs")
## Warning: Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: Birth and Edu
## S = 24963, p-value = 0.001806
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.4433
summary(lm(Birth~Edu+Ca))
##
## Call:
## lm(formula = Birth ~ Edu + Ca)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.04 -6.58 -1.43 6.12 14.32
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 74.2337 2.3520 31.56 < 2e-16 ***
## Edu -0.7883 0.1293 -6.10 2.4e-07 ***
## Ca 0.1109 0.0298 3.72 0.00056 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.33 on 44 degrees of freedom
## Multiple R-squared: 0.575, Adjusted R-squared: 0.555
## F-statistic: 29.7 on 2 and 44 DF, p-value: 6.85e-09
# Model comparisons
model1 <-lm(Birth~Edu)
summary(model1)
##
## Call:
## lm(formula = Birth ~ Edu)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.04 -6.71 -1.01 9.53 19.69
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 79.610 2.104 37.84 < 2e-16 ***
## Edu -0.862 0.145 -5.95 3.7e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.45 on 45 degrees of freedom
## Multiple R-squared: 0.441, Adjusted R-squared: 0.428
## F-statistic: 35.4 on 1 and 45 DF, p-value: 3.66e-07
model2 <-lm(Birth~Edu+Ca)
summary(model2)
##
## Call:
## lm(formula = Birth ~ Edu + Ca)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.04 -6.58 -1.43 6.12 14.32
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 74.2337 2.3520 31.56 < 2e-16 ***
## Edu -0.7883 0.1293 -6.10 2.4e-07 ***
## Ca 0.1109 0.0298 3.72 0.00056 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.33 on 44 degrees of freedom
## Multiple R-squared: 0.575, Adjusted R-squared: 0.555
## F-statistic: 29.7 on 2 and 44 DF, p-value: 6.85e-09
anova(model1, model2)
## Analysis of Variance Table
##
## Model 1: Birth ~ Edu
## Model 2: Birth ~ Edu + Ca
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 45 4015
## 2 44 3054 1 961 13.8 0.00056 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Contingency table analysis (categorical variables)
head(Asthma.Data)
## Nsmokers Asthma HayFever Age Gender roaddist2 d2source1 d2source2
## 1 2 control 0 8.9 1 886552 0 0.00
## 2 2 control 0 8.5 2 889338 0 0.01
## 3 2 control 1 8.5 2 827783 0 0.01
## 4 1 control 0 8.9 2 889338 0 0.01
## 5 1 control 0 6.9 1 889338 0 0.01
## 6 1 case 0 7.1 2 1213531 0 0.01
## d2source3
## 1 0.02
## 2 0.02
## 3 0.02
## 4 0.02
## 5 0.02
## 6 0.02
HayFever<-Asthma.Data$HayFever
Disease<-Asthma.Data$Asthma
AsthmaTable<-xtabs(~ HayFever+Disease, data=Asthma.Data)
AsthmaTable
## Disease
## HayFever case control
## 0 156 956
## 1 59 120
addmargins(prop.table(AsthmaTable,1),2)
## Disease
## HayFever case control Sum
## 0 0.1403 0.8597 1.0000
## 1 0.3296 0.6704 1.0000
chisq.test(AsthmaTable) # Test of Independence
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: AsthmaTable
## X-squared = 38.46, df = 1, p-value = 5.585e-10
fisher.test(AsthmaTable)# Test of Independence
##
## Fisher's Exact Test for Count Data
##
## data: AsthmaTable
## p-value = 4.837e-09
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.2300 0.4829
## sample estimates:
## odds ratio
## 0.3323
library(vcd)
assocstats(AsthmaTable) # Test of Association
## X^2 df P(> X^2)
## Likelihood Ratio 34.076 1 5.3003e-09
## Pearson 39.814 1 2.7939e-10
##
## Phi-Coefficient : 0.176
## Contingency Coeff.: 0.173
## Cramer's V : 0.176
# Multi-dimensional Correlation
Age<-Asthma.Data$Age
Gender<-Asthma.Data$Gender
RdDist<-Asthma.Data$roaddist2
Src1<-Asthma.Data$d2source1
Src2<-Asthma.Data$d2source2
# Correlation Matrix
DEMO<-data.frame(Age,Gender)
DIST<--data.frame(RdDist,Src1,Src2)
cor(DEMO, DIST)
## RdDist Src1 Src2
## Age 0.001301 -0.12894 -0.12616
## Gender 0.028306 0.07225 0.00447
cor.test(Src1,Src2)
##
## Pearson's product-moment correlation
##
## data: Src1 and Src2
## t = 0.708, df = 1289, p-value = 0.4791
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03488 0.07419
## sample estimates:
## cor
## 0.01972
stat<-data.frame(Age,RdDist,Src1,Src2)
library(psych)
corr.test(x=stat, use="complete")
## Call:corr.test(x = stat, use = "complete")
## Correlation matrix
## Age RdDist Src1 Src2
## Age 1.00 0.00 0.13 0.13
## RdDist 0.00 1.00 0.43 -0.12
## Src1 0.13 0.43 1.00 0.02
## Src2 0.13 -0.12 0.02 1.00
## Sample Size
## [1] 1291
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## Age RdDist Src1 Src2
## Age 0.00 0.96 0.00 0.00
## RdDist 0.96 0.00 0.00 0.00
## Src1 0.00 0.00 0.00 0.96
## Src2 0.00 0.00 0.48 0.00
##
## To see confidence intervals of the correlations, print with the short=FALSE option
COV1<-cov(DIST) # Covariance Matrix
class(COV1)
## [1] "matrix"
MySummary <- function(x,npar=TRUE,print=TRUE)
{
if (!npar) { center <- mean(x); spread <- sd(x) }
else {center <- median(x); spread <- mad(x) }
if (print & !npar) { cat("Mean=", center, "\n", "SD=", spread, "\n")}
else if (print & npar) { cat("Median=", center, "\n", "MAD=", spread, "\n") }
result <- list(center=center,spread=spread)
return(result)
}
set.seed(123456)
x <- rnorm(100, 20, 5)
y <- MySummary(x)
## Median= 20.24
## MAD= 5.815
y1 <- MySummary(x, npar=FALSE, print=FALSE)
y1; y1$center
## $center
## [1] 20.08
##
## $spread
## [1] 4.967
## [1] 20.08