In [86]:
#设置工作的路径
In [87]:
getwd()
Out[87]:
In [88]:
setwd('/Users/zhoudan/algae blooms')
In [89]:
#读入数据集algae
In [90]:
algae<-read.table('Analysis.txt',header=F,dec='.',col.names=c('season','size','speed','mxPH','mno2','C1','NO3','NH4',
'OPO4','PO4','Chla','a1','a2','a3','a4','a5','a6','a7'),na.strings=c('XXXXXXX'))
In [91]:
head(algae)
Out[91]:
In [92]:
algae[1:5,]
Out[92]:
In [93]:
summary(algae)
Out[93]:
In [94]:
str(algae)
In [96]:
hist(algae$mxPH,prob=T)
In [97]:
hist(algae$mxPH,prob=T,xlab='',main='Histogram of maximum PH value',ylim=0:1)
lines(density(algae$mxPH,na.rm=T),col='blue')#密度曲线图
rug(jitter(algae$mxPH))#轴须图
In [ ]:
In [53]:
boxplot(algae$OPO4,boxwex=0.15,ylab='Orthophosphate (oPO4)')#箱线图
rug(jitter(algae$OPO4),side=2)
abline(h=mean(algae$OPO4,na.rm=T),lty=2,col='blue')
In [98]:
plot(algae$NH4,xlab='')
abline(h=mean(algae$NH4,na.rm=T),lty=1)
abline(h=mean(algae$NH4,na.rm=T)+sd(algae$NH4,na.rm=T),lty=2)
abline(h=median(algae$NH4,na.rm=T),lty=3)
identify(algae$NH4)
Out[98]:
In [99]:
library(lattice)
In [100]:
bwplot(size~a1,data=algae,ylab='River Size',xlab='Algae A1')#盒形图
In [101]:
mino2<-equal.count(na.omit(algae$mno2),number=4,overlap=1/5)#将数据等分划分成四个区间
In [102]:
stripplot(season~a3|mino2,data=algae[!is.na(algae$mno2),])#带形图
In [103]:
#缺失值的处理
In [104]:
#Removing the cases with unknowns
In [105]:
algae[!complete.cases(algae),]
Out[105]:
In [106]:
nrow(algae[!complete.cases(algae),])
Out[106]:
In [108]:
algae<-na.omit(algae)#删除缺失值
In [117]:
algae<-read.table('Analysis.txt',header=F,dec='.',col.names=c('season','size','speed','mxPH','mno2','C1','NO3','NH4',
'OPO4','PO4','Chla','a1','a2','a3','a4','a5','a6','a7'),na.strings=c('XXXXXXX'))
In [118]:
algae[!complete.cases(algae),]
Out[118]:
In [119]:
algae<-algae[-c(62,199),]
In [120]:
algae[!complete.cases(algae),]
Out[120]:
In [121]:
#Filling the unknowns with the most frequent values
In [123]:
algae[48,'mxPH']<-mean(algae$mxPH,na.rm=T)
In [125]:
algae[48,'mxPH']
Out[125]:
In [133]:
hist(algae$Chla,prob=T)
In [134]:
algae[is.na(algae$Chla),'Chla']<-median(algae$Chla,na.rm=T)#利用中值来填充
In [136]:
#Filling in the unknown values by exploring correlations
In [137]:
algae<-read.table('Analysis.txt',header=F,dec='.',col.names=c('season','size','speed','mxPH','mno2','C1','NO3','NH4',
'OPO4','PO4','Chla','a1','a2','a3','a4','a5','a6','a7'),na.strings=c('XXXXXXX'))
In [138]:
algae[!complete.cases(algae),]#查看缺失值
Out[138]:
In [139]:
algae<-algae[-c(62,199),]
In [140]:
cor(algae[,4:18],use="complete.obs")
Out[140]:
In [141]:
symnum(cor(algae[,4:18],use="complete.obs"))
Out[141]:
In [142]:
lm(OPO4~PO4,data=algae)#线性模型 Linear Models
Out[142]:
In [143]:
algae[!complete.cases(algae),]
Out[143]:
In [144]:
algae[28,'PO4']<-(algae[28,'OPO4']+15.6142)/0.6466
In [146]:
#Correlation with nominal variables
In [155]:
histogram(~mxPH|season,data=algae)
In [148]:
histogram(~mxPH|size,data=algae)
In [149]:
histogram(~mxPH|size*speed,data=algae)
In [152]:
stripplot(size~mxPH|speed,data=algae,jitter=T)
In [156]:
#Filling in the unknown values by exploring similarities between cases
In [157]:
algae<-read.table('Analysis.txt',header=F,dec='.',col.names=c('season','size','speed','mxPH','mno2','C1','NO3','NH4',
'OPO4','PO4','Chla','a1','a2','a3','a4','a5','a6','a7'),na.strings=c('XXXXXXX'))
In [158]:
algae<-algae[-c(62,199),]
In [159]:
library(cluster)
In [160]:
dist.mtx<-as.matrix(daisy(algae,stand=T))
In [161]:
which(!complete.cases(algae))
Out[161]:
In [163]:
sort(dist.mtx[38,])[1:10]
Out[163]:
In [164]:
as.integer(names(sort(dist.mtx[38,])[2:11]))
Out[164]:
In [165]:
algae[38,]
Out[165]:
In [168]:
median(algae[c(as.integer(names(sort(dist.mtx[38,])[2:11]))),'mno2'])
Out[168]:
In [169]:
algae[38,'mno2']<-median(algae[c(as.integer(names(sort(dist.mtx[38,])[2:11]))),'mno2'],na.rm=T)
In [170]:
algae[38,'mno2']
Out[170]:
In [171]:
algae[55,]
Out[171]:
In [172]:
apply(algae[c(as.integer(names(sort(dist.mtx[55,])[2:11]))),which(is.na(algae[55,]))],2,median,na.rm=T)
Out[172]:
In [179]:
algae[55,which(is.na(algae[55,]))]<-apply(algae[c(as.integer(names(sort(dist.mtx[55,])[2:11]))),which(is.na(algae[55,]))],2,median,na.rm=T)
In [180]:
algae[55,]
Out[180]:
In [181]:
central.value<-function(x){
if(is.numeric(x)) median(x,na.rm=T)
else if (is.factor(x)) levels(x)[which.max(table(x))]
else {
f<-as.factor(x)
levels(f)[which.max(table(f))]
}
}
In [188]:
for(r in which(!complete.cases(algae)))
algae[r,which(is.na(algae[r,]))]<-apply(data.frame(algae[as.integer(names(sort(dist.mtx[r,])[2:11])),which(is.na(algae[r,]))]),2,central.value)
In [191]:
nrow(algae[!complete.cases(algae),])
Out[191]:
In [192]:
#Obtaining prediction models
In [193]:
algae<-read.table('Analysis.txt',header=F,dec='.',col.names=c('season','size','speed','mxPH','mno2','C1','NO3','NH4',
'OPO4','PO4','Chla','a1','a2','a3','a4','a5','a6','a7'),na.strings=c('XXXXXXX'))
In [194]:
algae<-algae[-c(62,199),]
In [195]:
clean.algae<-algae
In [197]:
for(r in which(!complete.cases(algae)))
clean.algae[r,which(is.na(algae[r,]))]<-
apply(data.frame(algae[c(as.integer(names(sort(dist.mtx[r,])[2:11]))),which(is.na(algae[r,]))]),2,central.value)
In [198]:
lm.a1<-lm(a1~.,data=clean.algae[,1:12])
In [199]:
summary(lm.a1)
Out[199]:
In [200]:
anova(lm.a1)
Out[200]:
In [208]:
lm2.a1<-update(lm.a1,.~.-season)
In [209]:
summary(lm2.a1)
Out[209]:
In [210]:
anova(lm.a1,lm2.a1)
Out[210]:
In [211]:
final.lm<-step(lm.a1)
In [212]:
summary(final.lm)
Out[212]:
In [213]:
library(rpart)
In [214]:
algae<-read.table('Analysis.txt',header=F,dec='.',col.names=c('season','size','speed','mxPH','mno2','C1','NO3','NH4',
'OPO4','PO4','Chla','a1','a2','a3','a4','a5','a6','a7'),na.strings=c('XXXXXXX'))
In [215]:
algae<-algae[-c(62,199),]
In [217]:
rt.a1<-rpart(a1~.,data=algae[,1:12])
In [218]:
rt.a1
Out[218]:
In [220]:
plot(rt.a1,uniform=T,branch=1,margin=0.1,cex=0.9)
text(rt.a1,cex=0.9)
In [222]:
printcp(rt.a1)
In [223]:
rt2.a1<-prune(rt.a1,cp=0.08)
In [224]:
rt2.a1
Out[224]:
In [225]:
first.tree<-rpart(a1~.,data=algae[,1:12])
In [226]:
my.tree<-snip.rpart(first.tree,c(4,7))
In [227]:
my.tree
Out[227]:
In [231]:
plot(first.tree)
text(first.tree)
snip.rpart(first.tree)
Out[231]:
In [233]:
lm.prediction.a1<-predict(final.lm,clean.algae)
In [248]:
rt1.prediction.al<-predict(rt.a1,algae)
In [258]:
#Mean Absolute Error
In [237]:
(mae.a1.lm<-mean(abs(lm.prediction.a1-algae[,'a1'])))
Out[237]:
In [257]:
(mae.a1.rt<-mean(abs(predict(rt.a1,algae)-algae[,'a1'])))
Out[257]:
In [259]:
#Mean Squared Error
In [260]:
(mse.a1.lm<-mean((lm.prediction.a1-algae[,'a1'])^2))
Out[260]:
In [261]:
(mse.a1.rt<-mean((predict(rt.a1,algae)-algae[,'a1'])^2))
Out[261]:
In [262]:
#Normalized mean squared error
In [265]:
(nmse.a1.lm<-mean((lm.prediction.a1-algae[,'a1'])^2)/mean((mean(algae[,'a1'])-algae[,'a1'])^2))
Out[265]:
In [266]:
(nmse.a1.rt<-mean((predict(rt.a1,algae)-algae[,'a1'])^2)/mean((mean(algae[,'a1'])-algae[,'a1'])^2))
Out[266]:
In [267]:
#Visual inspection of the predictions
In [271]:
old.par<-par(mfrow=c(2,1))
In [276]:
plot(lm.prediction.a1,algae[,'a1'],main='Linear Model',xlab="Predictions",ylab="True Values")
abline(0,1,lty=2)
plot(predict(rt.a1,algae),algae[,'a1'],main='Regression Model',xlab='Predictions',ylab='True Values')
abline(0,1,lty=2)
In [ ]:
In [ ]: