In [86]:
#设置工作的路径

In [87]:
getwd()


Out[87]:
'/Users/zhoudan/algae blooms'

In [88]:
setwd('/Users/zhoudan/algae blooms')

In [89]:
#读入数据集algae

In [90]:
algae<-read.table('Analysis.txt',header=F,dec='.',col.names=c('season','size','speed','mxPH','mno2','C1','NO3','NH4',
'OPO4','PO4','Chla','a1','a2','a3','a4','a5','a6','a7'),na.strings=c('XXXXXXX'))

In [91]:
head(algae)


Out[91]:
seasonsizespeedmxPHmno2C1NO3NH4OPO4PO4Chlaa1a2a3a4a5a6a7
1wintersmallmedium89.860.86.23857810517050000034.28.30
2springsmallmedium8.35857.751.288370428.75558.751.31.47.64.81.96.702.1
3autumnsmallmedium8.111.440.025.33346.667125.667187.05715.63.353.61.90009.7
4springsmallmedium8.074.877.3642.30298.18261.182138.71.43.14118.901.401.4
5autumnsmallmedium8.06955.3510.416233.758.22297.5810.59.22.97.507.54.11
6wintersmallhigh8.2513.165.759.24843018.2556.66728.415.114.61.4022.512.62.9

In [92]:
algae[1:5,]


Out[92]:
seasonsizespeedmxPHmno2C1NO3NH4OPO4PO4Chlaa1a2a3a4a5a6a7
1wintersmallmedium89.860.86.23857810517050000034.28.30
2springsmallmedium8.35857.751.288370428.75558.751.31.47.64.81.96.702.1
3autumnsmallmedium8.111.440.025.33346.667125.667187.05715.63.353.61.90009.7
4springsmallmedium8.074.877.3642.30298.18261.182138.71.43.14118.901.401.4
5autumnsmallmedium8.06955.3510.416233.758.22297.5810.59.22.97.507.54.11

In [93]:
summary(algae)


Out[93]:
    season       size       speed         mxPH            mno2       
 autumn:40   large :45   high  :84   Min.   :5.600   Min.   : 1.500  
 spring:53   medium:84   low   :33   1st Qu.:7.700   1st Qu.: 7.725  
 summer:45   small :71   medium:83   Median :8.060   Median : 9.800  
 winter:62                           Mean   :8.012   Mean   : 9.118  
                                     3rd Qu.:8.400   3rd Qu.:10.800  
                                     Max.   :9.700   Max.   :13.400  
                                     NA's   :1       NA's   :2       
       C1               NO3              NH4                OPO4       
 Min.   :  0.222   Min.   : 0.050   Min.   :    5.00   Min.   :  1.00  
 1st Qu.: 10.981   1st Qu.: 1.296   1st Qu.:   38.33   1st Qu.: 15.70  
 Median : 32.730   Median : 2.675   Median :  103.17   Median : 40.15  
 Mean   : 43.636   Mean   : 3.282   Mean   :  501.30   Mean   : 73.59  
 3rd Qu.: 57.824   3rd Qu.: 4.446   3rd Qu.:  226.95   3rd Qu.: 99.33  
 Max.   :391.500   Max.   :45.650   Max.   :24064.00   Max.   :564.60  
 NA's   :10        NA's   :2        NA's   :2          NA's   :2       
      PO4              Chla               a1              a2        
 Min.   :  1.00   Min.   :  0.200   Min.   : 0.00   Min.   : 0.000  
 1st Qu.: 41.38   1st Qu.:  2.000   1st Qu.: 1.50   1st Qu.: 0.000  
 Median :103.29   Median :  5.475   Median : 6.95   Median : 3.000  
 Mean   :137.88   Mean   : 13.971   Mean   :16.92   Mean   : 7.458  
 3rd Qu.:213.75   3rd Qu.: 18.308   3rd Qu.:24.80   3rd Qu.:11.375  
 Max.   :771.60   Max.   :110.456   Max.   :89.80   Max.   :72.600  
 NA's   :2        NA's   :12                                        
       a3               a4               a5               a6        
 Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
 1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.: 0.000  
 Median : 1.550   Median : 0.000   Median : 1.900   Median : 0.000  
 Mean   : 4.309   Mean   : 1.992   Mean   : 5.064   Mean   : 5.964  
 3rd Qu.: 4.925   3rd Qu.: 2.400   3rd Qu.: 7.500   3rd Qu.: 6.925  
 Max.   :42.800   Max.   :44.600   Max.   :44.400   Max.   :77.600  
                                                                    
       a7        
 Min.   : 0.000  
 1st Qu.: 0.000  
 Median : 1.000  
 Mean   : 2.495  
 3rd Qu.: 2.400  
 Max.   :31.600  
                 

In [94]:
str(algae)


'data.frame':	200 obs. of  18 variables:
 $ season: Factor w/ 4 levels "autumn","spring",..: 4 2 1 2 1 4 3 1 4 4 ...
 $ size  : Factor w/ 3 levels "large","medium",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ speed : Factor w/ 3 levels "high","low","medium": 3 3 3 3 3 1 1 1 3 1 ...
 $ mxPH  : num  8 8.35 8.1 8.07 8.06 8.25 8.15 8.05 8.7 7.93 ...
 $ mno2  : num  9.8 8 11.4 4.8 9 13.1 10.3 10.6 3.4 9.9 ...
 $ C1    : num  60.8 57.8 40 77.4 55.4 ...
 $ NO3   : num  6.24 1.29 5.33 2.3 10.42 ...
 $ NH4   : num  578 370 346.7 98.2 233.7 ...
 $ OPO4  : num  105 428.8 125.7 61.2 58.2 ...
 $ PO4   : num  170 558.8 187.1 138.7 97.6 ...
 $ Chla  : num  50 1.3 15.6 1.4 10.5 ...
 $ a1    : num  0 1.4 3.3 3.1 9.2 15.1 2.4 18.2 25.4 17 ...
 $ a2    : num  0 7.6 53.6 41 2.9 14.6 1.2 1.6 5.4 0 ...
 $ a3    : num  0 4.8 1.9 18.9 7.5 1.4 3.2 0 2.5 0 ...
 $ a4    : num  0 1.9 0 0 0 0 3.9 0 0 2.9 ...
 $ a5    : num  34.2 6.7 0 1.4 7.5 22.5 5.8 5.5 0 0 ...
 $ a6    : num  8.3 0 0 0 4.1 12.6 6.8 8.7 0 0 ...
 $ a7    : num  0 2.1 9.7 1.4 1 2.9 0 0 0 1.7 ...

In [96]:
hist(algae$mxPH,prob=T)



In [97]:
hist(algae$mxPH,prob=T,xlab='',main='Histogram of maximum PH value',ylim=0:1)
lines(density(algae$mxPH,na.rm=T),col='blue')#密度曲线图
rug(jitter(algae$mxPH))#轴须图



In [ ]:


In [53]:
boxplot(algae$OPO4,boxwex=0.15,ylab='Orthophosphate (oPO4)')#箱线图
rug(jitter(algae$OPO4),side=2)
abline(h=mean(algae$OPO4,na.rm=T),lty=2,col='blue')



In [98]:
plot(algae$NH4,xlab='')
abline(h=mean(algae$NH4,na.rm=T),lty=1)
abline(h=mean(algae$NH4,na.rm=T)+sd(algae$NH4,na.rm=T),lty=2)
abline(h=median(algae$NH4,na.rm=T),lty=3)
identify(algae$NH4)


Out[98]:

In [99]:
library(lattice)

In [100]:
bwplot(size~a1,data=algae,ylab='River Size',xlab='Algae A1')#盒形图



In [101]:
mino2<-equal.count(na.omit(algae$mno2),number=4,overlap=1/5)#将数据等分划分成四个区间

In [102]:
stripplot(season~a3|mino2,data=algae[!is.na(algae$mno2),])#带形图



In [103]:
#缺失值的处理

In [104]:
#Removing the cases with unknowns

In [105]:
algae[!complete.cases(algae),]


Out[105]:
seasonsizespeedmxPHmno2C1NO3NH4OPO4PO4Chlaa1a2a3a4a5a6a7
28autumnsmallhigh6.811.190.63204NA2.730.31.9002.11.42.1
38springsmallhigh8NA1.450.81102.530.375.8000000
48wintersmalllowNA12.690.2310561.135.5000000
55wintersmallhigh6.610.8NA3.2451016.5NA24.3000000
56springsmallmedium5.611.8NA2.22511NA82.7000000
57autumnsmallmedium5.710.8NA2.551014NA16.84.63.911.5000
58springsmallhigh6.69.5NA1.322016NA46.80028.8000
59summersmallhigh6.610.8NA2.6410211NA46.90013.4000
60autumnsmallmedium6.611.3NA4.171016NA47.100001.20
61springsmallmedium6.510.4NA5.9710214NA66.9000000
62summersmallmedium6.4NANANANANA14NA19.400203.91.7
63autumnsmallhigh7.8311.74.0831.328183.3336.667NA14.4000000
116wintermediumhigh9.710.80.2220.4061022.44410.111NA411.500000
161springlargelow95.8NA0.914210218668.051.720.61.52.2000
184winterlargehigh810.99.0550.8254021.08356.091NA16.819.640000
199winterlargemedium87.6NANANANANANA012.53.71004.9

In [106]:
nrow(algae[!complete.cases(algae),])


Out[106]:
16

In [108]:
algae<-na.omit(algae)#删除缺失值

In [117]:
algae<-read.table('Analysis.txt',header=F,dec='.',col.names=c('season','size','speed','mxPH','mno2','C1','NO3','NH4',
'OPO4','PO4','Chla','a1','a2','a3','a4','a5','a6','a7'),na.strings=c('XXXXXXX'))

In [118]:
algae[!complete.cases(algae),]


Out[118]:
seasonsizespeedmxPHmno2C1NO3NH4OPO4PO4Chlaa1a2a3a4a5a6a7
28autumnsmallhigh6.811.190.63204NA2.730.31.9002.11.42.1
38springsmallhigh8NA1.450.81102.530.375.8000000
48wintersmalllowNA12.690.2310561.135.5000000
55wintersmallhigh6.610.8NA3.2451016.5NA24.3000000
56springsmallmedium5.611.8NA2.22511NA82.7000000
57autumnsmallmedium5.710.8NA2.551014NA16.84.63.911.5000
58springsmallhigh6.69.5NA1.322016NA46.80028.8000
59summersmallhigh6.610.8NA2.6410211NA46.90013.4000
60autumnsmallmedium6.611.3NA4.171016NA47.100001.20
61springsmallmedium6.510.4NA5.9710214NA66.9000000
62summersmallmedium6.4NANANANANA14NA19.400203.91.7
63autumnsmallhigh7.8311.74.0831.328183.3336.667NA14.4000000
116wintermediumhigh9.710.80.2220.4061022.44410.111NA411.500000
161springlargelow95.8NA0.914210218668.051.720.61.52.2000
184winterlargehigh810.99.0550.8254021.08356.091NA16.819.640000
199winterlargemedium87.6NANANANANANA012.53.71004.9

In [119]:
algae<-algae[-c(62,199),]

In [120]:
algae[!complete.cases(algae),]


Out[120]:
seasonsizespeedmxPHmno2C1NO3NH4OPO4PO4Chlaa1a2a3a4a5a6a7
28autumnsmallhigh6.811.190.63204NA2.730.31.9002.11.42.1
38springsmallhigh8NA1.450.81102.530.375.8000000
48wintersmalllowNA12.690.2310561.135.5000000
55wintersmallhigh6.610.8NA3.2451016.5NA24.3000000
56springsmallmedium5.611.8NA2.22511NA82.7000000
57autumnsmallmedium5.710.8NA2.551014NA16.84.63.911.5000
58springsmallhigh6.69.5NA1.322016NA46.80028.8000
59summersmallhigh6.610.8NA2.6410211NA46.90013.4000
60autumnsmallmedium6.611.3NA4.171016NA47.100001.20
61springsmallmedium6.510.4NA5.9710214NA66.9000000
63autumnsmallhigh7.8311.74.0831.328183.3336.667NA14.4000000
116wintermediumhigh9.710.80.2220.4061022.44410.111NA411.500000
161springlargelow95.8NA0.914210218668.051.720.61.52.2000
184winterlargehigh810.99.0550.8254021.08356.091NA16.819.640000

In [121]:
#Filling the unknowns with the most frequent values

In [123]:
algae[48,'mxPH']<-mean(algae$mxPH,na.rm=T)

In [125]:
algae[48,'mxPH']


Out[125]:
8.01997461928934

In [133]:
hist(algae$Chla,prob=T)



In [134]:
algae[is.na(algae$Chla),'Chla']<-median(algae$Chla,na.rm=T)#利用中值来填充

In [136]:
#Filling in the unknown values by exploring correlations

In [137]:
algae<-read.table('Analysis.txt',header=F,dec='.',col.names=c('season','size','speed','mxPH','mno2','C1','NO3','NH4',
'OPO4','PO4','Chla','a1','a2','a3','a4','a5','a6','a7'),na.strings=c('XXXXXXX'))

In [138]:
algae[!complete.cases(algae),]#查看缺失值


Out[138]:
seasonsizespeedmxPHmno2C1NO3NH4OPO4PO4Chlaa1a2a3a4a5a6a7
28autumnsmallhigh6.811.190.63204NA2.730.31.9002.11.42.1
38springsmallhigh8NA1.450.81102.530.375.8000000
48wintersmalllowNA12.690.2310561.135.5000000
55wintersmallhigh6.610.8NA3.2451016.5NA24.3000000
56springsmallmedium5.611.8NA2.22511NA82.7000000
57autumnsmallmedium5.710.8NA2.551014NA16.84.63.911.5000
58springsmallhigh6.69.5NA1.322016NA46.80028.8000
59summersmallhigh6.610.8NA2.6410211NA46.90013.4000
60autumnsmallmedium6.611.3NA4.171016NA47.100001.20
61springsmallmedium6.510.4NA5.9710214NA66.9000000
62summersmallmedium6.4NANANANANA14NA19.400203.91.7
63autumnsmallhigh7.8311.74.0831.328183.3336.667NA14.4000000
116wintermediumhigh9.710.80.2220.4061022.44410.111NA411.500000
161springlargelow95.8NA0.914210218668.051.720.61.52.2000
184winterlargehigh810.99.0550.8254021.08356.091NA16.819.640000
199winterlargemedium87.6NANANANANANA012.53.71004.9

In [139]:
algae<-algae[-c(62,199),]

In [140]:
cor(algae[,4:18],use="complete.obs")


Out[140]:
mxPHmno2C1NO3NH4OPO4PO4Chlaa1a2a3a4a5a6a7
mxPH 1.00000000-0.10269374 0.14709539-0.17213024-0.15429757 0.09022909 0.10132957 0.43182377-0.16262986 0.33501740-0.02716034-0.18435348-0.10731230-0.17273795-0.17027088
mno2-0.10269374 1.00000000-0.26324536 0.11790769-0.07826816-0.39375269-0.46396073-0.13121671 0.24998372-0.06848199-0.23522831-0.37982999 0.21001174 0.18862656-0.10455106
C1 0.14709539-0.26324536 1.00000000 0.21095831 0.06598336 0.37925596 0.44519118 0.14295776-0.35923946 0.07845402 0.07653027 0.14147281 0.14534877 0.16904394-0.04494524
NO3-0.17213024 0.11790769 0.21095831 1.00000000 0.72467766 0.13301452 0.15702971 0.14549290-0.24723921 0.01997079-0.09182236-0.01448875 0.21213579 0.54404455 0.07505030
NH4-0.15429757-0.07826816 0.06598336 0.72467766 1.00000000 0.21931121 0.19939575 0.09120406-0.12360578-0.03790296-0.11290467 0.27452000 0.01544458 0.40119275-0.02539279
OPO4 0.090229085-0.393752688 0.379255958 0.133014517 0.219311206 1.000000000 0.911964602 0.106914784-0.394574479 0.123811068 0.005704557 0.382481433 0.122027482 0.003340366 0.026150420
PO4 0.10132957-0.46396073 0.44519118 0.15702971 0.19939575 0.91196460 1.00000000 0.24849223-0.45816781 0.13266789 0.03219398 0.40883951 0.15548900 0.05320294 0.07978353
Chla 0.43182377-0.13121671 0.14295776 0.14549290 0.09120406 0.10691478 0.24849223 1.00000000-0.26601088 0.36672465-0.06330113-0.08600540-0.07342837 0.01032550 0.01760782
a1-0.16262986 0.24998372-0.35923946-0.24723921-0.12360578-0.39457448-0.45816781-0.26601088 1.00000000-0.26266549-0.10817758-0.09338072-0.26972709-0.26156402-0.19306384
a2 0.335017401-0.068481989 0.078454019 0.019970786-0.037902958 0.123811068 0.132667891 0.366724647-0.262665485 1.000000000 0.009759915-0.176287038-0.186758940-0.133518480 0.036206205
a3-0.027160336-0.235228307 0.076530269-0.091822358-0.112904666 0.005704557 0.032193981-0.063301128-0.108177581 0.009759915 1.000000000 0.033369102-0.141610948-0.196900051 0.039060248
a4-0.18435348-0.37982999 0.14147281-0.01448875 0.27452000 0.38248143 0.40883951-0.08600540-0.09338072-0.17628704 0.03336910 1.00000000-0.10131827-0.08488426 0.07114638
a5-0.10731230 0.21001174 0.14534877 0.21213579 0.01544458 0.12202748 0.15548900-0.07342837-0.26972709-0.18675894-0.14161095-0.10131827 1.00000000 0.38860896-0.05149346
a6-0.172737947 0.188626555 0.169043945 0.544044553 0.401192749 0.003340366 0.053202942 0.010325497-0.261564023-0.133518480-0.196900051-0.084884259 0.388608955 1.000000000-0.030334277
a7-0.17027088-0.10455106-0.04494524 0.07505030-0.02539279 0.02615042 0.07978353 0.01760782-0.19306384 0.03620621 0.03906025 0.07114638-0.05149346-0.03033428 1.00000000

In [141]:
symnum(cor(algae[,4:18],use="complete.obs"))


Out[141]:
     mP m2 C1 NO NH O P Ch a1 a2 a3 a4 a5 a6 a7
mxPH 1                                         
mno2    1                                      
C1         1                                   
NO3           1                                
NH4           ,  1                             
OPO4    .  .        1                          
PO4     .  .        * 1                        
Chla .                  1                      
a1         .        . .    1                   
a2   .                  .     1                
a3                               1             
a4      .           . .             1          
a5                                     1       
a6            .  .                     .  1    
a7                                           1 
attr(,"legend")
[1] 0 ‘ ’ 0.3 ‘.’ 0.6 ‘,’ 0.8 ‘+’ 0.9 ‘*’ 0.95 ‘B’ 1

In [142]:
lm(OPO4~PO4,data=algae)#线性模型 Linear Models


Out[142]:
Call:
lm(formula = OPO4 ~ PO4, data = algae)

Coefficients:
(Intercept)          PO4  
   -15.6142       0.6466  

In [143]:
algae[!complete.cases(algae),]


Out[143]:
seasonsizespeedmxPHmno2C1NO3NH4OPO4PO4Chlaa1a2a3a4a5a6a7
28autumnsmallhigh6.811.190.63204NA2.730.31.9002.11.42.1
38springsmallhigh8NA1.450.81102.530.375.8000000
48wintersmalllowNA12.690.2310561.135.5000000
55wintersmallhigh6.610.8NA3.2451016.5NA24.3000000
56springsmallmedium5.611.8NA2.22511NA82.7000000
57autumnsmallmedium5.710.8NA2.551014NA16.84.63.911.5000
58springsmallhigh6.69.5NA1.322016NA46.80028.8000
59summersmallhigh6.610.8NA2.6410211NA46.90013.4000
60autumnsmallmedium6.611.3NA4.171016NA47.100001.20
61springsmallmedium6.510.4NA5.9710214NA66.9000000
63autumnsmallhigh7.8311.74.0831.328183.3336.667NA14.4000000
116wintermediumhigh9.710.80.2220.4061022.44410.111NA411.500000
161springlargelow95.8NA0.914210218668.051.720.61.52.2000
184winterlargehigh810.99.0550.8254021.08356.091NA16.819.640000

In [144]:
algae[28,'PO4']<-(algae[28,'OPO4']+15.6142)/0.6466

In [146]:
#Correlation with nominal variables

In [155]:
histogram(~mxPH|season,data=algae)



In [148]:
histogram(~mxPH|size,data=algae)



In [149]:
histogram(~mxPH|size*speed,data=algae)



In [152]:
stripplot(size~mxPH|speed,data=algae,jitter=T)



In [156]:
#Filling in the unknown values by exploring similarities between cases

In [157]:
algae<-read.table('Analysis.txt',header=F,dec='.',col.names=c('season','size','speed','mxPH','mno2','C1','NO3','NH4',
'OPO4','PO4','Chla','a1','a2','a3','a4','a5','a6','a7'),na.strings=c('XXXXXXX'))

In [158]:
algae<-algae[-c(62,199),]

In [159]:
library(cluster)

In [160]:
dist.mtx<-as.matrix(daisy(algae,stand=T))

In [161]:
which(!complete.cases(algae))


Out[161]:
  1. 28
  2. 38
  3. 48
  4. 55
  5. 56
  6. 57
  7. 58
  8. 59
  9. 60
  10. 61
  11. 62
  12. 115
  13. 160
  14. 183

In [163]:
sort(dist.mtx[38,])[1:10]


Out[163]:
38
0
54
0.0212600281194402
22
0.0571178190140975
64
0.0579052976322972
11
0.0604714199773158
30
0.0642723555138938
25
0.0666881059103993
53
0.0667769438831517
37
0.0698392618120022
24
0.0760912552588425

In [164]:
as.integer(names(sort(dist.mtx[38,])[2:11]))


Out[164]:
  1. 54
  2. 22
  3. 64
  4. 11
  5. 30
  6. 25
  7. 53
  8. 37
  9. 24
  10. 18

In [165]:
algae[38,]


Out[165]:
seasonsizespeedmxPHmno2C1NO3NH4OPO4PO4Chlaa1a2a3a4a5a6a7
38springsmallhigh8NA1.450.81102.530.375.8000000

In [168]:
median(algae[c(as.integer(names(sort(dist.mtx[38,])[2:11]))),'mno2'])


Out[168]:
10

In [169]:
algae[38,'mno2']<-median(algae[c(as.integer(names(sort(dist.mtx[38,])[2:11]))),'mno2'],na.rm=T)

In [170]:
algae[38,'mno2']


Out[170]:
10

In [171]:
algae[55,]


Out[171]:
seasonsizespeedmxPHmno2C1NO3NH4OPO4PO4Chlaa1a2a3a4a5a6a7
55wintersmallhigh6.610.8NA3.2451016.5NA24.3000000

In [172]:
apply(algae[c(as.integer(names(sort(dist.mtx[55,])[2:11]))),which(is.na(algae[55,]))],2,median,na.rm=T)


Out[172]:
C1
6.5835
Chla
0.8

In [179]:
algae[55,which(is.na(algae[55,]))]<-apply(algae[c(as.integer(names(sort(dist.mtx[55,])[2:11]))),which(is.na(algae[55,]))],2,median,na.rm=T)

In [180]:
algae[55,]


Out[180]:
seasonsizespeedmxPHmno2C1NO3NH4OPO4PO4Chlaa1a2a3a4a5a6a7
55wintersmallhigh6.610.86.58353.2451016.50.824.3000000

In [181]:
central.value<-function(x){
    if(is.numeric(x)) median(x,na.rm=T)
    else if (is.factor(x)) levels(x)[which.max(table(x))]
    else {
        f<-as.factor(x)
        levels(f)[which.max(table(f))]
        }
    }

In [188]:
for(r in which(!complete.cases(algae)))
    algae[r,which(is.na(algae[r,]))]<-apply(data.frame(algae[as.integer(names(sort(dist.mtx[r,])[2:11])),which(is.na(algae[r,]))]),2,central.value)

In [191]:
nrow(algae[!complete.cases(algae),])


Out[191]:
0

In [192]:
#Obtaining  prediction models

In [193]:
algae<-read.table('Analysis.txt',header=F,dec='.',col.names=c('season','size','speed','mxPH','mno2','C1','NO3','NH4',
'OPO4','PO4','Chla','a1','a2','a3','a4','a5','a6','a7'),na.strings=c('XXXXXXX'))

In [194]:
algae<-algae[-c(62,199),]

In [195]:
clean.algae<-algae

In [197]:
for(r in which(!complete.cases(algae)))
    clean.algae[r,which(is.na(algae[r,]))]<-
    apply(data.frame(algae[c(as.integer(names(sort(dist.mtx[r,])[2:11]))),which(is.na(algae[r,]))]),2,central.value)

In [198]:
lm.a1<-lm(a1~.,data=clean.algae[,1:12])

In [199]:
summary(lm.a1)


Out[199]:
Call:
lm(formula = a1 ~ ., data = clean.algae[, 1:12])

Residuals:
    Min      1Q  Median      3Q     Max 
-37.582 -11.882  -2.741   7.090  62.143 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)   
(Intercept)  43.210622  24.042849   1.797  0.07396 . 
seasonspring  3.575474   4.135308   0.865  0.38838   
seasonsummer  0.645459   4.020423   0.161  0.87263   
seasonwinter  3.572084   3.863941   0.924  0.35647   
sizemedium    3.321935   3.797755   0.875  0.38288   
sizesmall     9.732162   4.175616   2.331  0.02086 * 
speedlow      3.965153   4.709314   0.842  0.40090   
speedmedium   0.304232   3.243204   0.094  0.92537   
mxPH         -3.570995   2.706612  -1.319  0.18871   
mno2          1.018514   0.704875   1.445  0.15019   
C1           -0.042551   0.033646  -1.265  0.20761   
NO3          -1.494145   0.551200  -2.711  0.00736 **
NH4           0.001608   0.001003   1.603  0.11072   
OPO4         -0.005235   0.039864  -0.131  0.89566   
PO4          -0.052247   0.030737  -1.700  0.09087 . 
Chla         -0.090800   0.080015  -1.135  0.25796   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 17.64 on 182 degrees of freedom
Multiple R-squared:  0.3737,	Adjusted R-squared:  0.3221 
F-statistic:  7.24 on 15 and 182 DF,  p-value: 2.273e-12

In [200]:
anova(lm.a1)


Out[200]:
DfSum SqMean SqF valuePr(>F)
season384.5419228.180640.090588320.9651499
size211401.455700.72418.325315.612817e-08
speed23934.3771967.1886.3236430.002212567
mxPH11322.0821322.0824.249910.04067399
mno212218.4082218.4087.1312020.008261421
C114450.7774450.77714.307290.0002104707
NO313398.9943398.99410.926260.001142013
NH41385.0006385.00061.2376070.2674
OPO414764.8124764.81215.316770.0001283269
PO411422.8341422.8344.5737840.03379805
Chla1400.5985400.59851.2877470.2579558
Residuals18256617.41311.0847NANA

In [208]:
lm2.a1<-update(lm.a1,.~.-season)

In [209]:
summary(lm2.a1)


Out[209]:
Call:
lm(formula = a1 ~ size + speed + mxPH + mno2 + C1 + NO3 + NH4 + 
    OPO4 + PO4 + Chla, data = clean.algae[, 1:12])

Residuals:
    Min      1Q  Median      3Q     Max 
-36.386 -11.899  -2.941   7.338  63.611 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)   
(Intercept) 44.9587170 23.2659336   1.932  0.05484 . 
sizemedium   3.3636189  3.7773655   0.890  0.37437   
sizesmall   10.3092317  4.1173665   2.504  0.01315 * 
speedlow     3.1460847  4.6155216   0.682  0.49632   
speedmedium -0.2146428  3.1839011  -0.067  0.94632   
mxPH        -3.2377235  2.6587542  -1.218  0.22487   
mno2         0.7741679  0.6578931   1.177  0.24081   
C1          -0.0409303  0.0333812  -1.226  0.22170   
NO3         -1.5126458  0.5475832  -2.762  0.00632 **
NH4          0.0015525  0.0009946   1.561  0.12027   
OPO4        -0.0061577  0.0394710  -0.156  0.87620   
PO4         -0.0508845  0.0304911  -1.669  0.09684 . 
Chla        -0.0879751  0.0794655  -1.107  0.26969   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 17.56 on 185 degrees of freedom
Multiple R-squared:  0.369,	Adjusted R-squared:  0.3281 
F-statistic: 9.016 on 12 and 185 DF,  p-value: 1.581e-13

In [210]:
anova(lm.a1,lm2.a1)


Out[210]:
Res.DfRSSDfSum of SqFPr(>F)
118256617.41NANANANA
218557042.9-3-425.48570.45591620.713437

In [211]:
final.lm<-step(lm.a1)


Start:  AIC=1151.85
a1 ~ season + size + speed + mxPH + mno2 + C1 + NO3 + NH4 + OPO4 + 
    PO4 + Chla

         Df Sum of Sq   RSS    AIC
- season  3    425.49 57043 1147.3
- speed   2    269.97 56887 1148.8
- OPO4    1      5.37 56623 1149.9
- Chla    1    400.60 57018 1151.2
- C1      1    497.54 57115 1151.6
- mxPH    1    541.51 57159 1151.7
<none>                56617 1151.8
- mno2    1    649.51 57267 1152.1
- NH4     1    799.12 57417 1152.6
- PO4     1    898.87 57516 1153.0
- size    2   1870.97 58488 1154.3
- NO3     1   2285.84 58903 1157.7

Step:  AIC=1147.33
a1 ~ size + speed + mxPH + mno2 + C1 + NO3 + NH4 + OPO4 + PO4 + 
    Chla

        Df Sum of Sq   RSS    AIC
- speed  2    212.80 57256 1144.1
- OPO4   1      7.50 57050 1145.4
- Chla   1    377.91 57421 1146.6
- mno2   1    426.96 57470 1146.8
- mxPH   1    457.25 57500 1146.9
- C1     1    463.57 57506 1146.9
<none>               57043 1147.3
- NH4    1    751.20 57794 1147.9
- PO4    1    858.72 57902 1148.3
- size   2   2184.02 59227 1150.8
- NO3    1   2352.90 59396 1153.3

Step:  AIC=1144.07
a1 ~ size + mxPH + mno2 + C1 + NO3 + NH4 + OPO4 + PO4 + Chla

       Df Sum of Sq   RSS    AIC
- OPO4  1     16.33 57272 1142.1
- Chla  1    248.64 57504 1142.9
- mno2  1    381.45 57637 1143.4
- mxPH  1    474.75 57730 1143.7
- C1    1    539.36 57795 1143.9
<none>              57256 1144.1
- NH4   1    694.85 57951 1144.5
- PO4   1    801.67 58057 1144.8
- size  2   2059.66 59315 1147.1
- NO3   1   2310.43 59566 1149.9

Step:  AIC=1142.13
a1 ~ size + mxPH + mno2 + C1 + NO3 + NH4 + PO4 + Chla

       Df Sum of Sq   RSS    AIC
- Chla  1     233.6 57506 1140.9
- mno2  1     370.7 57643 1141.4
- mxPH  1     511.8 57784 1141.9
- C1    1     537.2 57809 1142.0
<none>              57272 1142.1
- NH4   1     679.2 57951 1142.5
- size  2    2049.0 59321 1145.1
- NO3   1    2301.7 59574 1147.9
- PO4   1    5767.3 63039 1159.1

Step:  AIC=1140.93
a1 ~ size + mxPH + mno2 + C1 + NO3 + NH4 + PO4

       Df Sum of Sq   RSS    AIC
- mno2  1     405.2 57911 1140.3
- C1    1     497.8 58003 1140.6
<none>              57506 1140.9
- NH4   1     725.2 58231 1141.4
- mxPH  1     845.4 58351 1141.8
- size  2    2218.6 59724 1144.4
- NO3   1    2615.4 60121 1147.7
- PO4   1    6294.0 63800 1159.5

Step:  AIC=1140.32
a1 ~ size + mxPH + C1 + NO3 + NH4 + PO4

       Df Sum of Sq   RSS    AIC
- NH4   1     520.8 58432 1140.1
<none>              57911 1140.3
- C1    1     642.6 58553 1140.5
- mxPH  1     834.5 58745 1141.2
- size  2    2469.3 60380 1144.6
- NO3   1    2225.3 60136 1145.8
- PO4   1    9027.7 66939 1167.0

Step:  AIC=1140.09
a1 ~ size + mxPH + C1 + NO3 + PO4

       Df Sum of Sq   RSS    AIC
<none>              58432 1140.1
- mxPH  1     801.0 59233 1140.8
- C1    1     905.8 59338 1141.1
- NO3   1    1973.7 60405 1144.7
- size  2    2652.3 61084 1144.9
- PO4   1    8514.5 66946 1165.0

In [212]:
summary(final.lm)


Out[212]:
Call:
lm(formula = a1 ~ size + mxPH + C1 + NO3 + PO4, data = clean.algae[, 
    1:12])

Residuals:
    Min      1Q  Median      3Q     Max 
-28.876 -12.681  -3.688   8.393  62.875 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 57.63859   20.93604   2.753  0.00647 ** 
sizemedium   2.82560    3.39950   0.831  0.40691    
sizesmall   10.39431    3.81809   2.722  0.00708 ** 
mxPH        -4.00980    2.47801  -1.618  0.10728    
C1          -0.05438    0.03160  -1.721  0.08692 .  
NO3         -0.89215    0.35124  -2.540  0.01188 *  
PO4         -0.05887    0.01116  -5.276 3.57e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 17.49 on 191 degrees of freedom
Multiple R-squared:  0.3536,	Adjusted R-squared:  0.3333 
F-statistic: 17.42 on 6 and 191 DF,  p-value: 4.857e-16

In [213]:
library(rpart)

In [214]:
algae<-read.table('Analysis.txt',header=F,dec='.',col.names=c('season','size','speed','mxPH','mno2','C1','NO3','NH4',
'OPO4','PO4','Chla','a1','a2','a3','a4','a5','a6','a7'),na.strings=c('XXXXXXX'))

In [215]:
algae<-algae[-c(62,199),]

In [217]:
rt.a1<-rpart(a1~.,data=algae[,1:12])

In [218]:
rt.a1


Out[218]:
n= 198 

node), split, n, deviance, yval
      * denotes terminal node

 1) root 198 90401.290 16.996460  
   2) PO4>=43.818 147 31279.120  8.979592  
     4) C1>=7.8065 140 21622.830  7.492857  
       8) OPO4>=51.118 84  3441.149  3.846429 *
       9) OPO4< 51.118 56 15389.430 12.962500  
        18) mno2>=10.05 24  1248.673  6.716667 *
        19) mno2< 10.05 32 12502.320 17.646870  
          38) NO3>=3.1875 9   257.080  7.866667 *
          39) NO3< 3.1875 23 11047.500 21.473910  
            78) mno2< 8 13  2919.549 13.807690 *
            79) mno2>=8 10  6370.704 31.440000 *
     5) C1< 7.8065 7  3157.769 38.714290 *
   3) PO4< 43.818 51 22442.760 40.103920  
     6) mxPH< 7.87 28 11452.770 33.450000  
      12) mxPH>=7.045 18  5146.169 26.394440 *
      13) mxPH< 7.045 10  3797.645 46.150000 *
     7) mxPH>=7.87 23  8241.110 48.204350  
      14) PO4>=15.177 12  3047.517 38.183330 *
      15) PO4< 15.177 11  2673.945 59.136360 *

In [220]:
plot(rt.a1,uniform=T,branch=1,margin=0.1,cex=0.9)
text(rt.a1,cex=0.9)



In [222]:
printcp(rt.a1)


Regression tree:
rpart(formula = a1 ~ ., data = algae[, 1:12])

Variables actually used in tree construction:
[1] C1   mno2 mxPH NO3  OPO4 PO4 

Root node error: 90401/198 = 456.57

n= 198 

        CP nsplit rel error  xerror    xstd
1 0.405740      0   1.00000 1.01054 0.13139
2 0.071885      1   0.59426 0.62715 0.10876
3 0.030887      2   0.52237 0.65965 0.10973
4 0.030408      3   0.49149 0.69746 0.11191
5 0.027872      4   0.46108 0.69389 0.11202
6 0.027754      5   0.43321 0.69405 0.10676
7 0.018124      6   0.40545 0.66001 0.10557
8 0.016344      7   0.38733 0.67852 0.10467
9 0.010000      9   0.35464 0.68088 0.10741

In [223]:
rt2.a1<-prune(rt.a1,cp=0.08)

In [224]:
rt2.a1


Out[224]:
n= 198 

node), split, n, deviance, yval
      * denotes terminal node

1) root 198 90401.29 16.996460  
  2) PO4>=43.818 147 31279.12  8.979592 *
  3) PO4< 43.818 51 22442.76 40.103920 *

In [225]:
first.tree<-rpart(a1~.,data=algae[,1:12])

In [226]:
my.tree<-snip.rpart(first.tree,c(4,7))

In [227]:
my.tree


Out[227]:
n= 198 

node), split, n, deviance, yval
      * denotes terminal node

 1) root 198 90401.290 16.996460  
   2) PO4>=43.818 147 31279.120  8.979592  
     4) C1>=7.8065 140 21622.830  7.492857 *
     5) C1< 7.8065 7  3157.769 38.714290 *
   3) PO4< 43.818 51 22442.760 40.103920  
     6) mxPH< 7.87 28 11452.770 33.450000  
      12) mxPH>=7.045 18  5146.169 26.394440 *
      13) mxPH< 7.045 10  3797.645 46.150000 *
     7) mxPH>=7.87 23  8241.110 48.204350 *

In [231]:
plot(first.tree)
text(first.tree)
snip.rpart(first.tree)


Out[231]:
n= 198 

node), split, n, deviance, yval
      * denotes terminal node

 1) root 198 90401.290 16.996460  
   2) PO4>=43.818 147 31279.120  8.979592  
     4) C1>=7.8065 140 21622.830  7.492857  
       8) OPO4>=51.118 84  3441.149  3.846429 *
       9) OPO4< 51.118 56 15389.430 12.962500  
        18) mno2>=10.05 24  1248.673  6.716667 *
        19) mno2< 10.05 32 12502.320 17.646870  
          38) NO3>=3.1875 9   257.080  7.866667 *
          39) NO3< 3.1875 23 11047.500 21.473910  
            78) mno2< 8 13  2919.549 13.807690 *
            79) mno2>=8 10  6370.704 31.440000 *
     5) C1< 7.8065 7  3157.769 38.714290 *
   3) PO4< 43.818 51 22442.760 40.103920  
     6) mxPH< 7.87 28 11452.770 33.450000  
      12) mxPH>=7.045 18  5146.169 26.394440 *
      13) mxPH< 7.045 10  3797.645 46.150000 *
     7) mxPH>=7.87 23  8241.110 48.204350  
      14) PO4>=15.177 12  3047.517 38.183330 *
      15) PO4< 15.177 11  2673.945 59.136360 *

In [233]:
lm.prediction.a1<-predict(final.lm,clean.algae)

In [248]:
rt1.prediction.al<-predict(rt.a1,algae)

In [258]:
#Mean Absolute Error

In [237]:
(mae.a1.lm<-mean(abs(lm.prediction.a1-algae[,'a1'])))


Out[237]:
13.1027885152035

In [257]:
(mae.a1.rt<-mean(abs(predict(rt.a1,algae)-algae[,'a1'])))


Out[257]:
8.48061947592251

In [259]:
#Mean Squared Error

In [260]:
(mse.a1.lm<-mean((lm.prediction.a1-algae[,'a1'])^2))


Out[260]:
295.109710635886

In [261]:
(mse.a1.rt<-mean((predict(rt.a1,algae)-algae[,'a1'])^2))


Out[261]:
161.920205200804

In [262]:
#Normalized mean squared error

In [265]:
(nmse.a1.lm<-mean((lm.prediction.a1-algae[,'a1'])^2)/mean((mean(algae[,'a1'])-algae[,'a1'])^2))


Out[265]:
0.646359408206252

In [266]:
(nmse.a1.rt<-mean((predict(rt.a1,algae)-algae[,'a1'])^2)/mean((mean(algae[,'a1'])-algae[,'a1'])^2))


Out[266]:
0.354643186036521

In [267]:
#Visual inspection of the predictions

In [271]:
old.par<-par(mfrow=c(2,1))

In [276]:
plot(lm.prediction.a1,algae[,'a1'],main='Linear Model',xlab="Predictions",ylab="True Values")
abline(0,1,lty=2)
plot(predict(rt.a1,algae),algae[,'a1'],main='Regression Model',xlab='Predictions',ylab='True Values')
abline(0,1,lty=2)



In [ ]:


In [ ]: