In [1]:

    
# Load Biospytial modules and etc.
%matplotlib inline
import sys
sys.path.append('/apps')
import django
django.setup()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
## Use the ggplot style
plt.style.use('ggplot')



In [2]:

    
from external_plugins.spystats import tools



In [3]:

    
# My mac
data = pd.read_csv("/RawDataCSV/plotsClimateData_11092017.csv")
# My Linux desktop
#data = pd.read_csv("/RawDataCSV/idiv_share/plotsClimateData_11092017.csv")









    



/opt/conda/envs/biospytial/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (24) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)



In [4]:

    
data.SppN.mean()









    Out[4]:





5.1824128104220382



In [5]:

    
%time new_data = tools.toGeoDataFrame(pandas_dataframe=data,xcoord_name='LON',ycoord_name='LAT')









    



CPU times: user 3.59 s, sys: 40 ms, total: 3.63 s
Wall time: 3.68 s

from django.contrib.gis import geos import geopandas as gpd from shapely.geometry import Point %time data['geometry'] = data.apply(lambda z: Point(z.LON, z.LAT), axis=1) %time new_data = gpd.GeoDataFrame(data)

Let´s reproject to Alberts or something with distance

Uncomment to reproject

proj string taken from: http://spatialreference.org/



In [6]:

    
new_data =  new_data.to_crs("+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=37.5 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ")

Add log of the Biomass



In [7]:

    
new_data['logBiomass'] = new_data.apply(lambda x : np.log(x.plotBiomass),axis=1)



In [8]:

    
new_data['newLon'] = new_data.apply(lambda c : c.geometry.x, axis=1)
new_data['newLat'] = new_data.apply(lambda c : c.geometry.y, axis=1)



In [9]:

    
new_data.plot(column='SppN')









    Out[9]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f6f82ae5550>



In [10]:

    
new_data['logBiomass'] = np.log(new_data.plotBiomass)



In [11]:

    
new_data.logBiomass.plot.hist()









    Out[11]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f6f82805c90>

Linear Regression



In [12]:

    
### Now with statsmodels.api

#xx = X.SppN.values.reshape(-1,1)
#xx = sm.add_constant(xx)
#model = sm.OLS(Y.values.reshape(-1,1),xx)
import statsmodels.formula.api as smf
model = smf.ols(formula='logBiomass ~ SppN',data=new_data)
results = model.fit()
param_model = results.params
results.summary()









    Out[12]:





OLS Regression Results

  Dep. Variable:        logBiomass       R-squared:             0.102 


  Model:                    OLS          Adj. R-squared:        0.102 


  Method:              Least Squares     F-statistic:           4187. 


  Date:              Wed, 01 Nov 2017    Prob (F-statistic):     0.00  


  Time:                  19:43:53        Log-Likelihood:      -36924. 


  No. Observations:        36845         AIC:                7.385e+04


  Df Residuals:            36843         BIC:                7.387e+04


  Df Model:                    1                                      


  Covariance Type:       nonrobust                                    




               coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  Intercept      8.6217      0.007   1178.320   0.000      8.607     8.636


  SppN           0.0807      0.001     64.709   0.000      0.078     0.083




  Omnibus:        1709.757    Durbin-Watson:         1.700


  Prob(Omnibus):    0.000     Jarque-Bera (JB):   2473.616


  Skew:            -0.437     Prob(JB):               0.00


  Kurtosis:         3.921     Cond. No.               12.8



In [13]:

    
new_data['residuals1'] = results.resid

The area is very big -> 35000 points.

We need to make a subset of this



In [14]:

    
# COnsider the the following subregion
section = new_data[lambda x:  (x.LON > -90) & (x.LON < -85) & (x.LAT > 30) & (x.LAT < 35) ]



In [15]:

    
section.plot(column='residuals1')









    Out[15]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f6f7d40f810>



In [16]:

    
section.plot(column='plotBiomass')









    Out[16]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f6f81ede190>



In [17]:

    
section.shape









    Out[17]:





(1841, 46)

Model Fitting Using a GLM

The general model will have the form: $$ Biomass(x,y) = \beta_1 AET + \beta_2 Age + Z(x,y) + \epsilon $$ Where: $\beta_1$ and $\beta_2$ are model parameters, $Z(x,y)$ is the Spatial Autocorrelation process and $\epsilon \sim N(0,\sigma^2)$



In [18]:

    
len(data.lon)
#X = data[['AET','StandAge','lon','lat']]
X = section[['SppN','lon','lat']]
X = section[['SppN','newLon','newLat']]
#X = data[['lon','lat']]
Y = section['plotBiomass']
logY = section['logBiomass']
#Y = data[['SppN']]
## First step in spatial autocorrelation
#Y = pd.DataFrame(np.zeros(len(Y)))
## Let´s take a small sample only for the spatial autocorrelation
import numpy as np
sample_size = 2000
randindx = np.random.randint(0,X.shape[0],sample_size)
nX = X.loc[randindx]
nY = Y.loc[randindx]



In [19]:

    
plt.scatter(X.SppN,Y)









    Out[19]:





<matplotlib.collections.PathCollection at 0x7f6f7d20a0d0>



In [20]:

    
Y.plot.hist(title="Biomass histogram")









    Out[20]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f6f7d2160d0>



In [21]:

    
logY.plot.hist(title="log-Biomass hist")









    Out[21]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f6f7d14ba50>



In [22]:

    
## Let´s make a simple linear trend here.
import statsmodels.api as sm
import statsmodels.formula.api as smf



In [23]:

    
### Now with statsmodels.api

#xx = X.SppN.values.reshape(-1,1)
#xx = sm.add_constant(xx)
#model = sm.OLS(Y.values.reshape(-1,1),xx)

model = smf.ols(formula='logBiomass ~ SppN',data=section)
results = model.fit()
param_model = results.params
results.summary()









    Out[23]:





OLS Regression Results

  Dep. Variable:        logBiomass       R-squared:             0.149


  Model:                    OLS          Adj. R-squared:        0.148


  Method:              Least Squares     F-statistic:           320.9


  Date:              Wed, 01 Nov 2017    Prob (F-statistic):  2.87e-66


  Time:                  19:43:56        Log-Likelihood:      -1752.0


  No. Observations:         1841         AIC:                   3508.


  Df Residuals:             1839         BIC:                   3519.


  Df Model:                    1                                     


  Covariance Type:       nonrobust                                   




               coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  Intercept      8.4869      0.033    261.094   0.000      8.423     8.551


  SppN           0.0830      0.005     17.913   0.000      0.074     0.092




  Omnibus:        75.080    Durbin-Watson:         1.964


  Prob(Omnibus):   0.000    Jarque-Bera (JB):    105.571


  Skew:           -0.392    Prob(JB):           1.19e-23


  Kurtosis:        3.873    Cond. No.               15.9

Residuals



In [24]:

    
Y_hat = results.predict(section)



In [25]:

    
ress = (section.logBiomass - Y_hat)



In [26]:

    
## They are eq. test this ress - results.resid



In [27]:

    
param_model.Intercept









    Out[27]:





8.4869335851853158



In [28]:

    
conf_int = results.conf_int(alpha=0.05)
plt.scatter(section.SppN,section.plotBiomass)
#plt.plot(section.SppN,param_model.Intercept + param_model.SppN * section.SppN)

plt.plot(section.SppN,Y_hat)
#plt.fill_between(Y_hat,Y_hat + conf_int , Y_hat - conf_int)
conf_int



In [29]:

    
plt.scatter(section.newLon,section.newLat,c=results.resid)
plt.colorbar()









    Out[29]:





<matplotlib.colorbar.Colorbar at 0x7f6f6f1dfbd0>



In [30]:

    
X = section[['newLon','newLat']].as_matrix()
Y = section[['logBiomass']].as_matrix()



In [31]:

    
dMM = tools._getDistanceMatrix(section)
dYY = tools._getDistResponseVariable(section,response_variable_name='logBiomass')
y = section['logBiomass']
yy = y.as_matrix()
#y2 = section[['logBiomass']]
#yy = y.values.reshape(-1,1)
#y2.shape

#dYY = sp.distance_matrix(yy,yy,p=2.0)



In [32]:

    
import scipy.spatial as sp



In [33]:

    
dM = sp.distance_matrix(X,X,p=2.0)
dY = sp.distance_matrix(Y,Y,p=2.0)



In [34]:

    
plt.imshow(dM)









    Out[34]:





<matplotlib.image.AxesImage at 0x7f6f6f0f2d50>



In [35]:

    
plt.imshow(dM)









    Out[35]:





<matplotlib.image.AxesImage at 0x7f6f6d5fc4d0>



In [36]:

    
plt.imshow(dY)









    Out[36]:





<matplotlib.image.AxesImage at 0x7f6f6d5a3510>



In [37]:

    
#d = pd.DataFrame({ 'dist' : dM.flatten(), 'ydif' : dY.flatten()})
dist = dMM.flatten()
ydif = dYY.flatten()
d = pd.DataFrame({'dist': dist,'ydif':ydif})

Calculate the semivariogram



In [38]:

    
vg = tools.Variogram(section,'logBiomass')



In [39]:

    
vg.calculate_empirical()
plt.scatter(vg.lags,vg.empirical)









    Out[39]:





<matplotlib.collections.PathCollection at 0x7f6f7c03a290>



In [ ]:

    
%time env = vg.calculateEnvelope()



In [40]:

    
ml = vg.distance_coordinates.flatten()
yl = vg.distance_responses.flatten()
X = np.vstack([ml,yl])
X.shape
n = np.random.shuffle(X)



In [41]:

    
titi = tools.montecarloEnvelope(ml,yl,num_iterations=99)



In [43]:

    
titi









    Out[43]:






  
    
      
      lags
      q_0.025
      q_0.975
    
  
  
    
      0
      0.000000
      0.364187
      0.380492
    
    
      1
      14096.245682
      0.370937
      0.378122
    
    
      2
      28192.491365
      0.371642
      0.376632
    
    
      3
      42288.737047
      0.371667
      0.376700
    
    
      4
      56384.982730
      0.372483
      0.376608
    
    
      5
      70481.228412
      0.372585
      0.376875
    
    
      6
      84577.474094
      0.371998
      0.376019
    
    
      7
      98673.719777
      0.372826
      0.376329
    
    
      8
      112769.965459
      0.372552
      0.376172
    
    
      9
      126866.211141
      0.373070
      0.376009
    
    
      10
      140962.456824
      0.372733
      0.376046
    
    
      11
      155058.702506
      0.372488
      0.375913
    
    
      12
      169154.948189
      0.372817
      0.375984
    
    
      13
      183251.193871
      0.372668
      0.375812
    
    
      14
      197347.439553
      0.372835
      0.376082
    
    
      15
      211443.685236
      0.373094
      0.375695
    
    
      16
      225539.930918
      0.373140
      0.375942
    
    
      17
      239636.176600
      0.372601
      0.375706
    
    
      18
      253732.422283
      0.372817
      0.375736
    
    
      19
      267828.667965
      0.372827
      0.375823
    
    
      20
      281924.913648
      0.372874
      0.376073
    
    
      21
      296021.159330
      0.372823
      0.376033
    
    
      22
      310117.405012
      0.372863
      0.376027
    
    
      23
      324213.650695
      0.372907
      0.376696
    
    
      24
      338309.896377
      0.371583
      0.376289
    
    
      25
      352406.142059
      0.372676
      0.376925
    
    
      26
      366502.387742
      0.372331
      0.376902
    
    
      27
      380598.633424
      0.372578
      0.377307
    
    
      28
      394694.879107
      0.372560
      0.377469
    
    
      29
      408791.124789
      0.372320
      0.377609
    
    
      30
      422887.370471
      0.372151
      0.377350
    
    
      31
      436983.616154
      0.371923
      0.377380
    
    
      32
      451079.861836
      0.371699
      0.377469
    
    
      33
      465176.107518
      0.371385
      0.377603
    
    
      34
      479272.353201
      0.370163
      0.377665
    
    
      35
      493368.598883
      0.370404
      0.379247
    
    
      36
      507464.844566
      0.370025
      0.379116
    
    
      37
      521561.090248
      0.369264
      0.382841
    
    
      38
      535657.335930
      0.368305
      0.381831
    
    
      39
      549753.581613
      0.369197
      0.381952
    
    
      40
      563849.827295
      0.367050
      0.383855
    
    
      41
      577946.072977
      0.364268
      0.383939
    
    
      42
      592042.318660
      0.362052
      0.385362
    
    
      43
      606138.564342
      0.361012
      0.389512
    
    
      44
      620234.810025
      0.356799
      0.394619
    
    
      45
      634331.055707
      0.349967
      0.399960
    
    
      46
      648427.301389
      0.348577
      0.404488
    
    
      47
      662523.547072
      0.327533
      0.450720
    
    
      48
      676619.792754
      0.240711
      0.545197



In [ ]:

    
np.random.shuffle(X)



In [ ]:

    
plt.plot(X)



In [ ]:

    
n_bins = 100
distances = dMM.flatten()
mdist = min(distances)
Mdist = max(distances)
partitions = np.linspace(mdist,Mdist,n_bins)



In [ ]:



In [ ]:

    
cosas =  map(lambda (i,x) : 0.5* (d[ ( d.dist < partitions[i+1]) & (d.dist>partitions[i])].ydif.mean()),enumerate(partitions[:len(partitions) - 1]))



In [ ]:

    
plt.scatter(partitions[:len(partitions) -1] ,cosas)



In [ ]:

    
c.ydif.mean()



In [ ]:

    
# Import GPFlow
import GPflow as gf
k = gf.kernels.Matern12(2, active_dims = [0,1] )



In [ ]:

    
results.resid.plot.hist()



In [ ]:

    
model = gf.gpr.GPR(section[['newLon','newLat']].as_matrix(),results.resid.values.reshape(-1,1),k)



In [ ]:

    
%time model.optimize()



In [ ]:

    
model.get_parameter_dict()



In [ ]:

    
import numpy as np
Nn = 300
dsc = section
predicted_x = np.linspace(min(dsc.newLon),max(dsc.newLon),Nn)
predicted_y = np.linspace(min(dsc.newLat),max(dsc.newLat),Nn)
Xx, Yy = np.meshgrid(predicted_x,predicted_y)
## Fake richness
fake_sp_rich = np.ones(len(Xx.ravel()))
predicted_coordinates = np.vstack([ Xx.ravel(), Yy.ravel()]).transpose()
#predicted_coordinates = np.vstack([section.SppN, section.newLon,section.newLat]).transpose()



In [ ]:

    
predicted_coordinates.shape



In [ ]:

    
means,variances = model.predict_y(predicted_coordinates)



In [ ]:

    
means = map(lambda x : np.exp(x),means)
variances = map(lambda x : np.exp(x),variances)



In [ ]:

    
sum(means)



In [ ]:

    
fig = plt.figure(figsize=(16,10), dpi= 80, facecolor='w', edgecolor='w')
#plt.pcolor(Xx,Yy,np.sqrt(variances.reshape(Nn,Nn))) #,cmap=plt.cm.Greens)
plt.pcolormesh(Xx,Yy,np.sqrt(variances.reshape(Nn,Nn)))
plt.colorbar()
plt.scatter(dsc.newLon,dsc.newLat,c=dsc.SppN,edgecolors='')
plt.title("VAriance Biomass")
plt.colorbar()



In [ ]:

    
import cartopy
plt.figure(figsize=(17,11))

proj = cartopy.crs.PlateCarree()
ax = plt.subplot(111, projection=proj)


ax = plt.axes(projection=proj)
#algo = new_data.plot(column='SppN',ax=ax,cmap=colormap,edgecolors='')


#ax.set_extent([-93, -70, 30, 50])
#ax.set_extent([-100, -60, 20, 50])
ax.set_extent([-95, -70, 25, 45])

#ax.add_feature(cartopy.feature.LAND)
ax.add_feature(cartopy.feature.OCEAN)
ax.add_feature(cartopy.feature.COASTLINE)
ax.add_feature(cartopy.feature.BORDERS, linestyle=':')
ax.add_feature(cartopy.feature.LAKES, alpha=0.9)
ax.stock_img()
#ax.add_geometries(new_data.geometry,crs=cartopy.crs.PlateCarree())
#ax.add_feature(cartopy.feature.RIVERS)
mm = ax.pcolormesh(Xx,Yy,means.reshape(Nn,Nn),transform=proj )
#cs = plt.contour(Xx,Yy,np.sqrt(variances).reshape(Nn,Nn),linewidths=2,cmap=plt.cm.Greys_r,linestyles='dotted')
cs = plt.contour(Xx,Yy,means.reshape(Nn,Nn),linewidths=2,colors='k',linestyles='dotted',levels=[4.0,5.0,6.0,7.0,8.0])
plt.clabel(cs, fontsize=16,inline=True,fmt='%1.1f')
#ax.scatter(new_data.lon,new_data.lat,c=new_data.error,edgecolors='',transform=proj,cmap=plt.cm.Greys,alpha=0.2)
plt.colorbar(mm)
plt.title("Predicted Species Richness")


#(x.LON > -90) & (x.LON < -80) & (x.LAT > 40) & (x.LAT < 50)

Residuals of $ Biomass ~ SppRich + Z(x,y) + \epsilon $

Using all data

Model Analysis



In [ ]:

    
model.kern



In [ ]:

    
model.likelihood

Let's calculate the residuals



In [ ]:

    
X_ = section[['LON','LAT']]
%time resid_hat, resid_hat_variance = model.predict_y(X_)



In [ ]:

    
resid_hat.shape



In [ ]:

    
qr_errors = qr_errors.values.reshape(len(qr_errors),1)



In [ ]:

    
#new_data= new_data.assign(error=lambda y : (y.SppN - y.pred_y)**2 )
sqr_error_errors = pd.DataFrame((qr_errors - resid_hat)**2)



In [ ]:

    
print(sqr_error_errors[0].mean())
print(sqr_error_errors[0].std())



In [ ]:

    
true_error



In [ ]:

    
sqr_error_errors.mean()



In [ ]:

    
k.ARD?



In [ ]:

	0	1
Intercept	8.423182	8.550685
SppN	0.073933	0.092113

Dep. Variable:	logBiomass	R-squared:	0.102
Model:	OLS	Adj. R-squared:	0.102
Method:	Least Squares	F-statistic:	4187.
Date:	Wed, 01 Nov 2017	Prob (F-statistic):	0.00
Time:	19:43:53	Log-Likelihood:	-36924.
No. Observations:	36845	AIC:	7.385e+04
Df Residuals:	36843	BIC:	7.387e+04
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
Intercept	8.6217	0.007	1178.320	0.000	8.607 8.636
SppN	0.0807	0.001	64.709	0.000	0.078 0.083

Omnibus:	1709.757	Durbin-Watson:	1.700
Prob(Omnibus):	0.000	Jarque-Bera (JB):	2473.616
Skew:	-0.437	Prob(JB):	0.00
Kurtosis:	3.921	Cond. No.	12.8

Omnibus:	75.080	Durbin-Watson:	1.964
Prob(Omnibus):	0.000	Jarque-Bera (JB):	105.571
Skew:	-0.392	Prob(JB):	1.19e-23
Kurtosis:	3.873	Cond. No.	15.9

	lags	q_0.025	q_0.975
0	0.000000	0.364187	0.380492
1	14096.245682	0.370937	0.378122
2	28192.491365	0.371642	0.376632
3	42288.737047	0.371667	0.376700
4	56384.982730	0.372483	0.376608
5	70481.228412	0.372585	0.376875
6	84577.474094	0.371998	0.376019
7	98673.719777	0.372826	0.376329
8	112769.965459	0.372552	0.376172
9	126866.211141	0.373070	0.376009
10	140962.456824	0.372733	0.376046
11	155058.702506	0.372488	0.375913
12	169154.948189	0.372817	0.375984
13	183251.193871	0.372668	0.375812
14	197347.439553	0.372835	0.376082
15	211443.685236	0.373094	0.375695
16	225539.930918	0.373140	0.375942
17	239636.176600	0.372601	0.375706
18	253732.422283	0.372817	0.375736
19	267828.667965	0.372827	0.375823
20	281924.913648	0.372874	0.376073
21	296021.159330	0.372823	0.376033
22	310117.405012	0.372863	0.376027
23	324213.650695	0.372907	0.376696
24	338309.896377	0.371583	0.376289
25	352406.142059	0.372676	0.376925
26	366502.387742	0.372331	0.376902
27	380598.633424	0.372578	0.377307
28	394694.879107	0.372560	0.377469
29	408791.124789	0.372320	0.377609
30	422887.370471	0.372151	0.377350
31	436983.616154	0.371923	0.377380
32	451079.861836	0.371699	0.377469
33	465176.107518	0.371385	0.377603
34	479272.353201	0.370163	0.377665
35	493368.598883	0.370404	0.379247
36	507464.844566	0.370025	0.379116
37	521561.090248	0.369264	0.382841
38	535657.335930	0.368305	0.381831
39	549753.581613	0.369197	0.381952
40	563849.827295	0.367050	0.383855
41	577946.072977	0.364268	0.383939
42	592042.318660	0.362052	0.385362
43	606138.564342	0.361012	0.389512
44	620234.810025	0.356799	0.394619
45	634331.055707	0.349967	0.399960
46	648427.301389	0.348577	0.404488
47	662523.547072	0.327533	0.450720
48	676619.792754	0.240711	0.545197