In [1]:

    
# Load Biospytial modules and etc.
%matplotlib inline
import sys
sys.path.append('/apps')
import django
django.setup()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
## Use the ggplot style
plt.style.use('ggplot')



In [2]:

    
from external_plugins.spystats import tools



In [3]:

    
# My mac
#data = pd.read_csv("/RawDataCSV/plotsClimateData_11092017.csv")
# My Linux desktop
data = pd.read_csv("/RawDataCSV/idiv_share/plotsClimateData_11092017.csv")









    



/opt/conda/envs/biospytial/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (24) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)



In [4]:

    
%time new_data = tools.toGeoDataFrame(pandas_dataframe=data,xcoord_name='LON',ycoord_name='LAT')









    



CPU times: user 1.58 s, sys: 12 ms, total: 1.59 s
Wall time: 1.6 s

from django.contrib.gis import geos import geopandas as gpd from shapely.geometry import Point %time data['geometry'] = data.apply(lambda z: Point(z.LON, z.LAT), axis=1) %time new_data = gpd.GeoDataFrame(data)

Let´s reproject to Alberts or something with distance

Uncomment to reproject

proj string taken from: http://spatialreference.org/



In [5]:

    
new_data =  new_data.to_crs("+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=37.5 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ")

Add log of the Biomass



In [6]:

    
new_data['logBiomass'] = new_data.apply(lambda x : np.log(x.plotBiomass),axis=1)



In [7]:

    
new_data['newLon'] = new_data.apply(lambda c : c.geometry.x, axis=1)
new_data['newLat'] = new_data.apply(lambda c : c.geometry.y, axis=1)



In [8]:

    
new_data.plot(column='SppN')









    Out[8]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f2b4a4e4b50>



In [9]:

    
new_data['logBiomass'] = np.log(new_data.plotBiomass)
new_data['logSppn'] = np.log(new_data.SppN)



In [10]:

    
new_data.logBiomass.plot.hist()









    Out[10]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f2b4a4e49d0>

Linear Regression



In [11]:

    
### Now with statsmodels.api

#xx = X.SppN.values.reshape(-1,1)
#xx = sm.add_constant(xx)
#model = sm.OLS(Y.values.reshape(-1,1),xx)
import statsmodels.formula.api as smf
model = smf.ols(formula='logBiomass ~ SppN',data=new_data)
results = model.fit()
param_model = results.params
results.summary()









    Out[11]:





OLS Regression Results

  Dep. Variable:        logBiomass       R-squared:             0.102 


  Model:                    OLS          Adj. R-squared:        0.102 


  Method:              Least Squares     F-statistic:           4187. 


  Date:              Tue, 21 Nov 2017    Prob (F-statistic):     0.00  


  Time:                  17:34:02        Log-Likelihood:      -36924. 


  No. Observations:        36845         AIC:                7.385e+04


  Df Residuals:            36843         BIC:                7.387e+04


  Df Model:                    1                                      


  Covariance Type:       nonrobust                                    




               coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  Intercept      8.6217      0.007   1178.320   0.000      8.607     8.636


  SppN           0.0807      0.001     64.709   0.000      0.078     0.083




  Omnibus:        1709.757    Durbin-Watson:         1.700


  Prob(Omnibus):    0.000     Jarque-Bera (JB):   2473.616


  Skew:            -0.437     Prob(JB):               0.00


  Kurtosis:         3.921     Cond. No.               12.8



In [12]:

    
### Now with statsmodels.api

#xx = X.SppN.values.reshape(-1,1)
#xx = sm.add_constant(xx)
#model = sm.OLS(Y.values.reshape(-1,1),xx)
import statsmodels.formula.api as smf
model = smf.ols(formula='logBiomass ~ logSppn',data=new_data)
results = model.fit()
param_model = results.params
results.summary()









    Out[12]:





OLS Regression Results

  Dep. Variable:        logBiomass       R-squared:             0.114 


  Model:                    OLS          Adj. R-squared:        0.114 


  Method:              Least Squares     F-statistic:           4757. 


  Date:              Tue, 21 Nov 2017    Prob (F-statistic):     0.00  


  Time:                  17:34:02        Log-Likelihood:      -36670. 


  No. Observations:        36845         AIC:                7.334e+04


  Df Residuals:            36843         BIC:                7.336e+04


  Df Model:                    1                                      


  Covariance Type:       nonrobust                                    




               coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  Intercept      8.4882      0.009    976.347   0.000      8.471     8.505


  logSppn        0.3740      0.005     68.974   0.000      0.363     0.385




  Omnibus:        1627.980    Durbin-Watson:         1.701


  Prob(Omnibus):    0.000     Jarque-Bera (JB):   2356.149


  Skew:            -0.421     Prob(JB):               0.00


  Kurtosis:         3.908     Cond. No.               5.50



In [13]:

    
new_data['residuals1'] = results.resid



In [14]:

    
fig, axes = plt.subplots(nrows=2, ncols=2)

#plt.scatter(new_data.newLon,new_data.residuals1,subplots=True, layout=(1,2))
#plt.scatter(new_data.newLat,new_data.residuals1,subplots=True, layout=(1,2))

new_data.residuals1.hist(ax=axes[0][0])
#axes[1][0] = 
plt.scatter(new_data.newLat,new_data.residuals1)
#axes[1][1] = 
plt.scatter(new_data.newLon,new_data.residuals1)


plt.set_title=("Residuals of: $log(Biomass) = Spp_richnees$")

STOPPPP!!



In [ ]:

The area is very big -> 35000 points.

We need to make a subset of this



In [ ]:

    
# COnsider the the following subregion
section = new_data[lambda x:  (x.LON > -90) & (x.LON < -80) & (x.LAT > 30) & (x.LAT < 32) ]
section.plot(column='residuals1')
section.plot(column='plotBiomass')



In [ ]:

    
plt.scatter(section.newLon,section.residuals1)



In [ ]:

    
plt.scatter(section.newLat,section.residuals1)



In [ ]:

    
vg = tools.Variogram(section,'residuals1')
#vg.calculate_empirical(n_bins=50)
%time vg.plot(num_iterations=90,n_bins=10,refresh=True)



In [ ]:

    
# COnsider the the following subregion
section = new_data[lambda x:  (x.LON > -90) & (x.LON < -88) & (x.LAT > 30) & (x.LAT < 40) ]
section.plot(column='residuals1')
section.plot(column='plotBiomass')



In [ ]:

    
vg = tools.Variogram(section,'residuals1')
#vg.calculate_empirical(n_bins=50)
%time vg.plot(num_iterations=90,n_bins=30,refresh=True)



In [ ]:

    
xx = pd.DataFrame({'dist' : vg.distance_coordinates.flatten() , 'y' : vg.distance_responses.flatten()})

Now with distance restriction (experimental!)



In [ ]:

    
vg = tools.Variogram(section,'residuals1',using_distance_threshold=3000)
#vg.calculate_empirical(n_bins=50)
%time vg.plot(num_iterations=80,n_bins=20,refresh=True,with_envelope=True)

Model Fitting Using a GLM

The general model will have the form: $$ Biomass(x,y) = \beta_1 AET + \beta_2 Age + Z(x,y) + \epsilon $$ Where: $\beta_1$ and $\beta_2$ are model parameters, $Z(x,y)$ is the Spatial Autocorrelation process and $\epsilon \sim N(0,\sigma^2)$



In [ ]:

    
len(data.lon)
#X = data[['AET','StandAge','lon','lat']]
#X = section[['SppN','lon','lat']]
#X = section[['SppN','newLon','newLat']]
X = section[['newLon','newLat']]
#Y = section['plotBiomass']
logY = section['logBiomass']
Y = section[['residuals1']]
#Y = data[['SppN']]
## First step in spatial autocorrelation
#Y = pd.DataFrame(np.zeros(len(Y)))
## Let´s take a small sample only for the spatial autocorrelation
import numpy as np
sample_size = 2000
randindx = np.random.randint(0,X.shape[0],sample_size)
nX = X.loc[randindx]
nY = Y.loc[randindx]



In [ ]:

    
# Import GPFlow
import GPflow as gf
k = gf.kernels.Matern12(2, active_dims = [0,1],lengthscales=1000 )
#k = gf.kernels.Matern12(2,lengthscales=700000) + gf.kernels.Constant?



In [ ]:

    
#k = gf.kernels.RBF(2,variance=2.0,lengthscales=4000) + gf.kernels.RBF(2,variance=2.0,lengthscales=700000) + gf.kernels.Constant(2,variance=1.0,active_dims=[0,1])



In [ ]:

    
model = gf.gpr.GPR(section[['newLon','newLat']].as_matrix(),section.residuals1.values.reshape(-1,1),k)



In [ ]:

    
%time model.optimize()



In [ ]:

    
model.get_parameter_dict()

Hasta Aqui!, Lo dem'as son tonterias y retazos



In [ ]:

    
import numpy as np
Nn = 300
dsc = section
predicted_x = np.linspace(min(dsc.newLon),max(dsc.newLon),Nn)
predicted_y = np.linspace(min(dsc.newLat),max(dsc.newLat),Nn)
Xx, Yy = np.meshgrid(predicted_x,predicted_y)
## Fake richness
fake_sp_rich = np.ones(len(Xx.ravel()))
predicted_coordinates = np.vstack([ Xx.ravel(), Yy.ravel()]).transpose()
#predicted_coordinates = np.vstack([section.SppN, section.newLon,section.newLat]).transpose()



In [ ]:

    
predicted_coordinates.shape



In [ ]:

    
means,variances = model.predict_y(predicted_coordinates)



In [ ]:

    
variances = np.array(variances)
means = np.array(means)



In [ ]:

    
fig = plt.figure(figsize=(16,10), dpi= 80, facecolor='w', edgecolor='w')
#plt.pcolor(Xx,Yy,np.sqrt(variances.reshape(Nn,Nn))) #,cmap=plt.cm.Greens)
plt.pcolormesh(Xx,Yy,np.sqrt(variances.reshape(Nn,Nn)))
plt.colorbar()
plt.scatter(dsc.newLon,dsc.newLat,c=dsc.SppN,edgecolors='')
plt.title("VAriance Biomass")
plt.colorbar()



In [ ]:

    
import cartopy
plt.figure(figsize=(17,11))

proj = cartopy.crs.PlateCarree()
ax = plt.subplot(111, projection=proj)


ax = plt.axes(projection=proj)
#algo = new_data.plot(column='SppN',ax=ax,cmap=colormap,edgecolors='')


#ax.set_extent([-93, -70, 30, 50])
#ax.set_extent([-100, -60, 20, 50])
#ax.set_extent([-95, -70, 25, 45])

#ax.add_feature(cartopy.feature.LAND)
ax.add_feature(cartopy.feature.OCEAN)
ax.add_feature(cartopy.feature.COASTLINE)
ax.add_feature(cartopy.feature.BORDERS, linestyle=':')
ax.add_feature(cartopy.feature.LAKES, alpha=0.9)
ax.stock_img()
#ax.add_geometries(new_data.geometry,crs=cartopy.crs.PlateCarree())
#ax.add_feature(cartopy.feature.RIVERS)
mm = ax.pcolormesh(Xx,Yy,means.reshape(Nn,Nn),transform=proj )
#cs = plt.contour(Xx,Yy,np.sqrt(variances).reshape(Nn,Nn),linewidths=2,cmap=plt.cm.Greys_r,linestyles='dotted')
cs = plt.contour(Xx,Yy,means.reshape(Nn,Nn),linewidths=2,colors='k',linestyles='dotted',levels=[4.0,5.0,6.0,7.0,8.0])
plt.clabel(cs, fontsize=16,inline=True,fmt='%1.1f')
#ax.scatter(new_data.lon,new_data.lat,c=new_data.error,edgecolors='',transform=proj,cmap=plt.cm.Greys,alpha=0.2)
plt.colorbar(mm)
plt.title("Predicted Species Richness")


#(x.LON > -90) & (x.LON < -80) & (x.LAT > 40) & (x.LAT < 50)



In [1]:

    
#### test to check duplicates
new_data









    




NameErrorTraceback (most recent call last)
<ipython-input-1-1a843d50b858> in <module>()
      1 #### test to check duplicates
----> 2 new_data

NameError: name 'new_data' is not defined



In [ ]:

Dep. Variable:	logBiomass	R-squared:	0.102
Model:	OLS	Adj. R-squared:	0.102
Method:	Least Squares	F-statistic:	4187.
Date:	Tue, 21 Nov 2017	Prob (F-statistic):	0.00
Time:	17:34:02	Log-Likelihood:	-36924.
No. Observations:	36845	AIC:	7.385e+04
Df Residuals:	36843	BIC:	7.387e+04
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
Intercept	8.6217	0.007	1178.320	0.000	8.607 8.636
SppN	0.0807	0.001	64.709	0.000	0.078 0.083

Omnibus:	1709.757	Durbin-Watson:	1.700
Prob(Omnibus):	0.000	Jarque-Bera (JB):	2473.616
Skew:	-0.437	Prob(JB):	0.00
Kurtosis:	3.921	Cond. No.	12.8

Omnibus:	1627.980	Durbin-Watson:	1.701
Prob(Omnibus):	0.000	Jarque-Bera (JB):	2356.149
Skew:	-0.421	Prob(JB):	0.00
Kurtosis:	3.908	Cond. No.	5.50