notebook.community

Edit and run



In [13]:

    
# Load Biospytial modules and etc.
%matplotlib inline
import sys
sys.path.append('/apps')
import django
django.setup()
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
## Use the ggplot style
plt.style.use('ggplot')



In [14]:

    
# My mac
#data = pd.read_csv("/RawDataCSV/plotsClimateData_11092017.csv")
# My Linux desktop
data = pd.read_csv("/RawDataCSV/idiv_share/plotsClimateData_11092017.csv")









    



/opt/conda/envs/biospytial/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (24) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)



In [15]:

    
data.SppN.mean()









    Out[15]:





5.1824128104220382



In [16]:

    
import geopandas as gpd



In [17]:

    
from django.contrib.gis import geos
from shapely.geometry import Point



In [18]:

    
data['geometry'] = data.apply(lambda z: Point(z.LON, z.LAT), axis=1)
new_data = gpd.GeoDataFrame(data)

Let´s reproject to Alberts or something with distance



In [19]:

    
new_data.crs = {'init':'epsg:4326'}

Uncomment to reproject

proj string taken from: http://spatialreference.org/



In [20]:

    
new_data =  new_data.to_crs("+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=37.5 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ")
new_data['newLon'] = new_data.apply(lambda c : c.geometry.x, axis=1)
new_data['newLat'] = new_data.apply(lambda c : c.geometry.y, axis=1)



In [21]:

    
new_data['logBiomass'] = np.log(new_data.plotBiomass)
new_data['logSppN'] = np.log(new_data.SppN)



In [22]:

    
## Let´s make a simple linear trend here.
import statsmodels.api as sm
import statsmodels.formula.api as smf



In [23]:

    
## All data
### Now with statsmodels.api

#xx = X.SppN.values.reshape(-1,1)
#xx = sm.add_constant(xx)
#model = sm.OLS(Y.values.reshape(-1,1),xx)

model = smf.ols(formula='logBiomass ~ logSppN',data=new_data)
results = model.fit()
param_model = results.params
results.summary()









    Out[23]:





OLS Regression Results

  Dep. Variable:        logBiomass       R-squared:             0.114 


  Model:                    OLS          Adj. R-squared:        0.114 


  Method:              Least Squares     F-statistic:           4757. 


  Date:              Thu, 23 Nov 2017    Prob (F-statistic):     0.00  


  Time:                  00:54:54        Log-Likelihood:      -36670. 


  No. Observations:        36845         AIC:                7.334e+04


  Df Residuals:            36843         BIC:                7.336e+04


  Df Model:                    1                                      


  Covariance Type:       nonrobust                                    




               coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  Intercept      8.4882      0.009    976.347   0.000      8.471     8.505


  logSppN        0.3740      0.005     68.974   0.000      0.363     0.385




  Omnibus:        1627.980    Durbin-Watson:         1.701


  Prob(Omnibus):    0.000     Jarque-Bera (JB):   2356.149


  Skew:            -0.421     Prob(JB):               0.00


  Kurtosis:         3.908     Cond. No.               5.50



In [24]:

    
new_data['residuals1'] = results.resid

The area is very big -> 35000 points.

We need to make a subset of this



In [25]:

    
# COnsider the the following subregion
section = new_data[lambda x:  (x.LON > -100) & (x.LON < -85) & (x.LAT > 30) & (x.LAT < 35) ]
section.plot(column='SppN')
section.plot(column='plotBiomass')
section.shape









    Out[25]:





(4657, 47)

Residuals



In [14]:

    
Y_hat = results.predict(section)



In [15]:

    
ress = (section.logBiomass - Y_hat)



In [16]:

    
param_model.Intercept









    Out[16]:





8.488195471616315



In [17]:

    
conf_int = results.conf_int(alpha=0.05)
plt.scatter(section.SppN,section.plotBiomass)
#plt.plot(section.SppN,param_model.Intercept + param_model.SppN * section.SppN)

plt.plot(section.SppN,Y_hat)
#plt.fill_between(Y_hat,Y_hat + conf_int , Y_hat - conf_int)
conf_int



In [20]:

    
plt.scatter(section.logSppN,section.residuals1)









    Out[20]:





<matplotlib.collections.PathCollection at 0x7f6ee62b68d0>



In [23]:

    
plt.scatter(section.newLon,section.newLat,c=section.residuals1)
plt.colorbar()









    Out[23]:





<matplotlib.colorbar.Colorbar at 0x7f6ee60d3fd0>



In [24]:

    
# Import GPFlow
import GPflow as gf
k = gf.kernels.Matern12(2, lengthscales=0.2, active_dims = [0,1] ) + gf.kernels.Constant(2,active_dims=[0,1])



In [25]:

    
results.resid.plot.hist()









    Out[25]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f6ee63e0550>



In [28]:

    
model = gf.gpr.GPR(section[['newLon','newLat']].as_matrix(),section.residuals1.values.reshape(-1,1),k)



In [29]:

    
%time model.optimize()









    



CPU times: user 28min 25s, sys: 30.9 s, total: 28min 56s
Wall time: 10min 32s






    Out[29]:





      fun: 4417.2589165690879
 hess_inv: <4x4 LbfgsInvHessProduct with dtype=float64>
      jac: array([  1.31733145e-05,  -0.00000000e+00,  -2.29640029e-05,
        -2.29639988e-05])
  message: 'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 24
      nit: 18
   status: 0
  success: True
        x: array([-5.23780247, -1.50777732, -1.53581048, -1.53581191])



In [30]:

    
k.get_parameter_dict()









    Out[30]:





{'name.kern.constant.variance': array([ 0.00529886]),
 'name.kern.matern12.lengthscales': array([ 0.2]),
 'name.kern.matern12.variance': array([ 0.19497644])}



In [ ]:

    
model.get_parameter_dict()



In [ ]:

    
import numpy as np
Nn = 300
dsc = section
predicted_x = np.linspace(min(dsc.newLon),max(dsc.newLon),Nn)
predicted_y = np.linspace(min(dsc.newLat),max(dsc.newLat),Nn)
Xx, Yy = np.meshgrid(predicted_x,predicted_y)
## Fake richness
fake_sp_rich = np.ones(len(Xx.ravel()))
predicted_coordinates = np.vstack([ Xx.ravel(), Yy.ravel()]).transpose()
#predicted_coordinates = np.vstack([section.SppN, section.newLon,section.newLat]).transpose()



In [ ]:

    
predicted_coordinates.shape



In [ ]:

    
means,variances = model.predict_y(predicted_coordinates)



In [ ]:

    
sum(means)



In [ ]:

    
fig = plt.figure(figsize=(16,10), dpi= 80, facecolor='w', edgecolor='w')
#plt.pcolor(Xx,Yy,np.sqrt(variances.reshape(Nn,Nn))) #,cmap=plt.cm.Greens)
plt.pcolormesh(Xx,Yy,np.sqrt(variances.reshape(Nn,Nn)))
plt.colorbar()
plt.scatter(dsc.newLon,dsc.newLat,c=dsc.SppN,edgecolors='')
plt.title("VAriance Biomass")
plt.colorbar()



In [ ]:

    
import cartopy
plt.figure(figsize=(17,11))

proj = cartopy.crs.PlateCarree()
ax = plt.subplot(111, projection=proj)


ax = plt.axes(projection=proj)
#algo = new_data.plot(column='SppN',ax=ax,cmap=colormap,edgecolors='')


#ax.set_extent([-93, -70, 30, 50])
#ax.set_extent([-100, -60, 20, 50])
ax.set_extent([-95, -70, 25, 45])

#ax.add_feature(cartopy.feature.LAND)
ax.add_feature(cartopy.feature.OCEAN)
ax.add_feature(cartopy.feature.COASTLINE)
ax.add_feature(cartopy.feature.BORDERS, linestyle=':')
ax.add_feature(cartopy.feature.LAKES, alpha=0.9)
ax.stock_img()
#ax.add_geometries(new_data.geometry,crs=cartopy.crs.PlateCarree())
#ax.add_feature(cartopy.feature.RIVERS)
mm = ax.pcolormesh(Xx,Yy,means.reshape(Nn,Nn),transform=proj )
#cs = plt.contour(Xx,Yy,np.sqrt(variances).reshape(Nn,Nn),linewidths=2,cmap=plt.cm.Greys_r,linestyles='dotted')
cs = plt.contour(Xx,Yy,means.reshape(Nn,Nn),linewidths=2,colors='k',linestyles='dotted',levels=[4.0,5.0,6.0,7.0,8.0])
plt.clabel(cs, fontsize=16,inline=True,fmt='%1.1f')
#ax.scatter(new_data.lon,new_data.lat,c=new_data.error,edgecolors='',transform=proj,cmap=plt.cm.Greys,alpha=0.2)
plt.colorbar(mm)
plt.title("Predicted Species Richness")


#(x.LON > -90) & (x.LON < -80) & (x.LAT > 40) & (x.LAT < 50)

Residuals of $ Biomass ~ SppRich + Z(x,y) + \epsilon $

Using all data

Model Analysis with the empirical variogram



In [48]:

    
from external_plugins.spystats import tools



In [53]:

    
filename = "../HEC_runs/results/low_q/data_envelope.csv"
envelope_data = pd.read_csv(filename)
gvg = tools.Variogram(new_data,'logBiomass',using_distance_threshold=600000)
gvg.envelope = envelope_data
gvg.empirical = gvg.envelope.variogram
gvg.lags = gvg.envelope.lags
vdata = gvg.envelope.dropna()



In [54]:

    
gvg.plot(refresh=False)

Fitting the empirical variogram to into a theoretical model



In [59]:









    Out[59]:





0          0.000000
1      13469.387755
2      26938.775510
3      40408.163265
4      53877.551020
5      67346.938776
6      80816.326531
7      94285.714286
8     107755.102041
9     121224.489796
10    134693.877551
11    148163.265306
12    161632.653061
13    175102.040816
14    188571.428571
15    202040.816327
16    215510.204082
17    228979.591837
18    242448.979592
19    255918.367347
20    269387.755102
21    282857.142857
22    296326.530612
23    309795.918367
24    323265.306122
25    336734.693878
26    350204.081633
27    363673.469388
28    377142.857143
29    390612.244898
30    404081.632653
31    417551.020408
32    431020.408163
33    444489.795918
34    457959.183673
35    471428.571429
36    484897.959184
37    498367.346939
38    511836.734694
39    525306.122449
40    538775.510204
41    552244.897959
42    565714.285714
43    579183.673469
44    592653.061224
Name: lags, dtype: float64



In [83]:

    
from scipy.optimize import curve_fit
s = 0.345
r = 100000.0
nugget = 0.33
init_vals = [0.34, 50000, 0.33]     # for [amp, cen, wid]
best_vals_gaussian, covar_gaussian = curve_fit(exponentialVariogram, xdata=vdata.lags.values, ydata=vdata.variogram.values, p0=init_vals)
#best_vals_gaussian, covar_gaussian = curve_fit(exponentialVariogram, xdata=vdata.lags, ydata=vdata.variogram, p0=init_vals)
#best_vals_gaussian, covar_gaussian = curve_fit(sphericalVariogram, xdata=vdata.lags, ydata=vdata.variogram, p0=init_vals)
v









    




TypeErrorTraceback (most recent call last)
<ipython-input-83-50966a4e2b74> in <module>()
      5 nugget = 0.33
      6 init_vals = [0.34, 50000, 0.33]     # for [amp, cen, wid]
----> 7 best_vals_gaussian, covar_gaussian = curve_fit(exponentialVariogram, xdata=vdata.lags.values, ydata=vdata.variogram.values, p0=init_vals)
      8 #best_vals_gaussian, covar_gaussian = curve_fit(exponentialVariogram, xdata=vdata.lags, ydata=vdata.variogram, p0=init_vals)
      9 #best_vals_gaussian, covar_gaussian = curve_fit(sphericalVariogram, xdata=vdata.lags, ydata=vdata.variogram, p0=init_vals)

/opt/conda/envs/biospytial/lib/python2.7/site-packages/scipy/optimize/minpack.pyc in curve_fit(f, xdata, ydata, p0, sigma, absolute_sigma, check_finite, bounds, method, **kwargs)
    649         # Remove full_output from kwargs, otherwise we're passing it in twice.
    650         return_full = kwargs.pop('full_output', False)
--> 651         res = leastsq(func, p0, args=args, full_output=1, **kwargs)
    652         popt, pcov, infodict, errmsg, ier = res
    653         cost = np.sum(infodict['fvec'] ** 2)

/opt/conda/envs/biospytial/lib/python2.7/site-packages/scipy/optimize/minpack.pyc in leastsq(func, x0, args, Dfun, full_output, col_deriv, ftol, xtol, gtol, maxfev, epsfcn, factor, diag)
    375     if not isinstance(args, tuple):
    376         args = (args,)
--> 377     shape, dtype = _check_func('leastsq', 'func', func, x0, args, n)
    378     m = shape[0]
    379     if n > m:

/opt/conda/envs/biospytial/lib/python2.7/site-packages/scipy/optimize/minpack.pyc in _check_func(checker, argname, thefunc, x0, args, numinputs, output_shape)
     24 def _check_func(checker, argname, thefunc, x0, args, numinputs,
     25                 output_shape=None):
---> 26     res = atleast_1d(thefunc(*((x0[:numinputs],) + args)))
     27     if (output_shape is not None) and (shape(res) != output_shape):
     28         if (output_shape[0] != 1):

/opt/conda/envs/biospytial/lib/python2.7/site-packages/scipy/optimize/minpack.pyc in _general_function(params, xdata, ydata, function)
    451 
    452 def _general_function(params, xdata, ydata, function):
--> 453     return function(xdata, *params) - ydata
    454 
    455 

<ipython-input-81-ddb384f3a18b> in exponentialVariogram(h, sill, range_a, nugget)
      1 def exponentialVariogram(h,sill=0,range_a=0,nugget=0):
----> 2     if isinstance(h,np.array):
      3         Ih = [1.0 if hx >= 0.0 else 0.0 for hx in h]
      4     else:
      5         Ih = 1.0 if h >= 0 else 0.0

TypeError: isinstance() arg 2 must be a class, type, or tuple of classes and types



In [63]:

    
gaussianVariogram(hx)









    




ValueErrorTraceback (most recent call last)
<ipython-input-63-980bf2de2f3a> in <module>()
----> 1 gaussianVariogram(hx)

<ipython-input-60-bc546c382462> in gaussianVariogram(h, sill, range_a, nugget)
      1 def gaussianVariogram(h,sill=0,range_a=0,nugget=0):
----> 2     Ih = 1.0 if h >= 0 else 0.0
      3     g_h = ((sill - nugget)*(1 - np.exp(-(h**2 / range_a**2)))) + nugget*Ih
      4     return g_h

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()



In [79]:

    
s =best_vals[0]
r = best_vals[1]
nugget = best_vals[2]
fitted_gaussianVariogram = lambda x : exponentialVariogram(x,sill=s,range_a=r,nugget=nugget)
gammas = pd.DataFrame(map(fitted_gaussianVariogram,hx))
import functools
fitted_gaussian2 = functools.partial(gaussianVariogram,s,r,nugget)









    




NameErrorTraceback (most recent call last)
<ipython-input-79-0e76bf7e7d37> in <module>()
----> 1 s =best_vals[0]
      2 r = best_vals[1]
      3 nugget = best_vals[2]
      4 fitted_gaussianVariogram = lambda x : exponentialVariogram(x,sill=s,range_a=r,nugget=nugget)
      5 gammas = pd.DataFrame(map(fitted_gaussianVariogram,hx))

NameError: name 'best_vals' is not defined



In [ ]:



In [33]:

    
hx = np.linspace(0,600000,100)
vg = tools.Variogram(section,'residuals1',using_distance_threshold=500000)
## already fitted previously
s = 0.345255240992
r = 65857.797111
nugget = 0.332850902482

Valid parametric empirical variogram

That, covariance matrix always postive semi-definite.

Gaussian Model

$$\gamma (h)=(s-n)\left(1-\exp \left(-{\frac {h^{2}}{r^{2}a}}\right)\right)+n1_{{(0,\infty )}}(h)$$



In [60]:

    
def gaussianVariogram(h,sill=0,range_a=0,nugget=0):
    Ih = 1.0 if h >= 0 else 0.0    
    g_h = ((sill - nugget)*(1 - np.exp(-(h**2 / range_a**2)))) + nugget*Ih
    return g_h

Exponential Model

$$\gamma (h)=(s-n)(1-\exp(-h/(ra)))+n1_{{(0,\infty )}}(h)$$



In [81]:

    
def exponentialVariogram(h,sill=0,range_a=0,nugget=0):
    if isinstance(h,np.array):
        Ih = [1.0 if hx >= 0.0 else 0.0 for hx in h]
    else:
        Ih = 1.0 if h >= 0 else 0.0
    g_h = (sill - nugget)*(1 - np.exp(-h/range_a)) + (nugget*Ih)
    return g_h



In [82]:

    
h = 2
[1.0 if hx >= 0.0 else 0.0 for hx in i









    



  File "<ipython-input-82-4a8a4d03b6f2>", line 2
    [1.0 if hx >= 0.0 else 0.0 for hx in i
                                          ^
SyntaxError: invalid syntax

Spherical Variogram

$$\gamma (h)=(s-n)\left(\left({\frac {3h}{2r}}-{\frac {h^{3}}{2r^{3}}}\right)1_{{(0,r)}}(h)+1_{{[r,\infty )}}(h)\right)+n1_{{(0,\infty )}}(h))$$



In [32]:

    
def sphericalVariogram(h,sill=0,range_a=0,nugget=0):
    Ih = 1.0 if h >= 0 else 0.0
    I0r = 1.0 if h <= range_a else 0.0
    Irinf = 1.0 if h > range_a else 0.0
    g_h = (sill - nugget)((3*h / float(2*range_a))*I0r + Irinf) - (h**3 / float(2*range_a)) + (nugget*Ih)
    return g_h



In [35]:

    
def theoreticalVariogram(model_function,sill,range_a,nugget):
    return lambda x : model_function(x,sill,range_a,nugget)



In [36]:

    
tvariogram = theoreticalVariogram(gaussianVariogram,s,r,nugget)
tvariogram = theoreticalVariogram(,s,r,nugget)



In [37]:

    
%time gs = np.array(map(tvariogram,hx))









    



CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.18 ms



In [38]:

    
x = vg.plot(with_envelope=True,num_iterations=30,refresh=False)
plt.plot(hx,gs,color='blue')









    Out[38]:





[<matplotlib.lines.Line2D at 0x7f23e857bf50>]



In [39]:

    
import statsmodels.regression.linear_model as lm



In [42]:

    
Mdist = vg.distance_coordinates.flatten()
%time vars = np.array(map(tvariogram,Mdist))









    



CPU times: user 1min 25s, sys: 570 ms, total: 1min 25s
Wall time: 1min 26s



In [43]:

    
CovMat = vars.reshape(len(section),len(section))
X = section.logSppN.values
Y = section.logBiomass.values



In [44]:

    
plt.imshow(CovMat)









    Out[44]:





<matplotlib.image.AxesImage at 0x7f23e84967d0>



In [46]:

    
%time model = lm.GLS(Y,X,sigma=CovMat)









    




LinAlgErrorTraceback (most recent call last)
<ipython-input-46-709b45fceee0> in <module>()
----> 1 get_ipython().magic(u'time model = lm.GLS(Y,X,sigma=CovMat)')

/opt/conda/envs/biospytial/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in magic(self, arg_s)
   2156         magic_name, _, magic_arg_s = arg_s.partition(' ')
   2157         magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2158         return self.run_line_magic(magic_name, magic_arg_s)
   2159 
   2160     #-------------------------------------------------------------------------

/opt/conda/envs/biospytial/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_line_magic(self, magic_name, line)
   2077                 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
   2078             with self.builtin_trap:
-> 2079                 result = fn(*args,**kwargs)
   2080             return result
   2081 

<decorator-gen-59> in time(self, line, cell, local_ns)

/opt/conda/envs/biospytial/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    186     # but it's overkill for just that one bit of state.
    187     def magic_deco(arg):
--> 188         call = lambda f, *a, **k: f(*a, **k)
    189 
    190         if callable(arg):

/opt/conda/envs/biospytial/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1178         else:
   1179             st = clock2()
-> 1180             exec(code, glob, local_ns)
   1181             end = clock2()
   1182             out = None

<timed exec> in <module>()

/opt/conda/envs/biospytial/lib/python2.7/site-packages/statsmodels/regression/linear_model.pyc in __init__(self, endog, exog, sigma, missing, hasconst, **kwargs)
    438     #TODO: add options igls, for iterative fgls if sigma is None
    439     #TODO: default if sigma is none should be two-step GLS
--> 440         sigma, cholsigmainv = _get_sigma(sigma, len(endog))
    441 
    442         super(GLS, self).__init__(endog, exog, missing=missing,

/opt/conda/envs/biospytial/lib/python2.7/site-packages/statsmodels/regression/linear_model.pyc in _get_sigma(sigma, nobs)
     77             raise ValueError("Sigma must be a scalar, 1d of length %s or a 2d "
     78                              "array of shape %s x %s" % (nobs, nobs, nobs))
---> 79         cholsigmainv = np.linalg.cholesky(np.linalg.pinv(sigma)).T
     80 
     81     return sigma, cholsigmainv

/opt/conda/envs/biospytial/lib/python2.7/site-packages/numpy/linalg/linalg.pyc in cholesky(a)
    610     t, result_t = _commonType(a)
    611     signature = 'D->D' if isComplexType(t) else 'd->d'
--> 612     r = gufunc(a, signature=signature, extobj=extobj)
    613     return wrap(r.astype(result_t, copy=False))
    614 

/opt/conda/envs/biospytial/lib/python2.7/site-packages/numpy/linalg/linalg.pyc in _raise_linalgerror_nonposdef(err, flag)
     91 
     92 def _raise_linalgerror_nonposdef(err, flag):
---> 93     raise LinAlgError("Matrix is not positive definite")
     94 
     95 def _raise_linalgerror_eigenvalues_nonconvergence(err, flag):

LinAlgError: Matrix is not positive definite



In [ ]:

    
%time results = model.fit()



In [ ]:

	0	1
Intercept	8.471155	8.505236
logSppN	0.363325	0.384578

Dep. Variable:	logBiomass	R-squared:	0.114
Model:	OLS	Adj. R-squared:	0.114
Method:	Least Squares	F-statistic:	4757.
Date:	Thu, 23 Nov 2017	Prob (F-statistic):	0.00
Time:	00:54:54	Log-Likelihood:	-36670.
No. Observations:	36845	AIC:	7.334e+04
Df Residuals:	36843	BIC:	7.336e+04
Df Model:	1
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
Intercept	8.4882	0.009	976.347	0.000	8.471 8.505
logSppN	0.3740	0.005	68.974	0.000	0.363 0.385

Omnibus:	1627.980	Durbin-Watson:	1.701
Prob(Omnibus):	0.000	Jarque-Bera (JB):	2356.149
Skew:	-0.421	Prob(JB):	0.00
Kurtosis:	3.908	Cond. No.	5.50