In [13]:
# Load Biospytial modules and etc.
%matplotlib inline
import sys
sys.path.append('/apps')
import django
django.setup()
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
## Use the ggplot style
plt.style.use('ggplot')

In [14]:
# My mac
#data = pd.read_csv("/RawDataCSV/plotsClimateData_11092017.csv")
# My Linux desktop
data = pd.read_csv("/RawDataCSV/idiv_share/plotsClimateData_11092017.csv")


/opt/conda/envs/biospytial/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (24) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [15]:
data.SppN.mean()


Out[15]:
5.1824128104220382

In [16]:
import geopandas as gpd

In [17]:
from django.contrib.gis import geos
from shapely.geometry import Point

In [18]:
data['geometry'] = data.apply(lambda z: Point(z.LON, z.LAT), axis=1)
new_data = gpd.GeoDataFrame(data)

Let´s reproject to Alberts or something with distance


In [19]:
new_data.crs = {'init':'epsg:4326'}

Uncomment to reproject

proj string taken from: http://spatialreference.org/


In [20]:
new_data =  new_data.to_crs("+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=37.5 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m +no_defs ")
new_data['newLon'] = new_data.apply(lambda c : c.geometry.x, axis=1)
new_data['newLat'] = new_data.apply(lambda c : c.geometry.y, axis=1)

In [21]:
new_data['logBiomass'] = np.log(new_data.plotBiomass)
new_data['logSppN'] = np.log(new_data.SppN)

In [22]:
## Let´s make a simple linear trend here.
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [23]:
## All data
### Now with statsmodels.api

#xx = X.SppN.values.reshape(-1,1)
#xx = sm.add_constant(xx)
#model = sm.OLS(Y.values.reshape(-1,1),xx)

model = smf.ols(formula='logBiomass ~ logSppN',data=new_data)
results = model.fit()
param_model = results.params
results.summary()


Out[23]:
OLS Regression Results
Dep. Variable: logBiomass R-squared: 0.114
Model: OLS Adj. R-squared: 0.114
Method: Least Squares F-statistic: 4757.
Date: Thu, 23 Nov 2017 Prob (F-statistic): 0.00
Time: 00:54:54 Log-Likelihood: -36670.
No. Observations: 36845 AIC: 7.334e+04
Df Residuals: 36843 BIC: 7.336e+04
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Intercept 8.4882 0.009 976.347 0.000 8.471 8.505
logSppN 0.3740 0.005 68.974 0.000 0.363 0.385
Omnibus: 1627.980 Durbin-Watson: 1.701
Prob(Omnibus): 0.000 Jarque-Bera (JB): 2356.149
Skew: -0.421 Prob(JB): 0.00
Kurtosis: 3.908 Cond. No. 5.50

In [24]:
new_data['residuals1'] = results.resid

The area is very big -> 35000 points.

We need to make a subset of this


In [25]:
# COnsider the the following subregion
section = new_data[lambda x:  (x.LON > -100) & (x.LON < -85) & (x.LAT > 30) & (x.LAT < 35) ]
section.plot(column='SppN')
section.plot(column='plotBiomass')
section.shape


Out[25]:
(4657, 47)

Residuals


In [14]:
Y_hat = results.predict(section)

In [15]:
ress = (section.logBiomass - Y_hat)

In [16]:
param_model.Intercept


Out[16]:
8.488195471616315

In [17]:
conf_int = results.conf_int(alpha=0.05)
plt.scatter(section.SppN,section.plotBiomass)
#plt.plot(section.SppN,param_model.Intercept + param_model.SppN * section.SppN)

plt.plot(section.SppN,Y_hat)
#plt.fill_between(Y_hat,Y_hat + conf_int , Y_hat - conf_int)
conf_int


Out[17]:
0 1
Intercept 8.471155 8.505236
logSppN 0.363325 0.384578

In [20]:
plt.scatter(section.logSppN,section.residuals1)


Out[20]:
<matplotlib.collections.PathCollection at 0x7f6ee62b68d0>

In [23]:
plt.scatter(section.newLon,section.newLat,c=section.residuals1)
plt.colorbar()


Out[23]:
<matplotlib.colorbar.Colorbar at 0x7f6ee60d3fd0>

In [24]:
# Import GPFlow
import GPflow as gf
k = gf.kernels.Matern12(2, lengthscales=0.2, active_dims = [0,1] ) + gf.kernels.Constant(2,active_dims=[0,1])

In [25]:
results.resid.plot.hist()


Out[25]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6ee63e0550>

In [28]:
model = gf.gpr.GPR(section[['newLon','newLat']].as_matrix(),section.residuals1.values.reshape(-1,1),k)

In [29]:
%time model.optimize()


CPU times: user 28min 25s, sys: 30.9 s, total: 28min 56s
Wall time: 10min 32s
Out[29]:
      fun: 4417.2589165690879
 hess_inv: <4x4 LbfgsInvHessProduct with dtype=float64>
      jac: array([  1.31733145e-05,  -0.00000000e+00,  -2.29640029e-05,
        -2.29639988e-05])
  message: 'CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH'
     nfev: 24
      nit: 18
   status: 0
  success: True
        x: array([-5.23780247, -1.50777732, -1.53581048, -1.53581191])

In [30]:
k.get_parameter_dict()


Out[30]:
{'name.kern.constant.variance': array([ 0.00529886]),
 'name.kern.matern12.lengthscales': array([ 0.2]),
 'name.kern.matern12.variance': array([ 0.19497644])}

In [ ]:
model.get_parameter_dict()

In [ ]:
import numpy as np
Nn = 300
dsc = section
predicted_x = np.linspace(min(dsc.newLon),max(dsc.newLon),Nn)
predicted_y = np.linspace(min(dsc.newLat),max(dsc.newLat),Nn)
Xx, Yy = np.meshgrid(predicted_x,predicted_y)
## Fake richness
fake_sp_rich = np.ones(len(Xx.ravel()))
predicted_coordinates = np.vstack([ Xx.ravel(), Yy.ravel()]).transpose()
#predicted_coordinates = np.vstack([section.SppN, section.newLon,section.newLat]).transpose()

In [ ]:
predicted_coordinates.shape

In [ ]:
means,variances = model.predict_y(predicted_coordinates)

In [ ]:
sum(means)

In [ ]:
fig = plt.figure(figsize=(16,10), dpi= 80, facecolor='w', edgecolor='w')
#plt.pcolor(Xx,Yy,np.sqrt(variances.reshape(Nn,Nn))) #,cmap=plt.cm.Greens)
plt.pcolormesh(Xx,Yy,np.sqrt(variances.reshape(Nn,Nn)))
plt.colorbar()
plt.scatter(dsc.newLon,dsc.newLat,c=dsc.SppN,edgecolors='')
plt.title("VAriance Biomass")
plt.colorbar()

In [ ]:
import cartopy
plt.figure(figsize=(17,11))

proj = cartopy.crs.PlateCarree()
ax = plt.subplot(111, projection=proj)


ax = plt.axes(projection=proj)
#algo = new_data.plot(column='SppN',ax=ax,cmap=colormap,edgecolors='')


#ax.set_extent([-93, -70, 30, 50])
#ax.set_extent([-100, -60, 20, 50])
ax.set_extent([-95, -70, 25, 45])

#ax.add_feature(cartopy.feature.LAND)
ax.add_feature(cartopy.feature.OCEAN)
ax.add_feature(cartopy.feature.COASTLINE)
ax.add_feature(cartopy.feature.BORDERS, linestyle=':')
ax.add_feature(cartopy.feature.LAKES, alpha=0.9)
ax.stock_img()
#ax.add_geometries(new_data.geometry,crs=cartopy.crs.PlateCarree())
#ax.add_feature(cartopy.feature.RIVERS)
mm = ax.pcolormesh(Xx,Yy,means.reshape(Nn,Nn),transform=proj )
#cs = plt.contour(Xx,Yy,np.sqrt(variances).reshape(Nn,Nn),linewidths=2,cmap=plt.cm.Greys_r,linestyles='dotted')
cs = plt.contour(Xx,Yy,means.reshape(Nn,Nn),linewidths=2,colors='k',linestyles='dotted',levels=[4.0,5.0,6.0,7.0,8.0])
plt.clabel(cs, fontsize=16,inline=True,fmt='%1.1f')
#ax.scatter(new_data.lon,new_data.lat,c=new_data.error,edgecolors='',transform=proj,cmap=plt.cm.Greys,alpha=0.2)
plt.colorbar(mm)
plt.title("Predicted Species Richness")


#(x.LON > -90) & (x.LON < -80) & (x.LAT > 40) & (x.LAT < 50)

Residuals of $ Biomass ~ SppRich + Z(x,y) + \epsilon $

Using all data

Model Analysis with the empirical variogram


In [48]:
from external_plugins.spystats import tools

In [53]:
filename = "../HEC_runs/results/low_q/data_envelope.csv"
envelope_data = pd.read_csv(filename)
gvg = tools.Variogram(new_data,'logBiomass',using_distance_threshold=600000)
gvg.envelope = envelope_data
gvg.empirical = gvg.envelope.variogram
gvg.lags = gvg.envelope.lags
vdata = gvg.envelope.dropna()

In [54]:
gvg.plot(refresh=False)


Fitting the empirical variogram to into a theoretical model


In [59]:



Out[59]:
0          0.000000
1      13469.387755
2      26938.775510
3      40408.163265
4      53877.551020
5      67346.938776
6      80816.326531
7      94285.714286
8     107755.102041
9     121224.489796
10    134693.877551
11    148163.265306
12    161632.653061
13    175102.040816
14    188571.428571
15    202040.816327
16    215510.204082
17    228979.591837
18    242448.979592
19    255918.367347
20    269387.755102
21    282857.142857
22    296326.530612
23    309795.918367
24    323265.306122
25    336734.693878
26    350204.081633
27    363673.469388
28    377142.857143
29    390612.244898
30    404081.632653
31    417551.020408
32    431020.408163
33    444489.795918
34    457959.183673
35    471428.571429
36    484897.959184
37    498367.346939
38    511836.734694
39    525306.122449
40    538775.510204
41    552244.897959
42    565714.285714
43    579183.673469
44    592653.061224
Name: lags, dtype: float64

In [83]:
from scipy.optimize import curve_fit
s = 0.345
r = 100000.0
nugget = 0.33
init_vals = [0.34, 50000, 0.33]     # for [amp, cen, wid]
best_vals_gaussian, covar_gaussian = curve_fit(exponentialVariogram, xdata=vdata.lags.values, ydata=vdata.variogram.values, p0=init_vals)
#best_vals_gaussian, covar_gaussian = curve_fit(exponentialVariogram, xdata=vdata.lags, ydata=vdata.variogram, p0=init_vals)
#best_vals_gaussian, covar_gaussian = curve_fit(sphericalVariogram, xdata=vdata.lags, ydata=vdata.variogram, p0=init_vals)
v



TypeErrorTraceback (most recent call last)
<ipython-input-83-50966a4e2b74> in <module>()
      5 nugget = 0.33
      6 init_vals = [0.34, 50000, 0.33]     # for [amp, cen, wid]
----> 7 best_vals_gaussian, covar_gaussian = curve_fit(exponentialVariogram, xdata=vdata.lags.values, ydata=vdata.variogram.values, p0=init_vals)
      8 #best_vals_gaussian, covar_gaussian = curve_fit(exponentialVariogram, xdata=vdata.lags, ydata=vdata.variogram, p0=init_vals)
      9 #best_vals_gaussian, covar_gaussian = curve_fit(sphericalVariogram, xdata=vdata.lags, ydata=vdata.variogram, p0=init_vals)

/opt/conda/envs/biospytial/lib/python2.7/site-packages/scipy/optimize/minpack.pyc in curve_fit(f, xdata, ydata, p0, sigma, absolute_sigma, check_finite, bounds, method, **kwargs)
    649         # Remove full_output from kwargs, otherwise we're passing it in twice.
    650         return_full = kwargs.pop('full_output', False)
--> 651         res = leastsq(func, p0, args=args, full_output=1, **kwargs)
    652         popt, pcov, infodict, errmsg, ier = res
    653         cost = np.sum(infodict['fvec'] ** 2)

/opt/conda/envs/biospytial/lib/python2.7/site-packages/scipy/optimize/minpack.pyc in leastsq(func, x0, args, Dfun, full_output, col_deriv, ftol, xtol, gtol, maxfev, epsfcn, factor, diag)
    375     if not isinstance(args, tuple):
    376         args = (args,)
--> 377     shape, dtype = _check_func('leastsq', 'func', func, x0, args, n)
    378     m = shape[0]
    379     if n > m:

/opt/conda/envs/biospytial/lib/python2.7/site-packages/scipy/optimize/minpack.pyc in _check_func(checker, argname, thefunc, x0, args, numinputs, output_shape)
     24 def _check_func(checker, argname, thefunc, x0, args, numinputs,
     25                 output_shape=None):
---> 26     res = atleast_1d(thefunc(*((x0[:numinputs],) + args)))
     27     if (output_shape is not None) and (shape(res) != output_shape):
     28         if (output_shape[0] != 1):

/opt/conda/envs/biospytial/lib/python2.7/site-packages/scipy/optimize/minpack.pyc in _general_function(params, xdata, ydata, function)
    451 
    452 def _general_function(params, xdata, ydata, function):
--> 453     return function(xdata, *params) - ydata
    454 
    455 

<ipython-input-81-ddb384f3a18b> in exponentialVariogram(h, sill, range_a, nugget)
      1 def exponentialVariogram(h,sill=0,range_a=0,nugget=0):
----> 2     if isinstance(h,np.array):
      3         Ih = [1.0 if hx >= 0.0 else 0.0 for hx in h]
      4     else:
      5         Ih = 1.0 if h >= 0 else 0.0

TypeError: isinstance() arg 2 must be a class, type, or tuple of classes and types

In [63]:
gaussianVariogram(hx)



ValueErrorTraceback (most recent call last)
<ipython-input-63-980bf2de2f3a> in <module>()
----> 1 gaussianVariogram(hx)

<ipython-input-60-bc546c382462> in gaussianVariogram(h, sill, range_a, nugget)
      1 def gaussianVariogram(h,sill=0,range_a=0,nugget=0):
----> 2     Ih = 1.0 if h >= 0 else 0.0
      3     g_h = ((sill - nugget)*(1 - np.exp(-(h**2 / range_a**2)))) + nugget*Ih
      4     return g_h

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [79]:
s =best_vals[0]
r = best_vals[1]
nugget = best_vals[2]
fitted_gaussianVariogram = lambda x : exponentialVariogram(x,sill=s,range_a=r,nugget=nugget)
gammas = pd.DataFrame(map(fitted_gaussianVariogram,hx))
import functools
fitted_gaussian2 = functools.partial(gaussianVariogram,s,r,nugget)



NameErrorTraceback (most recent call last)
<ipython-input-79-0e76bf7e7d37> in <module>()
----> 1 s =best_vals[0]
      2 r = best_vals[1]
      3 nugget = best_vals[2]
      4 fitted_gaussianVariogram = lambda x : exponentialVariogram(x,sill=s,range_a=r,nugget=nugget)
      5 gammas = pd.DataFrame(map(fitted_gaussianVariogram,hx))

NameError: name 'best_vals' is not defined

In [ ]:


In [33]:
hx = np.linspace(0,600000,100)
vg = tools.Variogram(section,'residuals1',using_distance_threshold=500000)
## already fitted previously
s = 0.345255240992
r = 65857.797111
nugget = 0.332850902482

Valid parametric empirical variogram

That, covariance matrix always postive semi-definite.

Gaussian Model

$$\gamma (h)=(s-n)\left(1-\exp \left(-{\frac {h^{2}}{r^{2}a}}\right)\right)+n1_{{(0,\infty )}}(h)$$

In [60]:
def gaussianVariogram(h,sill=0,range_a=0,nugget=0):
    Ih = 1.0 if h >= 0 else 0.0    
    g_h = ((sill - nugget)*(1 - np.exp(-(h**2 / range_a**2)))) + nugget*Ih
    return g_h

Exponential Model

$$\gamma (h)=(s-n)(1-\exp(-h/(ra)))+n1_{{(0,\infty )}}(h)$$

In [81]:
def exponentialVariogram(h,sill=0,range_a=0,nugget=0):
    if isinstance(h,np.array):
        Ih = [1.0 if hx >= 0.0 else 0.0 for hx in h]
    else:
        Ih = 1.0 if h >= 0 else 0.0
    g_h = (sill - nugget)*(1 - np.exp(-h/range_a)) + (nugget*Ih)
    return g_h

In [82]:
h = 2
[1.0 if hx >= 0.0 else 0.0 for hx in i


  File "<ipython-input-82-4a8a4d03b6f2>", line 2
    [1.0 if hx >= 0.0 else 0.0 for hx in i
                                          ^
SyntaxError: invalid syntax

Spherical Variogram

$$\gamma (h)=(s-n)\left(\left({\frac {3h}{2r}}-{\frac {h^{3}}{2r^{3}}}\right)1_{{(0,r)}}(h)+1_{{[r,\infty )}}(h)\right)+n1_{{(0,\infty )}}(h))$$

In [32]:
def sphericalVariogram(h,sill=0,range_a=0,nugget=0):
    Ih = 1.0 if h >= 0 else 0.0
    I0r = 1.0 if h <= range_a else 0.0
    Irinf = 1.0 if h > range_a else 0.0
    g_h = (sill - nugget)((3*h / float(2*range_a))*I0r + Irinf) - (h**3 / float(2*range_a)) + (nugget*Ih)
    return g_h

In [35]:
def theoreticalVariogram(model_function,sill,range_a,nugget):
    return lambda x : model_function(x,sill,range_a,nugget)

In [36]:
tvariogram = theoreticalVariogram(gaussianVariogram,s,r,nugget)
tvariogram = theoreticalVariogram(,s,r,nugget)

In [37]:
%time gs = np.array(map(tvariogram,hx))


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.18 ms

In [38]:
x = vg.plot(with_envelope=True,num_iterations=30,refresh=False)
plt.plot(hx,gs,color='blue')


Out[38]:
[<matplotlib.lines.Line2D at 0x7f23e857bf50>]

In [39]:
import statsmodels.regression.linear_model as lm

In [42]:
Mdist = vg.distance_coordinates.flatten()
%time vars = np.array(map(tvariogram,Mdist))


CPU times: user 1min 25s, sys: 570 ms, total: 1min 25s
Wall time: 1min 26s

In [43]:
CovMat = vars.reshape(len(section),len(section))
X = section.logSppN.values
Y = section.logBiomass.values

In [44]:
plt.imshow(CovMat)


Out[44]:
<matplotlib.image.AxesImage at 0x7f23e84967d0>

In [46]:
%time model = lm.GLS(Y,X,sigma=CovMat)



LinAlgErrorTraceback (most recent call last)
<ipython-input-46-709b45fceee0> in <module>()
----> 1 get_ipython().magic(u'time model = lm.GLS(Y,X,sigma=CovMat)')

/opt/conda/envs/biospytial/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in magic(self, arg_s)
   2156         magic_name, _, magic_arg_s = arg_s.partition(' ')
   2157         magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2158         return self.run_line_magic(magic_name, magic_arg_s)
   2159 
   2160     #-------------------------------------------------------------------------

/opt/conda/envs/biospytial/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_line_magic(self, magic_name, line)
   2077                 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
   2078             with self.builtin_trap:
-> 2079                 result = fn(*args,**kwargs)
   2080             return result
   2081 

<decorator-gen-59> in time(self, line, cell, local_ns)

/opt/conda/envs/biospytial/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    186     # but it's overkill for just that one bit of state.
    187     def magic_deco(arg):
--> 188         call = lambda f, *a, **k: f(*a, **k)
    189 
    190         if callable(arg):

/opt/conda/envs/biospytial/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1178         else:
   1179             st = clock2()
-> 1180             exec(code, glob, local_ns)
   1181             end = clock2()
   1182             out = None

<timed exec> in <module>()

/opt/conda/envs/biospytial/lib/python2.7/site-packages/statsmodels/regression/linear_model.pyc in __init__(self, endog, exog, sigma, missing, hasconst, **kwargs)
    438     #TODO: add options igls, for iterative fgls if sigma is None
    439     #TODO: default if sigma is none should be two-step GLS
--> 440         sigma, cholsigmainv = _get_sigma(sigma, len(endog))
    441 
    442         super(GLS, self).__init__(endog, exog, missing=missing,

/opt/conda/envs/biospytial/lib/python2.7/site-packages/statsmodels/regression/linear_model.pyc in _get_sigma(sigma, nobs)
     77             raise ValueError("Sigma must be a scalar, 1d of length %s or a 2d "
     78                              "array of shape %s x %s" % (nobs, nobs, nobs))
---> 79         cholsigmainv = np.linalg.cholesky(np.linalg.pinv(sigma)).T
     80 
     81     return sigma, cholsigmainv

/opt/conda/envs/biospytial/lib/python2.7/site-packages/numpy/linalg/linalg.pyc in cholesky(a)
    610     t, result_t = _commonType(a)
    611     signature = 'D->D' if isComplexType(t) else 'd->d'
--> 612     r = gufunc(a, signature=signature, extobj=extobj)
    613     return wrap(r.astype(result_t, copy=False))
    614 

/opt/conda/envs/biospytial/lib/python2.7/site-packages/numpy/linalg/linalg.pyc in _raise_linalgerror_nonposdef(err, flag)
     91 
     92 def _raise_linalgerror_nonposdef(err, flag):
---> 93     raise LinAlgError("Matrix is not positive definite")
     94 
     95 def _raise_linalgerror_eigenvalues_nonconvergence(err, flag):

LinAlgError: Matrix is not positive definite

In [ ]:
%time results = model.fit()

In [ ]: