Process the global variogram


In [2]:
# Load Biospytial modules and etc.
%matplotlib inline
import sys
sys.path.append('/apps')
import django
django.setup()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
## Use the ggplot style
plt.style.use('ggplot')

In [2]:
from external_plugins.spystats import tools
%run ../testvariogram.py


/opt/conda/envs/biospytial/lib/python2.7/site-packages/IPython/core/pylabtools.py:168: DtypeWarning: Columns (24) have mixed types. Specify dtype option on import or set low_memory=False.
  safe_execfile(fname,*where,**kw)

In [3]:
%time vg = tools.Variogram(new_data,'residuals1',using_distance_threshold=500000)


CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 594 µs

In [4]:
### Test creation of chunks
chunks = tools.PartitionDataSet(new_data,namecolumnx='newLon',namecolumny='newLat',n_chunks=3)

In [5]:
sizes = map(lambda c : c.shape[0],chunks)

In [6]:
vg0 = tools.Variogram(chunks[0],response_variable_name='residuals1',using_distance_threshold=500000)
vg1 = tools.Variogram(chunks[1],response_variable_name='residuals1',using_distance_threshold=500000)
vg2 = tools.Variogram(chunks[2],response_variable_name='residuals1',using_distance_threshold=500000)
vg3 = tools.Variogram(chunks[3],response_variable_name='residuals1',using_distance_threshold=500000)

STOP HERE This will calculate the variogram with chunks


In [ ]:
%time vg0.plot(num_iterations=50,with_envelope=True)

In [ ]:
chunks[0].plot(column='residuals1')

In [ ]:
%time vg1.plot(num_iterations=50,with_envelope=True)

In [ ]:
chunks[1].plot(column='residuals1')

In [ ]:
%time vg2.plot(num_iterations=50,with_envelope=True)

In [ ]:
chunks[2].plot(column='residuals1')

In [ ]:
%time vg3.plot(num_iterations=50,with_envelope=True)

In [ ]:
chunks[3].plot(column='residuals1')

In [ ]:
envelopes = map(lambda c : c.envelope,chunks)

In [ ]:
c = chunks[0]

In [ ]:
variograms = [vg0,vg1,vg2,vg3]

In [ ]:
envelopes = map(lambda v : v.envelope,variograms)

In [ ]:
colors = plt.rcParams['axes.prop_cycle']
colors = ['red','green','grey','orange']

In [ ]:
plt.figure(figsize=(12, 6))
for i,envelope in enumerate(envelopes):
    plt.plot(envelope.lags,envelope.envhigh,'k--')
    plt.plot(envelope.lags,envelope.envlow,'k--')
    plt.fill_between(envelope.lags,envelope.envlow,envelope.envhigh,alpha=0.5,color=colors[i])
    plt.plot(envelope.lags,envelope.variogram,'o--',lw=2.0,color=colors[i])
    
plt.legend(labels=['97.5%','emp. varig','2.5%'])

Now the global variogram

For doing this I need to take a weighted average. Or.. you can run it in the HEC machine! (as me did)


In [1]:
filename = "../HEC_runs/results/low_q/data_envelope.csv"
envelope_data = pd.read_csv(filename)



NameErrorTraceback (most recent call last)
<ipython-input-1-f8586b6394dd> in <module>()
      1 filename = "../HEC_runs/results/low_q/data_envelope.csv"
----> 2 envelope_data = pd.read_csv(filename)

NameError: name 'pd' is not defined

Gaussian semivariogram

$\gamma (h)=(s-n)\left(1-\exp \left(-{\frac {h^{2}}{r^{2}a}}\right)\right)+n1_{{(0,\infty )}}(h)$


In [8]:
def gaussianVariogram(h,sill=0,range_a=0,nugget=0):
    g_h = ((sill - nugget)*(1 - np.exp(-(h**2 / range_a**2)))) + nugget
    return g_h

In [12]:
hx = np.linspace(0,600000,100)
vg = tools.Variogram(new_data,'residuals1',using_distance_threshold=500000)
vg.envelope = envelope_data
vg.empirical = vg.envelope.variogram
vg.lags = vg.envelope.lags
vdata = vg.envelope.dropna()

In [13]:
from scipy.optimize import curve_fit
s = 0.345
r = 100000.0
nugget = 0.33
init_vals = [0.34, 50000, 0.33]     # for [amp, cen, wid]
best_vals, covar = curve_fit(gaussianVariogram, xdata=vdata.lags, ydata=vdata.variogram, p0=init_vals)
s =best_vals[0]
r = best_vals[1]
nugget = best_vals[2]
fitted_gaussianVariogram = lambda x : gaussianVariogram(x,sill=s,range_a=r,nugget=nugget)
gammas = pd.DataFrame(map(fitted_gaussianVariogram,hx))
import functools
fitted_gaussian2 = functools.partial(gaussianVariogram,s,r,nugget)

In [14]:
print(s)
print(r)
print(nugget)


0.345255240992
65857.797111
0.332850902482

In [ ]:
vg.plot(refresh=False)
plt.plot(hx,gammas,'green',lw=2)

In [ ]:
Mdist = vg.distance_coordinates.flatten()

In [ ]:
## Let's do a small subset
ch = Mdist[0:10000000]

In [ ]:
#%time covariance_matrix = map(fitted_gaussianVariogram,Mdist)
%time vars = np.array(map(fitted_gaussianVariogram,ch))

In [ ]:
plt.imshow(vars.reshape(1000,1000))

In [ ]:
## Save it in redis
import redis
con = redis.StrictRedis(host='redis')

In [ ]:
con.set('small_dist_mat1',vars)

In [ ]:
import multiprocessing as multi
from multiprocessing import Manager

In [ ]:
manager = Manager()

p=multi.Pool(processes=4)

In [ ]:
%time vars = p.map(fitted_gaussian2,ch,chunksize=len(ch)/3)

In [ ]:
%time vars = np.array(map(fitted_gaussianVariogram,ch))

In [17]:
88.36*30


Out[17]:
2650.8

In [ ]: