In [1]:
# Load Biospytial modules and etc.
%matplotlib inline
import sys
sys.path.append('/apps')
import django
django.setup()
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
## Use the ggplot style
plt.style.use('ggplot')
sys.path.append('..')
import tools
import geopandas as gpd
from HEC_runs.fit_fia_logbiomass_logspp_GLS import prepareDataFrame, createVariogram, buildSpatialStructure,calculateGLS, bundleToGLS, fitLinearLogLogModel
new_data['residuals1'] = results.resid
In [2]:
#new_data = prepareDataFrame("/RawDataCSV/idiv_share/plotsClimateData_11092017.csv")
## En Hec
#new_data = prepareDataFrame("/home/hpc/28/escamill/csv_data/idiv/plotsClimateData_11092017.csv")
## New "official" dataset
new_data = prepareDataFrame("/RawDataCSV/idiv_share/FIA_Plots_Biomass_11092017.csv")
#IN HEC
#new_data = prepareDataFrame("/home/hpc/28/escamill/csv_data/idiv/FIA_Plots_Biomass_11092017.csv")
In [3]:
def systSelection(dataframe,k):
n = len(dataframe)
idxs = range(0,n,k)
systematic_sample = dataframe.iloc[idxs]
return systematic_sample
##################
k = 10 # The k-th element to take as a sample
In [4]:
systematic_sample = systSelection(new_data,k)
In [5]:
ax= systematic_sample.plot(column='logBiomass',figsize=(16,10),cmap=plt.cm.Blues,edgecolors='')
In [6]:
def randomSelection(dataframe,p):
n = len(dataframe)
idxs = np.random.choice(n,p,replace=False)
random_sample = dataframe.iloc[idxs]
return random_sample
#################
n = len(new_data)
p = 3000 # The amount of samples taken (let's do it without replacement)
In [7]:
def subselectDataFrameByCoordinates(dataframe,namecolumnx,namecolumny,minx,maxx,miny,maxy):
"""
Returns a subselection by coordinates using the dataframe/
"""
minx = float(minx)
maxx = float(maxx)
miny = float(miny)
maxy = float(maxy)
section = dataframe[lambda x: (x[namecolumnx] > minx) & (x[namecolumnx] < maxx) & (x[namecolumny] > miny) & (x[namecolumny] < maxy) ]
return section
In [8]:
# COnsider the the following subregion
minx = -100
maxx = -85
miny = 30
maxy = 35
section = subselectDataFrameByCoordinates(new_data,'LON','LAT',minx,maxx,miny,maxy)
#section = new_data[lambda x: (x.LON > minx) & (x.LON < maxx) & (x.LAT > miny) & (x.LAT < maxy) ]
section.plot(column='logBiomass')
Out[8]:
In [9]:
# old variogram (using all data sets)
#gvg,tt = createVariogram("/apps/external_plugins/spystats/HEC_runs/results/logbiomas_logsppn_res.csv",new_data)
## New variogram for new data
gvg,tt = createVariogram("/apps/external_plugins/spystats/HEC_runs/results/variogram/data_envelope.csv",new_data)
#For HEC
#gvg,tt = createVariogram("/home/hpc/28/escamill/spystats/HEC_runs/results/variogram/data_envelope.csv",new_data)
In [10]:
import numpy as np
xx = np.linspace(0,1000000,1000)
gvg.plot(refresh=False)
plt.plot(xx,gvg.model.f(xx),lw=2.0,c='k')
plt.title("Empirical Variogram with fitted Whittle Model")
Out[10]:
In [11]:
gvg.model
Out[11]:
In [12]:
samples = map(lambda i : systSelection(new_data,i), range(20,2,-1))
samples = map(lambda i : randomSelection(new_data,3000),range(100))
In [13]:
s = samples[0]
vs = tools.Variogram(s,'logBiomas',model=gvg.model)
In [14]:
%timeit vs.distance_coordinates
In [15]:
%time vs.model.f(vs.distance_coordinates.flatten())
Out[15]:
In [16]:
## let's try to use a better model
vs.model.f(vs.distance_coordinates.flatten())
Out[16]:
In [17]:
%time vs.model.corr_f(vs.distance_coordinates.flatten()).reshape(vs.distance_coordinates.shape)
Out[17]:
In [18]:
matern_model = tools.MaternVariogram(sill=0.340125401705,range_a=5577.83789733, nugget=0.33, kappa=4)
whittle_model = tools.WhittleVariogram(sill=0.340288288241, range_a=40963.3203528, nugget=0.329830410223, alpha=1.12279978135)
exp_model = tools.ExponentialVariogram(sill=0.340294258738, range_a=38507.8253768, nugget=0.329629457808)
gaussian_model = tools.GaussianVariogram(sill=0.340237044718, range_a=44828.0323827, nugget=0.330734960804)
spherical_model = tools.SphericalVariogram(sill=266491706445.0, range_a=3.85462485193e+19, nugget=0.3378323178453)
In [19]:
%time matern_model.f(vs.distance_coordinates.flatten())
%time whittle_model.f(vs.distance_coordinates.flatten())
%time exp_model.f(vs.distance_coordinates.flatten())
%time gaussian_model.f(vs.distance_coordinates.flatten())
%time spherical_model.f(vs.distance_coordinates.flatten())
Out[19]:
In [27]:
%time mcf = matern_model.corr_f(vs.distance_coordinates.flatten())
%time wcf = whittle_model.corr_f(vs.distance_coordinates.flatten())
%time ecf = exp_model.corr_f(vs.distance_coordinates.flatten())
%time gcf = gaussian_model.corr_f(vs.distance_coordinates.flatten())
%time scf = spherical_model.corr_f(vs.distance_coordinates.flatten())
%time mcf0 = matern_model.corr_f_old(vs.distance_coordinates.flatten())
%time wcf0 = whittle_model.corr_f_old(vs.distance_coordinates.flatten())
%time ecf0 = exp_model.corr_f_old(vs.distance_coordinates.flatten())
%time gcf0 = gaussian_model.corr_f_old(vs.distance_coordinates.flatten())
%time scf0 = spherical_model.corr_f_old(vs.distance_coordinates.flatten())
In [33]:
w = matern_model.corr_f(vs.distance_coordinates.flatten())
w2 = matern_model.corr_f_old(vs.distance_coordinates.flatten())
In [ ]:
In [32]:
print(np.array_equal(mcf,mcf0))
print(np.array_equal(wcf,wcf0))
print(np.array_equal(ecf,ecf0))
print(np.array_equal(gcf,gcf0))
#np.array_equal(scf,scf0)
In [26]:
np.array_equal()
Out[26]:
In [ ]:
%time vs.calculateCovarianceMatrix()
In [ ]:
### read csv files
conf_ints = pd.read_csv("/outputs/gls_confidence_int.csv")
params = pd.read_csv("/outputs/params_gls.csv")
params2 = pd.read_csv("/outputs/params2_gls.csv")
pvals = pd.read_csv("/outputs/pvalues_gls.csv")
pnobs = pd.read_csv("/outputs/n_obs.csv")
prsqs = pd.read_csv("/outputs/rsqs.csv")
In [ ]:
params
In [ ]:
conf_ints
In [ ]:
pvals
In [ ]:
plt.plot(pnobs.n_obs,prsqs.rsq)
plt.title("$R^2$ statistic for GLS on logBiomass ~ logSppn using Sp.autocor")
plt.xlabel("Number of observations")
In [ ]:
tt = params.transpose()
In [ ]:
tt.columns = tt.iloc[0]
In [ ]:
tt = tt.drop(tt.index[0])
In [ ]:
plt.plot(pnobs.n_obs,tt.Intercept)
plt.title("Intercept parameter")
In [ ]:
plt.plot(pnobs.n_obs,tt.logSppN)
plt.title("logSppn parameter")
In [ ]:
ccs = map(lambda s : bundleToGLS(s,gvg.model),samples)
In [ ]:
#bundleToGLS(samples[22],gvg.model)
covMat = buildSpatialStructure(samples[8],gvg.model)
In [ ]:
#np.linalg.pinv(covMat)
calculateGLS(samples[8],covMat)
#tt = covMat.flatten()
In [ ]:
secvg = tools.Variogram(samples[8],'logBiomass',model=gvg.model)
In [ ]:
DM = secvg.distance_coordinates
In [ ]:
dm = DM.flatten()
In [ ]:
dm.sort()
In [ ]:
pdm = pd.DataFrame(dm)
In [ ]:
xxx = pdm.loc[pdm[0] > 0].sort()
In [ ]:
xxx.shape
In [ ]:
8996780 + 3000 - (3000 * 3000)
In [ ]:
pdm.shape
In [ ]:
dd = samples[22].drop_duplicates(subset=['newLon','newLat'])
In [ ]:
secvg2 = tools.Variogram(dd,'logBiomass',model=gvg.model)
In [ ]:
covMat = buildSpatialStructure(dd,gvg.model)
In [ ]:
calculateGLS(dd,covMat)
In [ ]:
In [ ]:
In [ ]:
samples[22].shape
In [ ]:
gvg.model.corr_f(xxx.values())
In [ ]:
kk
In [ ]:
gvg.model.corr_f([100])
In [ ]:
gvg.model.corr_f([10])
In [ ]: