In [1]:
%pylab inline
import geopandas as gpd
import pandas as pd
from OpticalRS import *
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import train_test_split
import itertools
import statsmodels.formula.api as smf
from collections import OrderedDict
style.use('ggplot')
In [2]:
cd ../data
That happened here.
In [3]:
imrds = RasterDS('glint_corrected.tif')
imarr = imrds.band_array
deprds = RasterDS('Leigh_Depth_atAcq_Resampled.tif')
darr = -1 * deprds.band_array.squeeze()
In [4]:
darr = np.ma.masked_greater( darr, 20.0 )
In [5]:
imarr = ArrayUtils.mask3D_with_2D( imarr, darr.mask )
darr = np.ma.masked_where( imarr[...,0].mask, darr )
I need to calculate a modified version of $X_i = ln(L_i - L_{si})$. In order to do that I'll first load the deep water means and standard deviations I calculated here.
In [6]:
dwmeans = np.load('darkmeans.pkl')
dwstds = np.load('darkstds.pkl')
I applied the same modification as Armstrong (1993), 2 standard deviations from $L_{si}$, to avoid getting too many negative values because those can't be log transformed.
In [7]:
dpsub = ArrayUtils.equalize_band_masks( \
np.ma.masked_less( imarr - (dwmeans - 2 * dwstds), 0.0 ) )
In [8]:
print "After that I still retain %.1f%% of my pixels." % ( 100 * dpsub.count() / float( imarr.count() ) )
In [9]:
X = np.log( dpsub )
In [11]:
# imrds.new_image_from_array(X.astype('float32'),'LyzengaX.tif')
Out[11]:
I'll need to equalize the masks again. I'll call the depths h in reference to Lyzenga et al. 2006 (e.g. equation 14).
In [10]:
h = np.ma.masked_where( X[...,0].mask, darr )
In [11]:
imshow( X[...,1] )
Out[11]:
In [12]:
df = ArrayUtils.band_df( X )
df['depth'] = h.compressed()
In [13]:
x_train, x_test, y_train, y_test = train_test_split( \
df[imrds.band_names],df.depth,train_size=300000,random_state=5)
traindf = ArrayUtils.band_df( x_train )
traindf['depth'] = y_train.ravel()
testdf = ArrayUtils.band_df( x_test )
testdf['depth'] = y_test.ravel()
In [14]:
def get_fit( ind, x_train, y_train ):
skols = LinearRegression()
skolsfit = skols.fit(x_train[...,ind],y_train)
return skolsfit
In [15]:
def get_selfscore( ind, x_train, y_train ):
fit = get_fit( ind, x_train, y_train )
return fit.score( x_train[...,ind], y_train )
In [16]:
od = OrderedDict()
for comb in itertools.combinations( range(8), 2 ):
od[ get_selfscore(comb,x_train,y_train) ] = [ c+1 for c in comb ]
od_sort = sorted( od.items(), key=lambda t: t[0], reverse=True )
od_sort
Out[16]:
In [17]:
best_ind = np.array( od_sort[0][1] ) - 1
best_ind
Out[17]:
In [18]:
skols = LinearRegression()
skolsfit = skols.fit(x_train[...,best_ind],y_train)
In [21]:
print "h0 = %.2f, h2 = %.2f, h3 = %.2f" % \
(skolsfit.intercept_,skolsfit.coef_[0],skolsfit.coef_[1])
In [26]:
print "R^2 = %.6f" % skolsfit.score(x_test[...,best_ind],y_test)
In [27]:
pred = skolsfit.predict(x_test[...,best_ind])
In [28]:
fig,ax = plt.subplots(1,1,figsize=(8,6))
mapa = ax.hexbin(pred,y_test,mincnt=1,bins='log',gridsize=500,cmap=plt.cm.hot)
# ax.scatter(pred,y_test,alpha=0.008,edgecolor='none')
ax.set_ylabel('MB Depth')
ax.set_xlabel('Predicted Depth')
rmse = np.sqrt( mean_squared_error( y_test, pred ) )
n = x_train.shape[0]
tit = "RMSE: %.4f, n=%i" % (rmse,n)
ax.set_title(tit)
ax.set_aspect('equal')
ax.axis([-5,25,-5,25])
ax.plot([-5,25],[-5,25],c='white')
cb = plt.colorbar(mapa)
cb.set_label("Log10(N)")
In [29]:
LyzPredVsMB = pd.DataFrame({'prediction':pred,'mb_depth':y_test})
LyzPredVsMB.to_pickle('LyzPredVsMB.pkl')
In [30]:
fullim = imrds.band_array
fulldep = -1 * deprds.band_array.squeeze()
fullim = ArrayUtils.mask3D_with_2D( fullim, fulldep.mask )
fulldep = np.ma.masked_where( fullim[...,0].mask, fulldep )
In [31]:
dlims = arange(5,31,2.5)
drmses,meanerrs,stderrs = [],[],[]
for dl in dlims:
dlarr = np.ma.masked_greater( fulldep, dl )
iml = ArrayUtils.mask3D_with_2D( fullim, dlarr.mask )
imldsub = ArrayUtils.equalize_band_masks( \
np.ma.masked_less( iml - (dwmeans - 2 * dwstds), 0.0 ) )
imlX = np.log( imldsub )
dlarr = np.ma.masked_where( imlX[...,0].mask, dlarr )
xl_train, xl_test, yl_train, yl_test = train_test_split( \
imlX.compressed().reshape(-1,8),dlarr.compressed(),train_size=1500,random_state=5)
linr = LinearRegression()
predl = linr.fit(xl_train[...,best_ind],yl_train).predict( xl_test[...,best_ind] )
drmses.append( sqrt( mean_squared_error(yl_test,predl) ) )
meanerrs.append( (yl_test - predl).mean() )
stderrs.append( (yl_test - predl).std() )
In [32]:
fig,(ax1,ax2) = subplots(1,2,figsize=(12,6))
ax1.plot(dlims,np.array(drmses),marker='o',c='b')
ax1.set_xlabel("Data Depth Limit (m)")
ax1.set_ylabel("Model RMSE (m)")
em,es = np.array(meanerrs), np.array(stderrs)
ax2.plot(dlims,em,marker='o',c='b')
ax2.plot(dlims,em+es,linestyle='--',c='k')
ax2.plot(dlims,em-es,linestyle='--',c='k')
ax2.set_xlabel("Data Depth Limit (m)")
ax2.set_ylabel("Model Mean Error (m)")
Out[32]:
In [33]:
deplimdf = pd.DataFrame({'depth_lim':dlims,'rmse':drmses,\
'mean_error':meanerrs,'standard_error':stderrs})
In [34]:
deplimdf.to_pickle('LyzengaDepthLimitDF.pkl')
In [111]:
# ns = np.logspace(log10(0.00003*df.depth.count()),log10(0.80*df.depth.count()),15)
int(ns.min()),int(ns.max())
Out[111]:
In [112]:
ns = np.logspace(1,log10(0.80*df.depth.count()),15)
ltdf = pd.DataFrame({'train_size':ns})
for rs in range(10):
nrmses = []
for n in ns:
xn_train,xn_test,yn_train,yn_test = train_test_split( \
df[imrds.band_names],df.depth,train_size=int(n),random_state=rs+100)
thisols = LinearRegression()
npred = thisols.fit(xn_train[...,best_ind],yn_train).predict(xn_test[...,best_ind])
nrmses.append( sqrt( mean_squared_error(yn_test,npred ) ) )
dflabel = 'rand_state_%i' % rs
ltdf[dflabel] = nrmses
print "min points: %i, max points: %i" % (int(ns.min()),int(ns.max()))
In [113]:
fig,ax = subplots(1,1,figsize=(10,6))
for rs in range(10):
dflabel = 'rand_state_%i' % rs
ax.plot(ltdf['train_size'],ltdf[dflabel])
ax.set_xlabel("Number of Training Points")
ax.set_ylabel("Model RMSE (m)")
# ax.set_xlim(0,5000)
ax.set_xscale('log')
ax.set_title("Rapidly Increasing Accuracy With More Training Data")
Out[113]:
In [114]:
ltdf.to_pickle('LyzengaAccuracyDF.pkl')
In [46]:
full_pred = skolsfit.predict(X[...,best_ind])
full_pred = np.ma.masked_where( h.mask, full_pred )
In [47]:
full_errs = full_pred - h
In [48]:
blah = hist( full_errs.compressed(), 100 )
In [49]:
figure(figsize=(12,11))
vmin,vmax = np.percentile(full_errs.compressed(),0.1),np.percentile(full_errs.compressed(),99.9)
imshow( full_errs, vmin=vmin, vmax=vmax )
ax = gca()
ax.set_axis_off()
ax.set_title("Depth Errors (m)")
colorbar()
Out[49]:
In [50]:
full_pred.dump('LyzDepthPred.pkl')
full_errs.dump('LyzDepthPredErrs.pkl')
In [ ]: