Write your post here.
In [103]:
from gplearn.genetic import SymbolicRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils.random import check_random_state
import numpy as np
import pandas as pd
import seaborn as sb
from matplotlib import pyplot as pl
from matplotlib import rcParams
from IPython.display import Image
from scipy.io import loadmat
import pydotplus
from sklearn.metrics import mean_squared_error
from math import sqrt
% matplotlib inline
In [2]:
rcParams['axes.formatter.limits'] = (-2, 3)
rcParams['xtick.labelsize'] = 14
rcParams['ytick.labelsize'] = 14
rcParams['font.size'] = 16
In [3]:
fpath ='/accounts/ekarakoy/DATA/OWT/nomad_rrs4clustering.mat'
In [4]:
matlabData = loadmat(fpath)
In [5]:
wavelength = matlabData['wl'][0]
rrsBelow = matlabData['rrs_below']
chl = matlabData['chl_nomad'] # I might use this as an additional feature in clustering
In [6]:
labels = list(wavelength) + ['chl']
df = pd.DataFrame(data=np.hstack((rrsBelow, chl)), columns=labels)
In [7]:
df.head()
Out[7]:
Next, I'll make a new dataframe with just SeaWiFS-wavelength Rrs and chl and delete this superset to save space
In [8]:
swfWvl = [411, 443, 489, 510, 555, 670]
In [12]:
dfSwf = df.loc[:, swfWvl + ['chl']]
del df
In [15]:
dfSwf.head()
Out[15]:
In [16]:
dfSwf.describe()
Out[16]:
What is this '0-chl' business?
In [17]:
f,axs = pl.subplots(nrows=2, ncols=2, figsize=(12,6), sharex=True)
for band, ax in zip([443, 489, 510, 555], axs.ravel()):
ax.plot(dfSwf.loc[dfSwf.chl==0, 'chl'], dfSwf.loc[dfSwf.chl==0, band],
'ko',alpha=0.5, label=r'$chl=0$')
ax.plot(dfSwf.loc[dfSwf.chl!=0, 'chl'], dfSwf.loc[dfSwf.chl!=0, band],
'go', alpha=0.3,label=r'$chl \neq 0$')
ax.legend(loc='best', fontsize=14);
ax.set_xlabel('$chl_a$', fontsize=16)
ax.set_ylabel('$Rrs(%d)$' % band, fontsize=16)
So the data where chl is 0 looks funky to me as this does not seem to pop up in the Rrs data. So for now I'll drop that part of the data so as to avoid giving the algorithm a hard time.
In [18]:
dfSwf['chl'].replace(0,np.NaN, inplace=True)
In [19]:
dfSwf.info()
In [21]:
dfSwf.dropna(inplace=True)
In [22]:
dfSwf.info()
In [23]:
dfSwf['maxBlue'] = dfSwf.loc[:,[443, 490, 510]].max(axis=1)
In [24]:
dfSwf.head()
Out[24]:
In [25]:
dfSwf.info()
In [26]:
dfSwf.describe()
Out[26]:
No cleanup necessary but we need to bring stuff to a similar scale. I'll try two ways: taking the lognormal (e.g. Campbell et al., '94(?), which will agree with the OC4 formulation anyway) The other is to standardize the data. I'll try both and see what I come up with. First, I'll do the log transform and run the model.
In [31]:
Xlog = np.log10(dfSwf.loc[:,swfWvl ])
ylog = np.log10(dfSwf.loc[:,'chl']+1e-7)
In [32]:
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(Xlog, ylog,
test_size=0.33)
In [33]:
def CI(df):
# assumes df has rrs data at specific wavelengths
blue, green, red=443, 555, 670
ci = df[green] - (df[blue] + (green - blue) / (red - blue) * (df[red] - df[blue]))
return ci
In [36]:
def OC4(rrsMaxBlue, rrsGreen, log=True):
# maxblue is last column of rrsData
# note the log option to specify whether the data has already been log transformed
a=[0.3272, -2.9940, 2.7218, -1.2259, -0.5683]
if log:
poly = np.sum([a[i]*np.power((rrsMaxBlue - rrsGreen),i)
for i in range(1,5) ], axis=0)
else:
poly = np.sum([a[i]*np.power(np.log10(rrsMaxBlue/rrsGreen),i )
for i in range(1,5) ], axis=0)
poly+=a[0]
chl = np.power(10,poly)
return chl
In [39]:
# making sure the log option in oc4 works
logchlMdl = OC4(np.log10(dfSwf.maxBlue.values), np.log10(dfSwf[green].values),
log=True)
chlMdl = OC4(dfSwf.maxBlue.values, dfSwf[green].values,log=False)
pl.plot(logchlMdl, chlMdl);
In [89]:
#X_train comes from X, which was log transformed, so...
green = 555
maxBlueTrainLog = X_train_log[[443, 489, 510]].max(axis=1)
chlOC4_train = OC4(maxBlueTrainLog.values, X_train_log[green].values, log=True)
In [43]:
pl.figure(figsize=(6,6))
pl.plot(chl_train,np.power(10,y_train_log),'ko',alpha=0.5)
pl.yscale('log')
pl.ylim(1e-2,1e2)
pl.xlim(1e-2,1e2)
pl.xscale('log')
pl.plot([1e-2,1e2],[1e-2,1e2],'k')
pl.xlabel('chl estimated by OC4', fontsize=16)
pl.ylabel('original chl', fontsize=16);
In [55]:
est_gp = SymbolicRegressor(population_size=5000,const_range=(-5,5),
generations=30,transformer=True,
trigonometric=False,
p_crossover=0.6, p_subtree_mutation=0.1,
p_hoist_mutation=0.05, p_point_mutation=0.2,
max_samples=0.9, verbose=1, comparison=True,
parsimony_coefficient=0.01, n_jobs=3)
est_gp.fit(X_train_log, y_train_log)
Out[55]:
In [98]:
chlGP_train_log = est_gp.predict(X_train_log)
In [56]:
graph = pydotplus.graphviz.graph_from_dot_data(est_gp._program.export_graphviz())
Image(graph.create_png())
Out[56]:
Above is best the formula that the GP came up with in 30 generations from a pool of 5000. Remember when you are looking at this, that you're in log space. Reference, X{0:412, 1:443, 2:489, 3:510, 4:555, 5:670}. Here the algorithm is tuned in to the fact that some kind of blue-green ratio matters. Interestingly, it is the inverse of the OCx formulation.
In [109]:
def PlotCompRegs(fieldChl,oc4Chl,gpChl):
oc4Chl = np.log10(oc4Chl)
rmse0 = sqrt(mean_squared_error(fieldChl, gpChl))
rmse1 = sqrt(mean_squared_error(fieldChl, oc4Chl))
f,axs = pl.subplots(ncols=2, figsize=(10,6),sharey=True)
sb.regplot(gpChl, fieldChl, ax=axs[0])
axs[0].set_xlim((-2,2)), axs[0].set_ylim((-2,2))
axs[0].set_xlabel('log(chl) from GP fit', fontsize=16)
axs[0].set_ylabel('log(chl) from data',fontsize=16)
axs[0].plot([-2,2],[-2,2], 'k--', linewidth=2)
axs[0].plot(0, 0, alpha=0, label='rmse=%.2f' % rmse0)
axs[0].legend(loc='best', fontsize=16)
sb.regplot(oc4Chl, fieldChl, ax=axs[1])
axs[1].set_ylim((-2,2)), axs[1].set_xlim((-2,2))
axs[1].set_ylabel('')
axs[1].set_xlabel('log(chl) from OC4', fontsize=16)
axs[1].plot([-2,2],[-2,2],'k--', linewidth=2);
axs[1].plot(0, 0, alpha=0, label='rmse=%.2f' % rmse1)
axs[1].legend(loc='best', fontsize=16)
In [106]:
def PlotCompHists(fieldChl, oc4Chl, gpChl):
f,axs=pl.subplots(ncols=2,figsize=(12,6), sharey=True)
axs[0].hist(fieldChl, bins=50, range=(-2,2), label='field chl',color='gray',
normed=True);
axs[0].hist(np.log10(oc4Chl), bins=50, range=(-2,2), label='OC4 chl',
color='green', normed=True, alpha=0.3);
axs[0].legend(fontsize=16)
axs[1].hist(fieldChl, bins=50, range=(-2,2), label='field chl',color='gray',
normed=True);
axs[1].hist(gpChl, bins=50, range=(-2,2), label='GP chl', color='orange', alpha=0.3,
normed=True);
axs[1].legend(fontsize=16);
axs[0].set_xlabel('log(chl)', fontsize=16)
axs[0].set_ylabel('freq.')
axs[1].set_xlabel('log(chl)', fontsize=16);
In [110]:
PlotCompRegs(fieldChl=y_train_log, oc4Chl=chlOC4_train, gpChl=chlGP_train_log)
In [97]:
PlotCompHists(fieldChl=y_train_log, oc4Chl=chlOC4_train, gpChl=chlGP_train_log)
Now this is very interesting. The plot on the left suggest that the GP algorithm converged to a solution that fit the most frequently observed data. As a result it mimicks the field data quite well at values it is most frequently observed. As such it seem to deal with central values better than the OC4. A quick check is to compare an rmse for the two algorithms relative to the field data, with the entire data sets, and then with just some central values.
In [ ]:
oc4Msk = np.where(((np.log10(chlOC4_train)<=0.5) & (chlOC4_train>=-0.5)))
In [50]:
est_gp.score(X_test_log, y_test_log)
Out[50]:
In [67]:
#X_train comes from X, which was log transformed, so...
green = 555
maxBlueTestLog = X_test_log[[443, 489, 510]].max(axis=1)
chlOC4_test = OC4(maxBlueTestLog.values, X_test_log[green].values, log=True)
In [99]:
chlGP_test_log = est_gp.predict(X_test_log)
In [111]:
PlotCompRegs(fieldChl=y_test_log, oc4Chl=chlOC4_test, gpChl=chlGP_test_log)
In [112]:
PlotCompHists(fieldChl=y_test_log, oc4Chl=chlOC4_test, gpChl=chlGP_test_log)