Write your post here.


In [103]:
from gplearn.genetic import SymbolicRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils.random import check_random_state
import numpy as np
import pandas as pd
import seaborn as sb
from matplotlib import pyplot as pl
from matplotlib import rcParams
from IPython.display import Image
from scipy.io import loadmat
import pydotplus
from sklearn.metrics import mean_squared_error
from math import sqrt
% matplotlib inline

In [2]:
rcParams['axes.formatter.limits'] = (-2, 3)
rcParams['xtick.labelsize'] = 14
rcParams['ytick.labelsize'] = 14
rcParams['font.size'] = 16

In [3]:
fpath ='/accounts/ekarakoy/DATA/OWT/nomad_rrs4clustering.mat'

In [4]:
matlabData = loadmat(fpath)

In [5]:
wavelength = matlabData['wl'][0]
rrsBelow = matlabData['rrs_below']
chl = matlabData['chl_nomad'] # I might use this as an additional feature in clustering

In [6]:
labels = list(wavelength) + ['chl']
df = pd.DataFrame(data=np.hstack((rrsBelow, chl)), columns=labels)

In [7]:
df.head()


Out[7]:
411 443 469 489 510 520 547 550 555 560 565 670 chl
0 0.002307 0.003225 0.004674 0.006265 0.007661 0.008944 0.012935 0.013410 0.014040 0.014715 0.015325 0.006588 38.19
1 0.002035 0.002650 0.003273 0.004149 0.004767 0.005416 0.007338 0.007570 0.007878 0.008208 0.008549 0.003241 35.01
2 0.001862 0.002271 0.002654 0.003523 0.004367 0.005147 0.007423 0.007691 0.008053 0.008434 0.008822 0.003084 26.91
3 0.002817 0.003330 0.004096 0.005482 0.006963 0.008243 0.012075 0.012525 0.013127 0.013770 0.014361 0.006154 47.96
4 0.001735 0.001959 0.002129 0.002882 0.003638 0.004115 0.005079 0.005191 0.005337 0.005507 0.005716 0.003423 23.55

Next, I'll make a new dataframe with just SeaWiFS-wavelength Rrs and chl and delete this superset to save space


In [8]:
swfWvl = [411, 443, 489, 510, 555, 670]

In [12]:
dfSwf = df.loc[:, swfWvl + ['chl']]
del df

In [15]:
dfSwf.head()


Out[15]:
411 443 489 510 555 670 chl
0 0.002307 0.003225 0.006265 0.007661 0.014040 0.006588 38.19
1 0.002035 0.002650 0.004149 0.004767 0.007878 0.003241 35.01
2 0.001862 0.002271 0.003523 0.004367 0.008053 0.003084 26.91
3 0.002817 0.003330 0.005482 0.006963 0.013127 0.006154 47.96
4 0.001735 0.001959 0.002882 0.003638 0.005337 0.003423 23.55

In [16]:
dfSwf.describe()


Out[16]:
411 443 489 510 555 670 chl
count 2404.000000 2404.000000 2404.000000 2404.000000 2404.000000 2404.000000 2404.000000
mean 0.009913 0.009351 0.009086 0.007638 0.006105 0.001289 1.544312
std 0.006548 0.005329 0.004598 0.004656 0.006091 0.002629 4.002408
min 0.000098 0.000365 0.000546 0.000502 0.000398 0.000023 0.000000
25% 0.005000 0.005335 0.006135 0.005511 0.003029 0.000196 0.054000
50% 0.007967 0.008120 0.008528 0.006562 0.003970 0.000385 0.301200
75% 0.013895 0.012699 0.011118 0.007965 0.006047 0.001101 1.218750
max 0.039485 0.031489 0.046573 0.050136 0.060087 0.031832 72.120000

What is this '0-chl' business?


In [17]:
f,axs = pl.subplots(nrows=2, ncols=2, figsize=(12,6), sharex=True)
for band, ax in zip([443, 489, 510, 555], axs.ravel()):
    ax.plot(dfSwf.loc[dfSwf.chl==0, 'chl'], dfSwf.loc[dfSwf.chl==0, band],
            'ko',alpha=0.5, label=r'$chl=0$')
    ax.plot(dfSwf.loc[dfSwf.chl!=0, 'chl'], dfSwf.loc[dfSwf.chl!=0, band],
            'go', alpha=0.3,label=r'$chl \neq 0$')
    ax.legend(loc='best', fontsize=14);
    ax.set_xlabel('$chl_a$', fontsize=16)
    ax.set_ylabel('$Rrs(%d)$' % band, fontsize=16)


So the data where chl is 0 looks funky to me as this does not seem to pop up in the Rrs data. So for now I'll drop that part of the data so as to avoid giving the algorithm a hard time.


In [18]:
dfSwf['chl'].replace(0,np.NaN, inplace=True)

In [19]:
dfSwf.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2404 entries, 0 to 2403
Data columns (total 7 columns):
411    2404 non-null float64
443    2404 non-null float64
489    2404 non-null float64
510    2404 non-null float64
555    2404 non-null float64
670    2404 non-null float64
chl    1874 non-null float64
dtypes: float64(7)
memory usage: 131.5 KB

In [21]:
dfSwf.dropna(inplace=True)

In [22]:
dfSwf.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1874 entries, 0 to 2403
Data columns (total 7 columns):
411    1874 non-null float64
443    1874 non-null float64
489    1874 non-null float64
510    1874 non-null float64
555    1874 non-null float64
670    1874 non-null float64
chl    1874 non-null float64
dtypes: float64(7)
memory usage: 117.1 KB

In [23]:
dfSwf['maxBlue'] = dfSwf.loc[:,[443, 490, 510]].max(axis=1)

In [24]:
dfSwf.head()


Out[24]:
411 443 489 510 555 670 chl maxBlue
0 0.002307 0.003225 0.006265 0.007661 0.014040 0.006588 38.19 0.007661
1 0.002035 0.002650 0.004149 0.004767 0.007878 0.003241 35.01 0.004767
2 0.001862 0.002271 0.003523 0.004367 0.008053 0.003084 26.91 0.004367
3 0.002817 0.003330 0.005482 0.006963 0.013127 0.006154 47.96 0.006963
4 0.001735 0.001959 0.002882 0.003638 0.005337 0.003423 23.55 0.003638

In [25]:
dfSwf.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1874 entries, 0 to 2403
Data columns (total 8 columns):
411        1874 non-null float64
443        1874 non-null float64
489        1874 non-null float64
510        1874 non-null float64
555        1874 non-null float64
670        1874 non-null float64
chl        1874 non-null float64
maxBlue    1874 non-null float64
dtypes: float64(8)
memory usage: 131.8 KB

In [26]:
dfSwf.describe()


Out[26]:
411 443 489 510 555 670 chl maxBlue
count 1874.000000 1874.000000 1874.000000 1874.000000 1874.000000 1874.000000 1874.000000 1874.000000
mean 0.009530 0.009077 0.009043 0.007704 0.006241 0.001313 1.981071 0.010267
std 0.006295 0.005162 0.004693 0.004811 0.006245 0.002590 4.436948 0.005792
min 0.000098 0.000365 0.000546 0.000502 0.000398 0.000023 0.012000 0.000502
25% 0.004957 0.005256 0.006091 0.005467 0.003056 0.000193 0.182000 0.006106
50% 0.007836 0.007967 0.008436 0.006548 0.004011 0.000385 0.604500 0.008981
75% 0.013161 0.012172 0.010946 0.008140 0.006390 0.001151 1.644000 0.013220
max 0.039485 0.029934 0.044280 0.046735 0.060087 0.031832 72.120000 0.046735

No cleanup necessary but we need to bring stuff to a similar scale. I'll try two ways: taking the lognormal (e.g. Campbell et al., '94(?), which will agree with the OC4 formulation anyway) The other is to standardize the data. I'll try both and see what I come up with. First, I'll do the log transform and run the model.


In [31]:
Xlog = np.log10(dfSwf.loc[:,swfWvl ])
ylog = np.log10(dfSwf.loc[:,'chl']+1e-7)

In [32]:
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(Xlog, ylog,
                                           test_size=0.33)

In [33]:
def CI(df):
    # assumes df has rrs data at specific wavelengths
    blue, green, red=443, 555, 670
    ci = df[green] - (df[blue] + (green - blue) / (red - blue) * (df[red] - df[blue]))
    return ci

In [36]:
def OC4(rrsMaxBlue, rrsGreen, log=True):
    # maxblue is last column of rrsData
    # note the log option to specify whether the data has already been log transformed
    a=[0.3272, -2.9940, 2.7218, -1.2259, -0.5683]
    if log:
        poly = np.sum([a[i]*np.power((rrsMaxBlue - rrsGreen),i)
                       for i in range(1,5) ], axis=0)
    else:
        poly = np.sum([a[i]*np.power(np.log10(rrsMaxBlue/rrsGreen),i )
                       for i in range(1,5) ], axis=0)
    poly+=a[0]
    chl = np.power(10,poly)
    return chl

In [39]:
# making sure the log option in oc4 works
logchlMdl = OC4(np.log10(dfSwf.maxBlue.values), np.log10(dfSwf[green].values),
                  log=True)
chlMdl = OC4(dfSwf.maxBlue.values, dfSwf[green].values,log=False)
pl.plot(logchlMdl, chlMdl);



In [89]:
#X_train comes from X, which was log transformed, so...
green = 555
maxBlueTrainLog = X_train_log[[443, 489, 510]].max(axis=1)
chlOC4_train = OC4(maxBlueTrainLog.values, X_train_log[green].values, log=True)

In [43]:
pl.figure(figsize=(6,6))
pl.plot(chl_train,np.power(10,y_train_log),'ko',alpha=0.5)
pl.yscale('log')
pl.ylim(1e-2,1e2)
pl.xlim(1e-2,1e2)
pl.xscale('log')
pl.plot([1e-2,1e2],[1e-2,1e2],'k')
pl.xlabel('chl estimated by OC4', fontsize=16)
pl.ylabel('original chl', fontsize=16);



In [55]:
est_gp = SymbolicRegressor(population_size=5000,const_range=(-5,5),
                           generations=30,transformer=True,
                           trigonometric=False,
                           p_crossover=0.6, p_subtree_mutation=0.1,
                           p_hoist_mutation=0.05, p_point_mutation=0.2,
                           max_samples=0.9, verbose=1, comparison=True, 
                           parsimony_coefficient=0.01, n_jobs=3)
est_gp.fit(X_train_log, y_train_log)


    |    Population Average   |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    13.03    4.77486719435       21   0.245304037182   0.261590535511      1.28m
   1     6.67    1.55790997942        7    0.23201307744   0.211389786585      3.13m
   2     5.46    1.22851240116        7    0.21772848329   0.219449610604      3.54m
   3      4.7   0.978694839825        7   0.215493255104   0.239477964747      3.49m
   4      4.0   0.903963582962        7   0.211986121771   0.270902992782      2.97m
   5     5.66   0.897395078392        7   0.212733159933   0.264209293698      2.63m
   6     6.27    1.05514228434        7   0.211989969278   0.270868517898      2.38m
   7     6.11   0.966464225853        6   0.215757661875    0.28032347674      2.15m
   8      6.0    0.92674943446        6   0.215582355576    0.28189427683      1.97m
   9     6.08   0.910966608816        6    0.21560452395   0.281695641164      1.80m
  10     6.04   0.943231415592        6   0.213886875788   0.262817230281      1.66m
  11     6.03   0.939895112578        6   0.212215784573   0.277790738072      1.53m
  12     6.01   0.994852139638        6   0.211653420763   0.279039572133      1.41m
  13     6.05    1.03574804665        6   0.211612347455   0.283197726217      1.30m
  14     6.09    0.96779717789        6    0.21323781824   0.268632991957      1.20m
  15     6.08    1.01782561421        6    0.21285573005   0.269495538818      1.10m
  16     6.07    1.10630569858        6   0.211992800105   0.275998625492      1.01m
  17     6.04    1.09778554441        6   0.212513193967   0.275125855487     55.22s
  18     6.14    1.05237782266        6   0.211764084085   0.281838117844     50.17s
  19     6.05    1.05541663058        6    0.21190548084   0.276781033828     45.15s
  20     6.05    1.10231396728        6   0.212256205752   0.277428551479     40.22s
  21     6.05    1.08788842656        6   0.211368813605   0.281589742621     35.46s
  22     6.04    1.11058621005        6   0.210959467608   0.285257612706     30.80s
  23     6.05    1.08499675836        6   0.212491902109   0.271526513093     26.24s
  24     6.05    1.05528379469        6   0.211721666758   0.279657105933     21.75s
  25     6.05    1.27539527141        6   0.212367614345   0.272640170912     17.30s
  26     6.08    1.16174115724        6   0.211773960168   0.277959500799     12.90s
  27     6.09    1.04270019351        6   0.211300099421   0.282205443523      8.55s
  28     6.14    1.08541395223        6   0.211760953552    0.27930508379      4.26s
  29     6.08    1.06603733062        6    0.21147851799   0.280606756509      0.00s
Out[55]:
SymbolicRegressor(comparison=True, const_range=(-5, 5), generations=30,
         init_depth=(2, 6), init_method='half and half', max_samples=0.9,
         metric='mean absolute error', n_jobs=3, p_crossover=0.6,
         p_hoist_mutation=0.05, p_point_mutation=0.2, p_point_replace=0.05,
         p_subtree_mutation=0.1, parsimony_coefficient=0.01,
         population_size=5000, random_state=None, stopping_criteria=0.0,
         tournament_size=20, transformer=True, trigonometric=False,
         verbose=1)

In [98]:
chlGP_train_log = est_gp.predict(X_train_log)

In [56]:
graph = pydotplus.graphviz.graph_from_dot_data(est_gp._program.export_graphviz())
Image(graph.create_png())


Out[56]:

Above is best the formula that the GP came up with in 30 generations from a pool of 5000. Remember when you are looking at this, that you're in log space. Reference, X{0:412, 1:443, 2:489, 3:510, 4:555, 5:670}. Here the algorithm is tuned in to the fact that some kind of blue-green ratio matters. Interestingly, it is the inverse of the OCx formulation.


In [109]:
def PlotCompRegs(fieldChl,oc4Chl,gpChl):
    oc4Chl = np.log10(oc4Chl)
    rmse0 = sqrt(mean_squared_error(fieldChl, gpChl))
    rmse1 = sqrt(mean_squared_error(fieldChl, oc4Chl))
    f,axs = pl.subplots(ncols=2, figsize=(10,6),sharey=True)
    sb.regplot(gpChl, fieldChl, ax=axs[0])
    axs[0].set_xlim((-2,2)), axs[0].set_ylim((-2,2))
    axs[0].set_xlabel('log(chl) from GP fit', fontsize=16)
    axs[0].set_ylabel('log(chl) from data',fontsize=16)
    axs[0].plot([-2,2],[-2,2], 'k--', linewidth=2)
    axs[0].plot(0, 0, alpha=0, label='rmse=%.2f' % rmse0)
    axs[0].legend(loc='best', fontsize=16)
    sb.regplot(oc4Chl, fieldChl, ax=axs[1])
    axs[1].set_ylim((-2,2)), axs[1].set_xlim((-2,2))
    axs[1].set_ylabel('')
    axs[1].set_xlabel('log(chl) from OC4', fontsize=16)
    axs[1].plot([-2,2],[-2,2],'k--', linewidth=2);
    axs[1].plot(0, 0, alpha=0, label='rmse=%.2f' % rmse1)
    axs[1].legend(loc='best', fontsize=16)

In [106]:
def PlotCompHists(fieldChl, oc4Chl, gpChl):
    f,axs=pl.subplots(ncols=2,figsize=(12,6), sharey=True)
    axs[0].hist(fieldChl, bins=50, range=(-2,2), label='field chl',color='gray',
               normed=True);
    axs[0].hist(np.log10(oc4Chl), bins=50, range=(-2,2), label='OC4 chl',
                color='green', normed=True, alpha=0.3);
    axs[0].legend(fontsize=16)
    axs[1].hist(fieldChl, bins=50, range=(-2,2), label='field chl',color='gray',
               normed=True);
    axs[1].hist(gpChl, bins=50, range=(-2,2), label='GP chl', color='orange', alpha=0.3,
               normed=True);
    axs[1].legend(fontsize=16);
    axs[0].set_xlabel('log(chl)', fontsize=16)
    axs[0].set_ylabel('freq.')
    axs[1].set_xlabel('log(chl)', fontsize=16);

In [110]:
PlotCompRegs(fieldChl=y_train_log, oc4Chl=chlOC4_train, gpChl=chlGP_train_log)



In [97]:
PlotCompHists(fieldChl=y_train_log, oc4Chl=chlOC4_train, gpChl=chlGP_train_log)


Now this is very interesting. The plot on the left suggest that the GP algorithm converged to a solution that fit the most frequently observed data. As a result it mimicks the field data quite well at values it is most frequently observed. As such it seem to deal with central values better than the OC4. A quick check is to compare an rmse for the two algorithms relative to the field data, with the entire data sets, and then with just some central values.


In [ ]:
oc4Msk = np.where(((np.log10(chlOC4_train)<=0.5) & (chlOC4_train>=-0.5)))

In [50]:
est_gp.score(X_test_log, y_test_log)


Out[50]:
0.78662188880476536

In [67]:
#X_train comes from X, which was log transformed, so...
green = 555
maxBlueTestLog = X_test_log[[443, 489, 510]].max(axis=1)
chlOC4_test = OC4(maxBlueTestLog.values, X_test_log[green].values, log=True)

In [99]:
chlGP_test_log = est_gp.predict(X_test_log)

In [111]:
PlotCompRegs(fieldChl=y_test_log, oc4Chl=chlOC4_test, gpChl=chlGP_test_log)



In [112]:
PlotCompHists(fieldChl=y_test_log, oc4Chl=chlOC4_test, gpChl=chlGP_test_log)