In an earlier post, I explained how to apply a Bayesian linear regression model to retrievI use the historically accurate dataset behind the development of NASA OBPG's chlorophyll algorithms.
In [31]:
import pandas as pd
import matplotlib.pyplot as pl
from sklearn.linear_model import LinearRegression
import re
import os
import numpy as np
import seaborn as sb
from mpl_toolkits.basemap import Basemap
import pymc3 as pm
import warnings
from cmocean import cm
warnings.filterwarnings('ignore')
In [2]:
% matplotlib inline
In [19]:
def ParseTextFile(textFileHandle, topickle=False, convert2DateTime=False, **kwargs):
"""
* topickle: pickle resulting DataFrame if True
* convert2DateTime: join date/time columns and convert entries to datetime objects
* kwargs:
pkl_fname: pickle file name to save DataFrame by, if topickle=True
"""
# Pre-compute some regex
columns = re.compile('^/fields=(.+)') # to get field/column names
units = re.compile('^/units=(.+)') # to get units -- optional
endHeader = re.compile('^/end_header') # to know when to start storing data
# Set some milestones
noFields = True
getData = False
# loop through the text data
for line in textFileHandle:
if noFields:
fieldStr = columns.findall(line)
if len(fieldStr)>0:
noFields = False
fieldList = fieldStr[0].split(',')
dataDict = dict.fromkeys(fieldList)
continue # nothing left to do with this line, keep looping
if not getData:
if endHeader.match(line):
# end of header reached, start acquiring data
getData = True
else:
dataList = line.split(',')
for field,datum in zip(fieldList, dataList):
if not dataDict[field]:
dataDict[field] = []
dataDict[field].append(datum)
df = pd.DataFrame(dataDict, columns=fieldList)
if convert2DateTime:
datetimelabels=['year', 'month', 'day', 'hour', 'minute', 'second']
df['Datetime']= pd.to_datetime(df[datetimelabels],
format='%Y-%m-%dT%H:%M:%S')
df.drop(datetimelabels, axis=1, inplace=True)
if topickle:
fname=kwargs.pop('pkl_fname', 'dfNomad2.pkl')
df.to_pickle(fname)
return df
def FindNaNs(df):
for col in df.columns:
sn = np.where(df[col].values=='NaN', True, False).sum()
s9 = np.where('-999' in df[col].values, True, False).sum()
print("%s: %d NaNs & %d -999s" % (col, sn, s9))
def FitPoly(X,y, order=4, lin=False):
"""
Numpy regression. Returns coeffs.
kwargs:
lin: specifies whether data is log transformed. Data is log transformed if not."""
if lin:
X = np.log10(X)
y = np.log10(y)
coeffs = np.polyfit(X,y,deg=order)
return coeffs
In [4]:
with open('/accounts/ekarakoy/DATA/ocprep_v4_iop.txt') as fdata:
df = ParseTextFile(fdata, topickle=True, convert2DateTime=True,
pkl_fname=os.path.join(savDir, 'JeremyOCx_data'))
In [ ]:
df.info() # skipping output which shows a lot of unnecessary features for this exercise
Select features I want for this modeling bit.
In [6]:
basicCols = ['cruise', 'lat', 'lon', 'type', 'chl', 'Datetime']
IwantCols = basicCols + [col for col in df.columns if 'rrs' in col]
dfRrs = df[IwantCols]
swflbls = ['rrs411','rrs443','rrs489','rrs510','rrs555','rrs670']
swfCols = basicCols + swflbls
dfSwf = dfRrs[swfCols]
In [9]:
savDir = '/accounts/ekarakoy/DEV-ALL/BLOGS/DataScienceCorner/posts/bayesianChl_stuff/'
df.to_pickle(os.path.join(savDir, 'dfOcPrepHistoric.pkl'))
dfRrs.to_pickle(os.path.join(savDir, 'dfOcPrepRrs.pkl'))
del df, dfRrs
In [ ]:
dfSwf.info() # skipping the output which shows that most columns are object type...
In [12]:
FindNaNs(dfSwf)
In [14]:
dfSwf.replace(to_replace='NaN',value=np.NaN,inplace=True)
dfSwf.dropna(inplace=True)
numCols = ['chl','lat','lon','rrs411','rrs443','rrs489','rrs510','rrs555','rrs670']
dfSwf[numCols] = dfSwf[numCols].apply(pd.to_numeric)
dfSwf.info()
In [26]:
dfSwf['maxBlue'] = dfSwf[['rrs443', 'rrs489', 'rrs510']].max(axis=1)
In [27]:
dfSwf['OCxRatio'] = dfSwf.maxBlue/dfSwf.rrs555
In [29]:
dfLogOCx = pd.DataFrame(columns = ['OCxRatio','chl','type','cruise'])
dfLogOCx.OCxRatio = np.log10(dfSwf.OCxRatio)
dfLogOCx.chl = np.log10(dfSwf.chl)
dfLogOCx[['type','cruise']] = dfSwf[['type','cruise']]
In [34]:
dfSwf.to_pickle(os.path.join(savDir, 'dfSwf'))
dfLogOCx.to_pickle(os.path.join(savDir, 'dfLogOCx'))
In [46]:
sb.set(font_scale=1.5)
g = sb.PairGrid(dfLogOCx, hue='type', vars=['chl','OCxRatio'], size=5,
palette=sb.color_palette("cubehelix",2))
g = g.map_upper(pl.scatter, alpha=0.5, edgecolor='k',linewidth=2)
g = g.map_diag(sb.kdeplot, lw=3)
g = g.map_lower(sb.kdeplot,cmap="Reds_d")
g.add_legend();
In [74]:
f,ax2 = pl.subplots(ncols=2, figsize=(14,6))
sb.violinplot(x='OCxRatio',y='type',data=dfLogOCx, hue='type', ax=ax2[0])
sb.violinplot(x='chl', y='type', data=dfLogOCx, hue='type', ax=ax2[1]);
ax2[0].legend().set_visible(False)
ax2[1].legend().set_visible(False)
In [18]:
dfSwf.type.unique()
Out[18]:
Pooled bayesian model:
In [56]:
logChlObs = dfLogOCx.chl.values
logOCxRatio = dfLogOCx.OCxRatio.values
OC4v6_coeffs = {'a0': 0.3272, 'a1': -2.9940, 'a2': 2.7218, 'a3': -1.2259, 'a4': -0.5683}
with pm.Model() as pooled_model:
a0 = pm.Normal('a0', mu=OC4v6_coeffs['a0'], sd=10)
a1 = pm.Normal('a1', mu=OC4v6_coeffs['a1'], sd=10)
a2 = pm.Normal('a2', mu=OC4v6_coeffs['a2'], sd=10)
a3 = pm.Normal('a3', mu=OC4v6_coeffs['a3'], sd=10)
a4 = pm.Normal('a4', mu=OC4v6_coeffs['a4'], sd=10)
epsilon = pm.Uniform('epsilon', lower=0, upper=10)
mu = a0 + a1 * logOCxRatio + a2 * logOCxRatio**2 + a3 *\
logOCxRatio**3 + a4 * logOCxRatio**4
logChlPred = pm.Normal('chlPred', mu=mu, sd=epsilon, observed=logChlObs)
start = pm.find_MAP()
step = pm.NUTS(scaling=start)
traceOCx_pooled = pm.sample(10000, step=step, start=start)
In [57]:
chainOCx_pooled = traceOCx_pooled[1000:]
varnames=['a%d' %d for d in range(5)] + ['epsilon']
#refvals = [chainOCx_pooles['a%d'] % d for d in arange(5)]
#refval = {'a%d' % d: rv for d,rv in zip(range(5), chainOCx_pooled['a%d'] )}
pm.traceplot(chainOCx_pooled,varnames=varnames, grid=True);
In [48]:
cfs = FitPoly(logOCxRatio,logChlObs)
In [68]:
{'a%d' %d:rv for d,rv in zip(range(5),cfs[::-1])}
Out[68]:
In [67]:
OC4v6_coeffs
Out[67]:
In [61]:
refvals = [chainOCx_pooled['a%d'% d].mean() for d in range(5)]
In [55]:
# bayes means with OC4_v6 mean normal priors
refvals
Out[55]:
In [59]:
# bayes means with 0-mean normal priors
refvals
Out[59]:
In [ ]: