In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor as rfr
get_ipython().magic('matplotlib inline')

In [44]:
cf=rfr(n_estimators = 50, n_jobs=-1,verbose=1)

In [10]:
zspectra = pd.read_csv('fitted_cest.csv', header = None).values
diff = pd.read_csv('dif.csv', header = None).values
conc = pd.read_csv('conc.csv', header = None).values
pH = pd.read_csv('pH.csv', header = None).values
concs = pd.read_csv('concs.csv', header = None).values
pHs = pd.read_csv('pHs.csv', header = None).values

In [3]:
def mymetric(yexp, ypred):
    d = np.sum((yexp - ypred)**2 )
    d = d / ypred.shape[0]
    d = np.sqrt(d)
    d = d / np.mean(yexp)
    d = 100 * d
    return d

In [4]:
Y = pH
Ys = np.sort(pHs)

In [24]:
mymetric(y_test.squeeze(),y_hat)


Out[24]:
2.4510586354648836

In [46]:
X = zspectra
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.05, random_state=42)
cf.fit(X_train,y_train.squeeze())
y_hat=cf.predict(X_test)
mymetric(y_test.squeeze(),y_hat)


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   10.6s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
Out[46]:
2.4775295701047839

In [48]:
import seaborn as sns
sns.distplot(y_hat)


F:\Users\Luis\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j
Out[48]:
<matplotlib.axes._subplots.AxesSubplot at 0x1c6763556a0>

In [30]:
imp=cf.feature_importances_
offset=np.linspace(-8,15,101)
plt.plot(offset,cf.feature_importances_,'-o');
plt.xlim((-1,5))


Out[30]:
(-1, 5)

In [40]:
cf.fit(X_train[:, imp>0.05],y_train.squeeze())
y_hat=cf.predict(X_test[:, imp>0.05])
mymetric(y_test.squeeze(),y_hat)


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.3s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    0.0s finished
Out[40]:
3.1121767142081564

In [51]:
sns.distplot(y_hat);


F:\Users\Luis\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j

In [50]:
num_components = X.shape[1]


Error = np.zeros((num_components -1,1))

for idx,K in enumerate(np.arange(1,num_components)):
    X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.05, random_state=42)
    pls = PLSRegression(n_components = K, scale = False)
    pls.fit(X_train, y_train)
    y_hat = pls.predict(X_test)
    Error[idx] = mymetric(y_test , y_hat)

plt.plot( np.arange(1,num_components), Error ,'o-')
print('Min = ', Error.min(),'%')


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-50-1e6c4dd45705> in <module>()
      7     X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.05, random_state=42)
      8     pls = PLSRegression(n_components = K, scale = False)
----> 9     pls.fit(X_train, y_train)
     10     y_hat = pls.predict(X_test)
     11     Error[idx] = mymetric(y_test , y_hat)

F:\Users\Luis\Anaconda3\lib\site-packages\sklearn\cross_decomposition\pls_.py in fit(self, X, Y)
    337             if self.deflation_mode == "regression":
    338                 # - regress Yk's on x_score, then subtract rank-one approx.
--> 339                 y_loadings = (np.dot(Yk.T, x_scores)
    340                               / np.dot(x_scores.T, x_scores))
    341                 Yk -= np.dot(x_scores, y_loadings.T)

KeyboardInterrupt: 

In [6]:
K=Error.argmin()+1
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.05, random_state=42)
pls = PLSRegression(n_components = K, scale = False)
pls.fit(X_train, y_train)
y_hat = pls.predict(X_test)

Error_dum = np.zeros(Ys.shape)
for idk,K in enumerate(np.arange(0,Ys.shape[1])):
    Error_dum[0,K] = mymetric(y_test[np.where(y_test==(np.ones(y_test.shape)*Ys[0,K]))], y_hat[np.where(y_test==(np.ones(y_test.shape)*Ys[0,K]))])
    
plt.plot(Ys,Error_dum,'o-')


Out[6]:
[<matplotlib.lines.Line2D at 0x29a5ca6b630>,
 <matplotlib.lines.Line2D at 0x29a5ca6bfd0>,
 <matplotlib.lines.Line2D at 0x29a5ca65898>,
 <matplotlib.lines.Line2D at 0x29a5ca65320>,
 <matplotlib.lines.Line2D at 0x29a5ca65c88>,
 <matplotlib.lines.Line2D at 0x29a5ca65e48>,
 <matplotlib.lines.Line2D at 0x29a5ca65860>,
 <matplotlib.lines.Line2D at 0x29a5ca65cc0>]

In [49]:
X = np.concatenate((zspectra,t2_signal),axis = 1)
num_components = X.shape[1]


Error = np.zeros((num_components -1,1))

for idx,K in enumerate(np.arange(1,num_components)):
    X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.05, random_state=42)
    pls = PLSRegression(n_components = K, scale = False)
    pls.fit(X_train, y_train)
    y_hat = pls.predict(X_test)
    Error[idx] = mymetric(y_test , y_hat)

plt.plot( np.arange(1,num_components), Error ,'o-')
print('Min = ', Error.min(),'%')


Min =  4.46196072362 %

In [8]:
K=Error.argmin()+1
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.05, random_state=42)
pls = PLSRegression(n_components = K, scale = False)
pls.fit(X_train, y_train)
y_hat = pls.predict(X_test)

Error_dum = np.zeros(Ys.shape)
for idk,K in enumerate(np.arange(0,Ys.shape[1])):
    Error_dum[0,K] = mymetric(y_test[np.where(y_test==(np.ones(y_test.shape)*Ys[0,K]))], y_hat[np.where(y_test==(np.ones(y_test.shape)*Ys[0,K]))])
    
plt.plot(Ys,Error_dum,'o-')


Out[8]:
[<matplotlib.lines.Line2D at 0x29a5cfac3c8>,
 <matplotlib.lines.Line2D at 0x29a5cfac550>,
 <matplotlib.lines.Line2D at 0x29a5cfac7b8>,
 <matplotlib.lines.Line2D at 0x29a5cfac978>,
 <matplotlib.lines.Line2D at 0x29a5cfacb38>,
 <matplotlib.lines.Line2D at 0x29a5cfaccf8>,
 <matplotlib.lines.Line2D at 0x29a5cfaceb8>,
 <matplotlib.lines.Line2D at 0x29a5cfb10b8>]

In [9]:
X = np.concatenate((zspectra,t2),axis = 1)
num_components = X.shape[1]


Error = np.zeros((num_components -1,1))

for idx,K in enumerate(np.arange(1,num_components)):
    X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.05, random_state=42)
    pls = PLSRegression(n_components = K, scale = False)
    pls.fit(X_train, y_train)
    y_hat = pls.predict(X_test)
    Error[idx] = mymetric(y_test , y_hat)

plt.plot( np.arange(1,num_components), Error ,'o-')
print('Min = ', Error.min(),'%')


Min =  4.52295837337 %

In [10]:
K=Error.argmin()+1
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.05, random_state=42)
pls = PLSRegression(n_components = K, scale = False)
pls.fit(X_train, y_train)
y_hat = pls.predict(X_test)

Error_dum = np.zeros(Ys.shape)
for idk,K in enumerate(np.arange(0,Ys.shape[1])):
    Error_dum[0,K] = mymetric(y_test[np.where(y_test==(np.ones(y_test.shape)*Ys[0,K]))], y_hat[np.where(y_test==(np.ones(y_test.shape)*Ys[0,K]))])
    
plt.plot(Ys,Error_dum,'o-')


Out[10]:
[<matplotlib.lines.Line2D at 0x29a5d50a160>,
 <matplotlib.lines.Line2D at 0x29a5d50a2e8>,
 <matplotlib.lines.Line2D at 0x29a5d50a550>,
 <matplotlib.lines.Line2D at 0x29a5d50a710>,
 <matplotlib.lines.Line2D at 0x29a5d50a8d0>,
 <matplotlib.lines.Line2D at 0x29a5d50aa90>,
 <matplotlib.lines.Line2D at 0x29a5d50ac50>,
 <matplotlib.lines.Line2D at 0x29a5d50ae10>]

In [11]:
X = t2_signal
num_components = X.shape[1]


Error = np.zeros((num_components -1,1))

for idx,K in enumerate(np.arange(1,num_components)):
    X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.05, random_state=42)
    pls = PLSRegression(n_components = K, scale = False)
    pls.fit(X_train, y_train)
    y_hat = pls.predict(X_test)
    Error[idx] = mymetric(y_test , y_hat)

plt.plot( np.arange(1,num_components), Error ,'o-')
print('Min = ', Error.min(),'%')


Min =  7.198785079 %

In [12]:
K=Error.argmin()+1
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.05, random_state=42)
pls = PLSRegression(n_components = K, scale = False)
pls.fit(X_train, y_train)
y_hat = pls.predict(X_test)

Error_dum = np.zeros(Ys.shape)
for idk,K in enumerate(np.arange(0,Ys.shape[1])):
    Error_dum[0,K] = mymetric(y_test[np.where(y_test==(np.ones(y_test.shape)*Ys[0,K]))], y_hat[np.where(y_test==(np.ones(y_test.shape)*Ys[0,K]))])
    
plt.plot(Ys,Error_dum,'o-')


Out[12]:
[<matplotlib.lines.Line2D at 0x29a5de80cc0>,
 <matplotlib.lines.Line2D at 0x29a5de80e48>,
 <matplotlib.lines.Line2D at 0x29a5de870f0>,
 <matplotlib.lines.Line2D at 0x29a5de872b0>,
 <matplotlib.lines.Line2D at 0x29a5de87470>,
 <matplotlib.lines.Line2D at 0x29a5de87630>,
 <matplotlib.lines.Line2D at 0x29a5de877f0>,
 <matplotlib.lines.Line2D at 0x29a5de879b0>]

In [6]:
np.mean(Ys)


Out[6]:
6.9049999999999994

In [7]:
np.median(Ys)


Out[7]:
6.8899999999999997

In [13]:
X = t2
num_components = X.shape[1]


Error = np.zeros((num_components -1,1))

X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.05, random_state=42)
pls = PLSRegression(n_components = num_components, scale = False)
pls.fit(X_train, y_train)
y_hat = pls.predict(X_test)
Error = mymetric(y_test , y_hat)
print('Min = ', Error.min(),'%')


Min =  7.34321700086 %

In [14]:
Error_dum = np.zeros(Ys.shape)
for idk,K in enumerate(np.arange(0,Ys.shape[1])):
    Error_dum[0,K] = mymetric(y_test[np.where(y_test==(np.ones(y_test.shape)*Ys[0,K]))], y_hat[np.where(y_test==(np.ones(y_test.shape)*Ys[0,K]))])
    
plt.plot(Ys,Error_dum,'o-')


Out[14]:
[<matplotlib.lines.Line2D at 0x29a5df064e0>,
 <matplotlib.lines.Line2D at 0x29a5df06668>,
 <matplotlib.lines.Line2D at 0x29a5df068d0>,
 <matplotlib.lines.Line2D at 0x29a5df06a90>,
 <matplotlib.lines.Line2D at 0x29a5df06c50>,
 <matplotlib.lines.Line2D at 0x29a5df06e10>,
 <matplotlib.lines.Line2D at 0x29a5df06fd0>,
 <matplotlib.lines.Line2D at 0x29a5df0c1d0>]

In [15]:
'''
steps = [1,4,8]
labels = list()

for step in steps:
    X = zspectra[:, 0:101:step]
    labels.append(int(X.shape[1]))
    Y = pH
    num_components = 10
    Error = np.zeros((num_components -1,1))

    for idx,K in enumerate(np.arange(1,num_components)):
        X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.50, random_state=42)
        pls = PLSRegression(n_components = K, scale = False)
        pls.fit(X_train, y_train)
        y_hat = pls.predict(X_test)
        Error[idx] = mymetric(y_test , y_hat)

    plt.plot( np.arange(1,num_components), Error ,'o-')
    plt.legend(labels)
'''


Out[15]:
"\nsteps = [1,4,8]\nlabels = list()\n\nfor step in steps:\n    X = zspectra[:, 0:101:step]\n    labels.append(int(X.shape[1]))\n    Y = pH\n    num_components = 10\n    Error = np.zeros((num_components -1,1))\n\n    for idx,K in enumerate(np.arange(1,num_components)):\n        X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.50, random_state=42)\n        pls = PLSRegression(n_components = K, scale = False)\n        pls.fit(X_train, y_train)\n        y_hat = pls.predict(X_test)\n        Error[idx] = mymetric(y_test , y_hat)\n\n    plt.plot( np.arange(1,num_components), Error ,'o-')\n    plt.legend(labels)\n"