In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from decimal import Decimal as deci
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score as rsquare
from sklearn.model_selection import train_test_split
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV as gsv
from sklearn.ensemble import RandomForestRegressor as rfr
get_ipython().magic('matplotlib inline')
plt.rcParams['figure.figsize'] = [10,8]

In [2]:
cf = rfr(n_jobs=-1)
max_f = list(np.arange(1,70,1))
max_f.append("auto")
cf_params = {"n_estimators":list(np.arange(1,25,1)), "max_features":max_f}
pls = PLSRegression(scale=False)
pls_params = {"n_components":list(np.arange(1,101,1))}

In [3]:
csv_path = "..\..\Data\csv"
here = os.getcwd()
os.chdir(csv_path)
zspectra = pd.read_csv('fitted_cest.csv', header = None).values.squeeze()
diff = pd.read_csv('diff.csv', header = None).values.squeeze()
conc = pd.read_csv('conc.csv', header = None).values.squeeze()
pH = pd.read_csv('pH.csv', header = None).values.squeeze()
concs = pd.read_csv('concs.csv', header = None).values.squeeze()
pHs = pd.read_csv('pHs.csv', header = None).values.squeeze()
rsq = pd.read_csv('rsq.csv', header= None).values.squeeze()
os.chdir(here)

In [4]:
def mymetric(yexp, ypred):
    yexp=yexp.squeeze()
    ypred=ypred.squeeze()
    d = np.sqrt(mean_squared_error(yexp, ypred))
    d = d / np.mean(yexp)
    d = 100 * d
    return d

In [5]:
def mystddev(yexp,ypred):
    yexp=yexp.squeeze()
    ypred=ypred.squeeze()
    sy=np.std(ypred)
    d = np.sum(yexp - ypred) / np.sqrt(np.sum((yexp - ypred)**2 ))
    d = d / np.sqrt(ypred.shape[0])
    d = d / np.mean(yexp)
    d = 100 * d
    sd=np.sqrt(np.square(d)*np.square(sy))
    return sd

In [6]:
X = diff
Y = pH
Ys = np.sort(pHs)

In [7]:
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.1, random_state=42)
grid_rfr = gsv(cf,cf_params,n_jobs = -1)
grid_rfr.fit(X_train,y_train)
y_hat_rfr = grid_rfr.predict(X_test)
mymetric(y_test,y_hat_rfr)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-495d83d42a2c> in <module>()
      3 grid_rfr.fit(X_train,y_train)
      4 y_hat_rfr = grid_rfr.predict(X_test)
----> 5 mymetric(y_test,y_hat)

NameError: name 'y_hat' is not defined

In [9]:
grid_rfr.best_estimator_


Out[9]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=34, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=22, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [10]:
grid_pls = gsv(pls,pls_params,n_jobs = -1)
grid_pls.fit(X_train,y_train)
y_hat_pls = grid_pls.predict(X_test)
mymetric(y_test,y_hat_pls)


Out[10]:
4.3614817257848344

In [11]:
grid_pls.best_estimator_


Out[11]:
PLSRegression(copy=True, max_iter=500, n_components=30, scale=False,
       tol=1e-06)

In [ ]: