In [65]:
from sklearn.preprocessing import scale
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn.svm import SVR, SVC
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [64]:
# color dict (use: colors['XColor'])
colors = {'DBrwn' : '#8c510a', 'MBrwn' : '#d8b365', 'LBrwn' : '#f6e8c3',
'DTeal' : '#01665e', 'MTeal' : '#5ab4ac', 'LTeal' : '#c7eae5',
'DPurp' : '#762a83', 'MPurp' : '#af8dc3', 'LPurp' : '#e7d4e8'
}
# set params
# save as .mplstyle file and import with: plt.style.use('file')
#mpl.rcParams['lines.linewidth'] = 2
#mpl.rcParams['lines.color'] = 'r'
# colors for seaborn
dark = ["#8c510a", "#01665e", "#762a83"]
med = ["#d8b365", "#5ab4ac", "#af8dc3"]
In [62]:
train = '../../prep-pkls/nucmoles_opusupdate_aug2019/not-scaled_15nuc.pkl'
trainXY = pd.read_pickle(train)
In [33]:
def add_error(percent_err, df):
"""
Given a dataframe of nuclide vectors, add error to each element in each
nuclide vector that has a random value within the range [1-err, 1+err]
Parameters
----------
percent_err : a float indicating the maximum error that can be added to the nuclide
vectors
df : dataframe of only nuclide concentrations
Returns
-------
df_err : dataframe with nuclide concentrations altered by some error
"""
x = df.shape[0]
y = df.shape[1]
err = percent_err / 100.0
low = 1 - err
high = 1 + err
errs = np.random.uniform(low, high, (x, y))
df_err = df * errs
df_err_scaled = scale(df_err)
return df_err
In [17]:
def random_error(X, Y, alg1, alg2, alg3, scores, CV, csv_name):
"""
"""
#err_percent = np.arange(0, 9.25, 0.25)
err_percent = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1,
1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.25, 2.5,
2.75, 3, 3.25, 3.5, 3.75, 4, 4.25, 4.5, 4.75, 5, 5.5, 6,
6.5, 7, 7.5, 8, 8.5, 9, 10, 13, 17, 20]
knn_mean = []
knn_std = []
dtr_mean = []
dtr_std = []
svr_mean = []
svr_std = []
for err in err_percent:
X = add_error(err, X)
X = scale(X)
knn_scr = cross_val_score(alg1, X, Y, scoring=scores, cv=CV, n_jobs=-1)
knn_mean.append(knn_scr.mean()) #train_mean = np.mean(train, axis=1)
knn_std.append(knn_scr.std())
dtr_scr = cross_val_score(alg2, X, Y, scoring=scores, cv=CV, n_jobs=-1)
dtr_mean.append(dtr_scr.mean())
dtr_std.append(dtr_scr.std())
svr_scr = cross_val_score(alg3, X, Y, scoring=scores, cv=CV, n_jobs=-1)
svr_mean.append(svr_scr.mean())
svr_std.append(svr_scr.std())
df = pd.DataFrame({'Percent Error' : err_percent,
'kNN Score' : knn_mean, 'kNN Std' : knn_std,
'DTree Score' : dtr_mean, 'DTree Std' : dtr_std,
'SVR Score' : svr_mean, 'SVR Std' : svr_std,}, index=err_percent)
df.to_csv(csv_name + '_random-error.csv')
In [58]:
def splitXY(dfXY):
lbls = ['ReactorType', 'CoolingTime', 'Enrichment', 'Burnup', 'OrigenReactor']
dfX = dfXY.drop(lbls, axis=1)
if 'total' in dfX.columns:
dfX.drop('total', axis=1, inplace=True)
r_dfY = dfXY.loc[:, lbls[0]]
c_dfY = dfXY.loc[:, lbls[1]]
e_dfY = dfXY.loc[:, lbls[2]]
b_dfY = dfXY.loc[:, lbls[3]]
#o_dfY = dfXY.loc[:, lbls[4]]
return dfX, r_dfY, c_dfY, e_dfY, b_dfY
trainXY.reset_index(inplace=True, drop=True)
trainXY = trainXY.sample(frac=0.8)
trainX, rY, cY, eY, bY = splitXY(trainXY)
#trainX = scale(trainX)
In [36]:
CV=5
for Y in ('r',):
trainY = pd.Series()
# get param names and set ground truth
if Y == 'b':
# burnup needs much less training data...this is 24% of data set
trainXY = trainXY.sample(frac=0.4)
trainX, rY, cY, eY, bY = splitXY(trainXY)
#trainX = scale(trainX)
trainY = bY
parameter = 'burnup'
k = 7
depth = 3
feats = 5
g = 0.1
c = 1500
elif Y == 'c':
trainY = cY
parameter = 'cooling'
k = 3
depth = 20
feats = 15
g = 0.1
c = 1500
else:
trainY = rY
parameter = 'reactor'
k = 1
depth = 3
feats = 5
g = 0.1
c = 1500
csv_name = '15nuc_m60_scale-last_' + parameter
## initialize learners
score = 'explained_variance'
kfold = KFold(n_splits=CV, shuffle=True)
knn_init = KNeighborsRegressor(n_neighbors=k, weights='distance')
dtr_init = DecisionTreeRegressor(max_depth=depth, max_features=feats)
svr_init = SVR(gamma=g, C=c)
if Y == 'r':
score = 'accuracy'
kfold = StratifiedKFold(n_splits=CV, shuffle=True)
knn_init = KNeighborsClassifier(n_neighbors=k, weights='distance')
dtr_init = DecisionTreeClassifier(max_depth=depth, max_features=feats, class_weight='balanced')
svr_init = SVC(gamma=g, C=c, class_weight='balanced')
#random_error(trainX, trainY, knn_init, dtr_init, svr_init, score, CV, csv_name)
In [59]:
trainY = rY
parameter = 'reactor'
k = 1
depth = 3
feats = 5
g = 0.1
c = 1500
score = 'accuracy'
kfold = StratifiedKFold(n_splits=CV, shuffle=True)
knn_init = KNeighborsClassifier(n_neighbors=k, weights='distance')
dtr_init = DecisionTreeClassifier(max_depth=depth, max_features=feats, class_weight='balanced')
svr_init = SVC(gamma=g, C=c, class_weight='balanced')
plt.figure(figsize=(25,15))
trainX = scale(trainX)
dtree = dtr_init.fit(trainX, trainY)
plot_tree(dtree, fontsize=14, filled=True)
plt.show()
plt.clf()
In [3]:
burn = pd.read_csv('../results/19jan2020_15nuc/15nuc_m60_scale-last_burnup_random-error.csv')
rxtr = pd.read_csv('../results/19jan2020_15nuc/15nuc_m60_scale-last_reactor_random-error.csv')
cool = pd.read_csv('../results/19jan2020_15nuc/15nuc_m60_scale-last_cooling_random-error.csv')
cool.head()
Out[3]:
In [66]:
def plot_errorcurves(df):
plt.clf()
err = df['Percent Error']
knn_mean = df['kNN Score']
knn_std = df['kNN Std']
dtr_mean = df['DTree Score']
dtr_std = df['DTree Std']
svr_mean = df['SVR Score']
svr_std = df['SVR Std']
fig = plt.figure(figsize=(7,6))
ax = fig.add_subplot(1, 1, 1)
ax.set_title('Reactor Type Prediction', fontsize=20)
ax.set_xlabel('Percent Error', fontsize=14)
#ax.set_xlim(0, 10)
ax.set_ylabel('Score', fontsize=16)
#ax.set_ylim(0.98, 1.001)
ax.fill_between(err, knn_mean - knn_std, knn_mean + knn_std, color = med[0], alpha=0.2)
ax.fill_between(err, dtr_mean - dtr_std, dtr_mean + dtr_std, color = med[1], alpha=0.2)
ax.fill_between(err, svr_mean - svr_std, svr_mean + svr_std, color = med[2], alpha=0.2)
ax.plot(err, knn_mean, 'o-', color=med[0], mew=0, ms=5, label="kNN")
ax.plot(err, dtr_mean, 'o-', color=med[1], mew=0, ms=5, label="Decision Tree")
ax.plot(err, svr_mean, 'o-', color=med[2], mew=0, ms=5, label="Support Vector")
ax.tick_params(labelsize=14)
ax.legend(loc='best')#'right', bbox_to_anchor=(1.4, 0.5))
plt.savefig('../results/19jan2020_15nuc/graphs/rxtr_error.png', bbox_inches='tight', transparent=True)
plt.show()
In [67]:
plot_errorcurves(rxtr)
In [ ]:
plt.clf()
rxtrerr = plot_errorcurves(rxtr)
rxtrerr.savefig('../results/19jan2020_15nuc/graphs/rxtr_error.png', bbox_inches='tight', transparent=True)
In [ ]: