In [4]:
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import learning_curve, cross_val_score, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
def splitXY(dfXY):
lbls = ['ReactorType', 'CoolingTime', 'Enrichment', 'Burnup', 'OrigenReactor']
dfX = dfXY.drop(lbls, axis=1)
if 'total' in dfX.columns:
dfX.drop('total', axis=1, inplace=True)
r_dfY = dfXY.loc[:, lbls[0]]
c_dfY = dfXY.loc[:, lbls[1]]
e_dfY = dfXY.loc[:, lbls[2]]
b_dfY = dfXY.loc[:, lbls[3]]
return dfX, r_dfY, c_dfY, e_dfY, b_dfY
In [5]:
#trainset1 = '../2jul2018_testset1_nucs_fissact_not-scaled.pkl'
trainset2 = '../../prep-pkls/nucmoles_opusupdate_aug2019/not-scaled_15nuc.pkl'
#trainXY1 = pd.read_pickle(trainset1)
trainXY2 = pd.read_pickle(trainset2)
#trainX1, rY1, cY1, eY1, bY1 = splitXY(trainXY1)
trainX2, rY2, cY2, eY2, bY2 = splitXY(trainXY2)
#trainX2 = scale(trainX2)
In [6]:
trainX2.shape
Out[6]:
In [103]:
scale = StandardScaler()
trainX2 = scale.fit(trainX2).transform(trainX2)
In [105]:
scale.mean_
scale.var_
Out[105]:
In [97]:
print(trainXY1.shape)
print(trainXY2.shape)
In [106]:
trainX2
#pd.DataFrame(trainX2)
Out[106]:
In [29]:
extra = []
for i in trainX2:
if i not in trainX1:
extra.append(i)
extra
Out[29]:
In [107]:
dtr = DecisionTreeRegressor()
dtr
Out[107]:
In [108]:
dtr.fit(trainX2, cY2)
cv_scr = cross_val_score(dtr, trainX2, cY2, cv=5)
In [109]:
scr = cv_scr
fi = dtr.feature_importances_
print(scr)
In [110]:
features = list(zip(trainX2.columns, fi))
features.sort(key=lambda x: x[1])
#features
In [123]:
dtr.tree_.max_depth
Out[123]:
In [112]:
trainset_frac = np.array( [0.1, 0.2, 0.3, 0.4, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0] )
tsize, train, cv = learning_curve(dtr, trainX2, cY2, train_sizes=trainset_frac, cv=8, shuffle=True)
In [113]:
train_mean = np.mean(train, axis=1)
cv_mean = np.mean(cv, axis=1)
In [114]:
lc = pd.DataFrame({'Training Score' : train_mean, 'CrossVal Score' : cv_mean}, index=trainset_frac)
plt.figure()
lc.plot(linewidth=3)
Out[114]:
In [125]:
dtr_grid = {"max_depth": np.linspace(3, 90).astype(int), "max_features": np.linspace(5, len(trainXY2.columns)-6).astype(int)}
In [126]:
alg1_opt = RandomizedSearchCV(estimator=dtr, param_distributions=dtr_grid, n_iter=20, scoring='explained_variance', n_jobs=-1, cv=5, return_train_score=True)
alg1_opt.fit(trainX2, cY2)
d1 = alg1_opt.best_params_['max_depth']
f1 = alg1_opt.best_params_['max_features']
print(d1, f1)
In [131]:
dtr.feature_importances_
Out[131]:
In [26]:
alg1_init = alg1_opt.best_estimator_
In [27]:
k
Out[27]:
In [ ]: