In [2]:
from sklearn.preprocessing import scale
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVR, SVC
from sklearn.model_selection import cross_validate, KFold, StratifiedKFold
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
In [22]:
def splitXY(dfXY):
lbls = ['ReactorType', 'CoolingTime', 'Enrichment', 'Burnup', 'OrigenReactor']
dfX = dfXY.drop(lbls, axis=1)
if 'total' in dfX.columns:
dfX.drop('total', axis=1, inplace=True)
r_dfY = dfXY.loc[:, lbls[0]]
c_dfY = dfXY.loc[:, lbls[1]]
e_dfY = dfXY.loc[:, lbls[2]]
b_dfY = dfXY.loc[:, lbls[3]]
return dfX, r_dfY, c_dfY, e_dfY, b_dfY
CV = 5
trainset = '../pkl_trainsets/2jul2018/22jul2018_trainset3_nucs_fissact_not-scaled.pkl'
trainXY = pd.read_pickle(trainset)
trainXY = trainXY.sample(frac=0.6)
trainX, rY, cY, eY, bY = splitXY(trainXY)
trainX = scale(trainX)
In [23]:
def make_sparser(array, min_x):
array = np.asarray(array)
array[array < min_x] = 0
return array
In [24]:
baseX = trainX
mbyn = np.prod(baseX.shape)
In [25]:
def get_inits(Y):
CV = 5
trainY = pd.Series()
# get param names and set ground truth
if Y == 'c':
trainY = cY
parameter = 'cooling'
k = 5 #3, 7
depth = 50 #50, 12
feats = 25 #36, 47
g = 0.06 #0.2
c = 50000 #200, 75000
elif Y == 'e':
trainY = eY
parameter = 'enrichment'
k = 5 #7, 8
depth = 50 #53, 38
feats = 25 #33, 16
g = 0.8 #0.2
c = 25000 #420
elif Y == 'b':
# burnup needs much less training data...this is 24% of data set
#trainXY = trainXY.sample(frac=0.4)
#trainX, rY, cY, eY, bY = splitXY(trainXY)
#trainX = scale(trainX)
trainY = bY
parameter = 'burnup'
k = 5 #4, 7
depth = 50 #50, 78
feats = 25 #23, 42
g = 0.025 #0.025
c = 42000 #105
else:
trainY = rY
parameter = 'reactor'
k = 1 #1, 2, or 12
depth = 50 #50, 97
feats = 25 # 37, 37
g = 0.07 #0.2
c = 10000 #220
csv_name = 'trainset3_fissact_m60_sparsity_' + parameter
# initialize learners
score = 'explained_variance'
kfold = KFold(n_splits=CV, shuffle=True)
knn_init = KNeighborsRegressor(n_neighbors=k, weights='distance')
dtr_init = DecisionTreeRegressor(max_depth=depth, max_features=feats)
svr_init = SVR(gamma=g, C=c)
if Y is 'r':
score = 'accuracy'
kfold = StratifiedKFold(n_splits=CV, shuffle=True)
knn_init = KNeighborsClassifier(n_neighbors=k, weights='distance')
dtr_init = DecisionTreeClassifier(max_depth=depth, max_features=feats, class_weight='balanced')
svr_init = SVC(gamma=g, C=c, class_weight='balanced')
scores = ['explained_variance', 'neg_mean_absolute_error']
if Y is 'r':
scores = ['accuracy', ]
return trainY, knn_init, dtr_init, svr_init, scores, kfold, CV, csv_name
In [26]:
thresholds = [-2, -1.5, -1.2, -1, -0.8, -0.6, -0.4, -0.2, -0.1, 0, 0.3, 0.6, 0.9, 1, 1.5]
for pred in ('e',):#'r', 'b', 'e', 'c'):
Y, alg1, alg2, alg3, scores, kfold, CV, csv_name = get_inits(pred)
all_results = pd.DataFrame()
for t in thresholds:
sparser = make_sparser(baseX, t)
nonzeros = np.count_nonzero(sparser)
sparsity = 1 - nonzeros/mbyn
X = pd.DataFrame(sparser)
print('Learning and prediction underway: ' + pred + ' at sparsity ' + str(sparsity))
cv_scr = cross_validate(alg1, X, Y, scoring=scores, cv=CV,
return_train_score=False, n_jobs=-1)
df1 = pd.DataFrame(cv_scr)
df1['Algorithm'] = 'knn'
cv_scr = cross_validate(alg2, X, Y, scoring=scores, cv=CV,
return_train_score=False, n_jobs=-1)
df2 = pd.DataFrame(cv_scr)
df2['Algorithm'] = 'dtree'
cv_scr = cross_validate(alg3, X, Y, scoring=scores, cv=CV,
return_train_score=False, n_jobs=-1)
df3 = pd.DataFrame(cv_scr)
df3['Algorithm'] = 'svr'
cv_results = [df1, df2, df3]
df = pd.concat(cv_results)
df['Sparsity'] = sparsity
all_results = all_results.append(df, ignore_index=True)
all_results.to_csv(csv_name + '.csv')
In [ ]:
In [ ]:
In [ ]: