In [2]:
from sklearn.preprocessing import scale
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.svm import SVR, SVC
from sklearn.model_selection import cross_validate, KFold, StratifiedKFold

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


/usr/local/lib/python3.6/importlib/_bootstrap.py:205: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)

In [22]:
def splitXY(dfXY):
    lbls = ['ReactorType', 'CoolingTime', 'Enrichment', 'Burnup', 'OrigenReactor']
    dfX = dfXY.drop(lbls, axis=1)
    if 'total' in dfX.columns:
        dfX.drop('total', axis=1, inplace=True)
    r_dfY = dfXY.loc[:, lbls[0]]
    c_dfY = dfXY.loc[:, lbls[1]]
    e_dfY = dfXY.loc[:, lbls[2]]
    b_dfY = dfXY.loc[:, lbls[3]]
    return dfX, r_dfY, c_dfY, e_dfY, b_dfY

CV = 5
trainset = '../pkl_trainsets/2jul2018/22jul2018_trainset3_nucs_fissact_not-scaled.pkl'
trainXY = pd.read_pickle(trainset)
trainXY = trainXY.sample(frac=0.6)
trainX, rY, cY, eY, bY = splitXY(trainXY)
trainX = scale(trainX)

In [23]:
def make_sparser(array, min_x):
    array = np.asarray(array)
    array[array < min_x] = 0
    return array

In [24]:
baseX = trainX
mbyn = np.prod(baseX.shape)

In [25]:
def get_inits(Y):
    CV = 5
    
    trainY = pd.Series()
    # get param names and set ground truth
    if Y == 'c':
        trainY = cY
        parameter = 'cooling'
        k = 5 #3, 7
        depth = 50 #50, 12
        feats = 25 #36, 47 
        g = 0.06 #0.2
        c = 50000 #200, 75000
    elif Y == 'e': 
        trainY = eY
        parameter = 'enrichment'
        k = 5 #7, 8
        depth = 50 #53, 38
        feats = 25 #33, 16 
        g = 0.8 #0.2
        c = 25000 #420
    elif Y == 'b':
        # burnup needs much less training data...this is 24% of data set
        #trainXY = trainXY.sample(frac=0.4)
        #trainX, rY, cY, eY, bY = splitXY(trainXY)
        #trainX = scale(trainX)
        trainY = bY
        parameter = 'burnup'
        k = 5 #4, 7
        depth = 50 #50, 78
        feats = 25 #23, 42 
        g = 0.025 #0.025
        c = 42000 #105
    else:
        trainY = rY
        parameter = 'reactor'
        k = 1 #1, 2, or 12
        depth = 50 #50, 97
        feats = 25 # 37, 37 
        g = 0.07 #0.2
        c = 10000 #220
        
    csv_name = 'trainset3_fissact_m60_sparsity_' + parameter
        
    # initialize learners
    score = 'explained_variance'
    kfold = KFold(n_splits=CV, shuffle=True)
    knn_init = KNeighborsRegressor(n_neighbors=k, weights='distance')
    dtr_init = DecisionTreeRegressor(max_depth=depth, max_features=feats)
    svr_init = SVR(gamma=g, C=c)
    if Y is 'r':
        score = 'accuracy'
        kfold = StratifiedKFold(n_splits=CV, shuffle=True)
        knn_init = KNeighborsClassifier(n_neighbors=k, weights='distance')
        dtr_init = DecisionTreeClassifier(max_depth=depth, max_features=feats, class_weight='balanced')
        svr_init = SVC(gamma=g, C=c, class_weight='balanced')
    
    scores = ['explained_variance', 'neg_mean_absolute_error']
    if Y is 'r':
        scores = ['accuracy', ]
    
    return trainY, knn_init, dtr_init, svr_init, scores, kfold, CV, csv_name

In [26]:
thresholds = [-2, -1.5, -1.2, -1, -0.8, -0.6, -0.4, -0.2, -0.1, 0, 0.3, 0.6, 0.9, 1, 1.5]
for pred in ('e',):#'r', 'b', 'e', 'c'):
    Y, alg1, alg2, alg3, scores, kfold, CV, csv_name = get_inits(pred)
    all_results = pd.DataFrame()
    for t in thresholds:
        sparser = make_sparser(baseX, t)
        nonzeros = np.count_nonzero(sparser)
        sparsity = 1 - nonzeros/mbyn
        X = pd.DataFrame(sparser)
        
        print('Learning and prediction underway: ' + pred + ' at sparsity ' + str(sparsity))
    
        cv_scr = cross_validate(alg1, X, Y, scoring=scores, cv=CV,
                                return_train_score=False, n_jobs=-1)
        df1 = pd.DataFrame(cv_scr)
        df1['Algorithm'] = 'knn'
        
        cv_scr = cross_validate(alg2, X, Y, scoring=scores, cv=CV,
                                return_train_score=False, n_jobs=-1)
        df2 = pd.DataFrame(cv_scr)
        df2['Algorithm'] = 'dtree'
    
        cv_scr = cross_validate(alg3, X, Y, scoring=scores, cv=CV,
                                return_train_score=False, n_jobs=-1)
        df3 = pd.DataFrame(cv_scr)
        df3['Algorithm'] = 'svr'
    
        cv_results = [df1, df2, df3]
        df = pd.concat(cv_results)
        df['Sparsity'] = sparsity
        
        all_results = all_results.append(df, ignore_index=True)
    
    all_results.to_csv(csv_name + '.csv')


Learning and prediction underway: e at sparsity 0.0024913510645449
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-26-4130b865d671> in <module>()
     22 
     23         cv_scr = cross_validate(alg3, X, Y, scoring=scores, cv=CV,
---> 24                                 return_train_score=False, n_jobs=-1)
     25         df3 = pd.DataFrame(cv_scr)
     26         df3['Algorithm'] = 'svr'

~/.local/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score)
    204             fit_params, return_train_score=return_train_score,
    205             return_times=True)
--> 206         for train, test in cv.split(X, y, groups))
    207 
    208     if return_train_score:

~/.local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    787                 # consumption.
    788                 self._iterating = False
--> 789             self.retrieve()
    790             # Make sure that we get a last message telling us we are done
    791             elapsed_time = time.time() - self._start_time

~/.local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in retrieve(self)
    697             try:
    698                 if getattr(self._backend, 'supports_timeout', False):
--> 699                     self._output.extend(job.get(timeout=self.timeout))
    700                 else:
    701                     self._output.extend(job.get())

/usr/local/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
    636 
    637     def get(self, timeout=None):
--> 638         self.wait(timeout)
    639         if not self.ready():
    640             raise TimeoutError

/usr/local/lib/python3.6/multiprocessing/pool.py in wait(self, timeout)
    633 
    634     def wait(self, timeout=None):
--> 635         self._event.wait(timeout)
    636 
    637     def get(self, timeout=None):

/usr/local/lib/python3.6/threading.py in wait(self, timeout)
    549             signaled = self._flag
    550             if not signaled:
--> 551                 signaled = self._cond.wait(timeout)
    552             return signaled
    553 

/usr/local/lib/python3.6/threading.py in wait(self, timeout)
    293         try:    # restore state no matter what (e.g., KeyboardInterrupt)
    294             if timeout is None:
--> 295                 waiter.acquire()
    296                 gotit = True
    297             else:

KeyboardInterrupt: 

In [ ]:


In [ ]:


In [ ]: