notebook.community

Edit and run



In [3]:

    
import pandas as pd
import numpy as np
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from scikitplot import classifier_factory



In [4]:

    
labels = pd.read_csv('./labels.csv', index_col=0)
pts = pd.read_csv('./pts.csv', index_col=0)



In [3]:

    
pts.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 871535 entries, 0 to 871534
Data columns (total 83 columns):
system_id                               871535 non-null int64
HPL_Tflops                              871535 non-null float64
StarDGEMM_Gflops                        871535 non-null float64
SingleDGEMM_Gflops                      871535 non-null float64
PTRANS_GBs                              871535 non-null float64
MPIRandomAccess_LCG_GUPs                871535 non-null float64
MPIRandomAccess_GUPs                    871535 non-null float64
StarRandomAccess_LCG_GUPs               871535 non-null float64
SingleRandomAccess_LCG_GUPs             871535 non-null float64
StarRandomAccess_GUPs                   871535 non-null float64
SingleRandomAccess_GUPs                 871535 non-null float64
StarSTREAM_Copy                         871535 non-null float64
StarSTREAM_Scale                        871535 non-null float64
StarSTREAM_Add                          871535 non-null float64
StarSTREAM_Triad                        871535 non-null float64
SingleSTREAM_Copy                       871535 non-null float64
SingleSTREAM_Scale                      871535 non-null float64
SingleSTREAM_Add                        871535 non-null float64
SingleSTREAM_Triad                      871535 non-null float64
StarFFT_Gflops                          871535 non-null float64
SingleFFT_Gflops                        871535 non-null float64
MPIFFT_Gflops                           871535 non-null float64
MaxPingPongLatency_usec                 871535 non-null float64
RandomlyOrderedRingLatency_usec         871535 non-null float64
MinPingPongBandwidth_GBytes             871535 non-null float64
NaturallyOrderedRingBandwidth_GBytes    871535 non-null float64
RandomlyOrderedRingBandwidth_GBytes     871535 non-null float64
MinPingPongLatency_usec                 871535 non-null float64
AvgPingPongLatency_usec                 871535 non-null float64
MaxPingPongBandwidth_GBytes             871535 non-null float64
AvgPingPongBandwidth_GBytes             871535 non-null float64
NaturallyOrderedRingLatency_usec        871535 non-null float64
MemProc                                 871535 non-null int64
core_count                              871535 non-null int64
cpu_freq                                871535 non-null int64
bogo_mips                               871535 non-null float64
l1_cache                                871535 non-null int64
l2_cache                                871535 non-null int64
l3_cache                                871535 non-null int64
memory_size                             871535 non-null int64
memory_freq                             871535 non-null int64
memory_type                             871535 non-null int64
rows                                    871535 non-null int64
cols                                    871535 non-null int64
min_nnz_row                             871535 non-null int64
row_var                                 871535 non-null float64
col_var                                 871535 non-null float64
diag_var                                871535 non-null float64
nnz                                     871535 non-null int64
frob_norm                               871535 non-null float64
symm_frob_norm                          871535 non-null float64
antisymm_frob_norm                      871535 non-null float64
one_norm                                871535 non-null float64
inf_norm                                871535 non-null float64
symm_inf_norm                           871535 non-null float64
antisymm_inf_norm                       871535 non-null float64
max_nnz_row                             871535 non-null int64
trace                                   871535 non-null float64
abs_trace                               871535 non-null float64
min_nnz_row.1                           871535 non-null int64
avg_nnz_row                             871535 non-null int64
dummy_rows                              871535 non-null int64
dummy_rows_kind                         871535 non-null int64
num_value_symm_1                        871535 non-null int64
nnz_pattern_symm_1                      871535 non-null int64
num_value_symm_2                        871535 non-null float64
nnz_pattern_symm_2                      871535 non-null float64
row_diag_dom                            871535 non-null int64
col_diag_dom                            871535 non-null int64
diag_avg                                871535 non-null float64
diag_sign                               871535 non-null int64
diag_nnz                                871535 non-null int64
lower_bw                                871535 non-null int64
upper_bw                                871535 non-null int64
row_log_val_spread                      871535 non-null float64
col_log_val_spread                      871535 non-null float64
symm                                    871535 non-null int64
matrix_id                               871535 non-null int64
np                                      871535 non-null int64
time                                    871535 non-null float64
solver_id                               871535 non-null int64
prec_id                                 871535 non-null int64
status_id                               871535 non-null int64
dtypes: float64(50), int64(33)
memory usage: 558.5 MB



In [5]:

    
all_data = pts.drop([
'matrix_id',
'system_id',
'time',
'matrix_id',
'status_id'], axis=1)



In [ ]:

    
# plain_data = pts.drop([
# 'matrix_id', 
# 'status_id', 
# 'time', 
# 'HPL_Tflops',        
# 'StarDGEMM_Gflops',                        
# 'SingleDGEMM_Gflops',                      
# 'PTRANS_GBs',                              
# 'MPIRandomAccess_LCG_GUPs',                
# 'MPIRandomAccess_GUPs',                    
# 'StarRandomAccess_LCG_GUPs',               
# 'SingleRandomAccess_LCG_GUPs',             
# 'StarRandomAccess_GUPs',                   
# 'SingleRandomAccess_GUPs',                 
# 'StarSTREAM_Copy',                         
# 'StarSTREAM_Scale',                        
# 'StarSTREAM_Add',                          
# 'StarSTREAM_Triad',                        
# 'SingleSTREAM_Copy',                       
# 'SingleSTREAM_Scale',                      
# 'SingleSTREAM_Add',                        
# 'SingleSTREAM_Triad',                      
# 'StarFFT_Gflops',                          
# 'SingleFFT_Gflops',                        
# 'MPIFFT_Gflops',                           
# 'MaxPingPongLatency_usec',                 
# 'RandomlyOrderedRingLatency_usec',         
# 'MinPingPongBandwidth_GBytes',             
# 'NaturallyOrderedRingBandwidth_GBytes',    
# 'RandomlyOrderedRingBandwidth_GBytes',     
# 'MinPingPongLatency_usec',                 
# 'AvgPingPongLatency_usec',                 
# 'MaxPingPongBandwidth_GBytes',             
# 'AvgPingPongBandwidth_GBytes',             
# 'NaturallyOrderedRingLatency_usec',        
# 'MemProc',                                
# 'core_count',                              
# 'cpu_freq',                                
# 'bogo_mips',                              
# 'l1_cache',                                
# 'l2_cache',                                
# 'l3_cache',                                
# 'memory_size',                             
# 'memory_freq',                             
# 'memory_type'], axis=1)



In [ ]:

    
all_data.info()



In [ ]:

    
summit_only_data = plain_data[plain_data['system_id'] == 3]
summit_only_labels = labels[plain_data['system_id'] == 3]



In [ ]:

    
summit_only_data.info()
summit_only_labels.info()



In [6]:

    
X = all_data.as_matrix()
y = labels.as_matrix()



In [ ]:

    
y = y[:,0]
y.transpose()



In [7]:

    
from sklearn.externals import joblib
classifier = RandomForestClassifier()
classifier = classifier.fit(X,y)
joblib.dump(classifier, 'all_data_classifier.pkl')









    Out[7]:





['all_data_classifier.pkl']



In [10]:

    
y.shape









    Out[10]:





(871535, 18)



In [15]:

    
classifiers_list = []
for i in range(0,18):
    classifier = RandomForestClassifier()
    classifier.fit(X,y[:,i])
    #classifiers_list.append(classifier)
    classifier_factory(classifier)
    classifier.plot_confusion_matrix(X, y[:,i], 
                                 cv=10,
                                 title=str(i),
                                 shuffle=True,
                                    figsize=[8,8]
                                 )
    plt.show()
#joblib.dump(classifiers_list, 'individual_label_classifiers.pkl')



In [16]:

    
classifiers_list = []
for i in range(0,18):
    classifier = RandomForestClassifier()
    classifier.fit(X,y[:,i])
    #classifiers_list.append(classifier)
    classifier_factory(classifier)
    classifier.plot_precision_recall_curve(X, y[:,i], 
                                 cv=10,
                                 title=str(i),
                                 shuffle=True,
                                    figsize=[8,8]
                                 )
    plt.show()
#joblib.dump(classifiers_list, 'individual_label_classifiers.pkl')



In [17]:

    
classifiers_list = []
for i in range(0,18):
    classifier = RandomForestClassifier()
    classifier.fit(X,y[:,i])
    #classifiers_list.append(classifier)
    classifier_factory(classifier)
    classifier.plot_learning_curve(X, y[:,i], 
                                 cv=10,
                                 title=str(i),
                                 shuffle=True,
                                    figsize=[8,8]
                                 )
    plt.show()
#joblib.dump(classifiers_list, 'individual_label_classifiers.pkl')









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-17-69caa335816b> in <module>()
      9                                  title=str(i),
     10                                  shuffle=True,
---> 11                                     figsize=[8,8]
     12                                  )
     13     plt.show()

TypeError: plot_learning_curve() got an unexpected keyword argument 'shuffle'



In [ ]:

    
classifiers_list = []
for i in range(0,18):
    classifier = RandomForestClassifier()
    classifier.fit(X,y[:,i])
    #classifiers_list.append(classifier)
    classifier_factory(classifier)
    classifier.plot_feature_importances(X, y[:,i], 
                                 cv=10,
                                 title=str(i),
                                 shuffle=True,
                                    figsize=[8,8]
                                 )
    plt.show()
#joblib.dump(classifiers_list, 'individual_label_classifiers.pkl')



In [ ]:

    
classifiers_list = []
for i in range(0,18):
    classifier = RandomForestClassifier()
    classifier.fit(X,y[:,i])
    #classifiers_list.append(classifier)
    classifier_factory(classifier)
    classifier.plot_ks_statistic(X, y[:,i], 
                                 cv=10,
                                 title=str(i),
                                 shuffle=True,
                                    figsize=[8,8]
                                 )
    plt.show()
#joblib.dump(classifiers_list, 'individual_label_classifiers.pkl')



In [7]:

    
classifiers_list = joblib.load('individual_label_classifiers.pkl')



In [11]:

    
for i in range (0,18):
    cur_y = y[:,i]
    pred_y = classifiers_list[i].pred
    #     classifiers_list[i].plot_confusion_matrix(X, y[:,i],
    #                                      cv=10,
    #                                      title=str(i),
    #                                      shuffle=True)
    plt.show()









    



/usr/local/lib/python3.5/dist-packages/scikitplot/classifiers.py:51: UserWarning: "plot_feature_importances" method already in clf. Overriding anyway. This may result in unintended behavior.
  'Overriding anyway. This may result in unintended behavior.'.format(key))
/usr/local/lib/python3.5/dist-packages/scikitplot/classifiers.py:51: UserWarning: "plot_confusion_matrix" method already in clf. Overriding anyway. This may result in unintended behavior.
  'Overriding anyway. This may result in unintended behavior.'.format(key))
/usr/local/lib/python3.5/dist-packages/scikitplot/classifiers.py:51: UserWarning: "plot_learning_curve" method already in clf. Overriding anyway. This may result in unintended behavior.
  'Overriding anyway. This may result in unintended behavior.'.format(key))
/usr/local/lib/python3.5/dist-packages/scikitplot/classifiers.py:51: UserWarning: "plot_precision_recall_curve" method already in clf. Overriding anyway. This may result in unintended behavior.
  'Overriding anyway. This may result in unintended behavior.'.format(key))
/usr/local/lib/python3.5/dist-packages/scikitplot/classifiers.py:51: UserWarning: "plot_roc_curve" method already in clf. Overriding anyway. This may result in unintended behavior.
  'Overriding anyway. This may result in unintended behavior.'.format(key))
/usr/local/lib/python3.5/dist-packages/scikitplot/classifiers.py:51: UserWarning: "plot_ks_statistic" method already in clf. Overriding anyway. This may result in unintended behavior.
  'Overriding anyway. This may result in unintended behavior.'.format(key))






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-11-de5661cbfe07> in <module>()
      4                                      cv=10,
      5                                      title=str(i),
----> 6                                      shuffle=True)
      7     plt.show()

/usr/local/lib/python3.5/dist-packages/scikitplot/classifiers.py in plot_confusion_matrix(clf, X, y, labels, title, normalize, x_tick_rotation, do_cv, cv, shuffle, random_state, ax, figsize, title_fontsize, text_fontsize)
    154             X_train, X_test = X[train_index], X[test_index]
    155             y_train, y_test = y[train_index], y[test_index]
--> 156             clf_clone.fit(X_train, y_train)
    157             preds = clf_clone.predict(X_test)
    158             preds_list.append(preds)

/usr/local/lib/python3.5/dist-packages/sklearn/ensemble/forest.py in fit(self, X, y, sample_weight)
    324                     t, self, X, y, sample_weight, i, len(trees),
    325                     verbose=self.verbose, class_weight=self.class_weight)
--> 326                 for i, t in enumerate(trees))
    327 
    328             # Collect newly grown trees

/usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    756             # was dispatched. In particular this covers the edge
    757             # case of Parallel used with an exhausted iterator.
--> 758             while self.dispatch_one_batch(iterator):
    759                 self._iterating = True
    760             else:

/usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    606                 return False
    607             else:
--> 608                 self._dispatch(tasks)
    609                 return True
    610 

/usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    569         dispatch_timestamp = time.time()
    570         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571         job = self._backend.apply_async(batch, callback=cb)
    572         self._jobs.append(job)
    573 

/usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
    107     def apply_async(self, func, callback=None):
    108         """Schedule a func to be run"""
--> 109         result = ImmediateResult(func)
    110         if callback:
    111             callback(result)

/usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
    324         # Don't delay the application, to avoid keeping the input
    325         # arguments in memory
--> 326         self.results = batch()
    327 
    328     def get(self):

/usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/parallel.py in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

/usr/local/lib/python3.5/dist-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

/usr/local/lib/python3.5/dist-packages/sklearn/ensemble/forest.py in _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, verbose, class_weight)
    118             curr_sample_weight *= compute_sample_weight('balanced', y, indices)
    119 
--> 120         tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
    121     else:
    122         tree.fit(X, y, sample_weight=sample_weight, check_input=False)

/usr/local/lib/python3.5/dist-packages/sklearn/tree/tree.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
    737             sample_weight=sample_weight,
    738             check_input=check_input,
--> 739             X_idx_sorted=X_idx_sorted)
    740         return self
    741 

/usr/local/lib/python3.5/dist-packages/sklearn/tree/tree.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
    348                                            self.min_impurity_split)
    349 
--> 350         builder.build(self.tree_, X, y, sample_weight, X_idx_sorted)
    351 
    352         if self.n_outputs_ == 1:

KeyboardInterrupt:



In [6]:

    
classifier.plot_precision_recall_curve(X,y[:,0],
                                       shuffle=True,
                                       cv=10,
                                       curves='each_class')
plt.show()









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-6679b8061e3e> in <module>()
----> 1 classifier.plot_precision_recall_curve(X,y[:,0],
      2                                        shuffle=True,
      3                                        cv=10,
      4                                        curves='each_class')
      5 plt.show()

NameError: name 'classifier' is not defined



In [ ]:



In [ ]:

    
classifier = RandomForestClassifier()
classifier_factory(classifier)
sss = StratifiedShuffleSplit()
y_preds = []
y_tests = []
i = 0
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    classifier.fit(X_train, y_train)
    classifier.plot_roc_curve(X_test,y_test)
    plt.show()
    y_preds.append(classifier.predict(X_test))
    y_tests.append(y_test)
    i+=1



In [ ]:

    
j = i
for i in range(0,j):
    y_preds[i] = y_preds[i].astype(int)
y_preds



In [ ]:

    
from sklearn.metrics import classification_report, confusion_matrix
for i in range(0,10):
    print(classification_report(y_tests[i],y_preds[i]))



In [ ]:

    
from sklearn.metrics import classification_report, confusion_matrix
for i in range(0,10):
    print(classification_report(y_tests[i],y_preds[i]))



In [ ]:

    
from sklearn.model_selection import KFold
classifier = RandomForestClassifier()
kf = KFold(n_splits=10)
y_preds = []
y_tests = []
i = 0
for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    classifier.fit(X_train, y_train)
    y_preds.append(classifier.predict(X_test))
    y_tests.append(y_test)
    i+=1



In [ ]:

    
from sklearn.metrics import classification_report, confusion_matrix
for i in range(0,10):
    print(classification_report(y_tests[i],y_preds[i]))



In [ ]:

    
from sklearn.metrics import coverage_error, label_ranking_average_precision_score, label_ranking_loss
print(coverage_error(y_tests[0], y_preds[0]))
print(coverage_error(y_tests[1], y_preds[0]))
print(coverage_error(y_tests[0], y_preds[0]))



In [ ]:

    
y_pred = classifier.predict(X)
y_pred = y_pred.astype(int)
y_pred



In [ ]:

    
y_pred_prob = classifier.predict_proba(X)
y_pred_prob



In [ ]:

    
print("Predicted number of instances:\t",
sum(y_pred[:,0]),
sum(y_pred[:,1]),
sum(y_pred[:,2]),
sum(y_pred[:,3]),
sum(y_pred[:,4]),
sum(y_pred[:,5]))

print("Predicted probabilities of instances:\n",
sum(y_pred_prob[0]), '\n',
sum(y_pred_prob[1]),'\n',
sum(y_pred_prob[2]),'\n',
sum(y_pred_prob[3]),'\n',
sum(y_pred_prob[4]),'\n',
sum(y_pred_prob[5]))

print("Actual number of instances:\t",
sum(y[:,0]),
sum(y[:,1]),
sum(y[:,2]),
sum(y[:,3]),
sum(y[:,4]),
sum(y[:,5]))



In [ ]:

    
classifier.score(X,y)



In [ ]:

    
from sklearn.metrics import coverage_error, label_ranking_average_precision_score, label_ranking_loss
print(coverage_error(y, results_y))
print(label_ranking_average_precision_score(y, results_y))
print(label_ranking_loss(y, results_y))



In [ ]:

    
from sklearn.metrics import confusion_matrix, f1_score, fbeta_score
print(f1_score(y, y_pred, average=None))
print(f1_score(y, y_pred, average='micro'))
print(f1_score(y, y_pred, average='macro'))
print(f1_score(y, y_pred, average='samples'))
print(f1_score(y, y_pred, average='weighted'))
scores = fbeta_score(y, y_pred, beta=0.5, average=None)



In [ ]:

    
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y,y_pred))
print(confusion_matrix(y[:,0],y_pred[:,0]))



In [ ]:

    
from sklearn.metrics import roc_auc_score, roc_curve
print(roc_auc_score(y, y_pred))



In [ ]:

    
from sklearn.metrics import zero_one_loss
print(zero_one_loss(y,y_pred)) # Fraction of misclassifications
print(zero_one_loss(y,y_pred,normalize=False)) # num of misclassifications