In [10]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import scikitplot.plotters as skplt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import imblearn.pipeline as pl
from scikitplot import classifier_factory
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import RandomizedLasso
from sklearn.utils import class_weight
import warnings; warnings.simplefilter('ignore')
import time
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
In [11]:
labels = pd.read_csv('./labels2.csv', index_col=0)
pts = pd.read_csv('./pts.csv', index_col=0)
In [ ]:
# For determining Ch 5 features
summit_data = pts[pts['system_id'] == 3]
summit_labels = labels[pts['system_id'] == 3]
summit_data = summit_data.drop([
'min_nnz_row.1',
'system_id',
'matrix_id',
'status_id',
'time',
'HPL_Tflops',
'StarDGEMM_Gflops',
'SingleDGEMM_Gflops',
'PTRANS_GBs',
'MPIRandomAccess_LCG_GUPs',
'MPIRandomAccess_GUPs',
'StarRandomAccess_LCG_GUPs',
'SingleRandomAccess_LCG_GUPs',
'StarRandomAccess_GUPs',
'SingleRandomAccess_GUPs',
'StarSTREAM_Copy',
'StarSTREAM_Scale',
'StarSTREAM_Add',
'StarSTREAM_Triad',
'SingleSTREAM_Copy',
'SingleSTREAM_Scale',
'SingleSTREAM_Add',
'SingleSTREAM_Triad',
'StarFFT_Gflops',
'SingleFFT_Gflops',
'MPIFFT_Gflops',
'MaxPingPongLatency_usec',
'RandomlyOrderedRingLatency_usec',
'MinPingPongBandwidth_GBytes',
'NaturallyOrderedRingBandwidth_GBytes',
'RandomlyOrderedRingBandwidth_GBytes',
'MinPingPongLatency_usec',
'AvgPingPongLatency_usec',
'MaxPingPongBandwidth_GBytes',
'AvgPingPongBandwidth_GBytes',
'NaturallyOrderedRingLatency_usec',
'MemProc',
'core_count',
'cpu_freq',
'bogo_mips',
'l1_cache',
'l2_cache',
'l3_cache',
'memory_size',
'memory_freq',
'memory_type'], axis=1)
summit_data.info()
X = summit_data.as_matrix()
y = summit_labels.as_matrix()
clfLasso = RandomizedLasso()
clfLasso.fit(X,y[:,7])
for i,j in zip(summit_data.columns, clfLasso.scores_):
print(i,j)
In [ ]:
# For determining Ch 6 features
ch6_data = pts[pts['np'] == 4]
ch6_labels = labels[pts['np'] == 4]
ch6_data = ch6_data.drop([
'min_nnz_row.1',
'np',
'matrix_id',
'status_id',
'system_id',
'time'], axis=1)
ch6_data.info()
X = ch6_data.as_matrix()
y = ch6_labels.as_matrix()
clfLasso = RandomizedLasso()
clfLasso.fit(X,y[:,15]) # The 25% threshold, specific to each np and sys
for i,j in zip(ch6_data.columns, clfLasso.scores_):
print(i,j)
In [ ]:
# For determining Ch 7 features
ch7_data = pts
ch7_labels = labels
ch7_data = ch7_data.drop([
'min_nnz_row.1',
'matrix_id',
'status_id',
'system_id',
'core_count',
'time'], axis=1)
ch7_data.info()
X = ch7_data.as_matrix()
y = ch7_labels.as_matrix()
clfLasso = RandomizedLasso()
for num in range(0,18):
print('\n', num)
clfLasso.fit(X,y[:,num]) # The 25% threshold, specific to each np and sys
for i,j in zip(ch7_data.columns, clfLasso.scores_):
print(i,j)
In [12]:
# Adding in Cavity flow stuff for Ch8 predictions
cavity_flow = pd.read_csv('./cavity_flow_pts.csv',
header=0, index_col=0)
cavity_flow_results = pd.read_csv('./labels_cavity.csv',
header=0, index_col=0)
cavity_flow = cavity_flow.drop([
'matrix_id',
'system_id',
'time',
'min_nnz_row.1',
'status_id'], axis=1)
ch8_data = pts
ch8_labels = labels
ch8_data = ch8_data.drop([
'matrix_id',
'system_id',
'min_nnz_row.1',
'time',
'status_id'], axis=1)
In [42]:
# Trains on UF matrices and tests on cavity flow
X = ch8_data.as_matrix()
y = ch8_labels.as_matrix()
print(X.shape, y.shape)
print(cavity_flow.shape, cavity_flow_results.shape)
for i in range(0,18):
clf = RandomForestClassifier()
start_time = time.time()
clf = clf.fit(X, y[:,i])
#a = pd.DataFrame(clf.predict(cavity_flow))
a = clf.predict_proba(cavity_flow)
#skplt.plot_confusion_matrix(cavity_flow_results.iloc[:,i], a)
skplt.plot_precision_recall_curve(cavity_flow_results.iloc[:,i], a)
print(time.time()-start_time)
plt.title(i)
plt.show()
In [63]:
# Trains on UF matrices + half cavity flow and tests
# on the other half of cavity flow
# Adding in Cavity flow stuff for Ch8 predictions
X_train = pd.read_csv('./pts+first_half_cavity.csv',
header=0, index_col=0)
y_train = pd.read_csv('./labels2+first_half_cavity.csv',
header=0, index_col=0)
X_test = pd.read_csv('./cavity_flow_pts_second_half.csv',
header=0, index_col=0)
y_test = pd.read_csv('./labels_cavity_second_half.csv',
header=0, index_col=0)
X_train = X_train.drop([
'matrix_id',
'system_id',
'time',
'min_nnz_row.1',
'status_id'], axis=1)
X_test = X_test.drop([
'matrix_id',
'system_id',
'time',
'min_nnz_row.1',
'status_id'], axis=1)
for i in range(0,18):
clf = RandomForestClassifier()
start_time = time.time()
clf = clf.fit(X_train.as_matrix(), y_train.as_matrix()[:,i])
#a = pd.DataFrame(clf.predict(X_test))
a = clf.predict_proba(X_test)
#skplt.plot_confusion_matrix(y_test.as_matrix()[:,i], a)
#skplt.plot_precision_recall_curve(y_test.iloc[:,i], a)
skplt.plot_roc_curve(y_test.iloc[:,i], a)
print(time.time()-start_time)
plt.title(i)
plt.show()
In [57]:
# Trains on cavity data and tests on UF matrices
X = cavity_flow.as_matrix()
y = cavity_flow_results.as_matrix()
for i in range(0,18):
clf = RandomForestClassifier()
start_time = time.time()
clf = clf.fit(X, y[:,i])
#a = pd.DataFrame(clf.predict(ch8_data))
a = clf.predict_proba(ch8_data)
#skplt.plot_confusion_matrix(ch8_labels.iloc[:,i], a)
skplt.plot_roc_curve(ch8_labels.iloc[:,i], a)
#skplt.plot_precision_recall_curve(ch8_labels.iloc[:,i], a)
print(time.time()-start_time)
plt.title(i)
plt.show()
In [43]:
# Trains and tests on cavity flow data
X = cavity_flow.as_matrix()
y = cavity_flow_results.as_matrix()
for i in range(0,18):
clf = RandomForestClassifier()
classifier_factory(clf)
start_time = time.time()
#clf.plot_confusion_matrix(X, y[:,i])
clf.plot_roc_curve(X,y[:,i])
#clf.plot_precision_recall_curve(X,y[:,i])
print(time.time()-start_time)
plt.title(i)
plt.show()
In [5]:
# Ch8 Feature ranking for Cavity flow only
for r in range(0,18):
print(str(r) + '\n')
clfLasso = RandomizedLasso()
clfLasso.fit(cavity_flow,cavity_flow_results.iloc[:,r])
zipped = sorted(zip(cavity_flow.columns, clfLasso.scores_),
key=lambda x: x[1], reverse=True)
for i,j in zipped:
print(i,j)
In [33]:
# Trains on UF matrices w/ random over sampling
# and tests on cavity_flow data
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()
for i in range (0,18):
new_x, new_y = ros.fit_sample(ch8_data.as_matrix(),
ch8_labels.iloc[:,i])
clf = RandomForestClassifier()
classifier_factory(clf)
start_time = time.time()
clf.fit(new_x, new_y)
a = pd.DataFrame(clf.predict(cavity_flow))
skplt.plot_confusion_matrix(cavity_flow_results.iloc[:,i], a)
print(time.time()-start_time)
plt.title(i)
plt.show()