In [2]:
import pandas as pd
import numpy as np
properties = pd.read_csv('../matrix_properties/processed_properties.csv', header=0, index_col=0)
properties
Out[2]:
In [3]:
time_files = ['../processed_timings/system_specific/bridges_all_np_timings_processed.csv',
'../processed_timings/system_specific/comet_all_np_timings_processed.csv',
'../processed_timings/system_specific/laptop_all_np_timings_processed.csv',
'../processed_timings/system_specific/summit_all_np_timings_processed.csv',
'../processed_timings/system_specific/stampede_all_np_timings_processed.csv']
times_array = []
for t in time_files:
times_array.append(pd.read_csv(t, header=0, index_col=0))
combined_times = pd.concat(times_array)
combined_times = combined_times.drop(labels=['system', 'solver', 'prec', 'status',
'new_time', 'good_or_bad', 'resid', 'iters'],
axis=1)
combined_times = combined_times.drop_duplicates()
combined_times
Out[3]:
In [4]:
systems_info = pd.read_csv('../systems_info/systems_info.csv')
systems_info.system_id = systems_info.system_id.astype(int)
systems_info = systems_info.drop(labels=['system'], axis=1)
systems_info
Out[4]:
In [15]:
pts = pd.merge(properties, combined_times, on='matrix_id')
pts = pd.merge(systems_info, pts, on='system_id')
pts = pts.dropna()
pts = pts.drop(
labels=['matrix_y', 'matrix_x'], axis=1)
pts.to_csv('./pts.csv')
In [6]:
# Makes label for error
lbl_error = []
i = 0
for row in pts.itertuples():
if getattr(row, "status_id") == -1:
lbl_error.append(1)
else:
lbl_error.append(0)
lbl_error
Out[6]:
In [7]:
df = pd.DataFrame(lbl_error)
df.to_csv('lbl_error.csv')
In [8]:
sum(lbl_error)
Out[8]:
In [9]:
# Makes label for convergence
lbl_converged = []
i = 0
for row in pts.itertuples():
if getattr(row, "status_id") == 1:
lbl_converged.append(1)
else:
lbl_converged.append(0)
lbl_converged
Out[9]:
In [10]:
df = pd.DataFrame(lbl_converged)
df.to_csv('lbl_converged.csv')
In [11]:
sum(lbl_converged)
Out[11]:
In [12]:
grouped = pts.groupby(['matrix_id', 'system_id', 'np','status_id'])
best_times = grouped['time'].aggregate(np.min)
best_times
Out[12]:
In [ ]:
# Finds the best times for each specific combination of np and sys
lbl_best_at_specific_sys_and_np = []
i = 0
for row in pts.itertuples():
cur_matrix_id = getattr(row, 'matrix_id')
cur_system_id = getattr(row, 'system_id')
cur_np = getattr(row, 'np')
cur_status_id = getattr(row, 'status_id')
try:
if best_times[cur_matrix_id][cur_system_id][cur_np][1] == getattr(row, 'time'):
lbl_best_at_specific_sys_and_np.append(1)
else:
lbl_best_at_specific_sys_and_np.append(0)
except KeyError:
lbl_best_at_specific_sys_and_np.append(0)
lbl_best_at_specific_sys_and_np
In [ ]:
df = pd.DataFrame(lbl_best_at_specific_sys_and_np)
df.to_csv('lbl_best_at_specific_sys_and_np.csv')
In [ ]:
np.sum(lbl_best_at_specific_sys_and_np)
In [ ]:
grouped = pts.groupby(['matrix_id', 'np','status_id'])
best_times = grouped['time'].aggregate(np.min)
best_times
In [ ]:
# Finds the best times for each specific np, regardless of system
lbl_best_at_specific_np = []
i = 0
for row in pts.itertuples():
cur_matrix_id = getattr(row, 'matrix_id')
cur_np = getattr(row, 'np')
cur_status_id = getattr(row, 'status_id')
try:
if best_times[cur_matrix_id][cur_np][1] == getattr(row, 'time'):
lbl_best_at_specific_np.append(1)
else:
lbl_best_at_specific_np.append(0)
except KeyError:
lbl_best_at_specific_np.append(0)
lbl_best_at_specific_np
In [ ]:
df = pd.DataFrame(lbl_best_at_specific_np)
df.to_csv('lbl_best_at_specific_np.csv')
In [ ]:
np.sum(lbl_best_at_specific_np)
In [ ]:
pts.count
In [ ]:
grouped = pts.groupby(['matrix_id', 'system_id','status_id'])
best_times = grouped['time'].aggregate(np.min)
best_times.count
In [ ]:
# Finds the best times for each specific system, regardless of np
lbl_best_at_specific_sys = []
i = 0
for row in pts.itertuples():
cur_matrix_id = getattr(row, 'matrix_id')
cur_sys = getattr(row, 'system_id')
cur_status_id = getattr(row, 'status_id')
try:
if best_times[cur_matrix_id][cur_sys][1] == getattr(row, 'time'):
lbl_best_at_specific_sys.append(1)
else:
lbl_best_at_specific_sys.append(0)
except KeyError:
lbl_best_at_specific_sys.append(0)
lbl_best_at_specific_sys
In [ ]:
df = pd.DataFrame(lbl_best_at_specific_sys)
df.to_csv('lbl_best_at_specific_sys.csv')
In [ ]:
sum(lbl_best_at_specific_sys)
In [ ]:
grouped = pts.groupby(['matrix_id', 'status_id'])
best_times = grouped['time'].aggregate(np.min)
best_times.count
In [ ]:
# Finds the best overall
lbl_best_overall = []
i = 0
for row in pts.itertuples():
cur_matrix_id = getattr(row, 'matrix_id')
try:
if best_times[cur_matrix_id][1] == getattr(row, 'time'):
lbl_best_overall.append(1)
else:
lbl_best_overall.append(0)
except KeyError:
lbl_best_overall.append(0)
lbl_best_overall
In [ ]:
sum(lbl_best_overall)
In [ ]:
df = pd.DataFrame(lbl_best_overall)
df.to_csv('lbl_best_overall.csv')
In [ ]:
all_labels = pd.read_csv('./labels.csv', index_col=0)
In [ ]:
all_labels_array = all_labels.as_matrix()
all_labels_array
In [ ]:
pts = pts.drop(['matrix_x', 'matrix_y', 'status_id', 'system_id', 'time'], axis=1)
In [ ]:
X = pts.as_matrix()
y = all_labels_array
In [ ]:
X[1]
In [ ]:
y[1]
In [ ]:
all_labels
In [ ]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
In [ ]:
from sklearn.externals import joblib
classifier = OneVsRestClassifier(LinearSVC(), n_jobs=6)
classifier = classifier.fit(X,y)
joblib.dump(classifier, 'classifier.pkl')
In [ ]:
classifier = Pipeline([('clf', OneVsRestClassifier(LinearSVC(), n_jobs=6))])
sss = StratifiedShuffleSplit(n_splits=3, random_state=0)
i = 0
for train_index, test_index in sss.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
classifier.fit(X_train, y_train)
joblib.dump(classifier, 'classifier-' + str(i) + '.pkl')
i+=1
In [ ]:
pts.to_csv('./pts.csv')