In [1]:
import pandas as pd
import numpy as np
In [2]:
from pandas import read_csv
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import RandomizedLasso
In [3]:
in_all = pd.read_csv('../classifications/merged_with_all_features.csv', index_col=0)
In [4]:
in_all.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 748916 entries, 0 to 748915
Data columns (total 81 columns):
system_id 748916 non-null int64
HPL_Tflops 748916 non-null float64
StarDGEMM_Gflops 748916 non-null float64
SingleDGEMM_Gflops 748916 non-null float64
PTRANS_GBs 748916 non-null float64
MPIRandomAccess_LCG_GUPs 748916 non-null float64
MPIRandomAccess_GUPs 748916 non-null float64
StarRandomAccess_LCG_GUPs 748916 non-null float64
SingleRandomAccess_LCG_GUPs 748916 non-null float64
StarRandomAccess_GUPs 748916 non-null float64
SingleRandomAccess_GUPs 748916 non-null float64
StarSTREAM_Copy 748916 non-null float64
StarSTREAM_Scale 748916 non-null float64
StarSTREAM_Add 748916 non-null float64
StarSTREAM_Triad 748916 non-null float64
SingleSTREAM_Copy 748916 non-null float64
SingleSTREAM_Scale 748916 non-null float64
SingleSTREAM_Add 748916 non-null float64
SingleSTREAM_Triad 748916 non-null float64
StarFFT_Gflops 748916 non-null float64
SingleFFT_Gflops 748916 non-null float64
MPIFFT_Gflops 748916 non-null float64
MaxPingPongLatency_usec 748916 non-null float64
RandomlyOrderedRingLatency_usec 748916 non-null float64
MinPingPongBandwidth_GBytes 748916 non-null float64
NaturallyOrderedRingBandwidth_GBytes 748916 non-null float64
RandomlyOrderedRingBandwidth_GBytes 748916 non-null float64
MinPingPongLatency_usec 748916 non-null float64
AvgPingPongLatency_usec 748916 non-null float64
MaxPingPongBandwidth_GBytes 748916 non-null float64
AvgPingPongBandwidth_GBytes 748916 non-null float64
NaturallyOrderedRingLatency_usec 748916 non-null float64
MemProc 748916 non-null int64
core_count 748916 non-null int64
cpu_freq 748916 non-null int64
bogo_mips 748916 non-null float64
l1_cache 748916 non-null int64
l2_cache 748916 non-null int64
l3_cache 748916 non-null int64
memory_size 748916 non-null int64
memory_freq 748916 non-null int64
memory_type 748916 non-null int64
rows 748916 non-null int64
cols 748916 non-null int64
min_nnz_row 748916 non-null int64
row_var 748916 non-null float64
col_var 748916 non-null float64
diag_var 748916 non-null float64
nnz 748916 non-null int64
frob_norm 748916 non-null float64
symm_frob_norm 748916 non-null float64
antisymm_frob_norm 748916 non-null float64
one_norm 748916 non-null float64
inf_norm 748916 non-null float64
symm_inf_norm 748916 non-null float64
antisymm_inf_norm 748916 non-null float64
max_nnz_row 748916 non-null int64
trace 748916 non-null float64
abs_trace 748916 non-null float64
min_nnz_row.1 748916 non-null int64
avg_nnz_row 748916 non-null int64
dummy_rows 748916 non-null int64
dummy_rows_kind 748916 non-null int64
num_value_symm_1 748916 non-null int64
nnz_pattern_symm_1 748916 non-null int64
num_value_symm_2 748916 non-null float64
nnz_pattern_symm_2 748916 non-null float64
row_diag_dom 748916 non-null int64
col_diag_dom 748916 non-null int64
diag_avg 748916 non-null float64
diag_sign 748916 non-null int64
diag_nnz 748916 non-null int64
lower_bw 748916 non-null int64
upper_bw 748916 non-null int64
row_log_val_spread 748916 non-null float64
col_log_val_spread 748916 non-null float64
symm 748916 non-null int64
np 748916 non-null int64
solver_id 748916 non-null int64
prec_id 748916 non-null int64
good_or_bad 748916 non-null float64
dtypes: float64(50), int64(31)
memory usage: 468.5 MB
In [5]:
in_all.system_id.value_counts()
Out[5]:
1 243200
2 215561
4 149605
3 140550
Name: system_id, dtype: int64
In [6]:
X = in_all.iloc[:,0:-1]
y = in_all.iloc[:,-1]
In [7]:
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_)
a = model.feature_importances_
[ 9.56537652e-05 6.88950128e-05 8.42141372e-05 9.60209722e-05
7.14296321e-05 7.46885515e-05 5.70044001e-05 9.15346525e-05
7.90386718e-05 1.03400095e-04 7.45316390e-05 9.89229634e-05
5.77601825e-05 8.80103025e-05 9.11406709e-05 1.12980090e-04
6.75211401e-05 8.45308869e-05 6.58600044e-05 1.42550883e-04
1.01703685e-04 6.37397061e-05 1.05653907e-04 7.67865119e-05
7.84360970e-05 7.27527849e-05 8.26801349e-05 1.12181442e-04
1.08394574e-04 6.04507763e-05 7.39511893e-05 8.76492382e-05
1.17584234e-04 7.00964907e-05 7.98117120e-05 7.72067674e-05
0.00000000e+00 0.00000000e+00 1.04177456e-04 5.65010429e-05
5.88066425e-05 5.17133738e-05 6.43986583e-03 6.31097202e-03
4.66827822e-03 4.52261553e-03 4.97713437e-03 4.75236555e-03
7.07855692e-03 5.58799254e-03 5.18597439e-03 4.83340806e-03
5.28379215e-03 4.75088050e-03 4.89161459e-03 5.24574437e-03
7.95030133e-03 5.06175970e-03 4.67310248e-03 4.21019673e-03
8.34061232e-03 9.85523166e-03 4.54532635e-03 2.17366249e-04
1.84996859e-04 1.14959453e-02 1.09906624e-02 2.28071486e-03
2.83970720e-03 4.79478069e-03 1.43253150e-02 7.57705210e-03
7.19327349e-03 5.49421660e-03 1.05155357e-02 1.01605287e-02
2.23490776e-04 1.22839409e-01 4.36548113e-01 2.29807201e-01]
In [8]:
blah = {}
blah[0] = in_all.columns
blah[1] = a
new_df = pd.DataFrame()
In [9]:
X.columns
Out[9]:
Index(['system_id', 'HPL_Tflops', 'StarDGEMM_Gflops', 'SingleDGEMM_Gflops',
'PTRANS_GBs', 'MPIRandomAccess_LCG_GUPs', 'MPIRandomAccess_GUPs',
'StarRandomAccess_LCG_GUPs', 'SingleRandomAccess_LCG_GUPs',
'StarRandomAccess_GUPs', 'SingleRandomAccess_GUPs', 'StarSTREAM_Copy',
'StarSTREAM_Scale', 'StarSTREAM_Add', 'StarSTREAM_Triad',
'SingleSTREAM_Copy', 'SingleSTREAM_Scale', 'SingleSTREAM_Add',
'SingleSTREAM_Triad', 'StarFFT_Gflops', 'SingleFFT_Gflops',
'MPIFFT_Gflops', 'MaxPingPongLatency_usec',
'RandomlyOrderedRingLatency_usec', 'MinPingPongBandwidth_GBytes',
'NaturallyOrderedRingBandwidth_GBytes',
'RandomlyOrderedRingBandwidth_GBytes', 'MinPingPongLatency_usec',
'AvgPingPongLatency_usec', 'MaxPingPongBandwidth_GBytes',
'AvgPingPongBandwidth_GBytes', 'NaturallyOrderedRingLatency_usec',
'MemProc', 'core_count', 'cpu_freq', 'bogo_mips', 'l1_cache',
'l2_cache', 'l3_cache', 'memory_size', 'memory_freq', 'memory_type',
'rows', 'cols', 'min_nnz_row', 'row_var', 'col_var', 'diag_var', 'nnz',
'frob_norm', 'symm_frob_norm', 'antisymm_frob_norm', 'one_norm',
'inf_norm', 'symm_inf_norm', 'antisymm_inf_norm', 'max_nnz_row',
'trace', 'abs_trace', 'min_nnz_row.1', 'avg_nnz_row', 'dummy_rows',
'dummy_rows_kind', 'num_value_symm_1', 'nnz_pattern_symm_1',
'num_value_symm_2', 'nnz_pattern_symm_2', 'row_diag_dom',
'col_diag_dom', 'diag_avg', 'diag_sign', 'diag_nnz', 'lower_bw',
'upper_bw', 'row_log_val_spread', 'col_log_val_spread', 'symm', 'np',
'solver_id', 'prec_id'],
dtype='object')
In [10]:
clfLasso = RandomizedLasso()
clfLasso.fit(X,y)
clfLasso.scores_
/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 14 iterations, i.e. alpha=3.459e-06, with an active set of 14 regressors, and the smallest cholesky pivot element being 2.220e-16
ConvergenceWarning)
/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/least_angle.py:334: ConvergenceWarning: Early stopping the lars path, as the residues are small and the current value of alpha is no longer well controlled. 27 iterations, alpha=1.644e-06, previous alpha=1.644e-06, with an active set of 22 regressors.
ConvergenceWarning)
/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/least_angle.py:381: RuntimeWarning: overflow encountered in true_divide
g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))
/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/least_angle.py:381: RuntimeWarning: overflow encountered in true_divide
g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))
Out[10]:
array([ 0.04 , 0.115, 0. , 0. , 0.005, 0. , 0. , 0.045,
0. , 0.075, 0. , 0.025, 0.035, 0.05 , 0.015, 0. ,
0. , 0. , 0. , 0.115, 0.01 , 0.005, 0.42 , 0.08 ,
0.005, 0.035, 0.455, 0. , 0.32 , 0. , 0. , 0.16 ,
0. , 0. , 0.025, 0.005, 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0.71 , 0.04 , 0.015, 0.005,
0.27 , 0.16 , 0.19 , 0.005, 0.175, 0.295, 0.375, 0.43 ,
0.68 , 0.035, 0.04 , 0.28 , 0.885, 0.995, 1. , 0.605,
0.225, 0.51 , 0.525, 0.705, 0.76 , 0.24 , 0.045, 0.84 ,
0.855, 0.48 , 0.3 , 0.73 , 0.17 , 0.49 , 1. , 1. ])
In [11]:
for i,j in zip(X.columns, clfLasso.scores_):
print(i,j)
system_id 0.04
HPL_Tflops 0.115
StarDGEMM_Gflops 0.0
SingleDGEMM_Gflops 0.0
PTRANS_GBs 0.005
MPIRandomAccess_LCG_GUPs 0.0
MPIRandomAccess_GUPs 0.0
StarRandomAccess_LCG_GUPs 0.045
SingleRandomAccess_LCG_GUPs 0.0
StarRandomAccess_GUPs 0.075
SingleRandomAccess_GUPs 0.0
StarSTREAM_Copy 0.025
StarSTREAM_Scale 0.035
StarSTREAM_Add 0.05
StarSTREAM_Triad 0.015
SingleSTREAM_Copy 0.0
SingleSTREAM_Scale 0.0
SingleSTREAM_Add 0.0
SingleSTREAM_Triad 0.0
StarFFT_Gflops 0.115
SingleFFT_Gflops 0.01
MPIFFT_Gflops 0.005
MaxPingPongLatency_usec 0.42
RandomlyOrderedRingLatency_usec 0.08
MinPingPongBandwidth_GBytes 0.005
NaturallyOrderedRingBandwidth_GBytes 0.035
RandomlyOrderedRingBandwidth_GBytes 0.455
MinPingPongLatency_usec 0.0
AvgPingPongLatency_usec 0.32
MaxPingPongBandwidth_GBytes 0.0
AvgPingPongBandwidth_GBytes 0.0
NaturallyOrderedRingLatency_usec 0.16
MemProc 0.0
core_count 0.0
cpu_freq 0.025
bogo_mips 0.005
l1_cache 0.0
l2_cache 0.0
l3_cache 0.0
memory_size 0.0
memory_freq 0.0
memory_type 0.0
rows 0.0
cols 0.0
min_nnz_row 0.71
row_var 0.04
col_var 0.015
diag_var 0.005
nnz 0.27
frob_norm 0.16
symm_frob_norm 0.19
antisymm_frob_norm 0.005
one_norm 0.175
inf_norm 0.295
symm_inf_norm 0.375
antisymm_inf_norm 0.43
max_nnz_row 0.68
trace 0.035
abs_trace 0.04
min_nnz_row.1 0.28
avg_nnz_row 0.885
dummy_rows 0.995
dummy_rows_kind 1.0
num_value_symm_1 0.605
nnz_pattern_symm_1 0.225
num_value_symm_2 0.51
nnz_pattern_symm_2 0.525
row_diag_dom 0.705
col_diag_dom 0.76
diag_avg 0.24
diag_sign 0.045
diag_nnz 0.84
lower_bw 0.855
upper_bw 0.48
row_log_val_spread 0.3
col_log_val_spread 0.73
symm 0.17
np 0.49
solver_id 1.0
prec_id 1.0
Content source: patemotter/trilinos-prediction
Similar notebooks: