In [1]:
import pandas as pd
import numpy as np

In [2]:
from pandas import read_csv
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import RandomizedLasso

In [3]:
in_all = pd.read_csv('../classifications/merged_with_all_features.csv', index_col=0)

In [4]:
in_all.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 748916 entries, 0 to 748915
Data columns (total 81 columns):
system_id                               748916 non-null int64
HPL_Tflops                              748916 non-null float64
StarDGEMM_Gflops                        748916 non-null float64
SingleDGEMM_Gflops                      748916 non-null float64
PTRANS_GBs                              748916 non-null float64
MPIRandomAccess_LCG_GUPs                748916 non-null float64
MPIRandomAccess_GUPs                    748916 non-null float64
StarRandomAccess_LCG_GUPs               748916 non-null float64
SingleRandomAccess_LCG_GUPs             748916 non-null float64
StarRandomAccess_GUPs                   748916 non-null float64
SingleRandomAccess_GUPs                 748916 non-null float64
StarSTREAM_Copy                         748916 non-null float64
StarSTREAM_Scale                        748916 non-null float64
StarSTREAM_Add                          748916 non-null float64
StarSTREAM_Triad                        748916 non-null float64
SingleSTREAM_Copy                       748916 non-null float64
SingleSTREAM_Scale                      748916 non-null float64
SingleSTREAM_Add                        748916 non-null float64
SingleSTREAM_Triad                      748916 non-null float64
StarFFT_Gflops                          748916 non-null float64
SingleFFT_Gflops                        748916 non-null float64
MPIFFT_Gflops                           748916 non-null float64
MaxPingPongLatency_usec                 748916 non-null float64
RandomlyOrderedRingLatency_usec         748916 non-null float64
MinPingPongBandwidth_GBytes             748916 non-null float64
NaturallyOrderedRingBandwidth_GBytes    748916 non-null float64
RandomlyOrderedRingBandwidth_GBytes     748916 non-null float64
MinPingPongLatency_usec                 748916 non-null float64
AvgPingPongLatency_usec                 748916 non-null float64
MaxPingPongBandwidth_GBytes             748916 non-null float64
AvgPingPongBandwidth_GBytes             748916 non-null float64
NaturallyOrderedRingLatency_usec        748916 non-null float64
MemProc                                 748916 non-null int64
core_count                              748916 non-null int64
cpu_freq                                748916 non-null int64
bogo_mips                               748916 non-null float64
l1_cache                                748916 non-null int64
l2_cache                                748916 non-null int64
l3_cache                                748916 non-null int64
memory_size                             748916 non-null int64
memory_freq                             748916 non-null int64
memory_type                             748916 non-null int64
rows                                    748916 non-null int64
cols                                    748916 non-null int64
min_nnz_row                             748916 non-null int64
row_var                                 748916 non-null float64
col_var                                 748916 non-null float64
diag_var                                748916 non-null float64
nnz                                     748916 non-null int64
frob_norm                               748916 non-null float64
symm_frob_norm                          748916 non-null float64
antisymm_frob_norm                      748916 non-null float64
one_norm                                748916 non-null float64
inf_norm                                748916 non-null float64
symm_inf_norm                           748916 non-null float64
antisymm_inf_norm                       748916 non-null float64
max_nnz_row                             748916 non-null int64
trace                                   748916 non-null float64
abs_trace                               748916 non-null float64
min_nnz_row.1                           748916 non-null int64
avg_nnz_row                             748916 non-null int64
dummy_rows                              748916 non-null int64
dummy_rows_kind                         748916 non-null int64
num_value_symm_1                        748916 non-null int64
nnz_pattern_symm_1                      748916 non-null int64
num_value_symm_2                        748916 non-null float64
nnz_pattern_symm_2                      748916 non-null float64
row_diag_dom                            748916 non-null int64
col_diag_dom                            748916 non-null int64
diag_avg                                748916 non-null float64
diag_sign                               748916 non-null int64
diag_nnz                                748916 non-null int64
lower_bw                                748916 non-null int64
upper_bw                                748916 non-null int64
row_log_val_spread                      748916 non-null float64
col_log_val_spread                      748916 non-null float64
symm                                    748916 non-null int64
np                                      748916 non-null int64
solver_id                               748916 non-null int64
prec_id                                 748916 non-null int64
good_or_bad                             748916 non-null float64
dtypes: float64(50), int64(31)
memory usage: 468.5 MB

In [5]:
in_all.system_id.value_counts()


Out[5]:
1    243200
2    215561
4    149605
3    140550
Name: system_id, dtype: int64

In [6]:
X = in_all.iloc[:,0:-1]
y = in_all.iloc[:,-1]

In [7]:
model = ExtraTreesClassifier()
model.fit(X,y)
print(model.feature_importances_)
a = model.feature_importances_


[  9.56537652e-05   6.88950128e-05   8.42141372e-05   9.60209722e-05
   7.14296321e-05   7.46885515e-05   5.70044001e-05   9.15346525e-05
   7.90386718e-05   1.03400095e-04   7.45316390e-05   9.89229634e-05
   5.77601825e-05   8.80103025e-05   9.11406709e-05   1.12980090e-04
   6.75211401e-05   8.45308869e-05   6.58600044e-05   1.42550883e-04
   1.01703685e-04   6.37397061e-05   1.05653907e-04   7.67865119e-05
   7.84360970e-05   7.27527849e-05   8.26801349e-05   1.12181442e-04
   1.08394574e-04   6.04507763e-05   7.39511893e-05   8.76492382e-05
   1.17584234e-04   7.00964907e-05   7.98117120e-05   7.72067674e-05
   0.00000000e+00   0.00000000e+00   1.04177456e-04   5.65010429e-05
   5.88066425e-05   5.17133738e-05   6.43986583e-03   6.31097202e-03
   4.66827822e-03   4.52261553e-03   4.97713437e-03   4.75236555e-03
   7.07855692e-03   5.58799254e-03   5.18597439e-03   4.83340806e-03
   5.28379215e-03   4.75088050e-03   4.89161459e-03   5.24574437e-03
   7.95030133e-03   5.06175970e-03   4.67310248e-03   4.21019673e-03
   8.34061232e-03   9.85523166e-03   4.54532635e-03   2.17366249e-04
   1.84996859e-04   1.14959453e-02   1.09906624e-02   2.28071486e-03
   2.83970720e-03   4.79478069e-03   1.43253150e-02   7.57705210e-03
   7.19327349e-03   5.49421660e-03   1.05155357e-02   1.01605287e-02
   2.23490776e-04   1.22839409e-01   4.36548113e-01   2.29807201e-01]

In [8]:
blah = {}
blah[0] = in_all.columns
blah[1] = a
new_df = pd.DataFrame()

In [9]:
X.columns


Out[9]:
Index(['system_id', 'HPL_Tflops', 'StarDGEMM_Gflops', 'SingleDGEMM_Gflops',
       'PTRANS_GBs', 'MPIRandomAccess_LCG_GUPs', 'MPIRandomAccess_GUPs',
       'StarRandomAccess_LCG_GUPs', 'SingleRandomAccess_LCG_GUPs',
       'StarRandomAccess_GUPs', 'SingleRandomAccess_GUPs', 'StarSTREAM_Copy',
       'StarSTREAM_Scale', 'StarSTREAM_Add', 'StarSTREAM_Triad',
       'SingleSTREAM_Copy', 'SingleSTREAM_Scale', 'SingleSTREAM_Add',
       'SingleSTREAM_Triad', 'StarFFT_Gflops', 'SingleFFT_Gflops',
       'MPIFFT_Gflops', 'MaxPingPongLatency_usec',
       'RandomlyOrderedRingLatency_usec', 'MinPingPongBandwidth_GBytes',
       'NaturallyOrderedRingBandwidth_GBytes',
       'RandomlyOrderedRingBandwidth_GBytes', 'MinPingPongLatency_usec',
       'AvgPingPongLatency_usec', 'MaxPingPongBandwidth_GBytes',
       'AvgPingPongBandwidth_GBytes', 'NaturallyOrderedRingLatency_usec',
       'MemProc', 'core_count', 'cpu_freq', 'bogo_mips', 'l1_cache',
       'l2_cache', 'l3_cache', 'memory_size', 'memory_freq', 'memory_type',
       'rows', 'cols', 'min_nnz_row', 'row_var', 'col_var', 'diag_var', 'nnz',
       'frob_norm', 'symm_frob_norm', 'antisymm_frob_norm', 'one_norm',
       'inf_norm', 'symm_inf_norm', 'antisymm_inf_norm', 'max_nnz_row',
       'trace', 'abs_trace', 'min_nnz_row.1', 'avg_nnz_row', 'dummy_rows',
       'dummy_rows_kind', 'num_value_symm_1', 'nnz_pattern_symm_1',
       'num_value_symm_2', 'nnz_pattern_symm_2', 'row_diag_dom',
       'col_diag_dom', 'diag_avg', 'diag_sign', 'diag_nnz', 'lower_bw',
       'upper_bw', 'row_log_val_spread', 'col_log_val_spread', 'symm', 'np',
       'solver_id', 'prec_id'],
      dtype='object')

In [10]:
clfLasso = RandomizedLasso()
clfLasso.fit(X,y)
clfLasso.scores_


/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/least_angle.py:309: ConvergenceWarning: Regressors in active set degenerate. Dropping a regressor, after 14 iterations, i.e. alpha=3.459e-06, with an active set of 14 regressors, and the smallest cholesky pivot element being 2.220e-16
  ConvergenceWarning)
/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/least_angle.py:334: ConvergenceWarning: Early stopping the lars path, as the residues are small and the current value of alpha is no longer well controlled. 27 iterations, alpha=1.644e-06, previous alpha=1.644e-06, with an active set of 22 regressors.
  ConvergenceWarning)
/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/least_angle.py:381: RuntimeWarning: overflow encountered in true_divide
  g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))
/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/least_angle.py:381: RuntimeWarning: overflow encountered in true_divide
  g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))
Out[10]:
array([ 0.04 ,  0.115,  0.   ,  0.   ,  0.005,  0.   ,  0.   ,  0.045,
        0.   ,  0.075,  0.   ,  0.025,  0.035,  0.05 ,  0.015,  0.   ,
        0.   ,  0.   ,  0.   ,  0.115,  0.01 ,  0.005,  0.42 ,  0.08 ,
        0.005,  0.035,  0.455,  0.   ,  0.32 ,  0.   ,  0.   ,  0.16 ,
        0.   ,  0.   ,  0.025,  0.005,  0.   ,  0.   ,  0.   ,  0.   ,
        0.   ,  0.   ,  0.   ,  0.   ,  0.71 ,  0.04 ,  0.015,  0.005,
        0.27 ,  0.16 ,  0.19 ,  0.005,  0.175,  0.295,  0.375,  0.43 ,
        0.68 ,  0.035,  0.04 ,  0.28 ,  0.885,  0.995,  1.   ,  0.605,
        0.225,  0.51 ,  0.525,  0.705,  0.76 ,  0.24 ,  0.045,  0.84 ,
        0.855,  0.48 ,  0.3  ,  0.73 ,  0.17 ,  0.49 ,  1.   ,  1.   ])

In [11]:
for i,j in zip(X.columns, clfLasso.scores_):
    print(i,j)


system_id 0.04
HPL_Tflops 0.115
StarDGEMM_Gflops 0.0
SingleDGEMM_Gflops 0.0
PTRANS_GBs 0.005
MPIRandomAccess_LCG_GUPs 0.0
MPIRandomAccess_GUPs 0.0
StarRandomAccess_LCG_GUPs 0.045
SingleRandomAccess_LCG_GUPs 0.0
StarRandomAccess_GUPs 0.075
SingleRandomAccess_GUPs 0.0
StarSTREAM_Copy 0.025
StarSTREAM_Scale 0.035
StarSTREAM_Add 0.05
StarSTREAM_Triad 0.015
SingleSTREAM_Copy 0.0
SingleSTREAM_Scale 0.0
SingleSTREAM_Add 0.0
SingleSTREAM_Triad 0.0
StarFFT_Gflops 0.115
SingleFFT_Gflops 0.01
MPIFFT_Gflops 0.005
MaxPingPongLatency_usec 0.42
RandomlyOrderedRingLatency_usec 0.08
MinPingPongBandwidth_GBytes 0.005
NaturallyOrderedRingBandwidth_GBytes 0.035
RandomlyOrderedRingBandwidth_GBytes 0.455
MinPingPongLatency_usec 0.0
AvgPingPongLatency_usec 0.32
MaxPingPongBandwidth_GBytes 0.0
AvgPingPongBandwidth_GBytes 0.0
NaturallyOrderedRingLatency_usec 0.16
MemProc 0.0
core_count 0.0
cpu_freq 0.025
bogo_mips 0.005
l1_cache 0.0
l2_cache 0.0
l3_cache 0.0
memory_size 0.0
memory_freq 0.0
memory_type 0.0
rows 0.0
cols 0.0
min_nnz_row 0.71
row_var 0.04
col_var 0.015
diag_var 0.005
nnz 0.27
frob_norm 0.16
symm_frob_norm 0.19
antisymm_frob_norm 0.005
one_norm 0.175
inf_norm 0.295
symm_inf_norm 0.375
antisymm_inf_norm 0.43
max_nnz_row 0.68
trace 0.035
abs_trace 0.04
min_nnz_row.1 0.28
avg_nnz_row 0.885
dummy_rows 0.995
dummy_rows_kind 1.0
num_value_symm_1 0.605
nnz_pattern_symm_1 0.225
num_value_symm_2 0.51
nnz_pattern_symm_2 0.525
row_diag_dom 0.705
col_diag_dom 0.76
diag_avg 0.24
diag_sign 0.045
diag_nnz 0.84
lower_bw 0.855
upper_bw 0.48
row_log_val_spread 0.3
col_log_val_spread 0.73
symm 0.17
np 0.49
solver_id 1.0
prec_id 1.0