1. Final Selection of Features in the 50 Percent Best Feature Set



In [ ]:

    
# Just import everything we might need, it saves time.
import warnings
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from time import time
from scipy.stats import randint as sp_randint
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score, KFold, train_test_split
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.svm import SVC
import seaborn as sns
%pylab inline
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

2. Load The Sorted Training Data Features

- sorted-train-malware-features-asm-reduced.csv
- sorted-train-malware-features-byte.csv
- sorted-train-labels.csv
- sorted-train-features-combined.csv



In [2]:

    
# First load the .asm and .byte training/test data and training labels
sorted_train_data_asm = pd.read_csv('data/sorted-train-malware-features-asm-50percent.csv')
sorted_train_data_byte = pd.read_csv('data/sorted-train-malware-features-byte.csv')
sorted_test_data_asm = pd.read_csv('data/sorted-test-malware-features-asm-50percent.csv')
sorted_test_data_byte = pd.read_csv('data/sorted-test-malware-features-byte.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
#combined_train_data = pd.read_csv('data/sorted-train-features-combined.csv')

# Load the image data for training data
sorted_train_image_asm = pd.read_csv('data/sorted-train-image-features-asm-50percent.csv')
sorted_test_image_asm = pd.read_csv('data/sorted-test-image-features-asm-50percent.csv')
#sorted_train_image_byte = pd.read_csv('data/sorted-train-image-features-byte-reduced.csv')

# Now load the row statistics for the original feature set, these will be combined with the new features
all_train_rowstats = pd.read_csv('data/all-train-asm-rowstats.csv')
all_test_rowstats = pd.read_csv('data/all-test-asm-rowstats.csv')
#all_train_image_asm_rowstats = pd.read_csv('data/all-train-image-asm-rowstats.csv')
#all_test_image_asm_rowstats = pd.read_csv('data/all-test-image-asm-rowstats.csv')



In [3]:

    
sorted_train_data_asm.head()









    Out[3]:






  
    
      
      filename
      edx
      esi
      es
      fs
      ds
      ss
      gs
      cs
      ah
      al
      ax
      bh
      bl
      bx
      ch
      cl
      cx
      dh
      dl
      
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
        750
        496
       3
       0
       0
       0
       0
       0
       8
       224
       49
       34
       25
       0
       41
       191
       52
       38
       163
      ...
    
    
      1
       01SuzwMJEIXsK7A8dQbl
       1121
         24
       3
       0
       1
       4
       0
       2
       6
        22
        7
        1
        4
       0
        3
        37
        2
        4
         9
      ...
    
    
      2
       01azqd4InC7m9JpocGv5
       1493
       1900
       0
       0
       0
       0
       0
       0
       1
       398
        0
        0
       47
       0
        1
        77
        4
        1
        56
      ...
    
    
      3
       01jsnpXSAlgw6aPeDxrU
        525
          4
       0
       0
       0
       0
       0
       0
       0
         0
        0
        0
        1
       0
        0
         1
        2
        0
         0
      ...
    
    
      4
       01kcPWA9K2BOxQeS5Rju
         23
         35
       0
       0
       0
       0
       0
       0
       0
         3
        0
        0
        1
       0
        0
         1
        0
        0
         0
      ...
    
  

5 rows × 504 columns



In [4]:

    
sorted_train_labels.head()









    Out[4]:






  
    
      
      Id
      Class
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
       2
    
    
      1
       01SuzwMJEIXsK7A8dQbl
       8
    
    
      2
       01azqd4InC7m9JpocGv5
       9
    
    
      3
       01jsnpXSAlgw6aPeDxrU
       9
    
    
      4
       01kcPWA9K2BOxQeS5Rju
       1
    
  

5 rows × 2 columns



In [4]:

    
sorted_train_image_asm.head()









    Out[4]:






  
    
      
      filename
      ASM_1
      ASM_3
      ASM_4
      ASM_14
      ASM_20
      ASM_21
      ASM_23
      ASM_24
      ASM_25
      ASM_26
      ASM_28
      ASM_29
      ASM_31
      ASM_32
      ASM_33
      ASM_34
      ASM_40
      ASM_41
      ASM_42
      
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
       116
       120
       116
        9
       32
       32
       32
       32
       32
       32
       13
       10
       116
       101
       120
       116
       49
       48
       48
      ...
    
    
      1
       01SuzwMJEIXsK7A8dQbl
        69
        68
        69
       48
        9
        9
       13
       10
       72
       69
       68
       69
        58
        48
        48
        52
        9
        9
        9
      ...
    
    
      2
       01azqd4InC7m9JpocGv5
        69
        68
        69
       48
        9
        9
       13
       10
       72
       69
       68
       69
        58
        48
        48
        52
        9
        9
        9
      ...
    
    
      3
       01jsnpXSAlgw6aPeDxrU
        69
        68
        69
       48
        9
        9
       13
       10
       72
       69
       68
       69
        58
        48
        48
        52
        9
        9
        9
      ...
    
    
      4
       01kcPWA9K2BOxQeS5Rju
        69
        68
        69
       48
        9
        9
       13
       10
       72
       69
       68
       69
        58
        49
        48
        48
        9
        9
        9
      ...
    
  

5 rows × 501 columns



In [5]:

    
sorted_test_image_asm.head()









    Out[5]:






  
    
      
      filename
      ASM_1
      ASM_3
      ASM_4
      ASM_14
      ASM_20
      ASM_21
      ASM_23
      ASM_24
      ASM_25
      ASM_26
      ASM_28
      ASM_29
      ASM_31
      ASM_32
      ASM_33
      ASM_34
      ASM_40
      ASM_41
      ASM_42
      
    
  
  
    
      0
       ITSUPtCmh7WdJcsYDwQ5
       69
       68
       69
       48
       9
       9
       13
       10
       72
       69
       68
       69
       58
       48
       48
       52
       9
       9
       9
      ...
    
    
      1
       Ig2DB5tSiEy1cJvV0zdw
       69
       68
       69
       48
       9
       9
       13
       10
       72
       69
       68
       69
       58
       48
       48
       52
       9
       9
       9
      ...
    
    
      2
       Jmo6eIhLZ4t9r8QsxEg5
       69
       68
       69
       48
       9
       9
       13
       10
       72
       69
       68
       69
       58
       48
       48
       52
       9
       9
       9
      ...
    
    
      3
       JtPFl4ewgdD78OzCMa3o
       69
       68
       69
       48
       9
       9
       13
       10
       72
       69
       68
       69
       58
       48
       48
       52
       9
       9
       9
      ...
    
    
      4
       K3ZtByPHGSFYNljDUEXp
       69
       68
       69
       48
       9
       9
       13
       10
       72
       69
       68
       69
       58
       48
       48
       52
       9
       9
       9
      ...
    
  

5 rows × 501 columns



In [7]:

    
sorted_train_data_byte.head()









    Out[7]:






  
    
      
      filename
      entropy
      filesize
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
       0.614952
       6874624
    
    
      1
       01SuzwMJEIXsK7A8dQbl
       0.843262
        460288
    
    
      2
       01azqd4InC7m9JpocGv5
       0.703961
       5256192
    
    
      3
       01jsnpXSAlgw6aPeDxrU
       0.806035
       4825600
    
    
      4
       01kcPWA9K2BOxQeS5Rju
       0.871610
        712704
    
  

5 rows × 3 columns



In [6]:

    
sorted_test_data_asm.head()









    Out[6]:






  
    
      
      filename
      edx
      esi
      es
      fs
      ds
      ss
      gs
      cs
      ah
      al
      ax
      bh
      bl
      bx
      ch
      cl
      cx
      dh
      dl
      
    
  
  
    
      0
       ITSUPtCmh7WdJcsYDwQ5
       245
       434
       0
       0
       1
       0
       0
       0
        9
       51
       1
       0
       4
       1
       3
       1
       3
       0
       1
      ...
    
    
      1
       Ig2DB5tSiEy1cJvV0zdw
       258
       437
       0
       0
       0
       0
       0
       0
       11
       60
       1
       1
       2
       2
       0
       1
       2
       2
       1
      ...
    
    
      2
       Jmo6eIhLZ4t9r8QsxEg5
       238
       365
       0
       0
       0
       0
       0
       0
        8
       51
       0
       0
       4
       1
       0
       1
       2
       1
       0
      ...
    
    
      3
       JtPFl4ewgdD78OzCMa3o
       241
       556
       1
       0
       1
       1
       0
       0
        4
       52
       0
       1
       0
       1
       2
       2
       2
       0
       1
      ...
    
    
      4
       K3ZtByPHGSFYNljDUEXp
        92
        75
       0
       0
       0
       0
       0
       0
        0
        1
       0
       0
       0
       0
       0
       2
       0
       0
       0
      ...
    
  

5 rows × 504 columns



In [7]:

    
# Assign asm data to X,y for brevity.
X = sorted_train_data_asm.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
X_test = sorted_test_data_asm.iloc[:,1:]
X_train_image_asm = sorted_train_image_asm.iloc[:,1:]
X_test_image_asm = sorted_test_image_asm.iloc[:,1:]
# X_image_byte = sorted_train_image_byte.iloc[:,1:]



In [8]:

    
X_train_image_asm.shape









    Out[8]:





(10868, 500)



In [9]:

    
X_test_image_asm.shape









    Out[9]:





(10873, 500)

3. Perform Some Basic Statistical Analysis on the Feature Sets

Find the following:
- Feature and row mean
- Feature and row standard deviation
- Feature and row correlation coefficient
- Feature and row covariance
- Feature and row minimum and maximum



In [10]:

    
# Train feature stats
X_means = X.mean()
X_std = X.std()
X_cor = X.corr()
X_cov = X.cov()

# Test feature stats
X_test_means = X_test.mean()
X_test_std = X_test.std()
X_test_cor = X_test.corr()
X_test_cov = X_test.cov()

# Train image feature stats
X_train_image_asm_means = X_train_image_asm.mean()
X_train_image_asm_std = X_train_image_asm.std()
X_train_image_asm_cor = X_train_image_asm.corr()
X_train_image_asm_cov = X_train_image_asm.cov()

# Test image feature stats
X_test_image_asm_means = X_test_image_asm.mean()
X_test_image_asm_std = X_test_image_asm.std()
X_test_image_asm_cor = X_test_image_asm.corr()
X_test_image_asm_cov = X_test_image_asm.cov()

# Not using byte image features
#X_image_byte_means = X_image_byte.mean()
#X_image_byte_std = X_image_byte.std()
#X_image_byte_cor = X_image_byte.corr()
#X_image_byte_cov = X_image_byte.cov()



In [11]:

    
X_train_image_asm_means.head()









    Out[11]:





ASM_1     79.944056
ASM_3     79.556220
ASM_4     79.304564
ASM_14    39.147405
ASM_20    14.223040
dtype: float64



In [12]:

    
X_train_image_asm_std.head()









    Out[12]:





ASM_1     19.950966
ASM_3     21.598677
ASM_4     19.392095
ASM_14    16.344400
ASM_20     9.636302
dtype: float64



In [13]:

    
X_test_image_asm_means.head()









    Out[13]:





ASM_1     79.938471
ASM_3     79.558080
ASM_4     79.319415
ASM_14    39.151936
ASM_20    14.224133
dtype: float64



In [14]:

    
# The byte image data has low standard deviation and mean variance and is not very useful for learning
# so this data will not be used for further analysis.
X_test_image_asm_std.head()









    Out[14]:





ASM_1     19.925523
ASM_3     21.626103
ASM_4     19.435313
ASM_14    16.335403
ASM_20     9.648070
dtype: float64



In [ ]:

    
X_means.head()

X_std.head()

X_cor.head()

X_cov.head()

X_means.min()

X_means.max()

X_std.min()

X_std.max()

X_means[X_means == X_means.min()]

X_means[X_means == X_means.max()]

X_std[X_std == X_std.min()]

X_std[X_std == X_std.max()]



In [15]:

    
# Row stats for train and test data
X_train_rowstats = pd.DataFrame(index=np.arange(X.shape[0]), columns=['filename','mean','std','min','max','total','logtotal'], dtype=float64)
X_test_rowstats = pd.DataFrame(index=np.arange(X_test.shape[0]), columns=['filename','e_mean','e_std','e_min','e_max','e_total','e_logtotal'], dtype=float64)

X_train_rowstats['filename'] = sorted_train_data_asm['filename']
for i in range(0,X.shape[0]):
    X_train_rowstats['mean'][i] = X.iloc[i,:].mean()
    X_train_rowstats['std'][i] = X.iloc[i,:].std()
    X_train_rowstats['min'][i] = X.iloc[i,:].min()
    X_train_rowstats['max'][i] = X.iloc[i,:].max()
    

X_train_rowstats['total'] = X_train_rowstats['max'] * X_train_rowstats['mean'] * X_train_rowstats['std']
X_train_rowstats['logtotal'] = np.log(X_train_rowstats['total']) # natural logarithm

X_test_rowstats['filename'] = sorted_test_data_asm['filename']
for i in range(0,X_test.shape[0]):
    X_test_rowstats['e_mean'][i] = X_test.iloc[i,:].mean()
    X_test_rowstats['e_std'][i] = X_test.iloc[i,:].std()
    X_test_rowstats['e_min'][i] = X_test.iloc[i,:].min()
    X_test_rowstats['e_max'][i] = X_test.iloc[i,:].max()
    

X_test_rowstats['e_total'] = X_test_rowstats['e_max'] * X_test_rowstats['e_mean'] * X_test_rowstats['e_std']
X_test_rowstats['e_logtotal'] = np.log(X_test_rowstats['e_total']) # natural logarithm

X_train_rowstats.head()









    Out[15]:






  
    
      
      filename
      mean
      std
      min
      max
      total
      logtotal
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
        648.620278
        6811.551280
       0
         87555
       3.868276e+11
       26.681245
    
    
      1
       01SuzwMJEIXsK7A8dQbl
         68.274354
         462.249118
       0
          5817
       1.835831e+08
       19.028178
    
    
      2
       01azqd4InC7m9JpocGv5
       5550.391650
       86112.319736
       0
       1367070
       6.534008e+14
       34.113212
    
    
      3
       01jsnpXSAlgw6aPeDxrU
        284.071571
        4153.720695
       0
         65928
       7.779200e+10
       25.077304
    
    
      4
       01kcPWA9K2BOxQeS5Rju
          4.930417
          26.064759
       0
           445
       5.718701e+04
       10.954082
    
  

5 rows × 7 columns



In [16]:

    
X_test_rowstats.head()









    Out[16]:






  
    
      
      filename
      e_mean
      e_std
      e_min
      e_max
      e_total
      e_logtotal
    
  
  
    
      0
       ITSUPtCmh7WdJcsYDwQ5
        144.286282
        1977.507215
       0
        31361
       8.948145e+09
       22.914712
    
    
      1
       Ig2DB5tSiEy1cJvV0zdw
        961.739563
       14923.803067
       0
       236923
       3.400511e+12
       28.854947
    
    
      2
       Jmo6eIhLZ4t9r8QsxEg5
       1039.353877
       16206.598011
       0
       257289
       4.333876e+12
       29.097483
    
    
      3
       JtPFl4ewgdD78OzCMa3o
         85.041750
        1035.014770
       0
        16341
       1.438326e+09
       21.086746
    
    
      4
       K3ZtByPHGSFYNljDUEXp
       3133.671968
       49409.150011
       0
       784363
       1.214445e+14
       32.430479
    
  

5 rows × 7 columns



In [17]:

    
# Image row stats
X_train_image_asm_rowstats = pd.DataFrame(index=np.arange(X_train_image_asm.shape[0]), columns=['filename','tr_mean','tr_std','tr_min','tr_max','tr_total','tr_logtotal'], dtype=float64)
X_test_image_asm_rowstats = pd.DataFrame(index=np.arange(X_test_image_asm.shape[0]), columns=['filename','te_mean','te_std','te_min','te_max','te_total','te_logtotal'], dtype=float64)

X_train_image_asm_rowstats['filename'] = sorted_train_data_asm['filename']
for i in range(0,X_train_image_asm.shape[0]):
    X_train_image_asm_rowstats['tr_mean'][i] = X_train_image_asm.iloc[i,:].mean()
    X_train_image_asm_rowstats['tr_std'][i] = X_train_image_asm.iloc[i,:].std()
    X_train_image_asm_rowstats['tr_min'][i] = X_train_image_asm.iloc[i,:].min()
    X_train_image_asm_rowstats['tr_max'][i] = X_train_image_asm.iloc[i,:].max()
    

X_train_image_asm_rowstats['tr_total'] = X_train_image_asm_rowstats['tr_max'] * X_train_image_asm_rowstats['tr_mean'] * X_train_image_asm_rowstats['tr_std']
X_train_image_asm_rowstats['tr_logtotal'] = np.log(X_train_image_asm_rowstats['tr_total']) # natural logarithm

X_test_image_asm_rowstats['filename'] = sorted_test_data_asm['filename']
for i in range(0,X_test_image_asm.shape[0]):
    X_test_image_asm_rowstats['te_mean'][i] = X_test_image_asm.iloc[i,:].mean()
    X_test_image_asm_rowstats['te_std'][i] = X_test_image_asm.iloc[i,:].std()
    X_test_image_asm_rowstats['te_min'][i] = X_test_image_asm.iloc[i,:].min()
    X_test_image_asm_rowstats['te_max'][i] = X_test_image_asm.iloc[i,:].max()
    

X_test_image_asm_rowstats['te_total'] = X_test_image_asm_rowstats['te_max'] * X_test_image_asm_rowstats['te_mean'] * X_test_image_asm_rowstats['te_std']
X_test_image_asm_rowstats['te_logtotal'] = np.log(X_test_image_asm_rowstats['te_total']) # natural logarithm

X_train_image_asm_rowstats.head()









    Out[17]:






  
    
      
      filename
      tr_mean
      tr_std
      tr_min
      tr_max
      tr_total
      tr_logtotal
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
       55.516
       37.775597
       9
       124
       260046.605955
       12.468616
    
    
      1
       01SuzwMJEIXsK7A8dQbl
       49.790
       33.782605
       9
       124
       208572.454549
       12.248042
    
    
      2
       01azqd4InC7m9JpocGv5
       49.790
       33.782605
       9
       124
       208572.454549
       12.248042
    
    
      3
       01jsnpXSAlgw6aPeDxrU
       49.790
       33.782605
       9
       124
       208572.454549
       12.248042
    
    
      4
       01kcPWA9K2BOxQeS5Rju
       52.780
       36.023534
       9
       124
       235763.942362
       12.370586
    
  

5 rows × 7 columns



In [18]:

    
X_test_image_asm_rowstats.head()









    Out[18]:






  
    
      
      filename
      te_mean
      te_std
      te_min
      te_max
      te_total
      te_logtotal
    
  
  
    
      0
       ITSUPtCmh7WdJcsYDwQ5
       50.432
       34.630506
       9
       124
       216564.221800
       12.285642
    
    
      1
       Ig2DB5tSiEy1cJvV0zdw
       50.240
       34.704686
       9
       124
       216201.864665
       12.283968
    
    
      2
       Jmo6eIhLZ4t9r8QsxEg5
       52.826
       36.020698
       9
       124
       235950.846586
       12.371379
    
    
      3
       JtPFl4ewgdD78OzCMa3o
       52.826
       36.020698
       9
       124
       235950.846586
       12.371379
    
    
      4
       K3ZtByPHGSFYNljDUEXp
       50.240
       34.704686
       9
       124
       216201.864665
       12.283968
    
  

5 rows × 7 columns



In [19]:

    
# Write column stats and row stats to file

X_train_rowstats.to_csv('data/train-asm-rowstats-50percent.csv', index=False)
X_test_rowstats.to_csv('data/test-asm-rowstats-50percent.csv', index=False)
X_train_image_asm_rowstats.to_csv('data/train-image-asm-rowstats-50percent.csv', index=False)
X_test_image_asm_rowstats.to_csv('data/test-image-asm-rowstats-50percent.csv', index=False)



In [20]:

    
# Combine all the training features and write to file.
combined_train_data = sorted_train_data_asm.merge(sorted_train_data_byte, on='filename')

combined_train_data = combined_train_data.merge(X_train_rowstats, on='filename')

combined_train_data = combined_train_data.merge(X_train_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_train_data = combined_train_data.merge(sorted_train_image_asm, on='filename')

combined_train_data = combined_train_data.merge(all_train_rowstats, on='filename')

# Result is better without the image rowstats for all image features
#combined_train_data = combined_train_data.merge(all_train_image_asm_rowstats, on='filename')

combined_train_data.to_csv('data/final-combined-train-data-50percent.csv', index=False)

combined_train_data.head()









    Out[20]:






  
    
      
      filename
      edx
      esi
      es
      fs
      ds
      ss
      gs
      cs
      ah
      al
      ax
      bh
      bl
      bx
      ch
      cl
      cx
      dh
      dl
      
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
        750
        496
       3
       0
       0
       0
       0
       0
       8
       224
       49
       34
       25
       0
       41
       191
       52
       38
       163
      ...
    
    
      1
       01SuzwMJEIXsK7A8dQbl
       1121
         24
       3
       0
       1
       4
       0
       2
       6
        22
        7
        1
        4
       0
        3
        37
        2
        4
         9
      ...
    
    
      2
       01azqd4InC7m9JpocGv5
       1493
       1900
       0
       0
       0
       0
       0
       0
       1
       398
        0
        0
       47
       0
        1
        77
        4
        1
        56
      ...
    
    
      3
       01jsnpXSAlgw6aPeDxrU
        525
          4
       0
       0
       0
       0
       0
       0
       0
         0
        0
        0
        1
       0
        0
         1
        2
        0
         0
      ...
    
    
      4
       01kcPWA9K2BOxQeS5Rju
         23
         35
       0
       0
       0
       0
       0
       0
       0
         3
        0
        0
        1
       0
        0
         1
        0
        0
         0
      ...
    
  

5 rows × 1024 columns



In [21]:

    
# Combine all the testing features and write to file.
combined_test_data = sorted_test_data_asm.merge(sorted_test_data_byte, on='filename')

combined_test_data = combined_test_data.merge(X_test_rowstats, on='filename')

combined_test_data = combined_test_data.merge(X_test_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_test_data = combined_test_data.merge(sorted_test_image_asm, on='filename')

combined_test_data = combined_test_data.merge(all_test_rowstats, on='filename')

# Result is better without the image rowstats for all image features
#combined_test_data = combined_test_data.merge(all_test_image_asm_rowstats, on='filename')

combined_test_data.to_csv('data/final-combined-test-data-50percent.csv', index=False)

combined_test_data.head()









    Out[21]:






  
    
      
      filename
      edx
      esi
      es
      fs
      ds
      ss
      gs
      cs
      ah
      al
      ax
      bh
      bl
      bx
      ch
      cl
      cx
      dh
      dl
      
    
  
  
    
      0
       ITSUPtCmh7WdJcsYDwQ5
       245
       434
       0
       0
       1
       0
       0
       0
        9
       51
       1
       0
       4
       1
       3
       1
       3
       0
       1
      ...
    
    
      1
       Ig2DB5tSiEy1cJvV0zdw
       258
       437
       0
       0
       0
       0
       0
       0
       11
       60
       1
       1
       2
       2
       0
       1
       2
       2
       1
      ...
    
    
      2
       Jmo6eIhLZ4t9r8QsxEg5
       238
       365
       0
       0
       0
       0
       0
       0
        8
       51
       0
       0
       4
       1
       0
       1
       2
       1
       0
      ...
    
    
      3
       JtPFl4ewgdD78OzCMa3o
       241
       556
       1
       0
       1
       1
       0
       0
        4
       52
       0
       1
       0
       1
       2
       2
       2
       0
       1
      ...
    
    
      4
       K3ZtByPHGSFYNljDUEXp
        92
        75
       0
       0
       0
       0
       0
       0
        0
        1
       0
       0
       0
       0
       0
       2
       0
       0
       0
      ...
    
  

5 rows × 1024 columns

4. Perform Some Classification Tests



In [22]:

    
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss

    Parameters
    ----------
    y_true : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    y_pred : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """
    predictions = np.clip(y_pred, eps, 1 - eps)

    # normalize row sums to 1
    predictions /= predictions.sum(axis=1)[:, np.newaxis]

    actual = np.zeros(y_pred.shape)
    n_samples = actual.shape[0]
    actual[np.arange(n_samples), y_true.astype(int)] = 1
    vectsum = np.sum(actual * np.log(predictions))
    loss = -1.0 / n_samples * vectsum
    return loss



In [23]:

    
def run_cv(X,y, clf):

    # Construct a kfolds object
    kf = KFold(len(y),n_folds=10,shuffle=True)
    y_prob = np.zeros((len(y),9))
    y_pred = np.zeros(len(y))
    
    # Iterate through folds
    for train_index, test_index in kf:
        print(test_index, train_index)
        X_train = X.loc[train_index,:]
        X_test = X.loc[test_index,:]
        y_train = y[train_index]

        clf.fit(X_train,y_train)
        y_prob[test_index] = clf.predict_proba(X_test)
        y_pred[test_index] = clf.predict(X_test)
    
    return y_prob, y_pred



In [24]:

    
# Set our X,y for the classifiers
X = combined_train_data.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
yloss = y - 1



In [25]:

    
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)









    



(array([    3,     4,     7, ..., 10843, 10846, 10865]), array([    0,     1,     2, ..., 10864, 10866, 10867]))
(array([    0,    19,    22, ..., 10848, 10856, 10867]), array([    1,     2,     3, ..., 10864, 10865, 10866]))
(array([   12,    20,    28, ..., 10803, 10817, 10845]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   15,    71,    75, ..., 10851, 10864, 10866]), array([    0,     1,     2, ..., 10863, 10865, 10867]))
(array([   10,    43,    53, ..., 10852, 10853, 10857]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    5,     8,    48, ..., 10837, 10860, 10861]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   14,    25,    31, ..., 10818, 10836, 10854]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    6,    18,    23, ..., 10840, 10862, 10863]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    2,    16,    17, ..., 10844, 10855, 10859]), array([    0,     1,     3, ..., 10865, 10866, 10867]))
(array([    1,     9,    11, ..., 10832, 10849, 10858]), array([    0,     2,     3, ..., 10865, 10866, 10867]))
logloss = 0.0134
multiclass logloss = 0.0134
score = 0.9973
[[1540    0    0    0    0    1    0    0    0]
 [   1 2475    2    0    0    0    0    0    0]
 [   0    0 2941    0    0    0    1    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   3    0    0    0   38    1    0    0    0]
 [   5    0    0    0    0  746    0    0    0]
 [   2    0    0    0    0    0  396    0    0]
 [   0    0    0    0    0    0    0 1225    3]
 [   0    0    0    0    0    0    0    9 1004]]



In [26]:

    
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)









    



(array([   10,    35,    46, ..., 10852, 10855, 10856]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    3,     8,     9, ..., 10840, 10860, 10865]), array([    0,     1,     2, ..., 10864, 10866, 10867]))
(array([   26,    27,    34, ..., 10801, 10808, 10814]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    1,     5,    14, ..., 10861, 10866, 10867]), array([    0,     2,     3, ..., 10863, 10864, 10865]))
(array([   12,    41,    49, ..., 10845, 10854, 10858]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    7,    16,    19, ..., 10831, 10857, 10863]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   11,    13,    42, ..., 10846, 10847, 10848]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    6,    20,    22, ..., 10836, 10839, 10864]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    0,     4,    15, ..., 10833, 10853, 10862]), array([    1,     2,     3, ..., 10865, 10866, 10867]))
(array([    2,    28,    31, ..., 10844, 10849, 10851]), array([    0,     1,     3, ..., 10865, 10866, 10867]))
logloss = 0.0135
multiclass logloss = 0.0135
score = 0.9976
[[1540    0    0    0    0    1    0    0    0]
 [   1 2475    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   3    0    0    0   38    1    0    0    0]
 [   4    0    0    0    0  747    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1225    3]
 [   0    0    0    0    0    0    0    9 1004]]



In [25]:

    
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)









    



(array([   14,    22,    25, ..., 10840, 10859, 10862]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    7,     9,    36, ..., 10826, 10850, 10861]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    1,     2,     6, ..., 10856, 10858, 10867]), array([    0,     3,     4, ..., 10864, 10865, 10866]))
(array([   10,    11,    15, ..., 10831, 10841, 10844]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    5,    16,    23, ..., 10838, 10863, 10864]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    0,    13,    20, ..., 10848, 10853, 10860]), array([    1,     2,     3, ..., 10865, 10866, 10867]))
(array([    4,     8,    44, ..., 10847, 10852, 10866]), array([    0,     1,     2, ..., 10864, 10865, 10867]))
(array([   17,    24,    27, ..., 10855, 10857, 10865]), array([    0,     1,     2, ..., 10864, 10866, 10867]))
(array([   12,    18,    32, ..., 10830, 10833, 10849]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    3,    51,    60, ..., 10793, 10798, 10814]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
logloss = 0.0133
multiclass logloss = 0.0133
score = 0.9978
[[1540    0    0    0    0    1    0    0    0]
 [   1 2475    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   2    0    0    0   38    2    0    0    0]
 [   3    0    0    0    0  748    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1225    3]
 [   0    0    0    0    0    0    0    8 1005]]



In [24]:

    
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)









    



(array([    8,    33,    41, ..., 10830, 10838, 10862]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    0,     3,    18, ..., 10844, 10858, 10865]), array([    1,     2,     4, ..., 10864, 10866, 10867]))
(array([    9,    14,    27, ..., 10854, 10855, 10866]), array([    0,     1,     2, ..., 10864, 10865, 10867]))
(array([   26,    39,    40, ..., 10829, 10853, 10856]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    1,    12,    31, ..., 10842, 10863, 10867]), array([    0,     2,     3, ..., 10864, 10865, 10866]))
(array([   20,    28,    32, ..., 10823, 10836, 10852]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    6,    11,    19, ..., 10848, 10849, 10851]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    4,     5,    10, ..., 10850, 10857, 10859]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   21,    37,    72, ..., 10860, 10861, 10864]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    2,     7,    13, ..., 10805, 10825, 10845]), array([    0,     1,     3, ..., 10865, 10866, 10867]))
logloss = 0.0140
multiclass logloss = 0.0140
score = 0.9977
[[1541    0    0    0    0    0    0    0    0]
 [   1 2475    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   5    0    0    0   37    0    0    0    0]
 [   5    0    0    0    0  746    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1227    1]
 [   0    0    0    0    0    0    0    9 1004]]

5. Test/Experimental Code Only



In [ ]:

    
combined_train_data['class'] = sorted_train_labels.iloc[:,1]

class1 = X[combined_train_data['class'] == 1]
class2 = X[combined_train_data['class'] == 2]
class3 = X[combined_train_data['class'] == 3]
class4 = X[combined_train_data['class'] == 4]
class5 = X[combined_train_data['class'] == 5]
class6 = X[combined_train_data['class'] == 6]
class7 = X[combined_train_data['class'] == 7]
class8 = X[combined_train_data['class'] == 8]
class9 = X[combined_train_data['class'] == 9]

columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns)

class_stats['mean'][1] = 1.0
class_stats.head()

classx = X[combined_train_data['class'] == i]
classxmean = classx.mean()
classxmean.head()

columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns, dtype=float)
for i in range(1,10):
  classx = X[combined_train_data['class'] == i]
  class_stats['mean'][i] = classx.mean().sum()
  class_stats['std'][i] = classx.std().sum()
  #class_stats['corr'][i] = classx.corr().sum()
  #class_stats['cov'][i] = classx.cov().sum()

class_stats.head()

plt.figure(figsize=(15,15))
plt.xlabel("means")
plt.ylabel("standard deviation")
plt.scatter(class_stats['mean'], class_stats['std'], c=[ 1,2,3,4,5,6,7,8,9 ], cmap='brg')

	filename	edx	esi	es	ds	ss	cs	ah	al	ax	bh	bl	ch	cl	cx	dh	dl
0	01IsoiSMh5gxyDYTl4CB	750	496	3	0	0	0	8	224	49	34	25	41	191	52	38	163	...
1	01SuzwMJEIXsK7A8dQbl	1121	24	3	1	4	2	6	22	7	1	4	3	37	2	4	9	...
2	01azqd4InC7m9JpocGv5	1493	1900	0	0	0	0	1	398	0	0	47	1	77	4	1	56	...
3	01jsnpXSAlgw6aPeDxrU	525	4	0	0	0	0	0	0	0	0	1	0	1	2	0	0	...
4	01kcPWA9K2BOxQeS5Rju	23	35	0	0	0	0	0	3	0	0	1	0	1	0	0	0	...

	filename	ASM_1	ASM_3	ASM_4	ASM_14	ASM_20	ASM_21	ASM_23	ASM_24	ASM_25	ASM_26	ASM_28	ASM_29	ASM_31	ASM_32	ASM_33	ASM_34	ASM_40	ASM_41	ASM_42
0	ITSUPtCmh7WdJcsYDwQ5	69	68	69	48	9	9	13	10	72	69	68	69	58	48	48	52	9	9	9	...
1	Ig2DB5tSiEy1cJvV0zdw	69	68	69	48	9	9	13	10	72	69	68	69	58	48	48	52	9	9	9	...
2	Jmo6eIhLZ4t9r8QsxEg5	69	68	69	48	9	9	13	10	72	69	68	69	58	48	48	52	9	9	9	...
3	JtPFl4ewgdD78OzCMa3o	69	68	69	48	9	9	13	10	72	69	68	69	58	48	48	52	9	9	9	...
4	K3ZtByPHGSFYNljDUEXp	69	68	69	48	9	9	13	10	72	69	68	69	58	48	48	52	9	9	9	...

	filename	entropy	filesize
0	01IsoiSMh5gxyDYTl4CB	0.614952	6874624
1	01SuzwMJEIXsK7A8dQbl	0.843262	460288
2	01azqd4InC7m9JpocGv5	0.703961	5256192
3	01jsnpXSAlgw6aPeDxrU	0.806035	4825600
4	01kcPWA9K2BOxQeS5Rju	0.871610	712704

	filename	mean	std	max	total	logtotal
0	01IsoiSMh5gxyDYTl4CB	648.620278	6811.551280	87555	3.868276e+11	26.681245
1	01SuzwMJEIXsK7A8dQbl	68.274354	462.249118	5817	1.835831e+08	19.028178
2	01azqd4InC7m9JpocGv5	5550.391650	86112.319736	1367070	6.534008e+14	34.113212
3	01jsnpXSAlgw6aPeDxrU	284.071571	4153.720695	65928	7.779200e+10	25.077304
4	01kcPWA9K2BOxQeS5Rju	4.930417	26.064759	445	5.718701e+04	10.954082

	filename	e_mean	e_std	e_max	e_total	e_logtotal
0	ITSUPtCmh7WdJcsYDwQ5	144.286282	1977.507215	31361	8.948145e+09	22.914712
1	Ig2DB5tSiEy1cJvV0zdw	961.739563	14923.803067	236923	3.400511e+12	28.854947
2	Jmo6eIhLZ4t9r8QsxEg5	1039.353877	16206.598011	257289	4.333876e+12	29.097483
3	JtPFl4ewgdD78OzCMa3o	85.041750	1035.014770	16341	1.438326e+09	21.086746
4	K3ZtByPHGSFYNljDUEXp	3133.671968	49409.150011	784363	1.214445e+14	32.430479

	filename	tr_mean	tr_std	tr_min	tr_max	tr_total	tr_logtotal
0	01IsoiSMh5gxyDYTl4CB	55.516	37.775597	9	124	260046.605955	12.468616
1	01SuzwMJEIXsK7A8dQbl	49.790	33.782605	9	124	208572.454549	12.248042
2	01azqd4InC7m9JpocGv5	49.790	33.782605	9	124	208572.454549	12.248042
3	01jsnpXSAlgw6aPeDxrU	49.790	33.782605	9	124	208572.454549	12.248042
4	01kcPWA9K2BOxQeS5Rju	52.780	36.023534	9	124	235763.942362	12.370586

	filename	te_mean	te_std	te_min	te_max	te_total	te_logtotal
0	ITSUPtCmh7WdJcsYDwQ5	50.432	34.630506	9	124	216564.221800	12.285642
1	Ig2DB5tSiEy1cJvV0zdw	50.240	34.704686	9	124	216201.864665	12.283968
2	Jmo6eIhLZ4t9r8QsxEg5	52.826	36.020698	9	124	235950.846586	12.371379
3	JtPFl4ewgdD78OzCMa3o	52.826	36.020698	9	124	235950.846586	12.371379
4	K3ZtByPHGSFYNljDUEXp	50.240	34.704686	9	124	216201.864665	12.283968