1. Final Selection of Features in the 40 Percent Best Feature Set



In [ ]:

    
# Just import everything we might need, it saves time.
import warnings
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from time import time
from scipy.stats import randint as sp_randint
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score, KFold, train_test_split
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.svm import SVC
import seaborn as sns
%pylab inline
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

2. Load The Sorted Training Data Features

- sorted-train-malware-features-asm-reduced.csv
- sorted-train-malware-features-byte.csv
- sorted-train-labels.csv
- sorted-train-features-combined.csv



In [3]:

    
# First load the .asm and .byte training/test data and training labels
sorted_train_data_asm = pd.read_csv('data/sorted-train-malware-features-asm-40percent.csv')
sorted_train_data_byte = pd.read_csv('data/sorted-train-malware-features-byte.csv')
sorted_test_data_asm = pd.read_csv('data/sorted-test-malware-features-asm-40percent.csv')
sorted_test_data_byte = pd.read_csv('data/sorted-test-malware-features-byte.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
#combined_train_data = pd.read_csv('data/sorted-train-features-combined.csv')

# Load the image data for training data
sorted_train_image_asm = pd.read_csv('data/sorted-train-image-features-asm-40percent.csv')
sorted_test_image_asm = pd.read_csv('data/sorted-test-image-features-asm-40percent.csv')
#sorted_train_image_byte = pd.read_csv('data/sorted-train-image-features-byte-reduced.csv')

# Now load the row statistics for the original feature set, these will be combined with the new features
all_train_rowstats = pd.read_csv('data/all-train-asm-rowstats.csv')
all_test_rowstats = pd.read_csv('data/all-test-asm-rowstats.csv')
#all_train_image_asm_rowstats = pd.read_csv('data/all-train-image-asm-rowstats.csv')
#all_test_image_asm_rowstats = pd.read_csv('data/all-test-image-asm-rowstats.csv')



In [4]:

    
sorted_train_data_asm.head()









    Out[4]:






  
    
      
      filename
      edx
      esi
      es
      ds
      ss
      cs
      ah
      al
      ax
      bh
      bl
      bx
      ch
      cl
      cx
      dh
      dl
      dx
      eax
      
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
        750
        496
       3
       0
       0
       0
       8
       224
       49
       34
       25
       0
       41
       191
       52
       38
       163
       63
       1447
      ...
    
    
      1
       01SuzwMJEIXsK7A8dQbl
       1121
         24
       3
       1
       4
       2
       6
        22
        7
        1
        4
       0
        3
        37
        2
        4
         9
        3
       1220
      ...
    
    
      2
       01azqd4InC7m9JpocGv5
       1493
       1900
       0
       0
       0
       0
       1
       398
        0
        0
       47
       0
        1
        77
        4
        1
        56
        2
       4438
      ...
    
    
      3
       01jsnpXSAlgw6aPeDxrU
        525
          4
       0
       0
       0
       0
       0
         0
        0
        0
        1
       0
        0
         1
        2
        0
         0
        0
        942
      ...
    
    
      4
       01kcPWA9K2BOxQeS5Rju
         23
         35
       0
       0
       0
       0
       0
         3
        0
        0
        1
       0
        0
         1
        0
        0
         0
        0
        137
      ...
    
  

5 rows × 403 columns



In [4]:

    
sorted_train_labels.head()









    Out[4]:






  
    
      
      Id
      Class
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
       2
    
    
      1
       01SuzwMJEIXsK7A8dQbl
       8
    
    
      2
       01azqd4InC7m9JpocGv5
       9
    
    
      3
       01jsnpXSAlgw6aPeDxrU
       9
    
    
      4
       01kcPWA9K2BOxQeS5Rju
       1
    
  

5 rows × 2 columns



In [5]:

    
sorted_train_image_asm.head()









    Out[5]:






  
    
      
      filename
      ASM_14
      ASM_20
      ASM_21
      ASM_28
      ASM_29
      ASM_31
      ASM_32
      ASM_33
      ASM_34
      ASM_40
      ASM_41
      ASM_42
      ASM_43
      ASM_47
      ASM_124
      ASM_125
      ASM_135
      ASM_137
      ASM_138
      
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
        9
       32
       32
       13
       10
       116
       101
       120
       116
       49
       48
       48
       48
        9
       45
       45
       10
       116
       101
      ...
    
    
      1
       01SuzwMJEIXsK7A8dQbl
       48
        9
        9
       68
       69
        58
        48
        48
        52
        9
        9
        9
        9
       59
       13
       10
       52
        48
        48
      ...
    
    
      2
       01azqd4InC7m9JpocGv5
       48
        9
        9
       68
       69
        58
        48
        48
        52
        9
        9
        9
        9
       59
       13
       10
       52
        48
        48
      ...
    
    
      3
       01jsnpXSAlgw6aPeDxrU
       48
        9
        9
       68
       69
        58
        48
        48
        52
        9
        9
        9
        9
       59
       13
       10
       52
        48
        48
      ...
    
    
      4
       01kcPWA9K2BOxQeS5Rju
       48
        9
        9
       68
       69
        58
        49
        48
        48
        9
        9
        9
        9
       59
       13
       10
       48
        48
        48
      ...
    
  

5 rows × 401 columns



In [6]:

    
sorted_test_image_asm.head()









    Out[6]:






  
    
      
      filename
      ASM_14
      ASM_20
      ASM_21
      ASM_28
      ASM_29
      ASM_31
      ASM_32
      ASM_33
      ASM_34
      ASM_40
      ASM_41
      ASM_42
      ASM_43
      ASM_47
      ASM_124
      ASM_125
      ASM_135
      ASM_137
      ASM_138
      
    
  
  
    
      0
       ITSUPtCmh7WdJcsYDwQ5
       48
       9
       9
       68
       69
       58
       48
       48
       52
       9
       9
       9
       9
       59
       13
       10
       52
       48
       48
      ...
    
    
      1
       Ig2DB5tSiEy1cJvV0zdw
       48
       9
       9
       68
       69
       58
       48
       48
       52
       9
       9
       9
       9
       59
       13
       10
       52
       48
       48
      ...
    
    
      2
       Jmo6eIhLZ4t9r8QsxEg5
       48
       9
       9
       68
       69
       58
       48
       48
       52
       9
       9
       9
       9
       59
       13
       10
       52
       48
       48
      ...
    
    
      3
       JtPFl4ewgdD78OzCMa3o
       48
       9
       9
       68
       69
       58
       48
       48
       52
       9
       9
       9
       9
       59
       13
       10
       52
       48
       48
      ...
    
    
      4
       K3ZtByPHGSFYNljDUEXp
       48
       9
       9
       68
       69
       58
       48
       48
       52
       9
       9
       9
       9
       59
       13
       10
       52
       48
       48
      ...
    
  

5 rows × 401 columns



In [7]:

    
sorted_train_data_byte.head()









    Out[7]:






  
    
      
      filename
      entropy
      filesize
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
       0.614952
       6874624
    
    
      1
       01SuzwMJEIXsK7A8dQbl
       0.843262
        460288
    
    
      2
       01azqd4InC7m9JpocGv5
       0.703961
       5256192
    
    
      3
       01jsnpXSAlgw6aPeDxrU
       0.806035
       4825600
    
    
      4
       01kcPWA9K2BOxQeS5Rju
       0.871610
        712704
    
  

5 rows × 3 columns



In [7]:

    
sorted_test_data_asm.head()









    Out[7]:






  
    
      
      filename
      edx
      esi
      es
      ds
      ss
      cs
      ah
      al
      ax
      bh
      bl
      bx
      ch
      cl
      cx
      dh
      dl
      dx
      eax
      
    
  
  
    
      0
       ITSUPtCmh7WdJcsYDwQ5
       245
       434
       0
       1
       0
       0
        9
       51
       1
       0
       4
       1
       3
       1
       3
       0
       1
       2
       553
      ...
    
    
      1
       Ig2DB5tSiEy1cJvV0zdw
       258
       437
       0
       0
       0
       0
       11
       60
       1
       1
       2
       2
       0
       1
       2
       2
       1
       3
       554
      ...
    
    
      2
       Jmo6eIhLZ4t9r8QsxEg5
       238
       365
       0
       0
       0
       0
        8
       51
       0
       0
       4
       1
       0
       1
       2
       1
       0
       0
       519
      ...
    
    
      3
       JtPFl4ewgdD78OzCMa3o
       241
       556
       1
       1
       1
       0
        4
       52
       0
       1
       0
       1
       2
       2
       2
       0
       1
       2
       668
      ...
    
    
      4
       K3ZtByPHGSFYNljDUEXp
        92
        75
       0
       0
       0
       0
        0
        1
       0
       0
       0
       0
       0
       2
       0
       0
       0
       0
       402
      ...
    
  

5 rows × 403 columns



In [8]:

    
# Assign asm data to X,y for brevity.
X = sorted_train_data_asm.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
X_test = sorted_test_data_asm.iloc[:,1:]
X_train_image_asm = sorted_train_image_asm.iloc[:,1:]
X_test_image_asm = sorted_test_image_asm.iloc[:,1:]
# X_image_byte = sorted_train_image_byte.iloc[:,1:]



In [9]:

    
X_train_image_asm.shape









    Out[9]:





(10868, 400)



In [10]:

    
X_test_image_asm.shape









    Out[10]:





(10873, 400)

3. Perform Some Basic Statistical Analysis on the Feature Sets

Find the following:
- Feature and row mean
- Feature and row standard deviation
- Feature and row correlation coefficient
- Feature and row covariance
- Feature and row minimum and maximum



In [11]:

    
# Train feature stats
X_means = X.mean()
X_std = X.std()
X_cor = X.corr()
X_cov = X.cov()

# Test feature stats
X_test_means = X_test.mean()
X_test_std = X_test.std()
X_test_cor = X_test.corr()
X_test_cov = X_test.cov()

# Train image feature stats
X_train_image_asm_means = X_train_image_asm.mean()
X_train_image_asm_std = X_train_image_asm.std()
X_train_image_asm_cor = X_train_image_asm.corr()
X_train_image_asm_cov = X_train_image_asm.cov()

# Test image feature stats
X_test_image_asm_means = X_test_image_asm.mean()
X_test_image_asm_std = X_test_image_asm.std()
X_test_image_asm_cor = X_test_image_asm.corr()
X_test_image_asm_cov = X_test_image_asm.cov()

# Not using byte image features
#X_image_byte_means = X_image_byte.mean()
#X_image_byte_std = X_image_byte.std()
#X_image_byte_cor = X_image_byte.corr()
#X_image_byte_cov = X_image_byte.cov()



In [12]:

    
X_train_image_asm_means.head()









    Out[12]:





ASM_14    39.147405
ASM_20    14.223040
ASM_21    14.223040
ASM_28    55.418936
ASM_29    55.515366
dtype: float64



In [13]:

    
X_train_image_asm_std.head()









    Out[13]:





ASM_14    16.344400
ASM_20     9.636302
ASM_21     9.636302
ASM_28    23.048969
ASM_29    24.740569
dtype: float64



In [14]:

    
X_test_image_asm_means.head()









    Out[14]:





ASM_14    39.151936
ASM_20    14.224133
ASM_21    14.220638
ASM_28    55.430608
ASM_29    55.529937
dtype: float64



In [15]:

    
# The byte image data has low standard deviation and mean variance and is not very useful for learning
# so this data will not be used for further analysis.
X_test_image_asm_std.head()









    Out[15]:





ASM_14    16.335403
ASM_20     9.648070
ASM_21     9.634737
ASM_28    23.041988
ASM_29    24.736962
dtype: float64



In [ ]:

    
X_means.head()

X_std.head()

X_cor.head()

X_cov.head()

X_means.min()

X_means.max()

X_std.min()

X_std.max()

X_means[X_means == X_means.min()]

X_means[X_means == X_means.max()]

X_std[X_std == X_std.min()]

X_std[X_std == X_std.max()]



In [16]:

    
# Row stats for train and test data
X_train_rowstats = pd.DataFrame(index=np.arange(X.shape[0]), columns=['filename','mean','std','min','max','total','logtotal'], dtype=float64)
X_test_rowstats = pd.DataFrame(index=np.arange(X_test.shape[0]), columns=['filename','e_mean','e_std','e_min','e_max','e_total','e_logtotal'], dtype=float64)

X_train_rowstats['filename'] = sorted_train_data_asm['filename']
for i in range(0,X.shape[0]):
    X_train_rowstats['mean'][i] = X.iloc[i,:].mean()
    X_train_rowstats['std'][i] = X.iloc[i,:].std()
    X_train_rowstats['min'][i] = X.iloc[i,:].min()
    X_train_rowstats['max'][i] = X.iloc[i,:].max()
    

X_train_rowstats['total'] = X_train_rowstats['max'] * X_train_rowstats['mean'] * X_train_rowstats['std']
X_train_rowstats['logtotal'] = np.log(X_train_rowstats['total']) # natural logarithm

X_test_rowstats['filename'] = sorted_test_data_asm['filename']
for i in range(0,X_test.shape[0]):
    X_test_rowstats['e_mean'][i] = X_test.iloc[i,:].mean()
    X_test_rowstats['e_std'][i] = X_test.iloc[i,:].std()
    X_test_rowstats['e_min'][i] = X_test.iloc[i,:].min()
    X_test_rowstats['e_max'][i] = X_test.iloc[i,:].max()
    

X_test_rowstats['e_total'] = X_test_rowstats['e_max'] * X_test_rowstats['e_mean'] * X_test_rowstats['e_std']
X_test_rowstats['e_logtotal'] = np.log(X_test_rowstats['e_total']) # natural logarithm

X_train_rowstats.head()









    Out[16]:






  
    
      
      filename
      mean
      std
      min
      max
      total
      logtotal
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
        811.335821
        7612.563931
       0
         87555
       5.407700e+11
       27.016260
    
    
      1
       01SuzwMJEIXsK7A8dQbl
         85.353234
         515.786719
       0
          5817
       2.560880e+08
       19.361032
    
    
      2
       01azqd4InC7m9JpocGv5
       6944.706468
       96298.117166
       0
       1367070
       9.142447e+14
       34.449119
    
    
      3
       01jsnpXSAlgw6aPeDxrU
        355.380597
        4644.741791
       0
         65928
       1.088241e+11
       25.412999
    
    
      4
       01kcPWA9K2BOxQeS5Rju
          6.146766
          29.034778
       0
           445
       7.941914e+04
       11.282495
    
  

5 rows × 7 columns



In [17]:

    
X_test_rowstats.head()









    Out[17]:






  
    
      
      filename
      e_mean
      e_std
      e_min
      e_max
      e_total
      e_logtotal
    
  
  
    
      0
       ITSUPtCmh7WdJcsYDwQ5
        180.465174
        2211.096404
       0
        31361
       1.251385e+10
       23.250102
    
    
      1
       Ig2DB5tSiEy1cJvV0zdw
       1203.296020
       16689.071688
       0
       236923
       4.757862e+12
       29.190820
    
    
      2
       Jmo6eIhLZ4t9r8QsxEg5
       1300.333333
       18123.699199
       0
       257289
       6.063491e+12
       29.433307
    
    
      3
       JtPFl4ewgdD78OzCMa3o
        106.320896
        1157.070503
       0
        16341
       2.010282e+09
       21.421541
    
    
      4
       K3ZtByPHGSFYNljDUEXp
       3920.910448
       55254.429136
       0
       784363
       1.699304e+14
       32.766410
    
  

5 rows × 7 columns



In [18]:

    
# Image row stats
X_train_image_asm_rowstats = pd.DataFrame(index=np.arange(X_train_image_asm.shape[0]), columns=['filename','tr_mean','tr_std','tr_min','tr_max','tr_total','tr_logtotal'], dtype=float64)
X_test_image_asm_rowstats = pd.DataFrame(index=np.arange(X_test_image_asm.shape[0]), columns=['filename','te_mean','te_std','te_min','te_max','te_total','te_logtotal'], dtype=float64)

X_train_image_asm_rowstats['filename'] = sorted_train_data_asm['filename']
for i in range(0,X_train_image_asm.shape[0]):
    X_train_image_asm_rowstats['tr_mean'][i] = X_train_image_asm.iloc[i,:].mean()
    X_train_image_asm_rowstats['tr_std'][i] = X_train_image_asm.iloc[i,:].std()
    X_train_image_asm_rowstats['tr_min'][i] = X_train_image_asm.iloc[i,:].min()
    X_train_image_asm_rowstats['tr_max'][i] = X_train_image_asm.iloc[i,:].max()
    

X_train_image_asm_rowstats['tr_total'] = X_train_image_asm_rowstats['tr_max'] * X_train_image_asm_rowstats['tr_mean'] * X_train_image_asm_rowstats['tr_std']
X_train_image_asm_rowstats['tr_logtotal'] = np.log(X_train_image_asm_rowstats['tr_total']) # natural logarithm

X_test_image_asm_rowstats['filename'] = sorted_test_data_asm['filename']
for i in range(0,X_test_image_asm.shape[0]):
    X_test_image_asm_rowstats['te_mean'][i] = X_test_image_asm.iloc[i,:].mean()
    X_test_image_asm_rowstats['te_std'][i] = X_test_image_asm.iloc[i,:].std()
    X_test_image_asm_rowstats['te_min'][i] = X_test_image_asm.iloc[i,:].min()
    X_test_image_asm_rowstats['te_max'][i] = X_test_image_asm.iloc[i,:].max()
    

X_test_image_asm_rowstats['te_total'] = X_test_image_asm_rowstats['te_max'] * X_test_image_asm_rowstats['te_mean'] * X_test_image_asm_rowstats['te_std']
X_test_image_asm_rowstats['te_logtotal'] = np.log(X_test_image_asm_rowstats['te_total']) # natural logarithm

X_train_image_asm_rowstats.head()









    Out[18]:






  
    
      
      filename
      tr_mean
      tr_std
      tr_min
      tr_max
      tr_total
      tr_logtotal
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
       56.7100
       38.936693
       9
       124
       273804.382950
       12.520169
    
    
      1
       01SuzwMJEIXsK7A8dQbl
       49.1675
       35.480934
       9
       124
       216319.094666
       12.284510
    
    
      2
       01azqd4InC7m9JpocGv5
       49.1675
       35.480934
       9
       124
       216319.094666
       12.284510
    
    
      3
       01jsnpXSAlgw6aPeDxrU
       49.1675
       35.480934
       9
       124
       216319.094666
       12.284510
    
    
      4
       01kcPWA9K2BOxQeS5Rju
       52.4775
       37.763470
       9
       124
       245734.831905
       12.412008
    
  

5 rows × 7 columns



In [19]:

    
X_test_image_asm_rowstats.head()









    Out[19]:






  
    
      
      filename
      te_mean
      te_std
      te_min
      te_max
      te_total
      te_logtotal
    
  
  
    
      0
       ITSUPtCmh7WdJcsYDwQ5
       48.2375
       36.535506
       9
       124
       218535.302616
       12.294703
    
    
      1
       Ig2DB5tSiEy1cJvV0zdw
       48.1175
       36.252162
       9
       124
       216301.061581
       12.284427
    
    
      2
       Jmo6eIhLZ4t9r8QsxEg5
       52.5225
       37.761081
       9
       124
       245929.991034
       12.412802
    
    
      3
       JtPFl4ewgdD78OzCMa3o
       52.5225
       37.761081
       9
       124
       245929.991034
       12.412802
    
    
      4
       K3ZtByPHGSFYNljDUEXp
       48.1175
       36.252162
       9
       124
       216301.061581
       12.284427
    
  

5 rows × 7 columns



In [20]:

    
# Write column stats and row stats to file

X_train_rowstats.to_csv('data/train-asm-rowstats-40percent.csv', index=False)
X_test_rowstats.to_csv('data/test-asm-rowstats-40percent.csv', index=False)
X_train_image_asm_rowstats.to_csv('data/train-image-asm-rowstats-40percent.csv', index=False)
X_test_image_asm_rowstats.to_csv('data/test-image-asm-rowstats-40percent.csv', index=False)



In [21]:

    
# Combine all the training features and write to file.
combined_train_data = sorted_train_data_asm.merge(sorted_train_data_byte, on='filename')

combined_train_data = combined_train_data.merge(X_train_rowstats, on='filename')

combined_train_data = combined_train_data.merge(X_train_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_train_data = combined_train_data.merge(sorted_train_image_asm, on='filename')

combined_train_data = combined_train_data.merge(all_train_rowstats, on='filename')

# Result is better without the image rowstats for all image features
#combined_train_data = combined_train_data.merge(all_train_image_asm_rowstats, on='filename')

combined_train_data.to_csv('data/final-combined-train-data-40percent.csv', index=False)

combined_train_data.head()









    Out[21]:






  
    
      
      filename
      edx
      esi
      es
      ds
      ss
      cs
      ah
      al
      ax
      bh
      bl
      bx
      ch
      cl
      cx
      dh
      dl
      dx
      eax
      
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
        750
        496
       3
       0
       0
       0
       8
       224
       49
       34
       25
       0
       41
       191
       52
       38
       163
       63
       1447
      ...
    
    
      1
       01SuzwMJEIXsK7A8dQbl
       1121
         24
       3
       1
       4
       2
       6
        22
        7
        1
        4
       0
        3
        37
        2
        4
         9
        3
       1220
      ...
    
    
      2
       01azqd4InC7m9JpocGv5
       1493
       1900
       0
       0
       0
       0
       1
       398
        0
        0
       47
       0
        1
        77
        4
        1
        56
        2
       4438
      ...
    
    
      3
       01jsnpXSAlgw6aPeDxrU
        525
          4
       0
       0
       0
       0
       0
         0
        0
        0
        1
       0
        0
         1
        2
        0
         0
        0
        942
      ...
    
    
      4
       01kcPWA9K2BOxQeS5Rju
         23
         35
       0
       0
       0
       0
       0
         3
        0
        0
        1
       0
        0
         1
        0
        0
         0
        0
        137
      ...
    
  

5 rows × 823 columns



In [22]:

    
# Combine all the testing features and write to file.
combined_test_data = sorted_test_data_asm.merge(sorted_test_data_byte, on='filename')

combined_test_data = combined_test_data.merge(X_test_rowstats, on='filename')

combined_test_data = combined_test_data.merge(X_test_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_test_data = combined_test_data.merge(sorted_test_image_asm, on='filename')

combined_test_data = combined_test_data.merge(all_test_rowstats, on='filename')

# Result is better without the image rowstats for all image features
#combined_test_data = combined_test_data.merge(all_test_image_asm_rowstats, on='filename')

combined_test_data.to_csv('data/final-combined-test-data-40percent.csv', index=False)

combined_test_data.head()









    Out[22]:






  
    
      
      filename
      edx
      esi
      es
      ds
      ss
      cs
      ah
      al
      ax
      bh
      bl
      bx
      ch
      cl
      cx
      dh
      dl
      dx
      eax
      
    
  
  
    
      0
       ITSUPtCmh7WdJcsYDwQ5
       245
       434
       0
       1
       0
       0
        9
       51
       1
       0
       4
       1
       3
       1
       3
       0
       1
       2
       553
      ...
    
    
      1
       Ig2DB5tSiEy1cJvV0zdw
       258
       437
       0
       0
       0
       0
       11
       60
       1
       1
       2
       2
       0
       1
       2
       2
       1
       3
       554
      ...
    
    
      2
       Jmo6eIhLZ4t9r8QsxEg5
       238
       365
       0
       0
       0
       0
        8
       51
       0
       0
       4
       1
       0
       1
       2
       1
       0
       0
       519
      ...
    
    
      3
       JtPFl4ewgdD78OzCMa3o
       241
       556
       1
       1
       1
       0
        4
       52
       0
       1
       0
       1
       2
       2
       2
       0
       1
       2
       668
      ...
    
    
      4
       K3ZtByPHGSFYNljDUEXp
        92
        75
       0
       0
       0
       0
        0
        1
       0
       0
       0
       0
       0
       2
       0
       0
       0
       0
       402
      ...
    
  

5 rows × 823 columns

4. Perform Some Classification Tests



In [23]:

    
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss

    Parameters
    ----------
    y_true : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    y_pred : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """
    predictions = np.clip(y_pred, eps, 1 - eps)

    # normalize row sums to 1
    predictions /= predictions.sum(axis=1)[:, np.newaxis]

    actual = np.zeros(y_pred.shape)
    n_samples = actual.shape[0]
    actual[np.arange(n_samples), y_true.astype(int)] = 1
    vectsum = np.sum(actual * np.log(predictions))
    loss = -1.0 / n_samples * vectsum
    return loss



In [24]:

    
def run_cv(X,y, clf):

    # Construct a kfolds object
    kf = KFold(len(y),n_folds=10,shuffle=True)
    y_prob = np.zeros((len(y),9))
    y_pred = np.zeros(len(y))
    
    # Iterate through folds
    for train_index, test_index in kf:
        print(test_index, train_index)
        X_train = X.loc[train_index,:]
        X_test = X.loc[test_index,:]
        y_train = y[train_index]

        clf.fit(X_train,y_train)
        y_prob[test_index] = clf.predict_proba(X_test)
        y_pred[test_index] = clf.predict(X_test)
    
    return y_prob, y_pred



In [25]:

    
# Set our X,y for the classifiers
X = combined_train_data.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
yloss = y - 1



In [26]:

    
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)









    



(array([   10,    35,    46, ..., 10852, 10855, 10856]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    3,     8,     9, ..., 10840, 10860, 10865]), array([    0,     1,     2, ..., 10864, 10866, 10867]))
(array([   26,    27,    34, ..., 10801, 10808, 10814]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    1,     5,    14, ..., 10861, 10866, 10867]), array([    0,     2,     3, ..., 10863, 10864, 10865]))
(array([   12,    41,    49, ..., 10845, 10854, 10858]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    7,    16,    19, ..., 10831, 10857, 10863]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   11,    13,    42, ..., 10846, 10847, 10848]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    6,    20,    22, ..., 10836, 10839, 10864]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    0,     4,    15, ..., 10833, 10853, 10862]), array([    1,     2,     3, ..., 10865, 10866, 10867]))
(array([    2,    28,    31, ..., 10844, 10849, 10851]), array([    0,     1,     3, ..., 10865, 10866, 10867]))
logloss = 0.0135
multiclass logloss = 0.0135
score = 0.9976
[[1540    0    0    0    0    1    0    0    0]
 [   1 2475    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   3    0    0    0   38    1    0    0    0]
 [   4    0    0    0    0  747    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1225    3]
 [   0    0    0    0    0    0    0    9 1004]]



In [25]:

    
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)









    



(array([   14,    22,    25, ..., 10840, 10859, 10862]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    7,     9,    36, ..., 10826, 10850, 10861]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    1,     2,     6, ..., 10856, 10858, 10867]), array([    0,     3,     4, ..., 10864, 10865, 10866]))
(array([   10,    11,    15, ..., 10831, 10841, 10844]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    5,    16,    23, ..., 10838, 10863, 10864]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    0,    13,    20, ..., 10848, 10853, 10860]), array([    1,     2,     3, ..., 10865, 10866, 10867]))
(array([    4,     8,    44, ..., 10847, 10852, 10866]), array([    0,     1,     2, ..., 10864, 10865, 10867]))
(array([   17,    24,    27, ..., 10855, 10857, 10865]), array([    0,     1,     2, ..., 10864, 10866, 10867]))
(array([   12,    18,    32, ..., 10830, 10833, 10849]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    3,    51,    60, ..., 10793, 10798, 10814]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
logloss = 0.0133
multiclass logloss = 0.0133
score = 0.9978
[[1540    0    0    0    0    1    0    0    0]
 [   1 2475    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   2    0    0    0   38    2    0    0    0]
 [   3    0    0    0    0  748    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1225    3]
 [   0    0    0    0    0    0    0    8 1005]]



In [24]:

    
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)









    



(array([    8,    33,    41, ..., 10830, 10838, 10862]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    0,     3,    18, ..., 10844, 10858, 10865]), array([    1,     2,     4, ..., 10864, 10866, 10867]))
(array([    9,    14,    27, ..., 10854, 10855, 10866]), array([    0,     1,     2, ..., 10864, 10865, 10867]))
(array([   26,    39,    40, ..., 10829, 10853, 10856]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    1,    12,    31, ..., 10842, 10863, 10867]), array([    0,     2,     3, ..., 10864, 10865, 10866]))
(array([   20,    28,    32, ..., 10823, 10836, 10852]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    6,    11,    19, ..., 10848, 10849, 10851]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    4,     5,    10, ..., 10850, 10857, 10859]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   21,    37,    72, ..., 10860, 10861, 10864]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    2,     7,    13, ..., 10805, 10825, 10845]), array([    0,     1,     3, ..., 10865, 10866, 10867]))
logloss = 0.0140
multiclass logloss = 0.0140
score = 0.9977
[[1541    0    0    0    0    0    0    0    0]
 [   1 2475    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   5    0    0    0   37    0    0    0    0]
 [   5    0    0    0    0  746    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1227    1]
 [   0    0    0    0    0    0    0    9 1004]]

5. Test/Experimental Code Only



In [ ]:

    
combined_train_data['class'] = sorted_train_labels.iloc[:,1]

class1 = X[combined_train_data['class'] == 1]
class2 = X[combined_train_data['class'] == 2]
class3 = X[combined_train_data['class'] == 3]
class4 = X[combined_train_data['class'] == 4]
class5 = X[combined_train_data['class'] == 5]
class6 = X[combined_train_data['class'] == 6]
class7 = X[combined_train_data['class'] == 7]
class8 = X[combined_train_data['class'] == 8]
class9 = X[combined_train_data['class'] == 9]

columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns)

class_stats['mean'][1] = 1.0
class_stats.head()

classx = X[combined_train_data['class'] == i]
classxmean = classx.mean()
classxmean.head()

columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns, dtype=float)
for i in range(1,10):
  classx = X[combined_train_data['class'] == i]
  class_stats['mean'][i] = classx.mean().sum()
  class_stats['std'][i] = classx.std().sum()
  #class_stats['corr'][i] = classx.corr().sum()
  #class_stats['cov'][i] = classx.cov().sum()

class_stats.head()

plt.figure(figsize=(15,15))
plt.xlabel("means")
plt.ylabel("standard deviation")
plt.scatter(class_stats['mean'], class_stats['std'], c=[ 1,2,3,4,5,6,7,8,9 ], cmap='brg')

	filename	edx	esi	es	ds	ss	cs	ah	al	ax	bh	bl	ch	cl	cx	dh	dl	dx	eax
0	01IsoiSMh5gxyDYTl4CB	750	496	3	0	0	0	8	224	49	34	25	41	191	52	38	163	63	1447	...
1	01SuzwMJEIXsK7A8dQbl	1121	24	3	1	4	2	6	22	7	1	4	3	37	2	4	9	3	1220	...
2	01azqd4InC7m9JpocGv5	1493	1900	0	0	0	0	1	398	0	0	47	1	77	4	1	56	2	4438	...
3	01jsnpXSAlgw6aPeDxrU	525	4	0	0	0	0	0	0	0	0	1	0	1	2	0	0	0	942	...
4	01kcPWA9K2BOxQeS5Rju	23	35	0	0	0	0	0	3	0	0	1	0	1	0	0	0	0	137	...

	filename	ASM_14	ASM_20	ASM_21	ASM_28	ASM_29	ASM_31	ASM_32	ASM_33	ASM_34	ASM_40	ASM_41	ASM_42	ASM_43	ASM_47	ASM_124	ASM_125	ASM_135	ASM_137	ASM_138
0	ITSUPtCmh7WdJcsYDwQ5	48	9	9	68	69	58	48	48	52	9	9	9	9	59	13	10	52	48	48	...
1	Ig2DB5tSiEy1cJvV0zdw	48	9	9	68	69	58	48	48	52	9	9	9	9	59	13	10	52	48	48	...
2	Jmo6eIhLZ4t9r8QsxEg5	48	9	9	68	69	58	48	48	52	9	9	9	9	59	13	10	52	48	48	...
3	JtPFl4ewgdD78OzCMa3o	48	9	9	68	69	58	48	48	52	9	9	9	9	59	13	10	52	48	48	...
4	K3ZtByPHGSFYNljDUEXp	48	9	9	68	69	58	48	48	52	9	9	9	9	59	13	10	52	48	48	...

	filename	entropy	filesize
0	01IsoiSMh5gxyDYTl4CB	0.614952	6874624
1	01SuzwMJEIXsK7A8dQbl	0.843262	460288
2	01azqd4InC7m9JpocGv5	0.703961	5256192
3	01jsnpXSAlgw6aPeDxrU	0.806035	4825600
4	01kcPWA9K2BOxQeS5Rju	0.871610	712704

	filename	mean	std	max	total	logtotal
0	01IsoiSMh5gxyDYTl4CB	811.335821	7612.563931	87555	5.407700e+11	27.016260
1	01SuzwMJEIXsK7A8dQbl	85.353234	515.786719	5817	2.560880e+08	19.361032
2	01azqd4InC7m9JpocGv5	6944.706468	96298.117166	1367070	9.142447e+14	34.449119
3	01jsnpXSAlgw6aPeDxrU	355.380597	4644.741791	65928	1.088241e+11	25.412999
4	01kcPWA9K2BOxQeS5Rju	6.146766	29.034778	445	7.941914e+04	11.282495

	filename	e_mean	e_std	e_max	e_total	e_logtotal
0	ITSUPtCmh7WdJcsYDwQ5	180.465174	2211.096404	31361	1.251385e+10	23.250102
1	Ig2DB5tSiEy1cJvV0zdw	1203.296020	16689.071688	236923	4.757862e+12	29.190820
2	Jmo6eIhLZ4t9r8QsxEg5	1300.333333	18123.699199	257289	6.063491e+12	29.433307
3	JtPFl4ewgdD78OzCMa3o	106.320896	1157.070503	16341	2.010282e+09	21.421541
4	K3ZtByPHGSFYNljDUEXp	3920.910448	55254.429136	784363	1.699304e+14	32.766410

	filename	tr_mean	tr_std	tr_min	tr_max	tr_total	tr_logtotal
0	01IsoiSMh5gxyDYTl4CB	56.7100	38.936693	9	124	273804.382950	12.520169
1	01SuzwMJEIXsK7A8dQbl	49.1675	35.480934	9	124	216319.094666	12.284510
2	01azqd4InC7m9JpocGv5	49.1675	35.480934	9	124	216319.094666	12.284510
3	01jsnpXSAlgw6aPeDxrU	49.1675	35.480934	9	124	216319.094666	12.284510
4	01kcPWA9K2BOxQeS5Rju	52.4775	37.763470	9	124	245734.831905	12.412008

	filename	te_mean	te_std	te_min	te_max	te_total	te_logtotal
0	ITSUPtCmh7WdJcsYDwQ5	48.2375	36.535506	9	124	218535.302616	12.294703
1	Ig2DB5tSiEy1cJvV0zdw	48.1175	36.252162	9	124	216301.061581	12.284427
2	Jmo6eIhLZ4t9r8QsxEg5	52.5225	37.761081	9	124	245929.991034	12.412802
3	JtPFl4ewgdD78OzCMa3o	52.5225	37.761081	9	124	245929.991034	12.412802
4	K3ZtByPHGSFYNljDUEXp	48.1175	36.252162	9	124	216301.061581	12.284427