1. Analysis and Transormation of Features in the 20 Percent Best Feature Set



In [ ]:

    
# Just import everything we might need, it saves time.
import warnings
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from time import time
from scipy.stats import randint as sp_randint
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score, KFold, train_test_split
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.svm import SVC
import seaborn as sns
%pylab inline
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

2. Load The Sorted Training Data Features

- sorted-train-malware-features-asm-reduced.csv
- sorted-train-malware-features-byte.csv
- sorted-train-labels.csv
- sorted-train-features-combined.csv



In [2]:

    
# First load the .asm and .byte training/test data and training labels
sorted_train_data_asm = pd.read_csv('data/sorted-train-malware-features-asm-20percent.csv')
sorted_train_data_byte = pd.read_csv('data/sorted-train-malware-features-byte.csv')
sorted_test_data_asm = pd.read_csv('data/sorted-test-malware-features-asm-20percent.csv')
sorted_test_data_byte = pd.read_csv('data/sorted-test-malware-features-byte.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
#combined_train_data = pd.read_csv('data/sorted-train-features-combined.csv')

# Load the image data for training data
sorted_train_image_asm = pd.read_csv('data/sorted-train-image-features-asm-20percent.csv')
sorted_test_image_asm = pd.read_csv('data/sorted-test-image-features-asm-20percent.csv')
#sorted_train_image_byte = pd.read_csv('data/sorted-train-image-features-byte-reduced.csv')

# Now load the row statistics for the original feature set, these will be combined with the new features
all_train_rowstats = pd.read_csv('data/all-train-asm-rowstats.csv')
all_test_rowstats = pd.read_csv('data/all-test-asm-rowstats.csv')
all_train_image_asm_rowstats = pd.read_csv('data/all-train-image-asm-rowstats.csv')
all_test_image_asm_rowstats = pd.read_csv('data/all-test-image-asm-rowstats.csv')



In [3]:

    
sorted_train_data_asm.head()









    Out[3]:






  
    
      
      filename
      edx
      esi
      es
      ds
      ss
      cs
      ah
      al
      ax
      bh
      bl
      bx
      ch
      cl
      cx
      dh
      dl
      dx
      eax
      
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
        750
        496
       3
       0
       0
       0
       8
       224
       49
       34
       25
       0
       41
       191
       52
       38
       163
       63
       1447
      ...
    
    
      1
       01SuzwMJEIXsK7A8dQbl
       1121
         24
       3
       1
       4
       2
       6
        22
        7
        1
        4
       0
        3
        37
        2
        4
         9
        3
       1220
      ...
    
    
      2
       01azqd4InC7m9JpocGv5
       1493
       1900
       0
       0
       0
       0
       1
       398
        0
        0
       47
       0
        1
        77
        4
        1
        56
        2
       4438
      ...
    
    
      3
       01jsnpXSAlgw6aPeDxrU
        525
          4
       0
       0
       0
       0
       0
         0
        0
        0
        1
       0
        0
         1
        2
        0
         0
        0
        942
      ...
    
    
      4
       01kcPWA9K2BOxQeS5Rju
         23
         35
       0
       0
       0
       0
       0
         3
        0
        0
        1
       0
        0
         1
        0
        0
         0
        0
        137
      ...
    
  

5 rows × 202 columns



In [4]:

    
sorted_train_labels.head()









    Out[4]:






  
    
      
      Id
      Class
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
       2
    
    
      1
       01SuzwMJEIXsK7A8dQbl
       8
    
    
      2
       01azqd4InC7m9JpocGv5
       9
    
    
      3
       01jsnpXSAlgw6aPeDxrU
       9
    
    
      4
       01kcPWA9K2BOxQeS5Rju
       1
    
  

5 rows × 2 columns



In [5]:

    
sorted_train_image_asm.head()









    Out[5]:






  
    
      
      filename
      ASM_33
      ASM_40
      ASM_41
      ASM_42
      ASM_43
      ASM_125
      ASM_137
      ASM_139
      ASM_140
      ASM_141
      ASM_142
      ASM_143
      ASM_144
      ASM_145
      ASM_146
      ASM_147
      ASM_150
      ASM_154
      ASM_155
      
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
       120
       49
       48
       48
       48
       45
       116
       120
       116
       58
       48
       48
       52
       48
       49
       48
         9
        9
         9
      ...
    
    
      1
       01SuzwMJEIXsK7A8dQbl
        48
        9
        9
        9
        9
       10
        48
        48
        48
        9
        9
        9
        9
        9
        9
        9
       124
       84
       104
      ...
    
    
      2
       01azqd4InC7m9JpocGv5
        48
        9
        9
        9
        9
       10
        48
        48
        48
        9
        9
        9
        9
        9
        9
        9
       124
       84
       104
      ...
    
    
      3
       01jsnpXSAlgw6aPeDxrU
        48
        9
        9
        9
        9
       10
        48
        48
        48
        9
        9
        9
        9
        9
        9
        9
       124
       84
       104
      ...
    
    
      4
       01kcPWA9K2BOxQeS5Rju
        48
        9
        9
        9
        9
       10
        48
        48
        48
        9
        9
        9
        9
        9
        9
        9
       124
       84
       104
      ...
    
  

5 rows × 201 columns



In [6]:

    
sorted_test_image_asm.head()









    Out[6]:






  
    
      
      filename
      ASM_33
      ASM_40
      ASM_41
      ASM_42
      ASM_43
      ASM_125
      ASM_137
      ASM_139
      ASM_140
      ASM_141
      ASM_142
      ASM_143
      ASM_144
      ASM_145
      ASM_146
      ASM_147
      ASM_150
      ASM_154
      ASM_155
      
    
  
  
    
      0
       ITSUPtCmh7WdJcsYDwQ5
       48
       9
       9
       9
       9
       10
       48
       48
       48
       9
       9
       9
       9
       9
       9
       9
       124
       84
       104
      ...
    
    
      1
       Ig2DB5tSiEy1cJvV0zdw
       48
       9
       9
       9
       9
       10
       48
       48
       48
       9
       9
       9
       9
       9
       9
       9
       124
       84
       104
      ...
    
    
      2
       Jmo6eIhLZ4t9r8QsxEg5
       48
       9
       9
       9
       9
       10
       48
       48
       48
       9
       9
       9
       9
       9
       9
       9
       124
       84
       104
      ...
    
    
      3
       JtPFl4ewgdD78OzCMa3o
       48
       9
       9
       9
       9
       10
       48
       48
       48
       9
       9
       9
       9
       9
       9
       9
       124
       84
       104
      ...
    
    
      4
       K3ZtByPHGSFYNljDUEXp
       48
       9
       9
       9
       9
       10
       48
       48
       48
       9
       9
       9
       9
       9
       9
       9
       124
       84
       104
      ...
    
  

5 rows × 201 columns



In [7]:

    
sorted_train_data_byte.head()









    Out[7]:






  
    
      
      filename
      entropy
      filesize
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
       0.614952
       6874624
    
    
      1
       01SuzwMJEIXsK7A8dQbl
       0.843262
        460288
    
    
      2
       01azqd4InC7m9JpocGv5
       0.703961
       5256192
    
    
      3
       01jsnpXSAlgw6aPeDxrU
       0.806035
       4825600
    
    
      4
       01kcPWA9K2BOxQeS5Rju
       0.871610
        712704
    
  

5 rows × 3 columns



In [9]:

    
sorted_test_data_asm.head()









    Out[9]:






  
    
      
      filename
      edx
      esi
      es
      ds
      ss
      cs
      ah
      al
      ax
      bh
      bl
      bx
      ch
      cl
      cx
      dh
      dl
      dx
      eax
      
    
  
  
    
      0
       ITSUPtCmh7WdJcsYDwQ5
       245
       434
       0
       1
       0
       0
        9
       51
       1
       0
       4
       1
       3
       1
       3
       0
       1
       2
       553
      ...
    
    
      1
       Ig2DB5tSiEy1cJvV0zdw
       258
       437
       0
       0
       0
       0
       11
       60
       1
       1
       2
       2
       0
       1
       2
       2
       1
       3
       554
      ...
    
    
      2
       Jmo6eIhLZ4t9r8QsxEg5
       238
       365
       0
       0
       0
       0
        8
       51
       0
       0
       4
       1
       0
       1
       2
       1
       0
       0
       519
      ...
    
    
      3
       JtPFl4ewgdD78OzCMa3o
       241
       556
       1
       1
       1
       0
        4
       52
       0
       1
       0
       1
       2
       2
       2
       0
       1
       2
       668
      ...
    
    
      4
       K3ZtByPHGSFYNljDUEXp
        92
        75
       0
       0
       0
       0
        0
        1
       0
       0
       0
       0
       0
       2
       0
       0
       0
       0
       402
      ...
    
  

5 rows × 202 columns



In [8]:

    
# Assign asm data to X,y for brevity.
X = sorted_train_data_asm.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
X_test = sorted_test_data_asm.iloc[:,1:]
X_train_image_asm = sorted_train_image_asm.iloc[:,1:]
X_test_image_asm = sorted_test_image_asm.iloc[:,1:]
# X_image_byte = sorted_train_image_byte.iloc[:,1:]



In [10]:

    
X_train_image_asm.shape









    Out[10]:





(10868, 200)



In [11]:

    
X_test_image_asm.shape









    Out[11]:





(10873, 200)

3. Perform Some Basic Statistical Analysis on the Feature Sets

Find the following:
- Feature and row mean
- Feature and row standard deviation
- Feature and row correlation coefficient
- Feature and row covariance
- Feature and row minimum and maximum



In [12]:

    
# Train feature stats
X_means = X.mean()
X_std = X.std()
X_cor = X.corr()
X_cov = X.cov()

# Test feature stats
X_test_means = X_test.mean()
X_test_std = X_test.std()
X_test_cor = X_test.corr()
X_test_cov = X_test.cov()

# Train image feature stats
X_train_image_asm_means = X_train_image_asm.mean()
X_train_image_asm_std = X_train_image_asm.std()
X_train_image_asm_cor = X_train_image_asm.corr()
X_train_image_asm_cov = X_train_image_asm.cov()

# Test image feature stats
X_test_image_asm_means = X_test_image_asm.mean()
X_test_image_asm_std = X_test_image_asm.std()
X_test_image_asm_cor = X_test_image_asm.corr()
X_test_image_asm_cov = X_test_image_asm.cov()

# Not using byte image features
#X_image_byte_means = X_image_byte.mean()
#X_image_byte_std = X_image_byte.std()
#X_image_byte_cor = X_image_byte.corr()
#X_image_byte_cov = X_image_byte.cov()



In [13]:

    
X_train_image_asm_means.head()









    Out[13]:





ASM_33    64.914704
ASM_40    18.125322
ASM_41    17.866949
ASM_42    17.868237
ASM_43    17.857195
dtype: float64



In [14]:

    
X_train_image_asm_std.head()









    Out[14]:





ASM_33    29.676660
ASM_40    16.799549
ASM_41    16.358924
ASM_42    16.362247
ASM_43    16.339508
dtype: float64



In [15]:

    
X_test_image_asm_means.head()









    Out[15]:





ASM_33    64.949508
ASM_40    18.126000
ASM_41    17.852387
ASM_42    17.850915
ASM_43    17.853030
dtype: float64



In [16]:

    
# The byte image data has low standard deviation and mean variance and is not very useful for learning
# so this data will not be used for further analysis.
X_test_image_asm_std.head()









    Out[16]:





ASM_33    29.682114
ASM_40    16.784976
ASM_41    16.337162
ASM_42    16.335167
ASM_43    16.338488
dtype: float64



In [ ]:

    
X_means.head()

X_std.head()

X_cor.head()

X_cov.head()

X_means.min()

X_means.max()

X_std.min()

X_std.max()

X_means[X_means == X_means.min()]

X_means[X_means == X_means.max()]

X_std[X_std == X_std.min()]

X_std[X_std == X_std.max()]



In [17]:

    
# Row stats for train and test data
X_train_rowstats = pd.DataFrame(index=np.arange(X.shape[0]), columns=['filename','mean','std','min','max','total','logtotal'], dtype=float64)
X_test_rowstats = pd.DataFrame(index=np.arange(X_test.shape[0]), columns=['filename','e_mean','e_std','e_min','e_max','e_total','e_logtotal'], dtype=float64)

X_train_rowstats['filename'] = sorted_train_data_asm['filename']
for i in range(0,X.shape[0]):
    X_train_rowstats['mean'][i] = X.iloc[i,:].mean()
    X_train_rowstats['std'][i] = X.iloc[i,:].std()
    X_train_rowstats['min'][i] = X.iloc[i,:].min()
    X_train_rowstats['max'][i] = X.iloc[i,:].max()
    

X_train_rowstats['total'] = X_train_rowstats['max'] * X_train_rowstats['mean'] * X_train_rowstats['std']
X_train_rowstats['logtotal'] = np.log(X_train_rowstats['total']) # natural logarithm

X_test_rowstats['filename'] = sorted_test_data_asm['filename']
for i in range(0,X_test.shape[0]):
    X_test_rowstats['e_mean'][i] = X_test.iloc[i,:].mean()
    X_test_rowstats['e_std'][i] = X_test.iloc[i,:].std()
    X_test_rowstats['e_min'][i] = X_test.iloc[i,:].min()
    X_test_rowstats['e_max'][i] = X_test.iloc[i,:].max()
    

X_test_rowstats['e_total'] = X_test_rowstats['e_max'] * X_test_rowstats['e_mean'] * X_test_rowstats['e_std']
X_test_rowstats['e_logtotal'] = np.log(X_test_rowstats['e_total']) # natural logarithm

X_train_rowstats.head()









    Out[17]:






  
    
      
      filename
      mean
      std
      min
      max
      total
      logtotal
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
        1620.895522
        10717.960833
       0
         87555
       1.521066e+12
       28.050433
    
    
      1
       01SuzwMJEIXsK7A8dQbl
         170.199005
          720.368718
       0
          5817
       7.131993e+08
       20.385272
    
    
      2
       01azqd4InC7m9JpocGv5
       13888.179104
       136000.425110
       0
       1367070
       2.582119e+15
       35.487387
    
    
      3
       01jsnpXSAlgw6aPeDxrU
         710.004975
         6557.617262
       0
         65928
       3.069569e+11
       26.449973
    
    
      4
       01kcPWA9K2BOxQeS5Rju
          12.099502
           40.226360
       0
           445
       2.165899e+05
       12.285761
    
  

5 rows × 7 columns



In [18]:

    
X_test_rowstats.head()









    Out[18]:






  
    
      
      filename
      e_mean
      e_std
      e_min
      e_max
      e_total
      e_logtotal
    
  
  
    
      0
       ITSUPtCmh7WdJcsYDwQ5
        360.348259
        3120.464387
       0
        31361
       3.526400e+10
       24.286128
    
    
      1
       Ig2DB5tSiEy1cJvV0zdw
       2405.940299
       23569.804426
       0
       236923
       1.343532e+13
       30.228908
    
    
      2
       Jmo6eIhLZ4t9r8QsxEg5
       2600.109453
       25596.553591
       0
       257289
       1.712357e+13
       30.471477
    
    
      3
       JtPFl4ewgdD78OzCMa3o
        212.213930
        1631.495862
       0
        16341
       5.657682e+09
       22.456280
    
    
      4
       K3ZtByPHGSFYNljDUEXp
       7841.134328
       78041.521891
       0
       784363
       4.799784e+14
       33.804762
    
  

5 rows × 7 columns



In [19]:

    
# Image row stats
X_train_image_asm_rowstats = pd.DataFrame(index=np.arange(X_train_image_asm.shape[0]), columns=['filename','tr_mean','tr_std','tr_min','tr_max','tr_total','tr_logtotal'], dtype=float64)
X_test_image_asm_rowstats = pd.DataFrame(index=np.arange(X_test_image_asm.shape[0]), columns=['filename','te_mean','te_std','te_min','te_max','te_total','te_logtotal'], dtype=float64)

X_train_image_asm_rowstats['filename'] = sorted_train_data_asm['filename']
for i in range(0,X_train_image_asm.shape[0]):
    X_train_image_asm_rowstats['tr_mean'][i] = X_train_image_asm.iloc[i,:].mean()
    X_train_image_asm_rowstats['tr_std'][i] = X_train_image_asm.iloc[i,:].std()
    X_train_image_asm_rowstats['tr_min'][i] = X_train_image_asm.iloc[i,:].min()
    X_train_image_asm_rowstats['tr_max'][i] = X_train_image_asm.iloc[i,:].max()
    

X_train_image_asm_rowstats['tr_total'] = X_train_image_asm_rowstats['tr_max'] * X_train_image_asm_rowstats['tr_mean'] * X_train_image_asm_rowstats['tr_std']
X_train_image_asm_rowstats['tr_logtotal'] = np.log(X_train_image_asm_rowstats['tr_total']) # natural logarithm

X_test_image_asm_rowstats['filename'] = sorted_test_data_asm['filename']
for i in range(0,X_test_image_asm.shape[0]):
    X_test_image_asm_rowstats['te_mean'][i] = X_test_image_asm.iloc[i,:].mean()
    X_test_image_asm_rowstats['te_std'][i] = X_test_image_asm.iloc[i,:].std()
    X_test_image_asm_rowstats['te_min'][i] = X_test_image_asm.iloc[i,:].min()
    X_test_image_asm_rowstats['te_max'][i] = X_test_image_asm.iloc[i,:].max()
    

X_test_image_asm_rowstats['te_total'] = X_test_image_asm_rowstats['te_max'] * X_test_image_asm_rowstats['te_mean'] * X_test_image_asm_rowstats['te_std']
X_test_image_asm_rowstats['te_logtotal'] = np.log(X_test_image_asm_rowstats['te_total']) # natural logarithm

X_train_image_asm_rowstats.head()









    Out[19]:






  
    
      
      filename
      tr_mean
      tr_std
      tr_min
      tr_max
      tr_total
      tr_logtotal
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
       65.27
       41.652271
       9
       124
       337111.820237
       12.72817
    
    
      1
       01SuzwMJEIXsK7A8dQbl
       45.74
       38.597815
       9
       124
       218917.545703
       12.29645
    
    
      2
       01azqd4InC7m9JpocGv5
       45.74
       38.597815
       9
       124
       218917.545703
       12.29645
    
    
      3
       01jsnpXSAlgw6aPeDxrU
       45.74
       38.597815
       9
       124
       218917.545703
       12.29645
    
    
      4
       01kcPWA9K2BOxQeS5Rju
       46.65
       41.145135
       9
       124
       238008.146261
       12.38006
    
  

5 rows × 7 columns



In [20]:

    
X_test_image_asm_rowstats.head()









    Out[20]:






  
    
      
      filename
      te_mean
      te_std
      te_min
      te_max
      te_total
      te_logtotal
    
  
  
    
      0
       ITSUPtCmh7WdJcsYDwQ5
       39.830
       37.757506
       9
       124
       186481.300676
       12.136086
    
    
      1
       Ig2DB5tSiEy1cJvV0zdw
       39.285
       37.532482
       9
       124
       182833.481254
       12.116331
    
    
      2
       Jmo6eIhLZ4t9r8QsxEg5
       46.660
       41.146318
       9
       124
       238066.013418
       12.380303
    
    
      3
       JtPFl4ewgdD78OzCMa3o
       46.660
       41.146318
       9
       124
       238066.013418
       12.380303
    
    
      4
       K3ZtByPHGSFYNljDUEXp
       39.285
       37.532482
       9
       124
       182833.481254
       12.116331
    
  

5 rows × 7 columns



In [21]:

    
# Write column stats and row stats to file

X_train_rowstats.to_csv('data/train-asm-rowstats-20percent.csv', index=False)
X_test_rowstats.to_csv('data/test-asm-rowstats-20percent.csv', index=False)
X_train_image_asm_rowstats.to_csv('data/train-image-asm-rowstats-20percent.csv', index=False)
X_test_image_asm_rowstats.to_csv('data/test-image-asm-rowstats-20percent.csv', index=False)



In [22]:

    
# Combine all the training features and write to file.
combined_train_data = sorted_train_data_asm.merge(sorted_train_data_byte, on='filename')

combined_train_data = combined_train_data.merge(X_train_rowstats, on='filename')

combined_train_data = combined_train_data.merge(X_train_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_train_data = combined_train_data.merge(sorted_train_image_asm, on='filename')

combined_train_data = combined_train_data.merge(all_train_rowstats, on='filename')

# Result is better without the image rowstats for all image features
#combined_train_data = combined_train_data.merge(all_train_image_asm_rowstats, on='filename')

combined_train_data.to_csv('data/all-combined-train-data-20percent.csv', index=False)

combined_train_data.head()









    Out[22]:






  
    
      
      filename
      edx
      esi
      es
      ds
      ss
      cs
      ah
      al
      ax
      bh
      bl
      bx
      ch
      cl
      cx
      dh
      dl
      dx
      eax
      
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
        750
        496
       3
       0
       0
       0
       8
       224
       49
       34
       25
       0
       41
       191
       52
       38
       163
       63
       1447
      ...
    
    
      1
       01SuzwMJEIXsK7A8dQbl
       1121
         24
       3
       1
       4
       2
       6
        22
        7
        1
        4
       0
        3
        37
        2
        4
         9
        3
       1220
      ...
    
    
      2
       01azqd4InC7m9JpocGv5
       1493
       1900
       0
       0
       0
       0
       1
       398
        0
        0
       47
       0
        1
        77
        4
        1
        56
        2
       4438
      ...
    
    
      3
       01jsnpXSAlgw6aPeDxrU
        525
          4
       0
       0
       0
       0
       0
         0
        0
        0
        1
       0
        0
         1
        2
        0
         0
        0
        942
      ...
    
    
      4
       01kcPWA9K2BOxQeS5Rju
         23
         35
       0
       0
       0
       0
       0
         3
        0
        0
        1
       0
        0
         1
        0
        0
         0
        0
        137
      ...
    
  

5 rows × 428 columns



In [23]:

    
# Combine all the testing features and write to file.
combined_test_data = sorted_test_data_asm.merge(sorted_test_data_byte, on='filename')

combined_test_data = combined_test_data.merge(X_test_rowstats, on='filename')

combined_test_data = combined_test_data.merge(X_test_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_test_data = combined_test_data.merge(sorted_test_image_asm, on='filename')

combined_test_data = combined_test_data.merge(all_test_rowstats, on='filename')

# Result is better without the image rowstats for all image features
#combined_test_data = combined_test_data.merge(all_test_image_asm_rowstats, on='filename')

combined_test_data.to_csv('data/all-combined-test-data-20percent.csv', index=False)

combined_test_data.head()









    Out[23]:






  
    
      
      filename
      edx
      esi
      es
      ds
      ss
      cs
      ah
      al
      ax
      bh
      bl
      bx
      ch
      cl
      cx
      dh
      dl
      dx
      eax
      
    
  
  
    
      0
       ITSUPtCmh7WdJcsYDwQ5
       245
       434
       0
       1
       0
       0
        9
       51
       1
       0
       4
       1
       3
       1
       3
       0
       1
       2
       553
      ...
    
    
      1
       Ig2DB5tSiEy1cJvV0zdw
       258
       437
       0
       0
       0
       0
       11
       60
       1
       1
       2
       2
       0
       1
       2
       2
       1
       3
       554
      ...
    
    
      2
       Jmo6eIhLZ4t9r8QsxEg5
       238
       365
       0
       0
       0
       0
        8
       51
       0
       0
       4
       1
       0
       1
       2
       1
       0
       0
       519
      ...
    
    
      3
       JtPFl4ewgdD78OzCMa3o
       241
       556
       1
       1
       1
       0
        4
       52
       0
       1
       0
       1
       2
       2
       2
       0
       1
       2
       668
      ...
    
    
      4
       K3ZtByPHGSFYNljDUEXp
        92
        75
       0
       0
       0
       0
        0
        1
       0
       0
       0
       0
       0
       2
       0
       0
       0
       0
       402
      ...
    
  

5 rows × 428 columns



In [ ]:

    
# Do some plotting
plt.figure(figsize=(15,15))
plt.xlabel("mean")
plt.ylabel('logtotal')
plt.scatter(Xrowstats['mean'], Xrowstats['logtotal'], c=y, cmap='brg')



In [ ]:

    
plt.figure(figsize=(15,15))
plt.xlabel("total")
plt.ylabel('class')
plt.scatter(Xrowstats['logtotal'], y, c=y, cmap='brg')



In [ ]:

    
plt.figure(figsize=(15,15))
plt.xlabel("mean")
plt.ylabel("std")
plt.scatter(Xrowstats['mean'], Xrowstats['std'], c=y, cmap='brg')



In [ ]:

    
plt.figure(figsize=(15,15))
plt.xlabel("mean")
plt.ylabel('class')
plt.scatter(Xrowstats['mean'], y, c=y, cmap='brg')



In [ ]:

    
plt.figure(figsize=(15,15))
plt.xlabel("standard deviation")
plt.ylabel('class')
plt.scatter(Xrowstats['std'], y, c=y, cmap='brg')



In [ ]:

    
plt.figure(figsize=(15,15))
plt.xlabel("max")
plt.ylabel('class')
plt.scatter(Xrowstats['max'], y, c=y, cmap='brg')



In [ ]:

    
plt.figure(figsize=(15,15))
sns.pairplot(Xrowstats, vars=['mean','std'], kind='reg', hue='class', size=6)



In [ ]:

    
plt.figure(figsize=(15,15))
sns.pairplot(Xrowstats, vars=['min','max'], kind='reg', hue='class', size=6)



In [ ]:

    
plt.figure(figsize=(15,15))
sns.pairplot(Xrowstats, vars=['mean','std','min','max'], kind='reg', hue='class', size=4)



In [ ]:

    
plt.figure(figsize=(15,15))
plt.xlabel("means")
plt.ylabel("standard deviation")
plt.scatter(X_means, X_std, cmap='brg')



In [ ]:

    
plt.figure(figsize=(15,15))
plt.xlabel("mean")
plt.ylabel("malware class")
ya = range(0,101)
plt.scatter(X_means, ya, cmap='brg')



In [ ]:

    
plt.figure(figsize=(15,15))
plt.xlabel("fword")
plt.ylabel("db")
plt.scatter(X['fword'], X['db'], c=y, cmap='brg')



In [ ]:

    
plt.figure(figsize=(15,15))
plt.xlabel("WinMain")
plt.ylabel("db.1")
plt.scatter(X['WinMain'], X['db.1'], c=y, cmap='brg')



In [ ]:

    
plt.figure(figsize=(15,15))
plt.xlabel("X_image_asm")
plt.ylabel("Malware Class")
plt.scatter(X_image_asm.iloc[:,5], y, c=y, cmap='brg')



In [ ]:

    
#sns.set()
#sns.set_context("paper")
plt.figure(figsize=(15,15))
sns.pairplot(combined_train_data, vars=['fword','db'], kind='reg', hue='class', size=6)



In [ ]:

    
help(np.zeros)

4. Perform Some Classification Tests



In [3]:

    
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss

    Parameters
    ----------
    y_true : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    y_pred : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """
    predictions = np.clip(y_pred, eps, 1 - eps)

    # normalize row sums to 1
    predictions /= predictions.sum(axis=1)[:, np.newaxis]

    actual = np.zeros(y_pred.shape)
    n_samples = actual.shape[0]
    actual[np.arange(n_samples), y_true.astype(int)] = 1
    vectsum = np.sum(actual * np.log(predictions))
    loss = -1.0 / n_samples * vectsum
    return loss



In [4]:

    
def run_cv(X,y, clf):

    # Construct a kfolds object
    kf = KFold(len(y),n_folds=10,shuffle=True)
    y_prob = np.zeros((len(y),9))
    y_pred = np.zeros(len(y))
    
    # Iterate through folds
    for train_index, test_index in kf:
        print(test_index, train_index)
        X_train = X.loc[train_index,:]
        X_test = X.loc[test_index,:]
        y_train = y[train_index]

        clf.fit(X_train,y_train)
        y_prob[test_index] = clf.predict_proba(X_test)
        y_pred[test_index] = clf.predict(X_test)
    
    return y_prob, y_pred



In [27]:

    
X = combined_train_data.iloc[:,1:]



In [28]:

    
X.head()









    Out[28]:






  
    
      
      edx
      esi
      es
      ds
      ss
      cs
      ah
      al
      ax
      bh
      bl
      bx
      ch
      cl
      cx
      dh
      dl
      dx
      eax
      ebp
      
    
  
  
    
      0
        750
        496
       3
       0
       0
       0
       8
       224
       49
       34
       25
       0
       41
       191
       52
       38
       163
       63
       1447
        905
      ...
    
    
      1
       1121
         24
       3
       1
       4
       2
       6
        22
        7
        1
        4
       0
        3
        37
        2
        4
         9
        3
       1220
       1544
      ...
    
    
      2
       1493
       1900
       0
       0
       0
       0
       1
       398
        0
        0
       47
       0
        1
        77
        4
        1
        56
        2
       4438
        591
      ...
    
    
      3
        525
          4
       0
       0
       0
       0
       0
         0
        0
        0
        1
       0
        0
         1
        2
        0
         0
        0
        942
        451
      ...
    
    
      4
         23
         35
       0
       0
       0
       0
       0
         3
        0
        0
        1
       0
        0
         1
        0
        0
         0
        0
        137
         43
      ...
    
  

5 rows × 427 columns



In [29]:

    
X.shape









    Out[29]:





(10868, 427)



In [ ]:



In [30]:

    
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
yloss = y - 1
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)









    



(array([    2,     4,    15, ..., 10845, 10863, 10866]), array([    0,     1,     3, ..., 10864, 10865, 10867]))
(array([    3,     8,    27, ..., 10832, 10860, 10867]), array([    0,     1,     2, ..., 10864, 10865, 10866]))
(array([   46,    55,    71, ..., 10854, 10859, 10864]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   11,    13,    16, ..., 10837, 10839, 10848]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   18,    21,    30, ..., 10821, 10830, 10844]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    7,    12,    41, ..., 10853, 10855, 10865]), array([    0,     1,     2, ..., 10864, 10866, 10867]))
(array([    5,     6,     9, ..., 10849, 10851, 10856]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    1,    26,    28, ..., 10846, 10847, 10850]), array([    0,     2,     3, ..., 10865, 10866, 10867]))
(array([   14,    32,    33, ..., 10840, 10857, 10862]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    0,    20,    43, ..., 10852, 10858, 10861]), array([    1,     2,     3, ..., 10865, 10866, 10867]))
logloss = 0.0168
multiclass logloss = 0.0168
score = 0.9977
[[1540    0    0    0    0    1    0    0    0]
 [   1 2474    3    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   2    0    0    0   39    1    0    0    0]
 [   4    0    0    0    0  747    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1226    2]
 [   0    0    0    0    0    0    0    9 1004]]



In [47]:

    
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)









    



score = 0.9979
[[1541    0    0    0    0    0    0    0    0]
 [   1 2475    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   1    0    0    0   40    1    0    0    0]
 [   5    0    0    0    0  746    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1226    2]
 [   0    0    0    0    0    0    0    9 1004]]



In [48]:

    
yloss = y - 1
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))









    



multiclass logloss = 0.0164



In [31]:

    
# Combine all the training features and write to file.

# removing image rowstats for all image features produces the best ExtraTreesClassifier result yet: 
# logloss: 0.0162
# score: 0.9976

combined_train_data = sorted_train_data_asm.merge(sorted_train_data_byte, on='filename')

combined_train_data = combined_train_data.merge(X_train_rowstats, on='filename')

combined_train_data = combined_train_data.merge(X_train_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_train_data = combined_train_data.merge(sorted_train_image_asm, on='filename')

combined_train_data = combined_train_data.merge(all_train_rowstats, on='filename')

#combined_train_data = combined_train_data.merge(all_train_image_asm_rowstats, on='filename')

combined_train_data.to_csv('data/all-combined-train-data-20percent.csv', index=False)

X = combined_train_data.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
yloss = y - 1

combined_train_data.head()



In [32]:

    
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)









    



(array([   44,    54,    55, ..., 10851, 10853, 10865]), array([    0,     1,     2, ..., 10864, 10866, 10867]))
(array([    1,    23,    35, ..., 10839, 10856, 10860]), array([    0,     2,     3, ..., 10865, 10866, 10867]))
(array([   10,    25,    41, ..., 10826, 10831, 10846]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   28,    30,    36, ..., 10843, 10845, 10861]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   12,    20,    24, ..., 10809, 10852, 10864]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    2,     7,    15, ..., 10847, 10855, 10862]), array([    0,     1,     3, ..., 10865, 10866, 10867]))
(array([    3,     8,    11, ..., 10830, 10833, 10834]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    4,     5,    17, ..., 10854, 10859, 10867]), array([    0,     1,     2, ..., 10864, 10865, 10866]))
(array([    6,    14,    18, ..., 10848, 10857, 10863]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    0,     9,    13, ..., 10850, 10858, 10866]), array([    1,     2,     3, ..., 10864, 10865, 10867]))
logloss = 0.0162
multiclass logloss = 0.0162
score = 0.9976
[[1541    0    0    0    0    0    0    0    0]
 [   1 2473    3    0    0    0    0    1    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   2    0    0    0   40    0    0    0    0]
 [   6    0    0    0    0  745    0    0    0]
 [   0    0    0    0    0    0  398    0    0]
 [   0    0    0    0    0    0    0 1225    3]
 [   0    0    0    0    0    0    0    9 1004]]



In [48]:

    
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)









    



score = 0.9975
[[1539    0    0    0    0    2    0    0    0]
 [   2 2474    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  473    0    1    0    0    0]
 [   3    0    0    0   39    0    0    0    0]
 [   5    0    0    0    0  746    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1226    2]
 [   0    0    0    0    0    0    0    8 1005]]



In [49]:

    
yloss = y - 1
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))









    



multiclass logloss = 0.0174

5. Test/Experimental Code Only



In [ ]:

    
combined_train_data['class'] = sorted_train_labels.iloc[:,1]

class1 = X[combined_train_data['class'] == 1]
class2 = X[combined_train_data['class'] == 2]
class3 = X[combined_train_data['class'] == 3]
class4 = X[combined_train_data['class'] == 4]
class5 = X[combined_train_data['class'] == 5]
class6 = X[combined_train_data['class'] == 6]
class7 = X[combined_train_data['class'] == 7]
class8 = X[combined_train_data['class'] == 8]
class9 = X[combined_train_data['class'] == 9]

columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns)

class_stats['mean'][1] = 1.0
class_stats.head()

classx = X[combined_train_data['class'] == i]
classxmean = classx.mean()
classxmean.head()

columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns, dtype=float)
for i in range(1,10):
  classx = X[combined_train_data['class'] == i]
  class_stats['mean'][i] = classx.mean().sum()
  class_stats['std'][i] = classx.std().sum()
  #class_stats['corr'][i] = classx.corr().sum()
  #class_stats['cov'][i] = classx.cov().sum()

class_stats.head()

plt.figure(figsize=(15,15))
plt.xlabel("means")
plt.ylabel("standard deviation")
plt.scatter(class_stats['mean'], class_stats['std'], c=[ 1,2,3,4,5,6,7,8,9 ], cmap='brg')

	filename	edx	esi	es	ds	ss	cs	ah	al	ax	bh	bl	ch	cl	cx	dh	dl	dx	eax
0	01IsoiSMh5gxyDYTl4CB	750	496	3	0	0	0	8	224	49	34	25	41	191	52	38	163	63	1447	...
1	01SuzwMJEIXsK7A8dQbl	1121	24	3	1	4	2	6	22	7	1	4	3	37	2	4	9	3	1220	...
2	01azqd4InC7m9JpocGv5	1493	1900	0	0	0	0	1	398	0	0	47	1	77	4	1	56	2	4438	...
3	01jsnpXSAlgw6aPeDxrU	525	4	0	0	0	0	0	0	0	0	1	0	1	2	0	0	0	942	...
4	01kcPWA9K2BOxQeS5Rju	23	35	0	0	0	0	0	3	0	0	1	0	1	0	0	0	0	137	...

	filename	ASM_33	ASM_40	ASM_41	ASM_42	ASM_43	ASM_125	ASM_137	ASM_139	ASM_140	ASM_141	ASM_142	ASM_143	ASM_144	ASM_145	ASM_146	ASM_147	ASM_150	ASM_154	ASM_155
0	ITSUPtCmh7WdJcsYDwQ5	48	9	9	9	9	10	48	48	48	9	9	9	9	9	9	9	124	84	104	...
1	Ig2DB5tSiEy1cJvV0zdw	48	9	9	9	9	10	48	48	48	9	9	9	9	9	9	9	124	84	104	...
2	Jmo6eIhLZ4t9r8QsxEg5	48	9	9	9	9	10	48	48	48	9	9	9	9	9	9	9	124	84	104	...
3	JtPFl4ewgdD78OzCMa3o	48	9	9	9	9	10	48	48	48	9	9	9	9	9	9	9	124	84	104	...
4	K3ZtByPHGSFYNljDUEXp	48	9	9	9	9	10	48	48	48	9	9	9	9	9	9	9	124	84	104	...

	filename	entropy	filesize
0	01IsoiSMh5gxyDYTl4CB	0.614952	6874624
1	01SuzwMJEIXsK7A8dQbl	0.843262	460288
2	01azqd4InC7m9JpocGv5	0.703961	5256192
3	01jsnpXSAlgw6aPeDxrU	0.806035	4825600
4	01kcPWA9K2BOxQeS5Rju	0.871610	712704

	filename	mean	std	max	total	logtotal
0	01IsoiSMh5gxyDYTl4CB	1620.895522	10717.960833	87555	1.521066e+12	28.050433
1	01SuzwMJEIXsK7A8dQbl	170.199005	720.368718	5817	7.131993e+08	20.385272
2	01azqd4InC7m9JpocGv5	13888.179104	136000.425110	1367070	2.582119e+15	35.487387
3	01jsnpXSAlgw6aPeDxrU	710.004975	6557.617262	65928	3.069569e+11	26.449973
4	01kcPWA9K2BOxQeS5Rju	12.099502	40.226360	445	2.165899e+05	12.285761

	filename	e_mean	e_std	e_max	e_total	e_logtotal
0	ITSUPtCmh7WdJcsYDwQ5	360.348259	3120.464387	31361	3.526400e+10	24.286128
1	Ig2DB5tSiEy1cJvV0zdw	2405.940299	23569.804426	236923	1.343532e+13	30.228908
2	Jmo6eIhLZ4t9r8QsxEg5	2600.109453	25596.553591	257289	1.712357e+13	30.471477
3	JtPFl4ewgdD78OzCMa3o	212.213930	1631.495862	16341	5.657682e+09	22.456280
4	K3ZtByPHGSFYNljDUEXp	7841.134328	78041.521891	784363	4.799784e+14	33.804762

	filename	tr_mean	tr_std	tr_min	tr_max	tr_total	tr_logtotal
0	01IsoiSMh5gxyDYTl4CB	65.27	41.652271	9	124	337111.820237	12.72817
1	01SuzwMJEIXsK7A8dQbl	45.74	38.597815	9	124	218917.545703	12.29645
2	01azqd4InC7m9JpocGv5	45.74	38.597815	9	124	218917.545703	12.29645
3	01jsnpXSAlgw6aPeDxrU	45.74	38.597815	9	124	218917.545703	12.29645
4	01kcPWA9K2BOxQeS5Rju	46.65	41.145135	9	124	238008.146261	12.38006

	filename	te_mean	te_std	te_min	te_max	te_total	te_logtotal
0	ITSUPtCmh7WdJcsYDwQ5	39.830	37.757506	9	124	186481.300676	12.136086
1	Ig2DB5tSiEy1cJvV0zdw	39.285	37.532482	9	124	182833.481254	12.116331
2	Jmo6eIhLZ4t9r8QsxEg5	46.660	41.146318	9	124	238066.013418	12.380303
3	JtPFl4ewgdD78OzCMa3o	46.660	41.146318	9	124	238066.013418	12.380303
4	K3ZtByPHGSFYNljDUEXp	39.285	37.532482	9	124	182833.481254	12.116331