1. Final Selection of Features in the 30 Percent Best Feature Set



In [1]:

    
# Just import everything we might need, it saves time.
import warnings
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from time import time
from scipy.stats import randint as sp_randint
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score, KFold, train_test_split
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVC
import seaborn as sns
%pylab inline
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)









    



Populating the interactive namespace from numpy and matplotlib



In [4]:

    
import sklearn
print(sklearn.__version__)

2. Load The Sorted Training Data Features

- sorted-train-malware-features-asm-reduced.csv
- sorted-train-malware-features-byte.csv
- sorted-train-labels.csv
- sorted-train-features-combined.csv



In [2]:

    
# First load the .asm and .byte training/test data and training labels
sorted_train_data_asm = pd.read_csv('data/sorted-train-malware-features-asm-30percent.csv')
sorted_train_data_byte = pd.read_csv('data/sorted-train-malware-features-byte.csv')
sorted_test_data_asm = pd.read_csv('data/sorted-test-malware-features-asm-30percent.csv')
sorted_test_data_byte = pd.read_csv('data/sorted-test-malware-features-byte.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
#combined_train_data = pd.read_csv('data/sorted-train-features-combined.csv')

# Load the image data for training data
sorted_train_image_asm = pd.read_csv('data/sorted-train-image-features-asm-30percent.csv')
sorted_test_image_asm = pd.read_csv('data/sorted-test-image-features-asm-30percent.csv')
#sorted_train_image_byte = pd.read_csv('data/sorted-train-image-features-byte-reduced.csv')

# Now load the row statistics for the original feature set, these will be combined with the new features
all_train_rowstats = pd.read_csv('data/all-train-asm-rowstats.csv')
all_test_rowstats = pd.read_csv('data/all-test-asm-rowstats.csv')
#all_train_image_asm_rowstats = pd.read_csv('data/all-train-image-asm-rowstats.csv')
#all_test_image_asm_rowstats = pd.read_csv('data/all-test-image-asm-rowstats.csv')



In [3]:

    
sorted_train_data_asm.head()









    Out[3]:






  
    
      
      filename
      edx
      esi
      es
      ds
      ss
      cs
      ah
      al
      ax
      bh
      bl
      bx
      ch
      cl
      cx
      dh
      dl
      dx
      eax
      
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
        750
        496
       3
       0
       0
       0
       8
       224
       49
       34
       25
       0
       41
       191
       52
       38
       163
       63
       1447
      ...
    
    
      1
       01SuzwMJEIXsK7A8dQbl
       1121
         24
       3
       1
       4
       2
       6
        22
        7
        1
        4
       0
        3
        37
        2
        4
         9
        3
       1220
      ...
    
    
      2
       01azqd4InC7m9JpocGv5
       1493
       1900
       0
       0
       0
       0
       1
       398
        0
        0
       47
       0
        1
        77
        4
        1
        56
        2
       4438
      ...
    
    
      3
       01jsnpXSAlgw6aPeDxrU
        525
          4
       0
       0
       0
       0
       0
         0
        0
        0
        1
       0
        0
         1
        2
        0
         0
        0
        942
      ...
    
    
      4
       01kcPWA9K2BOxQeS5Rju
         23
         35
       0
       0
       0
       0
       0
         3
        0
        0
        1
       0
        0
         1
        0
        0
         0
        0
        137
      ...
    
  

5 rows × 303 columns



In [4]:

    
sorted_train_labels.head()









    Out[4]:






  
    
      
      Id
      Class
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
       2
    
    
      1
       01SuzwMJEIXsK7A8dQbl
       8
    
    
      2
       01azqd4InC7m9JpocGv5
       9
    
    
      3
       01jsnpXSAlgw6aPeDxrU
       9
    
    
      4
       01kcPWA9K2BOxQeS5Rju
       1
    
  

5 rows × 2 columns



In [4]:

    
sorted_train_image_asm.head()









    Out[4]:






  
    
      
      filename
      ASM_28
      ASM_29
      ASM_31
      ASM_32
      ASM_33
      ASM_34
      ASM_40
      ASM_41
      ASM_42
      ASM_43
      ASM_47
      ASM_124
      ASM_125
      ASM_137
      ASM_138
      ASM_139
      ASM_140
      ASM_141
      ASM_142
      
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
       13
       10
       116
       101
       120
       116
       49
       48
       48
       48
        9
       45
       45
       116
       101
       120
       116
       58
       48
      ...
    
    
      1
       01SuzwMJEIXsK7A8dQbl
       68
       69
        58
        48
        48
        52
        9
        9
        9
        9
       59
       13
       10
        48
        48
        48
        48
        9
        9
      ...
    
    
      2
       01azqd4InC7m9JpocGv5
       68
       69
        58
        48
        48
        52
        9
        9
        9
        9
       59
       13
       10
        48
        48
        48
        48
        9
        9
      ...
    
    
      3
       01jsnpXSAlgw6aPeDxrU
       68
       69
        58
        48
        48
        52
        9
        9
        9
        9
       59
       13
       10
        48
        48
        48
        48
        9
        9
      ...
    
    
      4
       01kcPWA9K2BOxQeS5Rju
       68
       69
        58
        49
        48
        48
        9
        9
        9
        9
       59
       13
       10
        48
        48
        48
        48
        9
        9
      ...
    
  

5 rows × 301 columns



In [5]:

    
sorted_test_image_asm.head()









    Out[5]:






  
    
      
      filename
      ASM_28
      ASM_29
      ASM_31
      ASM_32
      ASM_33
      ASM_34
      ASM_40
      ASM_41
      ASM_42
      ASM_43
      ASM_47
      ASM_124
      ASM_125
      ASM_137
      ASM_138
      ASM_139
      ASM_140
      ASM_141
      ASM_142
      
    
  
  
    
      0
       ITSUPtCmh7WdJcsYDwQ5
       68
       69
       58
       48
       48
       52
       9
       9
       9
       9
       59
       13
       10
       48
       48
       48
       48
       9
       9
      ...
    
    
      1
       Ig2DB5tSiEy1cJvV0zdw
       68
       69
       58
       48
       48
       52
       9
       9
       9
       9
       59
       13
       10
       48
       48
       48
       48
       9
       9
      ...
    
    
      2
       Jmo6eIhLZ4t9r8QsxEg5
       68
       69
       58
       48
       48
       52
       9
       9
       9
       9
       59
       13
       10
       48
       48
       48
       48
       9
       9
      ...
    
    
      3
       JtPFl4ewgdD78OzCMa3o
       68
       69
       58
       48
       48
       52
       9
       9
       9
       9
       59
       13
       10
       48
       48
       48
       48
       9
       9
      ...
    
    
      4
       K3ZtByPHGSFYNljDUEXp
       68
       69
       58
       48
       48
       52
       9
       9
       9
       9
       59
       13
       10
       48
       48
       48
       48
       9
       9
      ...
    
  

5 rows × 301 columns



In [7]:

    
sorted_train_data_byte.head()









    Out[7]:






  
    
      
      filename
      entropy
      filesize
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
       0.614952
       6874624
    
    
      1
       01SuzwMJEIXsK7A8dQbl
       0.843262
        460288
    
    
      2
       01azqd4InC7m9JpocGv5
       0.703961
       5256192
    
    
      3
       01jsnpXSAlgw6aPeDxrU
       0.806035
       4825600
    
    
      4
       01kcPWA9K2BOxQeS5Rju
       0.871610
        712704
    
  

5 rows × 3 columns



In [6]:

    
sorted_test_data_asm.head()









    Out[6]:






  
    
      
      filename
      edx
      esi
      es
      ds
      ss
      cs
      ah
      al
      ax
      bh
      bl
      bx
      ch
      cl
      cx
      dh
      dl
      dx
      eax
      
    
  
  
    
      0
       ITSUPtCmh7WdJcsYDwQ5
       245
       434
       0
       1
       0
       0
        9
       51
       1
       0
       4
       1
       3
       1
       3
       0
       1
       2
       553
      ...
    
    
      1
       Ig2DB5tSiEy1cJvV0zdw
       258
       437
       0
       0
       0
       0
       11
       60
       1
       1
       2
       2
       0
       1
       2
       2
       1
       3
       554
      ...
    
    
      2
       Jmo6eIhLZ4t9r8QsxEg5
       238
       365
       0
       0
       0
       0
        8
       51
       0
       0
       4
       1
       0
       1
       2
       1
       0
       0
       519
      ...
    
    
      3
       JtPFl4ewgdD78OzCMa3o
       241
       556
       1
       1
       1
       0
        4
       52
       0
       1
       0
       1
       2
       2
       2
       0
       1
       2
       668
      ...
    
    
      4
       K3ZtByPHGSFYNljDUEXp
        92
        75
       0
       0
       0
       0
        0
        1
       0
       0
       0
       0
       0
       2
       0
       0
       0
       0
       402
      ...
    
  

5 rows × 303 columns



In [3]:

    
# Assign asm data to X,y for brevity.
X = sorted_train_data_asm.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
X_test = sorted_test_data_asm.iloc[:,1:]
X_train_image_asm = sorted_train_image_asm.iloc[:,1:]
X_test_image_asm = sorted_test_image_asm.iloc[:,1:]
# X_image_byte = sorted_train_image_byte.iloc[:,1:]



In [7]:

    
X_train_image_asm.shape









    Out[7]:





(10868, 300)



In [8]:

    
X_test_image_asm.shape









    Out[8]:





(10873, 300)

3. Perform Some Basic Statistical Analysis on the Feature Sets

Find the following:
- Feature and row mean
- Feature and row standard deviation
- Feature and row correlation coefficient
- Feature and row covariance
- Feature and row minimum and maximum



In [10]:

    
# Train feature stats
X_means = X.mean()
X_std = X.std()
X_cor = X.corr()
X_cov = X.cov()

# Test feature stats
X_test_means = X_test.mean()
X_test_std = X_test.std()
X_test_cor = X_test.corr()
X_test_cov = X_test.cov()

# Train image feature stats
X_train_image_asm_means = X_train_image_asm.mean()
X_train_image_asm_std = X_train_image_asm.std()
X_train_image_asm_cor = X_train_image_asm.corr()
X_train_image_asm_cov = X_train_image_asm.cov()

# Test image feature stats
X_test_image_asm_means = X_test_image_asm.mean()
X_test_image_asm_std = X_test_image_asm.std()
X_test_image_asm_cor = X_test_image_asm.corr()
X_test_image_asm_cov = X_test_image_asm.cov()

# Not using byte image features
#X_image_byte_means = X_image_byte.mean()
#X_image_byte_std = X_image_byte.std()
#X_image_byte_cor = X_image_byte.corr()
#X_image_byte_cov = X_image_byte.cov()



In [11]:

    
X_train_image_asm_means.head()









    Out[11]:





ASM_28    55.418936
ASM_29    55.515366
ASM_31    71.289474
ASM_32    60.463103
ASM_33    64.914704
dtype: float64



In [12]:

    
X_train_image_asm_std.head()









    Out[12]:





ASM_28    23.048969
ASM_29    24.740569
ASM_31    24.510906
ASM_32    22.028613
ASM_33    29.676660
dtype: float64



In [13]:

    
X_test_image_asm_means.head()









    Out[13]:





ASM_28    55.430608
ASM_29    55.529937
ASM_31    71.282995
ASM_32    60.451301
ASM_33    64.949508
dtype: float64



In [14]:

    
# The byte image data has low standard deviation and mean variance and is not very useful for learning
# so this data will not be used for further analysis.
X_test_image_asm_std.head()









    Out[14]:





ASM_28    23.041988
ASM_29    24.736962
ASM_31    24.477592
ASM_32    22.010681
ASM_33    29.682114
dtype: float64



In [ ]:

    
X_means.head()

X_std.head()

X_cor.head()

X_cov.head()

X_means.min()

X_means.max()

X_std.min()

X_std.max()

X_means[X_means == X_means.min()]

X_means[X_means == X_means.max()]

X_std[X_std == X_std.min()]

X_std[X_std == X_std.max()]



In [4]:

    
# Row stats for train and test data
X_train_rowstats = pd.DataFrame(index=np.arange(X.shape[0]), columns=['filename','mean','std','min','max','total','logtotal'], dtype=float64)
X_test_rowstats = pd.DataFrame(index=np.arange(X_test.shape[0]), columns=['filename','e_mean','e_std','e_min','e_max','e_total','e_logtotal'], dtype=float64)

X_train_rowstats['filename'] = sorted_train_data_asm['filename']
for i in range(0,X.shape[0]):
    X_train_rowstats.loc[i,'mean'] = X.iloc[i,:].mean()
    X_train_rowstats.loc[i,'std'] = X.iloc[i,:].std()
    X_train_rowstats.loc[i,'min'] = X.iloc[i,:].min()
    X_train_rowstats.loc[i,'max'] = X.iloc[i,:].max()
    

X_train_rowstats['total'] = X_train_rowstats['max'] * X_train_rowstats['mean'] * X_train_rowstats['std']
X_train_rowstats['logtotal'] = np.log(X_train_rowstats['total']) # natural logarithm

X_test_rowstats['filename'] = sorted_test_data_asm['filename']
for i in range(0,X_test.shape[0]):
    X_test_rowstats.loc[i,'e_mean'] = X_test.iloc[i,:].mean()
    X_test_rowstats.loc[i,'e_std'] = X_test.iloc[i,:].std()
    X_test_rowstats.loc[i,'e_min'] = X_test.iloc[i,:].min()
    X_test_rowstats.loc[i,'e_max'] = X_test.iloc[i,:].max()
    

X_test_rowstats['e_total'] = X_test_rowstats['e_max'] * X_test_rowstats['e_mean'] * X_test_rowstats['e_std']
X_test_rowstats['e_logtotal'] = np.log(X_test_rowstats['e_total']) # natural logarithm

X_train_rowstats.head()









    Out[4]:






  
    
      
      filename
      mean
      std
      min
      max
      total
      logtotal
    
  
  
    
      0
      01IsoiSMh5gxyDYTl4CB
      1079.440397
      8770.067154
      0.0
      87555.0
      8.288626e+11
      27.443320
    
    
      1
      01SuzwMJEIXsK7A8dQbl
      113.470199
      592.647411
      0.0
      5817.0
      3.911806e+08
      19.784680
    
    
      2
      01azqd4InC7m9JpocGv5
      9243.897351
      111053.345673
      0.0
      1367070.0
      1.403387e+15
      34.877665
    
    
      3
      01jsnpXSAlgw6aPeDxrU
      472.860927
      5355.863744
      0.0
      65928.0
      1.669678e+11
      25.841067
    
    
      4
      01kcPWA9K2BOxQeS5Rju
      8.149007
      33.268778
      0.0
      445.0
      1.206428e+05
      11.700590



In [5]:

    
X_test_rowstats.head()









    Out[5]:






  
    
      
      filename
      e_mean
      e_std
      e_min
      e_max
      e_total
      e_logtotal
    
  
  
    
      0
      ITSUPtCmh7WdJcsYDwQ5
      240.016556
      2549.289656
      0.0
      31361.0
      1.918891e+10
      23.677598
    
    
      1
      Ig2DB5tSiEy1cJvV0zdw
      1601.523179
      19246.265720
      0.0
      236923.0
      7.302758e+12
      29.619273
    
    
      2
      Jmo6eIhLZ4t9r8QsxEg5
      1730.692053
      20900.889887
      0.0
      257289.0
      9.306916e+12
      29.861779
    
    
      3
      JtPFl4ewgdD78OzCMa3o
      141.384106
      1333.657108
      0.0
      16341.0
      3.081225e+09
      21.848593
    
    
      4
      K3ZtByPHGSFYNljDUEXp
      5219.086093
      63722.505091
      0.0
      784363.0
      2.608581e+14
      33.194998



In [6]:

    
# Image row stats
X_train_image_asm_rowstats = pd.DataFrame(index=np.arange(X_train_image_asm.shape[0]), columns=['filename','tr_mean','tr_std','tr_min','tr_max','tr_total','tr_logtotal'], dtype=float64)
X_test_image_asm_rowstats = pd.DataFrame(index=np.arange(X_test_image_asm.shape[0]), columns=['filename','te_mean','te_std','te_min','te_max','te_total','te_logtotal'], dtype=float64)

X_train_image_asm_rowstats['filename'] = sorted_train_data_asm['filename']
for i in range(0,X_train_image_asm.shape[0]):
    X_train_image_asm_rowstats.loc[i,'tr_mean'] = X_train_image_asm.iloc[i,:].mean()
    X_train_image_asm_rowstats.loc[i,'tr_std'] = X_train_image_asm.iloc[i,:].std()
    X_train_image_asm_rowstats.loc[i,'tr_min'] = X_train_image_asm.iloc[i,:].min()
    X_train_image_asm_rowstats.loc[i,'tr_max'] = X_train_image_asm.iloc[i,:].max()
    

X_train_image_asm_rowstats['tr_total'] = X_train_image_asm_rowstats['tr_max'] * X_train_image_asm_rowstats['tr_mean'] * X_train_image_asm_rowstats['tr_std']
X_train_image_asm_rowstats['tr_logtotal'] = np.log(X_train_image_asm_rowstats['tr_total']) # natural logarithm

X_test_image_asm_rowstats['filename'] = sorted_test_data_asm['filename']
for i in range(0,X_test_image_asm.shape[0]):
    X_test_image_asm_rowstats.loc[i,'te_mean'] = X_test_image_asm.iloc[i,:].mean()
    X_test_image_asm_rowstats.loc[i,'te_std'] = X_test_image_asm.iloc[i,:].std()
    X_test_image_asm_rowstats.loc[i,'te_min'] = X_test_image_asm.iloc[i,:].min()
    X_test_image_asm_rowstats.loc[i,'te_max'] = X_test_image_asm.iloc[i,:].max()
    

X_test_image_asm_rowstats['te_total'] = X_test_image_asm_rowstats['te_max'] * X_test_image_asm_rowstats['te_mean'] * X_test_image_asm_rowstats['te_std']
X_test_image_asm_rowstats['te_logtotal'] = np.log(X_test_image_asm_rowstats['te_total']) # natural logarithm

X_train_image_asm_rowstats.head()









    Out[6]:






  
    
      
      filename
      tr_mean
      tr_std
      tr_min
      tr_max
      tr_total
      tr_logtotal
    
  
  
    
      0
      01IsoiSMh5gxyDYTl4CB
      60.090000
      39.496521
      9.0
      124.0
      294294.895471
      12.592338
    
    
      1
      01SuzwMJEIXsK7A8dQbl
      49.860000
      37.157179
      9.0
      124.0
      229729.458551
      12.344658
    
    
      2
      01azqd4InC7m9JpocGv5
      49.860000
      37.157179
      9.0
      124.0
      229729.458551
      12.344658
    
    
      3
      01jsnpXSAlgw6aPeDxrU
      49.860000
      37.157179
      9.0
      124.0
      229729.458551
      12.344658
    
    
      4
      01kcPWA9K2BOxQeS5Rju
      53.863333
      39.352425
      9.0
      124.0
      262836.946508
      12.479289



In [7]:

    
X_test_image_asm_rowstats.head()









    Out[7]:






  
    
      
      filename
      te_mean
      te_std
      te_min
      te_max
      te_total
      te_logtotal
    
  
  
    
      0
      ITSUPtCmh7WdJcsYDwQ5
      46.600000
      38.026219
      9.0
      124.0
      219730.702320
      12.300158
    
    
      1
      Ig2DB5tSiEy1cJvV0zdw
      46.083333
      37.798043
      9.0
      124.0
      215990.615111
      12.282990
    
    
      2
      Jmo6eIhLZ4t9r8QsxEg5
      53.903333
      39.348975
      9.0
      124.0
      263009.070268
      12.479944
    
    
      3
      JtPFl4ewgdD78OzCMa3o
      53.903333
      39.348975
      9.0
      124.0
      263009.070268
      12.479944
    
    
      4
      K3ZtByPHGSFYNljDUEXp
      46.083333
      37.798043
      9.0
      124.0
      215990.615111
      12.282990



In [8]:

    
# Write column stats and row stats to file

X_train_rowstats.to_csv('data/train-asm-rowstats-30percent.csv', index=False)
X_test_rowstats.to_csv('data/test-asm-rowstats-30percent.csv', index=False)
X_train_image_asm_rowstats.to_csv('data/train-image-asm-rowstats-30percent.csv', index=False)
X_test_image_asm_rowstats.to_csv('data/test-image-asm-rowstats-30percent.csv', index=False)



In [9]:

    
# Generate some polynomial features
X_train_polyize = sorted_train_data_asm[['edi','esi','eax']]
X_test_polyize = sorted_test_data_asm[['edi','esi','eax']]
poly = PolynomialFeatures(3)
#X_train_byte_poly = DataFrame(poly.fit_transform(sorted_train_data_byte[['entropy','filesize']]), columns=['p1','p2','p3'])
X_train_byte_poly = poly.fit_transform(sorted_train_data_byte[['entropy','filesize']])
X_test_byte_poly = poly.fit_transform(sorted_test_data_byte[['entropy','filesize']])
X_train_asm_poly = poly.fit_transform(X_train_polyize)
X_test_asm_poly = poly.fit_transform(X_test_polyize)
X_train_asm_poly









    Out[9]:





array([[  1.00000000e+00,   3.93000000e+02,   4.96000000e+02, ...,
          3.55985152e+08,   1.03852926e+09,   3.02974162e+09],
       [  1.00000000e+00,   2.40000000e+01,   2.40000000e+01, ...,
          7.02720000e+05,   3.57216000e+07,   1.81584800e+09],
       [  1.00000000e+00,   1.28400000e+03,   1.90000000e+03, ...,
          1.60211800e+10,   3.74221036e+10,   8.74101557e+10],
       ..., 
       [  1.00000000e+00,   1.59000000e+02,   2.45000000e+02, ...,
          2.08286750e+07,   2.95002050e+07,   4.17819230e+07],
       [  1.00000000e+00,   7.20000000e+01,   1.53000000e+02, ...,
          4.26043800e+06,   5.06797200e+06,   6.02856800e+06],
       [  1.00000000e+00,   9.90000000e+01,   1.83000000e+02, ...,
          1.07499690e+07,   1.88565030e+07,   3.30761610e+07]])



In [10]:

    
X_train_asm_poly.shape









    Out[10]:





(10868, 20)



In [11]:

    
X_train_byte_poly.shape









    Out[11]:





(10868, 10)



In [22]:

    
X_train_byte_poly_df = pd.DataFrame(X_train_byte_poly, columns=[ 'train_byte_p{:d}'.format(i) for i in range(1,11) ])
X_test_byte_poly_df = pd.DataFrame(X_test_byte_poly, columns=[ 'test_byte_p{:d}'.format(i) for i in range(1,11) ])
X_train_asm_poly_df = pd.DataFrame(X_train_asm_poly, columns=[ 'train_asm_p{:d}'.format(i) for i in range(1,21) ])
X_test_asm_poly_df = pd.DataFrame(X_test_asm_poly, columns=[ 'test_asm_p{:d}'.format(i) for i in range(1,21) ])
X_train_asm_poly_df['filename'] = sorted_train_data_asm['filename']
X_train_byte_poly_df['filename'] = sorted_train_data_asm['filename']
X_test_byte_poly_df['filename'] = sorted_test_data_asm['filename']
X_test_asm_poly_df['filename'] = sorted_test_data_asm['filename']
X_train_asm_poly_df.head()









    Out[22]:






  
    
      
      train_asm_p1
      train_asm_p2
      train_asm_p3
      train_asm_p4
      train_asm_p5
      train_asm_p6
      train_asm_p7
      train_asm_p8
      train_asm_p9
      train_asm_p10
      ...
      train_asm_p12
      train_asm_p13
      train_asm_p14
      train_asm_p15
      train_asm_p16
      train_asm_p17
      train_asm_p18
      train_asm_p19
      train_asm_p20
      filename
    
  
  
    
      0
      1.0
      393.0
      496.0
      1447.0
      154449.0
      194928.0
      568671.0
      246016.0
      717712.0
      2093809.0
      ...
      7.660670e+07
      2.234877e+08
      9.668429e+07
      2.820608e+08
      8.228669e+08
      1.220239e+08
      3.559852e+08
      1.038529e+09
      3.029742e+09
      01IsoiSMh5gxyDYTl4CB
    
    
      1
      1.0
      24.0
      24.0
      1220.0
      576.0
      576.0
      29280.0
      576.0
      29280.0
      1488400.0
      ...
      1.382400e+04
      7.027200e+05
      1.382400e+04
      7.027200e+05
      3.572160e+07
      1.382400e+04
      7.027200e+05
      3.572160e+07
      1.815848e+09
      01SuzwMJEIXsK7A8dQbl
    
    
      2
      1.0
      1284.0
      1900.0
      4438.0
      1648656.0
      2439600.0
      5698392.0
      3610000.0
      8432200.0
      19695844.0
      ...
      3.132446e+09
      7.316735e+09
      4.635240e+09
      1.082694e+10
      2.528946e+10
      6.859000e+09
      1.602118e+10
      3.742210e+10
      8.741016e+10
      01azqd4InC7m9JpocGv5
    
    
      3
      1.0
      5.0
      4.0
      942.0
      25.0
      20.0
      4710.0
      16.0
      3768.0
      887364.0
      ...
      1.000000e+02
      2.355000e+04
      8.000000e+01
      1.884000e+04
      4.436820e+06
      6.400000e+01
      1.507200e+04
      3.549456e+06
      8.358969e+08
      01jsnpXSAlgw6aPeDxrU
    
    
      4
      1.0
      15.0
      35.0
      137.0
      225.0
      525.0
      2055.0
      1225.0
      4795.0
      18769.0
      ...
      7.875000e+03
      3.082500e+04
      1.837500e+04
      7.192500e+04
      2.815350e+05
      4.287500e+04
      1.678250e+05
      6.569150e+05
      2.571353e+06
      01kcPWA9K2BOxQeS5Rju
    
  

5 rows × 21 columns



In [23]:

    
X_test_byte_poly_df.head()









    Out[23]:






  
    
      
      test_byte_p1
      test_byte_p2
      test_byte_p3
      test_byte_p4
      test_byte_p5
      test_byte_p6
      test_byte_p7
      test_byte_p8
      test_byte_p9
      test_byte_p10
      filename
    
  
  
    
      0
      1.0
      0.210240
      7112192.0
      0.044201
      1.495265e+06
      5.058328e+13
      0.009293
      3.143639e+05
      1.063461e+13
      3.597580e+20
      ITSUPtCmh7WdJcsYDwQ5
    
    
      1
      1.0
      0.269725
      4870144.0
      0.072751
      1.313599e+06
      2.371830e+13
      0.019623
      3.543103e+05
      6.397416e+12
      1.155115e+20
      Ig2DB5tSiEy1cJvV0zdw
    
    
      2
      1.0
      0.237825
      6176768.0
      0.056561
      1.468992e+06
      3.815246e+13
      0.013452
      3.493633e+05
      9.073620e+12
      2.356589e+20
      Jmo6eIhLZ4t9r8QsxEg5
    
    
      3
      1.0
      0.266051
      3385344.0
      0.070783
      9.006726e+05
      1.146055e+13
      0.018832
      2.396244e+05
      3.049087e+12
      3.879792e+19
      JtPFl4ewgdD78OzCMa3o
    
    
      4
      1.0
      0.456041
      8493056.0
      0.207973
      3.873179e+06
      7.213200e+13
      0.094844
      1.766327e+06
      3.289512e+13
      6.126211e+20
      K3ZtByPHGSFYNljDUEXp



In [24]:

    
#TODO:

# Combine all the training features and write to file.
combined_train_data = sorted_train_data_asm.merge(sorted_train_data_byte, on='filename')

combined_train_data = combined_train_data.merge(X_train_rowstats, on='filename')

combined_train_data = combined_train_data.merge(X_train_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_train_data = combined_train_data.merge(sorted_train_image_asm, on='filename')

combined_train_data = combined_train_data.merge(all_train_rowstats, on='filename')

# merge polynomial features
combined_train_data = combined_train_data.merge(X_train_asm_poly_df, on='filename')
combined_train_data = combined_train_data.merge(X_train_byte_poly_df, on='filename')

combined_train_data.to_csv('data/final-combined-train-data-30percent-poly.csv', index=False)

combined_train_data.head()









    Out[24]:






  
    
      
      filename
      edx
      esi
      es
      ds
      ss
      cs
      ah
      al
      ax
      ...
      train_byte_p1
      train_byte_p2
      train_byte_p3
      train_byte_p4
      train_byte_p5
      train_byte_p6
      train_byte_p7
      train_byte_p8
      train_byte_p9
      train_byte_p10
    
  
  
    
      0
      01IsoiSMh5gxyDYTl4CB
      750
      496
      3
      0
      0
      0
      8
      224
      49
      ...
      1.0
      0.614952
      6874624.0
      0.378166
      4.227563e+06
      4.726046e+13
      0.232554
      2.599748e+06
      2.906291e+13
      3.248979e+20
    
    
      1
      01SuzwMJEIXsK7A8dQbl
      1121
      24
      3
      1
      4
      2
      6
      22
      7
      ...
      1.0
      0.843262
      460288.0
      0.711091
      3.881435e+05
      2.118650e+11
      0.599636
      3.273068e+05
      1.786578e+11
      9.751894e+16
    
    
      2
      01azqd4InC7m9JpocGv5
      1493
      1900
      0
      0
      0
      0
      1
      398
      0
      ...
      1.0
      0.703961
      5256192.0
      0.495561
      3.700153e+06
      2.762755e+13
      0.348855
      2.604762e+06
      1.944871e+13
      1.452157e+20
    
    
      3
      01jsnpXSAlgw6aPeDxrU
      525
      4
      0
      0
      0
      0
      0
      0
      0
      ...
      1.0
      0.806035
      4825600.0
      0.649692
      3.889601e+06
      2.328642e+13
      0.523674
      3.135154e+06
      1.876966e+13
      1.123709e+20
    
    
      4
      01kcPWA9K2BOxQeS5Rju
      23
      35
      0
      0
      0
      0
      0
      3
      0
      ...
      1.0
      0.871610
      712704.0
      0.759704
      6.211998e+05
      5.079470e+11
      0.662165
      5.414439e+05
      4.427316e+11
      3.620159e+17
    
  

5 rows × 653 columns



In [25]:

    
#TODO:

# Combine all the testing features and write to file.
combined_test_data = sorted_test_data_asm.merge(sorted_test_data_byte, on='filename')

combined_test_data = combined_test_data.merge(X_test_rowstats, on='filename')

combined_test_data = combined_test_data.merge(X_test_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_test_data = combined_test_data.merge(sorted_test_image_asm, on='filename')

combined_test_data = combined_test_data.merge(all_test_rowstats, on='filename')

# merge polynomial features
combined_test_data = combined_test_data.merge(X_test_asm_poly_df, on='filename')
combined_test_data = combined_test_data.merge(X_test_byte_poly_df, on='filename')

combined_test_data.to_csv('data/final-combined-test-data-30percent-poly.csv', index=False)

combined_test_data.head()









    Out[25]:






  
    
      
      filename
      edx
      esi
      es
      ds
      ss
      cs
      ah
      al
      ax
      ...
      test_byte_p1
      test_byte_p2
      test_byte_p3
      test_byte_p4
      test_byte_p5
      test_byte_p6
      test_byte_p7
      test_byte_p8
      test_byte_p9
      test_byte_p10
    
  
  
    
      0
      ITSUPtCmh7WdJcsYDwQ5
      245
      434
      0
      1
      0
      0
      9
      51
      1
      ...
      1.0
      0.210240
      7112192.0
      0.044201
      1.495265e+06
      5.058328e+13
      0.009293
      3.143639e+05
      1.063461e+13
      3.597580e+20
    
    
      1
      Ig2DB5tSiEy1cJvV0zdw
      258
      437
      0
      0
      0
      0
      11
      60
      1
      ...
      1.0
      0.269725
      4870144.0
      0.072751
      1.313599e+06
      2.371830e+13
      0.019623
      3.543103e+05
      6.397416e+12
      1.155115e+20
    
    
      2
      Jmo6eIhLZ4t9r8QsxEg5
      238
      365
      0
      0
      0
      0
      8
      51
      0
      ...
      1.0
      0.237825
      6176768.0
      0.056561
      1.468992e+06
      3.815246e+13
      0.013452
      3.493633e+05
      9.073620e+12
      2.356589e+20
    
    
      3
      JtPFl4ewgdD78OzCMa3o
      241
      556
      1
      1
      1
      0
      4
      52
      0
      ...
      1.0
      0.266051
      3385344.0
      0.070783
      9.006726e+05
      1.146055e+13
      0.018832
      2.396244e+05
      3.049087e+12
      3.879792e+19
    
    
      4
      K3ZtByPHGSFYNljDUEXp
      92
      75
      0
      0
      0
      0
      0
      1
      0
      ...
      1.0
      0.456041
      8493056.0
      0.207973
      3.873179e+06
      7.213200e+13
      0.094844
      1.766327e+06
      3.289512e+13
      6.126211e+20
    
  

5 rows × 653 columns



In [20]:

    
# Combine all the training features and write to file.
combined_train_data = sorted_train_data_asm.merge(sorted_train_data_byte, on='filename')

combined_train_data = combined_train_data.merge(X_train_rowstats, on='filename')

combined_train_data = combined_train_data.merge(X_train_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_train_data = combined_train_data.merge(sorted_train_image_asm, on='filename')

combined_train_data = combined_train_data.merge(all_train_rowstats, on='filename')

# Result is better without the image rowstats for all image features
#combined_train_data = combined_train_data.merge(all_train_image_asm_rowstats, on='filename')

combined_train_data.to_csv('data/final-combined-train-data-30percent.csv', index=False)

combined_train_data.head()









    Out[20]:






  
    
      
      filename
      edx
      esi
      es
      ds
      ss
      cs
      ah
      al
      ax
      bh
      bl
      bx
      ch
      cl
      cx
      dh
      dl
      dx
      eax
      
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
        750
        496
       3
       0
       0
       0
       8
       224
       49
       34
       25
       0
       41
       191
       52
       38
       163
       63
       1447
      ...
    
    
      1
       01SuzwMJEIXsK7A8dQbl
       1121
         24
       3
       1
       4
       2
       6
        22
        7
        1
        4
       0
        3
        37
        2
        4
         9
        3
       1220
      ...
    
    
      2
       01azqd4InC7m9JpocGv5
       1493
       1900
       0
       0
       0
       0
       1
       398
        0
        0
       47
       0
        1
        77
        4
        1
        56
        2
       4438
      ...
    
    
      3
       01jsnpXSAlgw6aPeDxrU
        525
          4
       0
       0
       0
       0
       0
         0
        0
        0
        1
       0
        0
         1
        2
        0
         0
        0
        942
      ...
    
    
      4
       01kcPWA9K2BOxQeS5Rju
         23
         35
       0
       0
       0
       0
       0
         3
        0
        0
        1
       0
        0
         1
        0
        0
         0
        0
        137
      ...
    
  

5 rows × 623 columns



In [21]:

    
# Combine all the testing features and write to file.
combined_test_data = sorted_test_data_asm.merge(sorted_test_data_byte, on='filename')

combined_test_data = combined_test_data.merge(X_test_rowstats, on='filename')

combined_test_data = combined_test_data.merge(X_test_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_test_data = combined_test_data.merge(sorted_test_image_asm, on='filename')

combined_test_data = combined_test_data.merge(all_test_rowstats, on='filename')

# Result is better without the image rowstats for all image features
#combined_test_data = combined_test_data.merge(all_test_image_asm_rowstats, on='filename')

combined_test_data.to_csv('data/final-combined-test-data-30percent.csv', index=False)

combined_test_data.head()









    Out[21]:






  
    
      
      filename
      edx
      esi
      es
      ds
      ss
      cs
      ah
      al
      ax
      bh
      bl
      bx
      ch
      cl
      cx
      dh
      dl
      dx
      eax
      
    
  
  
    
      0
       ITSUPtCmh7WdJcsYDwQ5
       245
       434
       0
       1
       0
       0
        9
       51
       1
       0
       4
       1
       3
       1
       3
       0
       1
       2
       553
      ...
    
    
      1
       Ig2DB5tSiEy1cJvV0zdw
       258
       437
       0
       0
       0
       0
       11
       60
       1
       1
       2
       2
       0
       1
       2
       2
       1
       3
       554
      ...
    
    
      2
       Jmo6eIhLZ4t9r8QsxEg5
       238
       365
       0
       0
       0
       0
        8
       51
       0
       0
       4
       1
       0
       1
       2
       1
       0
       0
       519
      ...
    
    
      3
       JtPFl4ewgdD78OzCMa3o
       241
       556
       1
       1
       1
       0
        4
       52
       0
       1
       0
       1
       2
       2
       2
       0
       1
       2
       668
      ...
    
    
      4
       K3ZtByPHGSFYNljDUEXp
        92
        75
       0
       0
       0
       0
        0
        1
       0
       0
       0
       0
       0
       2
       0
       0
       0
       0
       402
      ...
    
  

5 rows × 623 columns

4. Perform Some Classification Tests



In [26]:

    
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss

    Parameters
    ----------
    y_true : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    y_pred : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """
    predictions = np.clip(y_pred, eps, 1 - eps)

    # normalize row sums to 1
    predictions /= predictions.sum(axis=1)[:, np.newaxis]

    actual = np.zeros(y_pred.shape)
    n_samples = actual.shape[0]
    actual[np.arange(n_samples), y_true.astype(int)] = 1
    vectsum = np.sum(actual * np.log(predictions))
    loss = -1.0 / n_samples * vectsum
    return loss



In [27]:

    
def run_cv(X,y, clf):

    # Construct a kfolds object
    kf = KFold(len(y),n_folds=10,shuffle=True)
    y_prob = np.zeros((len(y),9))
    y_pred = np.zeros(len(y))
    
    # Iterate through folds
    for train_index, test_index in kf:
        print(test_index, train_index)
        X_train = X.loc[train_index,:]
        X_test = X.loc[test_index,:]
        y_train = y[train_index]

        clf.fit(X_train,y_train)
        y_prob[test_index] = clf.predict_proba(X_test)
        y_pred[test_index] = clf.predict(X_test)
    
    return y_prob, y_pred



In [28]:

    
# Set our X,y for the classifiers
X = combined_train_data.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
yloss = y - 1



In [29]:

    
# combined train data plus polynomial features
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)









    



[    3    18    26 ..., 10823 10829 10832] [    0     1     2 ..., 10865 10866 10867]
[   13    16    17 ..., 10855 10856 10857] [    0     1     2 ..., 10865 10866 10867]
[    0     1    10 ..., 10772 10785 10825] [    2     3     4 ..., 10865 10866 10867]
[   21    24    33 ..., 10841 10842 10852] [    0     1     2 ..., 10865 10866 10867]
[    2    12    15 ..., 10846 10858 10866] [    0     1     3 ..., 10864 10865 10867]
[    5     7     9 ..., 10839 10850 10864] [    0     1     2 ..., 10865 10866 10867]
[    4    14    23 ..., 10845 10851 10853] [    0     1     2 ..., 10865 10866 10867]
[   47    50    54 ..., 10847 10854 10862] [    0     1     2 ..., 10865 10866 10867]
[   19    20    41 ..., 10861 10865 10867] [    0     1     2 ..., 10863 10864 10866]
[    6     8    36 ..., 10859 10860 10863] [    0     1     2 ..., 10865 10866 10867]
logloss = 0.0134
multiclass logloss = 0.0134
score = 0.9970
[[1538    0    0    0    0    3    0    0    0]
 [   1 2473    3    0    0    0    0    1    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   1    0    0    0   39    2    0    0    0]
 [   6    0    0    0    1  744    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1223    5]
 [   0    0    0    0    0    0    0    8 1005]]



In [25]:

    
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)









    



(array([   14,    22,    25, ..., 10840, 10859, 10862]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    7,     9,    36, ..., 10826, 10850, 10861]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    1,     2,     6, ..., 10856, 10858, 10867]), array([    0,     3,     4, ..., 10864, 10865, 10866]))
(array([   10,    11,    15, ..., 10831, 10841, 10844]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    5,    16,    23, ..., 10838, 10863, 10864]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    0,    13,    20, ..., 10848, 10853, 10860]), array([    1,     2,     3, ..., 10865, 10866, 10867]))
(array([    4,     8,    44, ..., 10847, 10852, 10866]), array([    0,     1,     2, ..., 10864, 10865, 10867]))
(array([   17,    24,    27, ..., 10855, 10857, 10865]), array([    0,     1,     2, ..., 10864, 10866, 10867]))
(array([   12,    18,    32, ..., 10830, 10833, 10849]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    3,    51,    60, ..., 10793, 10798, 10814]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
logloss = 0.0133
multiclass logloss = 0.0133
score = 0.9978
[[1540    0    0    0    0    1    0    0    0]
 [   1 2475    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   2    0    0    0   38    2    0    0    0]
 [   3    0    0    0    0  748    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1225    3]
 [   0    0    0    0    0    0    0    8 1005]]



In [24]:

    
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)









    



(array([    8,    33,    41, ..., 10830, 10838, 10862]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    0,     3,    18, ..., 10844, 10858, 10865]), array([    1,     2,     4, ..., 10864, 10866, 10867]))
(array([    9,    14,    27, ..., 10854, 10855, 10866]), array([    0,     1,     2, ..., 10864, 10865, 10867]))
(array([   26,    39,    40, ..., 10829, 10853, 10856]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    1,    12,    31, ..., 10842, 10863, 10867]), array([    0,     2,     3, ..., 10864, 10865, 10866]))
(array([   20,    28,    32, ..., 10823, 10836, 10852]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    6,    11,    19, ..., 10848, 10849, 10851]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    4,     5,    10, ..., 10850, 10857, 10859]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   21,    37,    72, ..., 10860, 10861, 10864]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    2,     7,    13, ..., 10805, 10825, 10845]), array([    0,     1,     3, ..., 10865, 10866, 10867]))
logloss = 0.0140
multiclass logloss = 0.0140
score = 0.9977
[[1541    0    0    0    0    0    0    0    0]
 [   1 2475    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   5    0    0    0   37    0    0    0    0]
 [   5    0    0    0    0  746    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1227    1]
 [   0    0    0    0    0    0    0    9 1004]]

5. Test/Experimental Code Only



In [ ]:

    
combined_train_data['class'] = sorted_train_labels.iloc[:,1]

class1 = X[combined_train_data['class'] == 1]
class2 = X[combined_train_data['class'] == 2]
class3 = X[combined_train_data['class'] == 3]
class4 = X[combined_train_data['class'] == 4]
class5 = X[combined_train_data['class'] == 5]
class6 = X[combined_train_data['class'] == 6]
class7 = X[combined_train_data['class'] == 7]
class8 = X[combined_train_data['class'] == 8]
class9 = X[combined_train_data['class'] == 9]

columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns)

class_stats['mean'][1] = 1.0
class_stats.head()

classx = X[combined_train_data['class'] == i]
classxmean = classx.mean()
classxmean.head()

columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns, dtype=float)
for i in range(1,10):
  classx = X[combined_train_data['class'] == i]
  class_stats['mean'][i] = classx.mean().sum()
  class_stats['std'][i] = classx.std().sum()
  #class_stats['corr'][i] = classx.corr().sum()
  #class_stats['cov'][i] = classx.cov().sum()

class_stats.head()

plt.figure(figsize=(15,15))
plt.xlabel("means")
plt.ylabel("standard deviation")
plt.scatter(class_stats['mean'], class_stats['std'], c=[ 1,2,3,4,5,6,7,8,9 ], cmap='brg')



In [17]:

    
['filename',[ 'train_asm_p{:d}'.format(i) for i in range(1,11) ]]









    Out[17]:





['filename',
 ['train_asm_p1',
  'train_asm_p2',
  'train_asm_p3',
  'train_asm_p4',
  'train_asm_p5',
  'train_asm_p6',
  'train_asm_p7',
  'train_asm_p8',
  'train_asm_p9',
  'train_asm_p10']]



In [ ]:

    
help(pd.DataFrame)

	filename	edx	esi	es	ds	ss	cs	ah	al	ax	bh	bl	ch	cl	cx	dh	dl	dx	eax
0	01IsoiSMh5gxyDYTl4CB	750	496	3	0	0	0	8	224	49	34	25	41	191	52	38	163	63	1447	...
1	01SuzwMJEIXsK7A8dQbl	1121	24	3	1	4	2	6	22	7	1	4	3	37	2	4	9	3	1220	...
2	01azqd4InC7m9JpocGv5	1493	1900	0	0	0	0	1	398	0	0	47	1	77	4	1	56	2	4438	...
3	01jsnpXSAlgw6aPeDxrU	525	4	0	0	0	0	0	0	0	0	1	0	1	2	0	0	0	942	...
4	01kcPWA9K2BOxQeS5Rju	23	35	0	0	0	0	0	3	0	0	1	0	1	0	0	0	0	137	...

	filename	ASM_28	ASM_29	ASM_31	ASM_32	ASM_33	ASM_34	ASM_40	ASM_41	ASM_42	ASM_43	ASM_47	ASM_124	ASM_125	ASM_137	ASM_138	ASM_139	ASM_140	ASM_141	ASM_142
0	ITSUPtCmh7WdJcsYDwQ5	68	69	58	48	48	52	9	9	9	9	59	13	10	48	48	48	48	9	9	...
1	Ig2DB5tSiEy1cJvV0zdw	68	69	58	48	48	52	9	9	9	9	59	13	10	48	48	48	48	9	9	...
2	Jmo6eIhLZ4t9r8QsxEg5	68	69	58	48	48	52	9	9	9	9	59	13	10	48	48	48	48	9	9	...
3	JtPFl4ewgdD78OzCMa3o	68	69	58	48	48	52	9	9	9	9	59	13	10	48	48	48	48	9	9	...
4	K3ZtByPHGSFYNljDUEXp	68	69	58	48	48	52	9	9	9	9	59	13	10	48	48	48	48	9	9	...

	filename	entropy	filesize
0	01IsoiSMh5gxyDYTl4CB	0.614952	6874624
1	01SuzwMJEIXsK7A8dQbl	0.843262	460288
2	01azqd4InC7m9JpocGv5	0.703961	5256192
3	01jsnpXSAlgw6aPeDxrU	0.806035	4825600
4	01kcPWA9K2BOxQeS5Rju	0.871610	712704

	filename	mean	std	max	total	logtotal
0	01IsoiSMh5gxyDYTl4CB	1079.440397	8770.067154	87555.0	8.288626e+11	27.443320
1	01SuzwMJEIXsK7A8dQbl	113.470199	592.647411	5817.0	3.911806e+08	19.784680
2	01azqd4InC7m9JpocGv5	9243.897351	111053.345673	1367070.0	1.403387e+15	34.877665
3	01jsnpXSAlgw6aPeDxrU	472.860927	5355.863744	65928.0	1.669678e+11	25.841067
4	01kcPWA9K2BOxQeS5Rju	8.149007	33.268778	445.0	1.206428e+05	11.700590

	filename	e_mean	e_std	e_max	e_total	e_logtotal
0	ITSUPtCmh7WdJcsYDwQ5	240.016556	2549.289656	31361.0	1.918891e+10	23.677598
1	Ig2DB5tSiEy1cJvV0zdw	1601.523179	19246.265720	236923.0	7.302758e+12	29.619273
2	Jmo6eIhLZ4t9r8QsxEg5	1730.692053	20900.889887	257289.0	9.306916e+12	29.861779
3	JtPFl4ewgdD78OzCMa3o	141.384106	1333.657108	16341.0	3.081225e+09	21.848593
4	K3ZtByPHGSFYNljDUEXp	5219.086093	63722.505091	784363.0	2.608581e+14	33.194998

	filename	tr_mean	tr_std	tr_min	tr_max	tr_total	tr_logtotal
0	01IsoiSMh5gxyDYTl4CB	60.090000	39.496521	9.0	124.0	294294.895471	12.592338
1	01SuzwMJEIXsK7A8dQbl	49.860000	37.157179	9.0	124.0	229729.458551	12.344658
2	01azqd4InC7m9JpocGv5	49.860000	37.157179	9.0	124.0	229729.458551	12.344658
3	01jsnpXSAlgw6aPeDxrU	49.860000	37.157179	9.0	124.0	229729.458551	12.344658
4	01kcPWA9K2BOxQeS5Rju	53.863333	39.352425	9.0	124.0	262836.946508	12.479289

	filename	te_mean	te_std	te_min	te_max	te_total	te_logtotal
0	ITSUPtCmh7WdJcsYDwQ5	46.600000	38.026219	9.0	124.0	219730.702320	12.300158
1	Ig2DB5tSiEy1cJvV0zdw	46.083333	37.798043	9.0	124.0	215990.615111	12.282990
2	Jmo6eIhLZ4t9r8QsxEg5	53.903333	39.348975	9.0	124.0	263009.070268	12.479944
3	JtPFl4ewgdD78OzCMa3o	53.903333	39.348975	9.0	124.0	263009.070268	12.479944
4	K3ZtByPHGSFYNljDUEXp	46.083333	37.798043	9.0	124.0	215990.615111	12.282990

	train_asm_p1	train_asm_p2	train_asm_p3	train_asm_p4	train_asm_p5	train_asm_p6	train_asm_p7	train_asm_p8	train_asm_p9	train_asm_p10	...	train_asm_p12	train_asm_p13	train_asm_p14	train_asm_p15	train_asm_p16	train_asm_p17	train_asm_p18	train_asm_p19	train_asm_p20	filename
0	1.0	393.0	496.0	1447.0	154449.0	194928.0	568671.0	246016.0	717712.0	2093809.0	...	7.660670e+07	2.234877e+08	9.668429e+07	2.820608e+08	8.228669e+08	1.220239e+08	3.559852e+08	1.038529e+09	3.029742e+09	01IsoiSMh5gxyDYTl4CB
1	1.0	24.0	24.0	1220.0	576.0	576.0	29280.0	576.0	29280.0	1488400.0	...	1.382400e+04	7.027200e+05	1.382400e+04	7.027200e+05	3.572160e+07	1.382400e+04	7.027200e+05	3.572160e+07	1.815848e+09	01SuzwMJEIXsK7A8dQbl
2	1.0	1284.0	1900.0	4438.0	1648656.0	2439600.0	5698392.0	3610000.0	8432200.0	19695844.0	...	3.132446e+09	7.316735e+09	4.635240e+09	1.082694e+10	2.528946e+10	6.859000e+09	1.602118e+10	3.742210e+10	8.741016e+10	01azqd4InC7m9JpocGv5
3	1.0	5.0	4.0	942.0	25.0	20.0	4710.0	16.0	3768.0	887364.0	...	1.000000e+02	2.355000e+04	8.000000e+01	1.884000e+04	4.436820e+06	6.400000e+01	1.507200e+04	3.549456e+06	8.358969e+08	01jsnpXSAlgw6aPeDxrU
4	1.0	15.0	35.0	137.0	225.0	525.0	2055.0	1225.0	4795.0	18769.0	...	7.875000e+03	3.082500e+04	1.837500e+04	7.192500e+04	2.815350e+05	4.287500e+04	1.678250e+05	6.569150e+05	2.571353e+06	01kcPWA9K2BOxQeS5Rju

	test_byte_p1	test_byte_p2	test_byte_p3	test_byte_p4	test_byte_p5	test_byte_p6	test_byte_p7	test_byte_p8	test_byte_p9	test_byte_p10	filename
0	1.0	0.210240	7112192.0	0.044201	1.495265e+06	5.058328e+13	0.009293	3.143639e+05	1.063461e+13	3.597580e+20	ITSUPtCmh7WdJcsYDwQ5
1	1.0	0.269725	4870144.0	0.072751	1.313599e+06	2.371830e+13	0.019623	3.543103e+05	6.397416e+12	1.155115e+20	Ig2DB5tSiEy1cJvV0zdw
2	1.0	0.237825	6176768.0	0.056561	1.468992e+06	3.815246e+13	0.013452	3.493633e+05	9.073620e+12	2.356589e+20	Jmo6eIhLZ4t9r8QsxEg5
3	1.0	0.266051	3385344.0	0.070783	9.006726e+05	1.146055e+13	0.018832	2.396244e+05	3.049087e+12	3.879792e+19	JtPFl4ewgdD78OzCMa3o
4	1.0	0.456041	8493056.0	0.207973	3.873179e+06	7.213200e+13	0.094844	1.766327e+06	3.289512e+13	6.126211e+20	K3ZtByPHGSFYNljDUEXp

	filename	edx	esi	es	ds	ss	cs	ah	al	ax	...	train_byte_p1	train_byte_p2	train_byte_p3	train_byte_p4	train_byte_p5	train_byte_p6	train_byte_p7	train_byte_p8	train_byte_p9	train_byte_p10
0	01IsoiSMh5gxyDYTl4CB	750	496	3	0	0	0	8	224	49	...	1.0	0.614952	6874624.0	0.378166	4.227563e+06	4.726046e+13	0.232554	2.599748e+06	2.906291e+13	3.248979e+20
1	01SuzwMJEIXsK7A8dQbl	1121	24	3	1	4	2	6	22	7	...	1.0	0.843262	460288.0	0.711091	3.881435e+05	2.118650e+11	0.599636	3.273068e+05	1.786578e+11	9.751894e+16
2	01azqd4InC7m9JpocGv5	1493	1900	0	0	0	0	1	398	0	...	1.0	0.703961	5256192.0	0.495561	3.700153e+06	2.762755e+13	0.348855	2.604762e+06	1.944871e+13	1.452157e+20
3	01jsnpXSAlgw6aPeDxrU	525	4	0	0	0	0	0	0	0	...	1.0	0.806035	4825600.0	0.649692	3.889601e+06	2.328642e+13	0.523674	3.135154e+06	1.876966e+13	1.123709e+20
4	01kcPWA9K2BOxQeS5Rju	23	35	0	0	0	0	0	3	0	...	1.0	0.871610	712704.0	0.759704	6.211998e+05	5.079470e+11	0.662165	5.414439e+05	4.427316e+11	3.620159e+17