1. Final Selection of Features in the 30 Percent Best Feature Set


In [1]:
# Just import everything we might need, it saves time.
import warnings
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from time import time
from scipy.stats import randint as sp_randint
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score, KFold, train_test_split
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVC
import seaborn as sns
%pylab inline
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


Populating the interactive namespace from numpy and matplotlib

In [4]:
import sklearn
print(sklearn.__version__)


0.17.1

2. Load The Sorted Training Data Features

- sorted-train-malware-features-asm-reduced.csv
- sorted-train-malware-features-byte.csv
- sorted-train-labels.csv
- sorted-train-features-combined.csv

In [2]:
# First load the .asm and .byte training/test data and training labels
sorted_train_data_asm = pd.read_csv('data/sorted-train-malware-features-asm-30percent.csv')
sorted_train_data_byte = pd.read_csv('data/sorted-train-malware-features-byte.csv')
sorted_test_data_asm = pd.read_csv('data/sorted-test-malware-features-asm-30percent.csv')
sorted_test_data_byte = pd.read_csv('data/sorted-test-malware-features-byte.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
#combined_train_data = pd.read_csv('data/sorted-train-features-combined.csv')

# Load the image data for training data
sorted_train_image_asm = pd.read_csv('data/sorted-train-image-features-asm-30percent.csv')
sorted_test_image_asm = pd.read_csv('data/sorted-test-image-features-asm-30percent.csv')
#sorted_train_image_byte = pd.read_csv('data/sorted-train-image-features-byte-reduced.csv')

# Now load the row statistics for the original feature set, these will be combined with the new features
all_train_rowstats = pd.read_csv('data/all-train-asm-rowstats.csv')
all_test_rowstats = pd.read_csv('data/all-test-asm-rowstats.csv')
#all_train_image_asm_rowstats = pd.read_csv('data/all-train-image-asm-rowstats.csv')
#all_test_image_asm_rowstats = pd.read_csv('data/all-test-image-asm-rowstats.csv')

In [3]:
sorted_train_data_asm.head()


Out[3]:
filename edx esi es ds ss cs ah al ax bh bl bx ch cl cx dh dl dx eax
0 01IsoiSMh5gxyDYTl4CB 750 496 3 0 0 0 8 224 49 34 25 0 41 191 52 38 163 63 1447 ...
1 01SuzwMJEIXsK7A8dQbl 1121 24 3 1 4 2 6 22 7 1 4 0 3 37 2 4 9 3 1220 ...
2 01azqd4InC7m9JpocGv5 1493 1900 0 0 0 0 1 398 0 0 47 0 1 77 4 1 56 2 4438 ...
3 01jsnpXSAlgw6aPeDxrU 525 4 0 0 0 0 0 0 0 0 1 0 0 1 2 0 0 0 942 ...
4 01kcPWA9K2BOxQeS5Rju 23 35 0 0 0 0 0 3 0 0 1 0 0 1 0 0 0 0 137 ...

5 rows × 303 columns


In [4]:
sorted_train_labels.head()


Out[4]:
Id Class
0 01IsoiSMh5gxyDYTl4CB 2
1 01SuzwMJEIXsK7A8dQbl 8
2 01azqd4InC7m9JpocGv5 9
3 01jsnpXSAlgw6aPeDxrU 9
4 01kcPWA9K2BOxQeS5Rju 1

5 rows × 2 columns


In [4]:
sorted_train_image_asm.head()


Out[4]:
filename ASM_28 ASM_29 ASM_31 ASM_32 ASM_33 ASM_34 ASM_40 ASM_41 ASM_42 ASM_43 ASM_47 ASM_124 ASM_125 ASM_137 ASM_138 ASM_139 ASM_140 ASM_141 ASM_142
0 01IsoiSMh5gxyDYTl4CB 13 10 116 101 120 116 49 48 48 48 9 45 45 116 101 120 116 58 48 ...
1 01SuzwMJEIXsK7A8dQbl 68 69 58 48 48 52 9 9 9 9 59 13 10 48 48 48 48 9 9 ...
2 01azqd4InC7m9JpocGv5 68 69 58 48 48 52 9 9 9 9 59 13 10 48 48 48 48 9 9 ...
3 01jsnpXSAlgw6aPeDxrU 68 69 58 48 48 52 9 9 9 9 59 13 10 48 48 48 48 9 9 ...
4 01kcPWA9K2BOxQeS5Rju 68 69 58 49 48 48 9 9 9 9 59 13 10 48 48 48 48 9 9 ...

5 rows × 301 columns


In [5]:
sorted_test_image_asm.head()


Out[5]:
filename ASM_28 ASM_29 ASM_31 ASM_32 ASM_33 ASM_34 ASM_40 ASM_41 ASM_42 ASM_43 ASM_47 ASM_124 ASM_125 ASM_137 ASM_138 ASM_139 ASM_140 ASM_141 ASM_142
0 ITSUPtCmh7WdJcsYDwQ5 68 69 58 48 48 52 9 9 9 9 59 13 10 48 48 48 48 9 9 ...
1 Ig2DB5tSiEy1cJvV0zdw 68 69 58 48 48 52 9 9 9 9 59 13 10 48 48 48 48 9 9 ...
2 Jmo6eIhLZ4t9r8QsxEg5 68 69 58 48 48 52 9 9 9 9 59 13 10 48 48 48 48 9 9 ...
3 JtPFl4ewgdD78OzCMa3o 68 69 58 48 48 52 9 9 9 9 59 13 10 48 48 48 48 9 9 ...
4 K3ZtByPHGSFYNljDUEXp 68 69 58 48 48 52 9 9 9 9 59 13 10 48 48 48 48 9 9 ...

5 rows × 301 columns


In [7]:
sorted_train_data_byte.head()


Out[7]:
filename entropy filesize
0 01IsoiSMh5gxyDYTl4CB 0.614952 6874624
1 01SuzwMJEIXsK7A8dQbl 0.843262 460288
2 01azqd4InC7m9JpocGv5 0.703961 5256192
3 01jsnpXSAlgw6aPeDxrU 0.806035 4825600
4 01kcPWA9K2BOxQeS5Rju 0.871610 712704

5 rows × 3 columns


In [6]:
sorted_test_data_asm.head()


Out[6]:
filename edx esi es ds ss cs ah al ax bh bl bx ch cl cx dh dl dx eax
0 ITSUPtCmh7WdJcsYDwQ5 245 434 0 1 0 0 9 51 1 0 4 1 3 1 3 0 1 2 553 ...
1 Ig2DB5tSiEy1cJvV0zdw 258 437 0 0 0 0 11 60 1 1 2 2 0 1 2 2 1 3 554 ...
2 Jmo6eIhLZ4t9r8QsxEg5 238 365 0 0 0 0 8 51 0 0 4 1 0 1 2 1 0 0 519 ...
3 JtPFl4ewgdD78OzCMa3o 241 556 1 1 1 0 4 52 0 1 0 1 2 2 2 0 1 2 668 ...
4 K3ZtByPHGSFYNljDUEXp 92 75 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 402 ...

5 rows × 303 columns


In [3]:
# Assign asm data to X,y for brevity.
X = sorted_train_data_asm.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
X_test = sorted_test_data_asm.iloc[:,1:]
X_train_image_asm = sorted_train_image_asm.iloc[:,1:]
X_test_image_asm = sorted_test_image_asm.iloc[:,1:]
# X_image_byte = sorted_train_image_byte.iloc[:,1:]

In [7]:
X_train_image_asm.shape


Out[7]:
(10868, 300)

In [8]:
X_test_image_asm.shape


Out[8]:
(10873, 300)

3. Perform Some Basic Statistical Analysis on the Feature Sets

Find the following:
- Feature and row mean
- Feature and row standard deviation
- Feature and row correlation coefficient
- Feature and row covariance
- Feature and row minimum and maximum

In [10]:
# Train feature stats
X_means = X.mean()
X_std = X.std()
X_cor = X.corr()
X_cov = X.cov()

# Test feature stats
X_test_means = X_test.mean()
X_test_std = X_test.std()
X_test_cor = X_test.corr()
X_test_cov = X_test.cov()

# Train image feature stats
X_train_image_asm_means = X_train_image_asm.mean()
X_train_image_asm_std = X_train_image_asm.std()
X_train_image_asm_cor = X_train_image_asm.corr()
X_train_image_asm_cov = X_train_image_asm.cov()

# Test image feature stats
X_test_image_asm_means = X_test_image_asm.mean()
X_test_image_asm_std = X_test_image_asm.std()
X_test_image_asm_cor = X_test_image_asm.corr()
X_test_image_asm_cov = X_test_image_asm.cov()

# Not using byte image features
#X_image_byte_means = X_image_byte.mean()
#X_image_byte_std = X_image_byte.std()
#X_image_byte_cor = X_image_byte.corr()
#X_image_byte_cov = X_image_byte.cov()

In [11]:
X_train_image_asm_means.head()


Out[11]:
ASM_28    55.418936
ASM_29    55.515366
ASM_31    71.289474
ASM_32    60.463103
ASM_33    64.914704
dtype: float64

In [12]:
X_train_image_asm_std.head()


Out[12]:
ASM_28    23.048969
ASM_29    24.740569
ASM_31    24.510906
ASM_32    22.028613
ASM_33    29.676660
dtype: float64

In [13]:
X_test_image_asm_means.head()


Out[13]:
ASM_28    55.430608
ASM_29    55.529937
ASM_31    71.282995
ASM_32    60.451301
ASM_33    64.949508
dtype: float64

In [14]:
# The byte image data has low standard deviation and mean variance and is not very useful for learning
# so this data will not be used for further analysis.
X_test_image_asm_std.head()


Out[14]:
ASM_28    23.041988
ASM_29    24.736962
ASM_31    24.477592
ASM_32    22.010681
ASM_33    29.682114
dtype: float64

In [ ]:
X_means.head()

X_std.head()

X_cor.head()

X_cov.head()

X_means.min()

X_means.max()

X_std.min()

X_std.max()

X_means[X_means == X_means.min()]

X_means[X_means == X_means.max()]

X_std[X_std == X_std.min()]

X_std[X_std == X_std.max()]

In [4]:
# Row stats for train and test data
X_train_rowstats = pd.DataFrame(index=np.arange(X.shape[0]), columns=['filename','mean','std','min','max','total','logtotal'], dtype=float64)
X_test_rowstats = pd.DataFrame(index=np.arange(X_test.shape[0]), columns=['filename','e_mean','e_std','e_min','e_max','e_total','e_logtotal'], dtype=float64)

X_train_rowstats['filename'] = sorted_train_data_asm['filename']
for i in range(0,X.shape[0]):
    X_train_rowstats.loc[i,'mean'] = X.iloc[i,:].mean()
    X_train_rowstats.loc[i,'std'] = X.iloc[i,:].std()
    X_train_rowstats.loc[i,'min'] = X.iloc[i,:].min()
    X_train_rowstats.loc[i,'max'] = X.iloc[i,:].max()
    

X_train_rowstats['total'] = X_train_rowstats['max'] * X_train_rowstats['mean'] * X_train_rowstats['std']
X_train_rowstats['logtotal'] = np.log(X_train_rowstats['total']) # natural logarithm

X_test_rowstats['filename'] = sorted_test_data_asm['filename']
for i in range(0,X_test.shape[0]):
    X_test_rowstats.loc[i,'e_mean'] = X_test.iloc[i,:].mean()
    X_test_rowstats.loc[i,'e_std'] = X_test.iloc[i,:].std()
    X_test_rowstats.loc[i,'e_min'] = X_test.iloc[i,:].min()
    X_test_rowstats.loc[i,'e_max'] = X_test.iloc[i,:].max()
    

X_test_rowstats['e_total'] = X_test_rowstats['e_max'] * X_test_rowstats['e_mean'] * X_test_rowstats['e_std']
X_test_rowstats['e_logtotal'] = np.log(X_test_rowstats['e_total']) # natural logarithm

X_train_rowstats.head()


Out[4]:
filename mean std min max total logtotal
0 01IsoiSMh5gxyDYTl4CB 1079.440397 8770.067154 0.0 87555.0 8.288626e+11 27.443320
1 01SuzwMJEIXsK7A8dQbl 113.470199 592.647411 0.0 5817.0 3.911806e+08 19.784680
2 01azqd4InC7m9JpocGv5 9243.897351 111053.345673 0.0 1367070.0 1.403387e+15 34.877665
3 01jsnpXSAlgw6aPeDxrU 472.860927 5355.863744 0.0 65928.0 1.669678e+11 25.841067
4 01kcPWA9K2BOxQeS5Rju 8.149007 33.268778 0.0 445.0 1.206428e+05 11.700590

In [5]:
X_test_rowstats.head()


Out[5]:
filename e_mean e_std e_min e_max e_total e_logtotal
0 ITSUPtCmh7WdJcsYDwQ5 240.016556 2549.289656 0.0 31361.0 1.918891e+10 23.677598
1 Ig2DB5tSiEy1cJvV0zdw 1601.523179 19246.265720 0.0 236923.0 7.302758e+12 29.619273
2 Jmo6eIhLZ4t9r8QsxEg5 1730.692053 20900.889887 0.0 257289.0 9.306916e+12 29.861779
3 JtPFl4ewgdD78OzCMa3o 141.384106 1333.657108 0.0 16341.0 3.081225e+09 21.848593
4 K3ZtByPHGSFYNljDUEXp 5219.086093 63722.505091 0.0 784363.0 2.608581e+14 33.194998

In [6]:
# Image row stats
X_train_image_asm_rowstats = pd.DataFrame(index=np.arange(X_train_image_asm.shape[0]), columns=['filename','tr_mean','tr_std','tr_min','tr_max','tr_total','tr_logtotal'], dtype=float64)
X_test_image_asm_rowstats = pd.DataFrame(index=np.arange(X_test_image_asm.shape[0]), columns=['filename','te_mean','te_std','te_min','te_max','te_total','te_logtotal'], dtype=float64)

X_train_image_asm_rowstats['filename'] = sorted_train_data_asm['filename']
for i in range(0,X_train_image_asm.shape[0]):
    X_train_image_asm_rowstats.loc[i,'tr_mean'] = X_train_image_asm.iloc[i,:].mean()
    X_train_image_asm_rowstats.loc[i,'tr_std'] = X_train_image_asm.iloc[i,:].std()
    X_train_image_asm_rowstats.loc[i,'tr_min'] = X_train_image_asm.iloc[i,:].min()
    X_train_image_asm_rowstats.loc[i,'tr_max'] = X_train_image_asm.iloc[i,:].max()
    

X_train_image_asm_rowstats['tr_total'] = X_train_image_asm_rowstats['tr_max'] * X_train_image_asm_rowstats['tr_mean'] * X_train_image_asm_rowstats['tr_std']
X_train_image_asm_rowstats['tr_logtotal'] = np.log(X_train_image_asm_rowstats['tr_total']) # natural logarithm

X_test_image_asm_rowstats['filename'] = sorted_test_data_asm['filename']
for i in range(0,X_test_image_asm.shape[0]):
    X_test_image_asm_rowstats.loc[i,'te_mean'] = X_test_image_asm.iloc[i,:].mean()
    X_test_image_asm_rowstats.loc[i,'te_std'] = X_test_image_asm.iloc[i,:].std()
    X_test_image_asm_rowstats.loc[i,'te_min'] = X_test_image_asm.iloc[i,:].min()
    X_test_image_asm_rowstats.loc[i,'te_max'] = X_test_image_asm.iloc[i,:].max()
    

X_test_image_asm_rowstats['te_total'] = X_test_image_asm_rowstats['te_max'] * X_test_image_asm_rowstats['te_mean'] * X_test_image_asm_rowstats['te_std']
X_test_image_asm_rowstats['te_logtotal'] = np.log(X_test_image_asm_rowstats['te_total']) # natural logarithm

X_train_image_asm_rowstats.head()


Out[6]:
filename tr_mean tr_std tr_min tr_max tr_total tr_logtotal
0 01IsoiSMh5gxyDYTl4CB 60.090000 39.496521 9.0 124.0 294294.895471 12.592338
1 01SuzwMJEIXsK7A8dQbl 49.860000 37.157179 9.0 124.0 229729.458551 12.344658
2 01azqd4InC7m9JpocGv5 49.860000 37.157179 9.0 124.0 229729.458551 12.344658
3 01jsnpXSAlgw6aPeDxrU 49.860000 37.157179 9.0 124.0 229729.458551 12.344658
4 01kcPWA9K2BOxQeS5Rju 53.863333 39.352425 9.0 124.0 262836.946508 12.479289

In [7]:
X_test_image_asm_rowstats.head()


Out[7]:
filename te_mean te_std te_min te_max te_total te_logtotal
0 ITSUPtCmh7WdJcsYDwQ5 46.600000 38.026219 9.0 124.0 219730.702320 12.300158
1 Ig2DB5tSiEy1cJvV0zdw 46.083333 37.798043 9.0 124.0 215990.615111 12.282990
2 Jmo6eIhLZ4t9r8QsxEg5 53.903333 39.348975 9.0 124.0 263009.070268 12.479944
3 JtPFl4ewgdD78OzCMa3o 53.903333 39.348975 9.0 124.0 263009.070268 12.479944
4 K3ZtByPHGSFYNljDUEXp 46.083333 37.798043 9.0 124.0 215990.615111 12.282990

In [8]:
# Write column stats and row stats to file

X_train_rowstats.to_csv('data/train-asm-rowstats-30percent.csv', index=False)
X_test_rowstats.to_csv('data/test-asm-rowstats-30percent.csv', index=False)
X_train_image_asm_rowstats.to_csv('data/train-image-asm-rowstats-30percent.csv', index=False)
X_test_image_asm_rowstats.to_csv('data/test-image-asm-rowstats-30percent.csv', index=False)

In [9]:
# Generate some polynomial features
X_train_polyize = sorted_train_data_asm[['edi','esi','eax']]
X_test_polyize = sorted_test_data_asm[['edi','esi','eax']]
poly = PolynomialFeatures(3)
#X_train_byte_poly = DataFrame(poly.fit_transform(sorted_train_data_byte[['entropy','filesize']]), columns=['p1','p2','p3'])
X_train_byte_poly = poly.fit_transform(sorted_train_data_byte[['entropy','filesize']])
X_test_byte_poly = poly.fit_transform(sorted_test_data_byte[['entropy','filesize']])
X_train_asm_poly = poly.fit_transform(X_train_polyize)
X_test_asm_poly = poly.fit_transform(X_test_polyize)
X_train_asm_poly


Out[9]:
array([[  1.00000000e+00,   3.93000000e+02,   4.96000000e+02, ...,
          3.55985152e+08,   1.03852926e+09,   3.02974162e+09],
       [  1.00000000e+00,   2.40000000e+01,   2.40000000e+01, ...,
          7.02720000e+05,   3.57216000e+07,   1.81584800e+09],
       [  1.00000000e+00,   1.28400000e+03,   1.90000000e+03, ...,
          1.60211800e+10,   3.74221036e+10,   8.74101557e+10],
       ..., 
       [  1.00000000e+00,   1.59000000e+02,   2.45000000e+02, ...,
          2.08286750e+07,   2.95002050e+07,   4.17819230e+07],
       [  1.00000000e+00,   7.20000000e+01,   1.53000000e+02, ...,
          4.26043800e+06,   5.06797200e+06,   6.02856800e+06],
       [  1.00000000e+00,   9.90000000e+01,   1.83000000e+02, ...,
          1.07499690e+07,   1.88565030e+07,   3.30761610e+07]])

In [10]:
X_train_asm_poly.shape


Out[10]:
(10868, 20)

In [11]:
X_train_byte_poly.shape


Out[11]:
(10868, 10)

In [22]:
X_train_byte_poly_df = pd.DataFrame(X_train_byte_poly, columns=[ 'train_byte_p{:d}'.format(i) for i in range(1,11) ])
X_test_byte_poly_df = pd.DataFrame(X_test_byte_poly, columns=[ 'test_byte_p{:d}'.format(i) for i in range(1,11) ])
X_train_asm_poly_df = pd.DataFrame(X_train_asm_poly, columns=[ 'train_asm_p{:d}'.format(i) for i in range(1,21) ])
X_test_asm_poly_df = pd.DataFrame(X_test_asm_poly, columns=[ 'test_asm_p{:d}'.format(i) for i in range(1,21) ])
X_train_asm_poly_df['filename'] = sorted_train_data_asm['filename']
X_train_byte_poly_df['filename'] = sorted_train_data_asm['filename']
X_test_byte_poly_df['filename'] = sorted_test_data_asm['filename']
X_test_asm_poly_df['filename'] = sorted_test_data_asm['filename']
X_train_asm_poly_df.head()


Out[22]:
train_asm_p1 train_asm_p2 train_asm_p3 train_asm_p4 train_asm_p5 train_asm_p6 train_asm_p7 train_asm_p8 train_asm_p9 train_asm_p10 ... train_asm_p12 train_asm_p13 train_asm_p14 train_asm_p15 train_asm_p16 train_asm_p17 train_asm_p18 train_asm_p19 train_asm_p20 filename
0 1.0 393.0 496.0 1447.0 154449.0 194928.0 568671.0 246016.0 717712.0 2093809.0 ... 7.660670e+07 2.234877e+08 9.668429e+07 2.820608e+08 8.228669e+08 1.220239e+08 3.559852e+08 1.038529e+09 3.029742e+09 01IsoiSMh5gxyDYTl4CB
1 1.0 24.0 24.0 1220.0 576.0 576.0 29280.0 576.0 29280.0 1488400.0 ... 1.382400e+04 7.027200e+05 1.382400e+04 7.027200e+05 3.572160e+07 1.382400e+04 7.027200e+05 3.572160e+07 1.815848e+09 01SuzwMJEIXsK7A8dQbl
2 1.0 1284.0 1900.0 4438.0 1648656.0 2439600.0 5698392.0 3610000.0 8432200.0 19695844.0 ... 3.132446e+09 7.316735e+09 4.635240e+09 1.082694e+10 2.528946e+10 6.859000e+09 1.602118e+10 3.742210e+10 8.741016e+10 01azqd4InC7m9JpocGv5
3 1.0 5.0 4.0 942.0 25.0 20.0 4710.0 16.0 3768.0 887364.0 ... 1.000000e+02 2.355000e+04 8.000000e+01 1.884000e+04 4.436820e+06 6.400000e+01 1.507200e+04 3.549456e+06 8.358969e+08 01jsnpXSAlgw6aPeDxrU
4 1.0 15.0 35.0 137.0 225.0 525.0 2055.0 1225.0 4795.0 18769.0 ... 7.875000e+03 3.082500e+04 1.837500e+04 7.192500e+04 2.815350e+05 4.287500e+04 1.678250e+05 6.569150e+05 2.571353e+06 01kcPWA9K2BOxQeS5Rju

5 rows × 21 columns


In [23]:
X_test_byte_poly_df.head()


Out[23]:
test_byte_p1 test_byte_p2 test_byte_p3 test_byte_p4 test_byte_p5 test_byte_p6 test_byte_p7 test_byte_p8 test_byte_p9 test_byte_p10 filename
0 1.0 0.210240 7112192.0 0.044201 1.495265e+06 5.058328e+13 0.009293 3.143639e+05 1.063461e+13 3.597580e+20 ITSUPtCmh7WdJcsYDwQ5
1 1.0 0.269725 4870144.0 0.072751 1.313599e+06 2.371830e+13 0.019623 3.543103e+05 6.397416e+12 1.155115e+20 Ig2DB5tSiEy1cJvV0zdw
2 1.0 0.237825 6176768.0 0.056561 1.468992e+06 3.815246e+13 0.013452 3.493633e+05 9.073620e+12 2.356589e+20 Jmo6eIhLZ4t9r8QsxEg5
3 1.0 0.266051 3385344.0 0.070783 9.006726e+05 1.146055e+13 0.018832 2.396244e+05 3.049087e+12 3.879792e+19 JtPFl4ewgdD78OzCMa3o
4 1.0 0.456041 8493056.0 0.207973 3.873179e+06 7.213200e+13 0.094844 1.766327e+06 3.289512e+13 6.126211e+20 K3ZtByPHGSFYNljDUEXp

In [24]:
#TODO:

# Combine all the training features and write to file.
combined_train_data = sorted_train_data_asm.merge(sorted_train_data_byte, on='filename')

combined_train_data = combined_train_data.merge(X_train_rowstats, on='filename')

combined_train_data = combined_train_data.merge(X_train_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_train_data = combined_train_data.merge(sorted_train_image_asm, on='filename')

combined_train_data = combined_train_data.merge(all_train_rowstats, on='filename')

# merge polynomial features
combined_train_data = combined_train_data.merge(X_train_asm_poly_df, on='filename')
combined_train_data = combined_train_data.merge(X_train_byte_poly_df, on='filename')

combined_train_data.to_csv('data/final-combined-train-data-30percent-poly.csv', index=False)

combined_train_data.head()


Out[24]:
filename edx esi es ds ss cs ah al ax ... train_byte_p1 train_byte_p2 train_byte_p3 train_byte_p4 train_byte_p5 train_byte_p6 train_byte_p7 train_byte_p8 train_byte_p9 train_byte_p10
0 01IsoiSMh5gxyDYTl4CB 750 496 3 0 0 0 8 224 49 ... 1.0 0.614952 6874624.0 0.378166 4.227563e+06 4.726046e+13 0.232554 2.599748e+06 2.906291e+13 3.248979e+20
1 01SuzwMJEIXsK7A8dQbl 1121 24 3 1 4 2 6 22 7 ... 1.0 0.843262 460288.0 0.711091 3.881435e+05 2.118650e+11 0.599636 3.273068e+05 1.786578e+11 9.751894e+16
2 01azqd4InC7m9JpocGv5 1493 1900 0 0 0 0 1 398 0 ... 1.0 0.703961 5256192.0 0.495561 3.700153e+06 2.762755e+13 0.348855 2.604762e+06 1.944871e+13 1.452157e+20
3 01jsnpXSAlgw6aPeDxrU 525 4 0 0 0 0 0 0 0 ... 1.0 0.806035 4825600.0 0.649692 3.889601e+06 2.328642e+13 0.523674 3.135154e+06 1.876966e+13 1.123709e+20
4 01kcPWA9K2BOxQeS5Rju 23 35 0 0 0 0 0 3 0 ... 1.0 0.871610 712704.0 0.759704 6.211998e+05 5.079470e+11 0.662165 5.414439e+05 4.427316e+11 3.620159e+17

5 rows × 653 columns


In [25]:
#TODO:

# Combine all the testing features and write to file.
combined_test_data = sorted_test_data_asm.merge(sorted_test_data_byte, on='filename')

combined_test_data = combined_test_data.merge(X_test_rowstats, on='filename')

combined_test_data = combined_test_data.merge(X_test_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_test_data = combined_test_data.merge(sorted_test_image_asm, on='filename')

combined_test_data = combined_test_data.merge(all_test_rowstats, on='filename')

# merge polynomial features
combined_test_data = combined_test_data.merge(X_test_asm_poly_df, on='filename')
combined_test_data = combined_test_data.merge(X_test_byte_poly_df, on='filename')

combined_test_data.to_csv('data/final-combined-test-data-30percent-poly.csv', index=False)

combined_test_data.head()


Out[25]:
filename edx esi es ds ss cs ah al ax ... test_byte_p1 test_byte_p2 test_byte_p3 test_byte_p4 test_byte_p5 test_byte_p6 test_byte_p7 test_byte_p8 test_byte_p9 test_byte_p10
0 ITSUPtCmh7WdJcsYDwQ5 245 434 0 1 0 0 9 51 1 ... 1.0 0.210240 7112192.0 0.044201 1.495265e+06 5.058328e+13 0.009293 3.143639e+05 1.063461e+13 3.597580e+20
1 Ig2DB5tSiEy1cJvV0zdw 258 437 0 0 0 0 11 60 1 ... 1.0 0.269725 4870144.0 0.072751 1.313599e+06 2.371830e+13 0.019623 3.543103e+05 6.397416e+12 1.155115e+20
2 Jmo6eIhLZ4t9r8QsxEg5 238 365 0 0 0 0 8 51 0 ... 1.0 0.237825 6176768.0 0.056561 1.468992e+06 3.815246e+13 0.013452 3.493633e+05 9.073620e+12 2.356589e+20
3 JtPFl4ewgdD78OzCMa3o 241 556 1 1 1 0 4 52 0 ... 1.0 0.266051 3385344.0 0.070783 9.006726e+05 1.146055e+13 0.018832 2.396244e+05 3.049087e+12 3.879792e+19
4 K3ZtByPHGSFYNljDUEXp 92 75 0 0 0 0 0 1 0 ... 1.0 0.456041 8493056.0 0.207973 3.873179e+06 7.213200e+13 0.094844 1.766327e+06 3.289512e+13 6.126211e+20

5 rows × 653 columns


In [20]:
# Combine all the training features and write to file.
combined_train_data = sorted_train_data_asm.merge(sorted_train_data_byte, on='filename')

combined_train_data = combined_train_data.merge(X_train_rowstats, on='filename')

combined_train_data = combined_train_data.merge(X_train_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_train_data = combined_train_data.merge(sorted_train_image_asm, on='filename')

combined_train_data = combined_train_data.merge(all_train_rowstats, on='filename')

# Result is better without the image rowstats for all image features
#combined_train_data = combined_train_data.merge(all_train_image_asm_rowstats, on='filename')

combined_train_data.to_csv('data/final-combined-train-data-30percent.csv', index=False)

combined_train_data.head()


Out[20]:
filename edx esi es ds ss cs ah al ax bh bl bx ch cl cx dh dl dx eax
0 01IsoiSMh5gxyDYTl4CB 750 496 3 0 0 0 8 224 49 34 25 0 41 191 52 38 163 63 1447 ...
1 01SuzwMJEIXsK7A8dQbl 1121 24 3 1 4 2 6 22 7 1 4 0 3 37 2 4 9 3 1220 ...
2 01azqd4InC7m9JpocGv5 1493 1900 0 0 0 0 1 398 0 0 47 0 1 77 4 1 56 2 4438 ...
3 01jsnpXSAlgw6aPeDxrU 525 4 0 0 0 0 0 0 0 0 1 0 0 1 2 0 0 0 942 ...
4 01kcPWA9K2BOxQeS5Rju 23 35 0 0 0 0 0 3 0 0 1 0 0 1 0 0 0 0 137 ...

5 rows × 623 columns


In [21]:
# Combine all the testing features and write to file.
combined_test_data = sorted_test_data_asm.merge(sorted_test_data_byte, on='filename')

combined_test_data = combined_test_data.merge(X_test_rowstats, on='filename')

combined_test_data = combined_test_data.merge(X_test_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_test_data = combined_test_data.merge(sorted_test_image_asm, on='filename')

combined_test_data = combined_test_data.merge(all_test_rowstats, on='filename')

# Result is better without the image rowstats for all image features
#combined_test_data = combined_test_data.merge(all_test_image_asm_rowstats, on='filename')

combined_test_data.to_csv('data/final-combined-test-data-30percent.csv', index=False)

combined_test_data.head()


Out[21]:
filename edx esi es ds ss cs ah al ax bh bl bx ch cl cx dh dl dx eax
0 ITSUPtCmh7WdJcsYDwQ5 245 434 0 1 0 0 9 51 1 0 4 1 3 1 3 0 1 2 553 ...
1 Ig2DB5tSiEy1cJvV0zdw 258 437 0 0 0 0 11 60 1 1 2 2 0 1 2 2 1 3 554 ...
2 Jmo6eIhLZ4t9r8QsxEg5 238 365 0 0 0 0 8 51 0 0 4 1 0 1 2 1 0 0 519 ...
3 JtPFl4ewgdD78OzCMa3o 241 556 1 1 1 0 4 52 0 1 0 1 2 2 2 0 1 2 668 ...
4 K3ZtByPHGSFYNljDUEXp 92 75 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 402 ...

5 rows × 623 columns

4. Perform Some Classification Tests


In [26]:
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss

    Parameters
    ----------
    y_true : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    y_pred : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """
    predictions = np.clip(y_pred, eps, 1 - eps)

    # normalize row sums to 1
    predictions /= predictions.sum(axis=1)[:, np.newaxis]

    actual = np.zeros(y_pred.shape)
    n_samples = actual.shape[0]
    actual[np.arange(n_samples), y_true.astype(int)] = 1
    vectsum = np.sum(actual * np.log(predictions))
    loss = -1.0 / n_samples * vectsum
    return loss

In [27]:
def run_cv(X,y, clf):

    # Construct a kfolds object
    kf = KFold(len(y),n_folds=10,shuffle=True)
    y_prob = np.zeros((len(y),9))
    y_pred = np.zeros(len(y))
    
    # Iterate through folds
    for train_index, test_index in kf:
        print(test_index, train_index)
        X_train = X.loc[train_index,:]
        X_test = X.loc[test_index,:]
        y_train = y[train_index]

        clf.fit(X_train,y_train)
        y_prob[test_index] = clf.predict_proba(X_test)
        y_pred[test_index] = clf.predict(X_test)
    
    return y_prob, y_pred

In [28]:
# Set our X,y for the classifiers
X = combined_train_data.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
yloss = y - 1

In [29]:
# combined train data plus polynomial features
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)


[    3    18    26 ..., 10823 10829 10832] [    0     1     2 ..., 10865 10866 10867]
[   13    16    17 ..., 10855 10856 10857] [    0     1     2 ..., 10865 10866 10867]
[    0     1    10 ..., 10772 10785 10825] [    2     3     4 ..., 10865 10866 10867]
[   21    24    33 ..., 10841 10842 10852] [    0     1     2 ..., 10865 10866 10867]
[    2    12    15 ..., 10846 10858 10866] [    0     1     3 ..., 10864 10865 10867]
[    5     7     9 ..., 10839 10850 10864] [    0     1     2 ..., 10865 10866 10867]
[    4    14    23 ..., 10845 10851 10853] [    0     1     2 ..., 10865 10866 10867]
[   47    50    54 ..., 10847 10854 10862] [    0     1     2 ..., 10865 10866 10867]
[   19    20    41 ..., 10861 10865 10867] [    0     1     2 ..., 10863 10864 10866]
[    6     8    36 ..., 10859 10860 10863] [    0     1     2 ..., 10865 10866 10867]
logloss = 0.0134
multiclass logloss = 0.0134
score = 0.9970
[[1538    0    0    0    0    3    0    0    0]
 [   1 2473    3    0    0    0    0    1    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   1    0    0    0   39    2    0    0    0]
 [   6    0    0    0    1  744    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1223    5]
 [   0    0    0    0    0    0    0    8 1005]]

In [25]:
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)


(array([   14,    22,    25, ..., 10840, 10859, 10862]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    7,     9,    36, ..., 10826, 10850, 10861]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    1,     2,     6, ..., 10856, 10858, 10867]), array([    0,     3,     4, ..., 10864, 10865, 10866]))
(array([   10,    11,    15, ..., 10831, 10841, 10844]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    5,    16,    23, ..., 10838, 10863, 10864]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    0,    13,    20, ..., 10848, 10853, 10860]), array([    1,     2,     3, ..., 10865, 10866, 10867]))
(array([    4,     8,    44, ..., 10847, 10852, 10866]), array([    0,     1,     2, ..., 10864, 10865, 10867]))
(array([   17,    24,    27, ..., 10855, 10857, 10865]), array([    0,     1,     2, ..., 10864, 10866, 10867]))
(array([   12,    18,    32, ..., 10830, 10833, 10849]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    3,    51,    60, ..., 10793, 10798, 10814]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
logloss = 0.0133
multiclass logloss = 0.0133
score = 0.9978
[[1540    0    0    0    0    1    0    0    0]
 [   1 2475    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   2    0    0    0   38    2    0    0    0]
 [   3    0    0    0    0  748    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1225    3]
 [   0    0    0    0    0    0    0    8 1005]]

In [24]:
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)


(array([    8,    33,    41, ..., 10830, 10838, 10862]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    0,     3,    18, ..., 10844, 10858, 10865]), array([    1,     2,     4, ..., 10864, 10866, 10867]))
(array([    9,    14,    27, ..., 10854, 10855, 10866]), array([    0,     1,     2, ..., 10864, 10865, 10867]))
(array([   26,    39,    40, ..., 10829, 10853, 10856]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    1,    12,    31, ..., 10842, 10863, 10867]), array([    0,     2,     3, ..., 10864, 10865, 10866]))
(array([   20,    28,    32, ..., 10823, 10836, 10852]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    6,    11,    19, ..., 10848, 10849, 10851]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    4,     5,    10, ..., 10850, 10857, 10859]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   21,    37,    72, ..., 10860, 10861, 10864]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    2,     7,    13, ..., 10805, 10825, 10845]), array([    0,     1,     3, ..., 10865, 10866, 10867]))
logloss = 0.0140
multiclass logloss = 0.0140
score = 0.9977
[[1541    0    0    0    0    0    0    0    0]
 [   1 2475    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   5    0    0    0   37    0    0    0    0]
 [   5    0    0    0    0  746    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1227    1]
 [   0    0    0    0    0    0    0    9 1004]]

5. Test/Experimental Code Only


In [ ]:
combined_train_data['class'] = sorted_train_labels.iloc[:,1]

class1 = X[combined_train_data['class'] == 1]
class2 = X[combined_train_data['class'] == 2]
class3 = X[combined_train_data['class'] == 3]
class4 = X[combined_train_data['class'] == 4]
class5 = X[combined_train_data['class'] == 5]
class6 = X[combined_train_data['class'] == 6]
class7 = X[combined_train_data['class'] == 7]
class8 = X[combined_train_data['class'] == 8]
class9 = X[combined_train_data['class'] == 9]

columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns)

class_stats['mean'][1] = 1.0
class_stats.head()

classx = X[combined_train_data['class'] == i]
classxmean = classx.mean()
classxmean.head()

columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns, dtype=float)
for i in range(1,10):
  classx = X[combined_train_data['class'] == i]
  class_stats['mean'][i] = classx.mean().sum()
  class_stats['std'][i] = classx.std().sum()
  #class_stats['corr'][i] = classx.corr().sum()
  #class_stats['cov'][i] = classx.cov().sum()

class_stats.head()

plt.figure(figsize=(15,15))
plt.xlabel("means")
plt.ylabel("standard deviation")
plt.scatter(class_stats['mean'], class_stats['std'], c=[ 1,2,3,4,5,6,7,8,9 ], cmap='brg')

In [17]:
['filename',[ 'train_asm_p{:d}'.format(i) for i in range(1,11) ]]


Out[17]:
['filename',
 ['train_asm_p1',
  'train_asm_p2',
  'train_asm_p3',
  'train_asm_p4',
  'train_asm_p5',
  'train_asm_p6',
  'train_asm_p7',
  'train_asm_p8',
  'train_asm_p9',
  'train_asm_p10']]

In [ ]:
help(pd.DataFrame)