1. Analysis and Transormation of Features in the 20 Percent Best Feature Set


In [ ]:
# Just import everything we might need, it saves time.
import warnings
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from time import time
from scipy.stats import randint as sp_randint
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score, KFold, train_test_split
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.svm import SVC
import seaborn as sns
%pylab inline
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

2. Load The Sorted Training Data Features

- sorted-train-malware-features-asm-reduced.csv
- sorted-train-malware-features-byte.csv
- sorted-train-labels.csv
- sorted-train-features-combined.csv

In [2]:
# First load the .asm and .byte training/test data and training labels
sorted_train_data_asm = pd.read_csv('data/sorted-train-malware-features-asm-20percent.csv')
sorted_train_data_byte = pd.read_csv('data/sorted-train-malware-features-byte.csv')
sorted_test_data_asm = pd.read_csv('data/sorted-test-malware-features-asm-20percent.csv')
sorted_test_data_byte = pd.read_csv('data/sorted-test-malware-features-byte.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
#combined_train_data = pd.read_csv('data/sorted-train-features-combined.csv')

# Load the image data for training data
sorted_train_image_asm = pd.read_csv('data/sorted-train-image-features-asm-20percent.csv')
sorted_test_image_asm = pd.read_csv('data/sorted-test-image-features-asm-20percent.csv')
#sorted_train_image_byte = pd.read_csv('data/sorted-train-image-features-byte-reduced.csv')

# Now load the row statistics for the original feature set, these will be combined with the new features
all_train_rowstats = pd.read_csv('data/all-train-asm-rowstats.csv')
all_test_rowstats = pd.read_csv('data/all-test-asm-rowstats.csv')
all_train_image_asm_rowstats = pd.read_csv('data/all-train-image-asm-rowstats.csv')
all_test_image_asm_rowstats = pd.read_csv('data/all-test-image-asm-rowstats.csv')

In [3]:
sorted_train_data_asm.head()


Out[3]:
filename edx esi es ds ss cs ah al ax bh bl bx ch cl cx dh dl dx eax
0 01IsoiSMh5gxyDYTl4CB 750 496 3 0 0 0 8 224 49 34 25 0 41 191 52 38 163 63 1447 ...
1 01SuzwMJEIXsK7A8dQbl 1121 24 3 1 4 2 6 22 7 1 4 0 3 37 2 4 9 3 1220 ...
2 01azqd4InC7m9JpocGv5 1493 1900 0 0 0 0 1 398 0 0 47 0 1 77 4 1 56 2 4438 ...
3 01jsnpXSAlgw6aPeDxrU 525 4 0 0 0 0 0 0 0 0 1 0 0 1 2 0 0 0 942 ...
4 01kcPWA9K2BOxQeS5Rju 23 35 0 0 0 0 0 3 0 0 1 0 0 1 0 0 0 0 137 ...

5 rows × 202 columns


In [4]:
sorted_train_labels.head()


Out[4]:
Id Class
0 01IsoiSMh5gxyDYTl4CB 2
1 01SuzwMJEIXsK7A8dQbl 8
2 01azqd4InC7m9JpocGv5 9
3 01jsnpXSAlgw6aPeDxrU 9
4 01kcPWA9K2BOxQeS5Rju 1

5 rows × 2 columns


In [5]:
sorted_train_image_asm.head()


Out[5]:
filename ASM_33 ASM_40 ASM_41 ASM_42 ASM_43 ASM_125 ASM_137 ASM_139 ASM_140 ASM_141 ASM_142 ASM_143 ASM_144 ASM_145 ASM_146 ASM_147 ASM_150 ASM_154 ASM_155
0 01IsoiSMh5gxyDYTl4CB 120 49 48 48 48 45 116 120 116 58 48 48 52 48 49 48 9 9 9 ...
1 01SuzwMJEIXsK7A8dQbl 48 9 9 9 9 10 48 48 48 9 9 9 9 9 9 9 124 84 104 ...
2 01azqd4InC7m9JpocGv5 48 9 9 9 9 10 48 48 48 9 9 9 9 9 9 9 124 84 104 ...
3 01jsnpXSAlgw6aPeDxrU 48 9 9 9 9 10 48 48 48 9 9 9 9 9 9 9 124 84 104 ...
4 01kcPWA9K2BOxQeS5Rju 48 9 9 9 9 10 48 48 48 9 9 9 9 9 9 9 124 84 104 ...

5 rows × 201 columns


In [6]:
sorted_test_image_asm.head()


Out[6]:
filename ASM_33 ASM_40 ASM_41 ASM_42 ASM_43 ASM_125 ASM_137 ASM_139 ASM_140 ASM_141 ASM_142 ASM_143 ASM_144 ASM_145 ASM_146 ASM_147 ASM_150 ASM_154 ASM_155
0 ITSUPtCmh7WdJcsYDwQ5 48 9 9 9 9 10 48 48 48 9 9 9 9 9 9 9 124 84 104 ...
1 Ig2DB5tSiEy1cJvV0zdw 48 9 9 9 9 10 48 48 48 9 9 9 9 9 9 9 124 84 104 ...
2 Jmo6eIhLZ4t9r8QsxEg5 48 9 9 9 9 10 48 48 48 9 9 9 9 9 9 9 124 84 104 ...
3 JtPFl4ewgdD78OzCMa3o 48 9 9 9 9 10 48 48 48 9 9 9 9 9 9 9 124 84 104 ...
4 K3ZtByPHGSFYNljDUEXp 48 9 9 9 9 10 48 48 48 9 9 9 9 9 9 9 124 84 104 ...

5 rows × 201 columns


In [7]:
sorted_train_data_byte.head()


Out[7]:
filename entropy filesize
0 01IsoiSMh5gxyDYTl4CB 0.614952 6874624
1 01SuzwMJEIXsK7A8dQbl 0.843262 460288
2 01azqd4InC7m9JpocGv5 0.703961 5256192
3 01jsnpXSAlgw6aPeDxrU 0.806035 4825600
4 01kcPWA9K2BOxQeS5Rju 0.871610 712704

5 rows × 3 columns


In [9]:
sorted_test_data_asm.head()


Out[9]:
filename edx esi es ds ss cs ah al ax bh bl bx ch cl cx dh dl dx eax
0 ITSUPtCmh7WdJcsYDwQ5 245 434 0 1 0 0 9 51 1 0 4 1 3 1 3 0 1 2 553 ...
1 Ig2DB5tSiEy1cJvV0zdw 258 437 0 0 0 0 11 60 1 1 2 2 0 1 2 2 1 3 554 ...
2 Jmo6eIhLZ4t9r8QsxEg5 238 365 0 0 0 0 8 51 0 0 4 1 0 1 2 1 0 0 519 ...
3 JtPFl4ewgdD78OzCMa3o 241 556 1 1 1 0 4 52 0 1 0 1 2 2 2 0 1 2 668 ...
4 K3ZtByPHGSFYNljDUEXp 92 75 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 402 ...

5 rows × 202 columns


In [8]:
# Assign asm data to X,y for brevity.
X = sorted_train_data_asm.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
X_test = sorted_test_data_asm.iloc[:,1:]
X_train_image_asm = sorted_train_image_asm.iloc[:,1:]
X_test_image_asm = sorted_test_image_asm.iloc[:,1:]
# X_image_byte = sorted_train_image_byte.iloc[:,1:]

In [10]:
X_train_image_asm.shape


Out[10]:
(10868, 200)

In [11]:
X_test_image_asm.shape


Out[11]:
(10873, 200)

3. Perform Some Basic Statistical Analysis on the Feature Sets

Find the following:
- Feature and row mean
- Feature and row standard deviation
- Feature and row correlation coefficient
- Feature and row covariance
- Feature and row minimum and maximum

In [12]:
# Train feature stats
X_means = X.mean()
X_std = X.std()
X_cor = X.corr()
X_cov = X.cov()

# Test feature stats
X_test_means = X_test.mean()
X_test_std = X_test.std()
X_test_cor = X_test.corr()
X_test_cov = X_test.cov()

# Train image feature stats
X_train_image_asm_means = X_train_image_asm.mean()
X_train_image_asm_std = X_train_image_asm.std()
X_train_image_asm_cor = X_train_image_asm.corr()
X_train_image_asm_cov = X_train_image_asm.cov()

# Test image feature stats
X_test_image_asm_means = X_test_image_asm.mean()
X_test_image_asm_std = X_test_image_asm.std()
X_test_image_asm_cor = X_test_image_asm.corr()
X_test_image_asm_cov = X_test_image_asm.cov()

# Not using byte image features
#X_image_byte_means = X_image_byte.mean()
#X_image_byte_std = X_image_byte.std()
#X_image_byte_cor = X_image_byte.corr()
#X_image_byte_cov = X_image_byte.cov()

In [13]:
X_train_image_asm_means.head()


Out[13]:
ASM_33    64.914704
ASM_40    18.125322
ASM_41    17.866949
ASM_42    17.868237
ASM_43    17.857195
dtype: float64

In [14]:
X_train_image_asm_std.head()


Out[14]:
ASM_33    29.676660
ASM_40    16.799549
ASM_41    16.358924
ASM_42    16.362247
ASM_43    16.339508
dtype: float64

In [15]:
X_test_image_asm_means.head()


Out[15]:
ASM_33    64.949508
ASM_40    18.126000
ASM_41    17.852387
ASM_42    17.850915
ASM_43    17.853030
dtype: float64

In [16]:
# The byte image data has low standard deviation and mean variance and is not very useful for learning
# so this data will not be used for further analysis.
X_test_image_asm_std.head()


Out[16]:
ASM_33    29.682114
ASM_40    16.784976
ASM_41    16.337162
ASM_42    16.335167
ASM_43    16.338488
dtype: float64

In [ ]:
X_means.head()

X_std.head()

X_cor.head()

X_cov.head()

X_means.min()

X_means.max()

X_std.min()

X_std.max()

X_means[X_means == X_means.min()]

X_means[X_means == X_means.max()]

X_std[X_std == X_std.min()]

X_std[X_std == X_std.max()]

In [17]:
# Row stats for train and test data
X_train_rowstats = pd.DataFrame(index=np.arange(X.shape[0]), columns=['filename','mean','std','min','max','total','logtotal'], dtype=float64)
X_test_rowstats = pd.DataFrame(index=np.arange(X_test.shape[0]), columns=['filename','e_mean','e_std','e_min','e_max','e_total','e_logtotal'], dtype=float64)

X_train_rowstats['filename'] = sorted_train_data_asm['filename']
for i in range(0,X.shape[0]):
    X_train_rowstats['mean'][i] = X.iloc[i,:].mean()
    X_train_rowstats['std'][i] = X.iloc[i,:].std()
    X_train_rowstats['min'][i] = X.iloc[i,:].min()
    X_train_rowstats['max'][i] = X.iloc[i,:].max()
    

X_train_rowstats['total'] = X_train_rowstats['max'] * X_train_rowstats['mean'] * X_train_rowstats['std']
X_train_rowstats['logtotal'] = np.log(X_train_rowstats['total']) # natural logarithm

X_test_rowstats['filename'] = sorted_test_data_asm['filename']
for i in range(0,X_test.shape[0]):
    X_test_rowstats['e_mean'][i] = X_test.iloc[i,:].mean()
    X_test_rowstats['e_std'][i] = X_test.iloc[i,:].std()
    X_test_rowstats['e_min'][i] = X_test.iloc[i,:].min()
    X_test_rowstats['e_max'][i] = X_test.iloc[i,:].max()
    

X_test_rowstats['e_total'] = X_test_rowstats['e_max'] * X_test_rowstats['e_mean'] * X_test_rowstats['e_std']
X_test_rowstats['e_logtotal'] = np.log(X_test_rowstats['e_total']) # natural logarithm

X_train_rowstats.head()


Out[17]:
filename mean std min max total logtotal
0 01IsoiSMh5gxyDYTl4CB 1620.895522 10717.960833 0 87555 1.521066e+12 28.050433
1 01SuzwMJEIXsK7A8dQbl 170.199005 720.368718 0 5817 7.131993e+08 20.385272
2 01azqd4InC7m9JpocGv5 13888.179104 136000.425110 0 1367070 2.582119e+15 35.487387
3 01jsnpXSAlgw6aPeDxrU 710.004975 6557.617262 0 65928 3.069569e+11 26.449973
4 01kcPWA9K2BOxQeS5Rju 12.099502 40.226360 0 445 2.165899e+05 12.285761

5 rows × 7 columns


In [18]:
X_test_rowstats.head()


Out[18]:
filename e_mean e_std e_min e_max e_total e_logtotal
0 ITSUPtCmh7WdJcsYDwQ5 360.348259 3120.464387 0 31361 3.526400e+10 24.286128
1 Ig2DB5tSiEy1cJvV0zdw 2405.940299 23569.804426 0 236923 1.343532e+13 30.228908
2 Jmo6eIhLZ4t9r8QsxEg5 2600.109453 25596.553591 0 257289 1.712357e+13 30.471477
3 JtPFl4ewgdD78OzCMa3o 212.213930 1631.495862 0 16341 5.657682e+09 22.456280
4 K3ZtByPHGSFYNljDUEXp 7841.134328 78041.521891 0 784363 4.799784e+14 33.804762

5 rows × 7 columns


In [19]:
# Image row stats
X_train_image_asm_rowstats = pd.DataFrame(index=np.arange(X_train_image_asm.shape[0]), columns=['filename','tr_mean','tr_std','tr_min','tr_max','tr_total','tr_logtotal'], dtype=float64)
X_test_image_asm_rowstats = pd.DataFrame(index=np.arange(X_test_image_asm.shape[0]), columns=['filename','te_mean','te_std','te_min','te_max','te_total','te_logtotal'], dtype=float64)

X_train_image_asm_rowstats['filename'] = sorted_train_data_asm['filename']
for i in range(0,X_train_image_asm.shape[0]):
    X_train_image_asm_rowstats['tr_mean'][i] = X_train_image_asm.iloc[i,:].mean()
    X_train_image_asm_rowstats['tr_std'][i] = X_train_image_asm.iloc[i,:].std()
    X_train_image_asm_rowstats['tr_min'][i] = X_train_image_asm.iloc[i,:].min()
    X_train_image_asm_rowstats['tr_max'][i] = X_train_image_asm.iloc[i,:].max()
    

X_train_image_asm_rowstats['tr_total'] = X_train_image_asm_rowstats['tr_max'] * X_train_image_asm_rowstats['tr_mean'] * X_train_image_asm_rowstats['tr_std']
X_train_image_asm_rowstats['tr_logtotal'] = np.log(X_train_image_asm_rowstats['tr_total']) # natural logarithm

X_test_image_asm_rowstats['filename'] = sorted_test_data_asm['filename']
for i in range(0,X_test_image_asm.shape[0]):
    X_test_image_asm_rowstats['te_mean'][i] = X_test_image_asm.iloc[i,:].mean()
    X_test_image_asm_rowstats['te_std'][i] = X_test_image_asm.iloc[i,:].std()
    X_test_image_asm_rowstats['te_min'][i] = X_test_image_asm.iloc[i,:].min()
    X_test_image_asm_rowstats['te_max'][i] = X_test_image_asm.iloc[i,:].max()
    

X_test_image_asm_rowstats['te_total'] = X_test_image_asm_rowstats['te_max'] * X_test_image_asm_rowstats['te_mean'] * X_test_image_asm_rowstats['te_std']
X_test_image_asm_rowstats['te_logtotal'] = np.log(X_test_image_asm_rowstats['te_total']) # natural logarithm

X_train_image_asm_rowstats.head()


Out[19]:
filename tr_mean tr_std tr_min tr_max tr_total tr_logtotal
0 01IsoiSMh5gxyDYTl4CB 65.27 41.652271 9 124 337111.820237 12.72817
1 01SuzwMJEIXsK7A8dQbl 45.74 38.597815 9 124 218917.545703 12.29645
2 01azqd4InC7m9JpocGv5 45.74 38.597815 9 124 218917.545703 12.29645
3 01jsnpXSAlgw6aPeDxrU 45.74 38.597815 9 124 218917.545703 12.29645
4 01kcPWA9K2BOxQeS5Rju 46.65 41.145135 9 124 238008.146261 12.38006

5 rows × 7 columns


In [20]:
X_test_image_asm_rowstats.head()


Out[20]:
filename te_mean te_std te_min te_max te_total te_logtotal
0 ITSUPtCmh7WdJcsYDwQ5 39.830 37.757506 9 124 186481.300676 12.136086
1 Ig2DB5tSiEy1cJvV0zdw 39.285 37.532482 9 124 182833.481254 12.116331
2 Jmo6eIhLZ4t9r8QsxEg5 46.660 41.146318 9 124 238066.013418 12.380303
3 JtPFl4ewgdD78OzCMa3o 46.660 41.146318 9 124 238066.013418 12.380303
4 K3ZtByPHGSFYNljDUEXp 39.285 37.532482 9 124 182833.481254 12.116331

5 rows × 7 columns


In [21]:
# Write column stats and row stats to file

X_train_rowstats.to_csv('data/train-asm-rowstats-20percent.csv', index=False)
X_test_rowstats.to_csv('data/test-asm-rowstats-20percent.csv', index=False)
X_train_image_asm_rowstats.to_csv('data/train-image-asm-rowstats-20percent.csv', index=False)
X_test_image_asm_rowstats.to_csv('data/test-image-asm-rowstats-20percent.csv', index=False)

In [22]:
# Combine all the training features and write to file.
combined_train_data = sorted_train_data_asm.merge(sorted_train_data_byte, on='filename')

combined_train_data = combined_train_data.merge(X_train_rowstats, on='filename')

combined_train_data = combined_train_data.merge(X_train_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_train_data = combined_train_data.merge(sorted_train_image_asm, on='filename')

combined_train_data = combined_train_data.merge(all_train_rowstats, on='filename')

# Result is better without the image rowstats for all image features
#combined_train_data = combined_train_data.merge(all_train_image_asm_rowstats, on='filename')

combined_train_data.to_csv('data/all-combined-train-data-20percent.csv', index=False)

combined_train_data.head()


Out[22]:
filename edx esi es ds ss cs ah al ax bh bl bx ch cl cx dh dl dx eax
0 01IsoiSMh5gxyDYTl4CB 750 496 3 0 0 0 8 224 49 34 25 0 41 191 52 38 163 63 1447 ...
1 01SuzwMJEIXsK7A8dQbl 1121 24 3 1 4 2 6 22 7 1 4 0 3 37 2 4 9 3 1220 ...
2 01azqd4InC7m9JpocGv5 1493 1900 0 0 0 0 1 398 0 0 47 0 1 77 4 1 56 2 4438 ...
3 01jsnpXSAlgw6aPeDxrU 525 4 0 0 0 0 0 0 0 0 1 0 0 1 2 0 0 0 942 ...
4 01kcPWA9K2BOxQeS5Rju 23 35 0 0 0 0 0 3 0 0 1 0 0 1 0 0 0 0 137 ...

5 rows × 428 columns


In [23]:
# Combine all the testing features and write to file.
combined_test_data = sorted_test_data_asm.merge(sorted_test_data_byte, on='filename')

combined_test_data = combined_test_data.merge(X_test_rowstats, on='filename')

combined_test_data = combined_test_data.merge(X_test_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_test_data = combined_test_data.merge(sorted_test_image_asm, on='filename')

combined_test_data = combined_test_data.merge(all_test_rowstats, on='filename')

# Result is better without the image rowstats for all image features
#combined_test_data = combined_test_data.merge(all_test_image_asm_rowstats, on='filename')

combined_test_data.to_csv('data/all-combined-test-data-20percent.csv', index=False)

combined_test_data.head()


Out[23]:
filename edx esi es ds ss cs ah al ax bh bl bx ch cl cx dh dl dx eax
0 ITSUPtCmh7WdJcsYDwQ5 245 434 0 1 0 0 9 51 1 0 4 1 3 1 3 0 1 2 553 ...
1 Ig2DB5tSiEy1cJvV0zdw 258 437 0 0 0 0 11 60 1 1 2 2 0 1 2 2 1 3 554 ...
2 Jmo6eIhLZ4t9r8QsxEg5 238 365 0 0 0 0 8 51 0 0 4 1 0 1 2 1 0 0 519 ...
3 JtPFl4ewgdD78OzCMa3o 241 556 1 1 1 0 4 52 0 1 0 1 2 2 2 0 1 2 668 ...
4 K3ZtByPHGSFYNljDUEXp 92 75 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 402 ...

5 rows × 428 columns


In [ ]:
# Do some plotting
plt.figure(figsize=(15,15))
plt.xlabel("mean")
plt.ylabel('logtotal')
plt.scatter(Xrowstats['mean'], Xrowstats['logtotal'], c=y, cmap='brg')

In [ ]:
plt.figure(figsize=(15,15))
plt.xlabel("total")
plt.ylabel('class')
plt.scatter(Xrowstats['logtotal'], y, c=y, cmap='brg')

In [ ]:
plt.figure(figsize=(15,15))
plt.xlabel("mean")
plt.ylabel("std")
plt.scatter(Xrowstats['mean'], Xrowstats['std'], c=y, cmap='brg')

In [ ]:
plt.figure(figsize=(15,15))
plt.xlabel("mean")
plt.ylabel('class')
plt.scatter(Xrowstats['mean'], y, c=y, cmap='brg')

In [ ]:
plt.figure(figsize=(15,15))
plt.xlabel("standard deviation")
plt.ylabel('class')
plt.scatter(Xrowstats['std'], y, c=y, cmap='brg')

In [ ]:
plt.figure(figsize=(15,15))
plt.xlabel("max")
plt.ylabel('class')
plt.scatter(Xrowstats['max'], y, c=y, cmap='brg')

In [ ]:
plt.figure(figsize=(15,15))
sns.pairplot(Xrowstats, vars=['mean','std'], kind='reg', hue='class', size=6)

In [ ]:
plt.figure(figsize=(15,15))
sns.pairplot(Xrowstats, vars=['min','max'], kind='reg', hue='class', size=6)

In [ ]:
plt.figure(figsize=(15,15))
sns.pairplot(Xrowstats, vars=['mean','std','min','max'], kind='reg', hue='class', size=4)

In [ ]:
plt.figure(figsize=(15,15))
plt.xlabel("means")
plt.ylabel("standard deviation")
plt.scatter(X_means, X_std, cmap='brg')

In [ ]:
plt.figure(figsize=(15,15))
plt.xlabel("mean")
plt.ylabel("malware class")
ya = range(0,101)
plt.scatter(X_means, ya, cmap='brg')

In [ ]:
plt.figure(figsize=(15,15))
plt.xlabel("fword")
plt.ylabel("db")
plt.scatter(X['fword'], X['db'], c=y, cmap='brg')

In [ ]:
plt.figure(figsize=(15,15))
plt.xlabel("WinMain")
plt.ylabel("db.1")
plt.scatter(X['WinMain'], X['db.1'], c=y, cmap='brg')

In [ ]:
plt.figure(figsize=(15,15))
plt.xlabel("X_image_asm")
plt.ylabel("Malware Class")
plt.scatter(X_image_asm.iloc[:,5], y, c=y, cmap='brg')

In [ ]:
#sns.set()
#sns.set_context("paper")
plt.figure(figsize=(15,15))
sns.pairplot(combined_train_data, vars=['fword','db'], kind='reg', hue='class', size=6)

In [ ]:
help(np.zeros)

4. Perform Some Classification Tests


In [3]:
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss

    Parameters
    ----------
    y_true : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    y_pred : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """
    predictions = np.clip(y_pred, eps, 1 - eps)

    # normalize row sums to 1
    predictions /= predictions.sum(axis=1)[:, np.newaxis]

    actual = np.zeros(y_pred.shape)
    n_samples = actual.shape[0]
    actual[np.arange(n_samples), y_true.astype(int)] = 1
    vectsum = np.sum(actual * np.log(predictions))
    loss = -1.0 / n_samples * vectsum
    return loss

In [4]:
def run_cv(X,y, clf):

    # Construct a kfolds object
    kf = KFold(len(y),n_folds=10,shuffle=True)
    y_prob = np.zeros((len(y),9))
    y_pred = np.zeros(len(y))
    
    # Iterate through folds
    for train_index, test_index in kf:
        print(test_index, train_index)
        X_train = X.loc[train_index,:]
        X_test = X.loc[test_index,:]
        y_train = y[train_index]

        clf.fit(X_train,y_train)
        y_prob[test_index] = clf.predict_proba(X_test)
        y_pred[test_index] = clf.predict(X_test)
    
    return y_prob, y_pred

In [27]:
X = combined_train_data.iloc[:,1:]

In [28]:
X.head()


Out[28]:
edx esi es ds ss cs ah al ax bh bl bx ch cl cx dh dl dx eax ebp
0 750 496 3 0 0 0 8 224 49 34 25 0 41 191 52 38 163 63 1447 905 ...
1 1121 24 3 1 4 2 6 22 7 1 4 0 3 37 2 4 9 3 1220 1544 ...
2 1493 1900 0 0 0 0 1 398 0 0 47 0 1 77 4 1 56 2 4438 591 ...
3 525 4 0 0 0 0 0 0 0 0 1 0 0 1 2 0 0 0 942 451 ...
4 23 35 0 0 0 0 0 3 0 0 1 0 0 1 0 0 0 0 137 43 ...

5 rows × 427 columns


In [29]:
X.shape


Out[29]:
(10868, 427)

In [ ]:


In [30]:
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
yloss = y - 1
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)


(array([    2,     4,    15, ..., 10845, 10863, 10866]), array([    0,     1,     3, ..., 10864, 10865, 10867]))
(array([    3,     8,    27, ..., 10832, 10860, 10867]), array([    0,     1,     2, ..., 10864, 10865, 10866]))
(array([   46,    55,    71, ..., 10854, 10859, 10864]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   11,    13,    16, ..., 10837, 10839, 10848]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   18,    21,    30, ..., 10821, 10830, 10844]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    7,    12,    41, ..., 10853, 10855, 10865]), array([    0,     1,     2, ..., 10864, 10866, 10867]))
(array([    5,     6,     9, ..., 10849, 10851, 10856]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    1,    26,    28, ..., 10846, 10847, 10850]), array([    0,     2,     3, ..., 10865, 10866, 10867]))
(array([   14,    32,    33, ..., 10840, 10857, 10862]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    0,    20,    43, ..., 10852, 10858, 10861]), array([    1,     2,     3, ..., 10865, 10866, 10867]))
logloss = 0.0168
multiclass logloss = 0.0168
score = 0.9977
[[1540    0    0    0    0    1    0    0    0]
 [   1 2474    3    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   2    0    0    0   39    1    0    0    0]
 [   4    0    0    0    0  747    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1226    2]
 [   0    0    0    0    0    0    0    9 1004]]

In [47]:
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)


score = 0.9979
[[1541    0    0    0    0    0    0    0    0]
 [   1 2475    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   1    0    0    0   40    1    0    0    0]
 [   5    0    0    0    0  746    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1226    2]
 [   0    0    0    0    0    0    0    9 1004]]

In [48]:
yloss = y - 1
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))


multiclass logloss = 0.0164

In [31]:
# Combine all the training features and write to file.

# removing image rowstats for all image features produces the best ExtraTreesClassifier result yet: 
# logloss: 0.0162
# score: 0.9976

combined_train_data = sorted_train_data_asm.merge(sorted_train_data_byte, on='filename')

combined_train_data = combined_train_data.merge(X_train_rowstats, on='filename')

combined_train_data = combined_train_data.merge(X_train_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_train_data = combined_train_data.merge(sorted_train_image_asm, on='filename')

combined_train_data = combined_train_data.merge(all_train_rowstats, on='filename')

#combined_train_data = combined_train_data.merge(all_train_image_asm_rowstats, on='filename')

combined_train_data.to_csv('data/all-combined-train-data-20percent.csv', index=False)

X = combined_train_data.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
yloss = y - 1

combined_train_data.head()

In [32]:
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)


(array([   44,    54,    55, ..., 10851, 10853, 10865]), array([    0,     1,     2, ..., 10864, 10866, 10867]))
(array([    1,    23,    35, ..., 10839, 10856, 10860]), array([    0,     2,     3, ..., 10865, 10866, 10867]))
(array([   10,    25,    41, ..., 10826, 10831, 10846]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   28,    30,    36, ..., 10843, 10845, 10861]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   12,    20,    24, ..., 10809, 10852, 10864]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    2,     7,    15, ..., 10847, 10855, 10862]), array([    0,     1,     3, ..., 10865, 10866, 10867]))
(array([    3,     8,    11, ..., 10830, 10833, 10834]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    4,     5,    17, ..., 10854, 10859, 10867]), array([    0,     1,     2, ..., 10864, 10865, 10866]))
(array([    6,    14,    18, ..., 10848, 10857, 10863]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    0,     9,    13, ..., 10850, 10858, 10866]), array([    1,     2,     3, ..., 10864, 10865, 10867]))
logloss = 0.0162
multiclass logloss = 0.0162
score = 0.9976
[[1541    0    0    0    0    0    0    0    0]
 [   1 2473    3    0    0    0    0    1    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   2    0    0    0   40    0    0    0    0]
 [   6    0    0    0    0  745    0    0    0]
 [   0    0    0    0    0    0  398    0    0]
 [   0    0    0    0    0    0    0 1225    3]
 [   0    0    0    0    0    0    0    9 1004]]

In [48]:
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)


score = 0.9975
[[1539    0    0    0    0    2    0    0    0]
 [   2 2474    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  473    0    1    0    0    0]
 [   3    0    0    0   39    0    0    0    0]
 [   5    0    0    0    0  746    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1226    2]
 [   0    0    0    0    0    0    0    8 1005]]

In [49]:
yloss = y - 1
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))


multiclass logloss = 0.0174

5. Test/Experimental Code Only


In [ ]:
combined_train_data['class'] = sorted_train_labels.iloc[:,1]

class1 = X[combined_train_data['class'] == 1]
class2 = X[combined_train_data['class'] == 2]
class3 = X[combined_train_data['class'] == 3]
class4 = X[combined_train_data['class'] == 4]
class5 = X[combined_train_data['class'] == 5]
class6 = X[combined_train_data['class'] == 6]
class7 = X[combined_train_data['class'] == 7]
class8 = X[combined_train_data['class'] == 8]
class9 = X[combined_train_data['class'] == 9]

columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns)

class_stats['mean'][1] = 1.0
class_stats.head()

classx = X[combined_train_data['class'] == i]
classxmean = classx.mean()
classxmean.head()

columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns, dtype=float)
for i in range(1,10):
  classx = X[combined_train_data['class'] == i]
  class_stats['mean'][i] = classx.mean().sum()
  class_stats['std'][i] = classx.std().sum()
  #class_stats['corr'][i] = classx.corr().sum()
  #class_stats['cov'][i] = classx.cov().sum()

class_stats.head()

plt.figure(figsize=(15,15))
plt.xlabel("means")
plt.ylabel("standard deviation")
plt.scatter(class_stats['mean'], class_stats['std'], c=[ 1,2,3,4,5,6,7,8,9 ], cmap='brg')