1. Final Selection of Features.

   - 30 Percent Best ASM Feature Set
   - 10 Percent Best Call Graph Feature Set

In [1]:
# Just import everything we might need, it saves time.
import warnings
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from time import time
from scipy.stats import randint as sp_randint
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score, KFold, train_test_split
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.svm import SVC
import seaborn as sns
%pylab inline
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


Populating the interactive namespace from numpy and matplotlib

2. Load and Combine Training Data

- final-combined-train-data-30percent.csv
- final-call-graph-features-10percent.csv
- sorted-train-labels.csv

In [2]:
# First load the ASM training data and training labels
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
combined_train_data = pd.read_csv('data/final-combined-train-data-30percent.csv')
call_graph_features_train = pd.read_csv('data/final-call-graph-features-10percent.csv')

In [3]:
sorted_train_labels.head()


Out[3]:
Id Class
0 01IsoiSMh5gxyDYTl4CB 2
1 01SuzwMJEIXsK7A8dQbl 8
2 01azqd4InC7m9JpocGv5 9
3 01jsnpXSAlgw6aPeDxrU 9
4 01kcPWA9K2BOxQeS5Rju 1

In [4]:
combined_train_data.head()


Out[4]:
filename edx esi es ds ss cs ah al ax ... ASM_964 ASM_972 ASM_977 ASM_990 trainmean trainstd trainmin trainmax traintotal trainlogtotal
0 01IsoiSMh5gxyDYTl4CB 750 496 3 0 0 0 8 224 49 ... 32 49 53 10 586.160040 12877.609022 0.0 288961.0 2.181176e+12 28.410885
1 01SuzwMJEIXsK7A8dQbl 1121 24 3 1 4 2 6 22 7 ... 48 9 9 116 5.908549 60.063976 0.0 1068.0 3.790235e+05 12.845354
2 01azqd4InC7m9JpocGv5 1493 1900 0 0 0 0 1 398 0 ... 48 9 9 116 7.002982 64.756651 0.0 1173.0 5.319434e+05 13.184292
3 01jsnpXSAlgw6aPeDxrU 525 4 0 0 0 0 0 0 0 ... 48 9 9 116 327.150099 3278.958529 0.0 81305.0 8.721682e+10 25.191663
4 01kcPWA9K2BOxQeS5Rju 23 35 0 0 0 0 0 3 0 ... 48 89 32 71 5.932406 60.189034 0.0 1068.0 3.813462e+05 12.851463

5 rows × 623 columns


In [5]:
call_graph_features_train.head()


Out[5]:
filename vertex_count edge_count delta_max density $+5 $+5_x $+5_y $5MGU$ch $6MDU$ch ... subst_x subst_y unkno unkno_x unkno_x.1 unkno_x.2 unkno_y unkno_y.1 unkno_y.2 wpa_hexd
0 01IsoiSMh5gxyDYTl4CB 274 333 137 0.081319 0.0 0 0 0 0.0 ... 0.0 0.0 1.0 1 0.0 0.0 1 0.0 0.0 0.0
1 01SuzwMJEIXsK7A8dQbl 187 196 82 0.181314 0.0 0 0 0 0.0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
2 01azqd4InC7m9JpocGv5 158 1533 95 0.140927 0.0 0 0 0 0.0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
3 01jsnpXSAlgw6aPeDxrU 26 126 35 0.600000 0.0 0 0 0 0.0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
4 01kcPWA9K2BOxQeS5Rju 61 53 24 0.504762 0.0 0 0 0 0.0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0

5 rows × 1562 columns


In [3]:
# Merge the original ASM feature set with the call graph feature set.
all_combined_train_data = combined_train_data.merge(call_graph_features_train, on='filename', suffixes=('_asm','_cg'))
all_combined_train_data.head(20)


Out[3]:
filename edx_asm esi es ds ss cs ah al ax ... subst_x subst_y unkno unkno_x unkno_x.1 unkno_x.2 unkno_y unkno_y.1 unkno_y.2 wpa_hexd
0 01IsoiSMh5gxyDYTl4CB 750 496 3 0 0 0 8 224 49 ... 0.0 0.0 1.0 1 0.0 0.0 1 0.0 0.0 0.0
1 01SuzwMJEIXsK7A8dQbl 1121 24 3 1 4 2 6 22 7 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
2 01azqd4InC7m9JpocGv5 1493 1900 0 0 0 0 1 398 0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
3 01jsnpXSAlgw6aPeDxrU 525 4 0 0 0 0 0 0 0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
4 01kcPWA9K2BOxQeS5Rju 23 35 0 0 0 0 0 3 0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
5 02IOCvYEy8mjiuAQHax3 19 15 1 0 2 1 2 6 0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
6 02JqQ7H3yEoD8viYWlmS 337 250 8 0 0 0 3 82 1 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
7 02K5GMYITj7bBoAisEmD 10182 20055 0 0 0 0 0 97 307 ... 0.0 0.0 0.0 1 1.0 1.0 1 1.0 1.0 0.0
8 02MRILoE6rNhmt7FUi45 462 576 0 0 0 0 0 48 21 ... 0.0 0.0 0.0 1 1.0 1.0 1 1.0 1.0 0.0
9 02mlBLHZTDFXGa7Nt6cr 141 96 2 2 1 2 6 13 0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
10 02zcUmKV16Lya5xqnPGB 600 981 0 0 0 0 45 87 49 ... 0.0 0.0 1.0 1 1.0 0.0 1 1.0 0.0 0.0
11 03nJaQV6K2ObICUmyWoR 360 351 43 30 34 14 62 231 1 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
12 04BfoQRA6XEshiNuI7pF 125 106 0 0 0 0 0 1 0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
13 04EjIdbPV5e1XroFOpiN 936 5027 1 1 1 1 3 45 157 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
14 04QzZ3DVdPsEp9elLR65 123 112 0 0 0 0 1 1 0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
15 04cvLCVPqBMs6yn5xGlE 119 103 0 0 0 0 0 1 0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
16 04hSzLv5s2TDYPlcgpHB 1023 23 3 0 3 0 0 16 2 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
17 04mcPSei852tgIKUwTJr 1127 4 0 0 0 0 0 1 0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
18 04sJnMaORYc1SV5pKjrP 126 83 0 0 0 0 2 1 0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
19 05EeG39MTRrI6VY21DPd 1011 895 0 0 0 0 0 60 2 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0

20 rows × 2184 columns


In [5]:
all_combined_train_data.to_csv('data/all-combined-train-data-final.csv', index=False)

3. Load and Combine Test Data

- TODO

In [ ]:
# TODO: load and combine 30% best ASM features and 10% best call graph features.
combined_test_data = pd.read_csv('data/final-combined-test-data-30percent.csv')

4. Perform Some Classification Tests


In [7]:
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss

    Parameters
    ----------
    y_true : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    y_pred : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """
    predictions = np.clip(y_pred, eps, 1 - eps)

    # normalize row sums to 1
    predictions /= predictions.sum(axis=1)[:, np.newaxis]

    actual = np.zeros(y_pred.shape)
    n_samples = actual.shape[0]
    actual[np.arange(n_samples), y_true.astype(int)] = 1
    vectsum = np.sum(actual * np.log(predictions))
    loss = -1.0 / n_samples * vectsum
    return loss

In [8]:
def run_cv(X,y, clf):

    # Construct a kfolds object
    kf = KFold(len(y),n_folds=10,shuffle=True)
    y_prob = np.zeros((len(y),9))
    y_pred = np.zeros(len(y))
    
    # Iterate through folds
    for train_index, test_index in kf:
        print(test_index, train_index)
        X_train = X.loc[train_index,:]
        X_test = X.loc[test_index,:]
        y_train = y[train_index]

        clf.fit(X_train,y_train)
        y_prob[test_index] = clf.predict_proba(X_test)
        y_pred[test_index] = clf.predict(X_test)
    
    return y_prob, y_pred

In [9]:
# Set our X,y for the classifiers
#all_combined_train_data = pd.read_csv('data/all-combined-train-data-final.csv')
#sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
X = all_combined_train_data.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
yloss = y - 1

In [10]:
all_combined_train_data.head()


Out[10]:
filename edx_x esi es ds ss cs ah al ax ... subst_x subst_y unkno unkno_x unkno_x.1 unkno_x.2 unkno_y unkno_y.1 unkno_y.2 wpa_hexd
0 01IsoiSMh5gxyDYTl4CB 750 496 3 0 0 0 8 224 49 ... 0.0 0.0 1.0 1 0.0 0.0 1 0.0 0.0 0.0
1 01SuzwMJEIXsK7A8dQbl 1121 24 3 1 4 2 6 22 7 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
2 01azqd4InC7m9JpocGv5 1493 1900 0 0 0 0 1 398 0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
3 01jsnpXSAlgw6aPeDxrU 525 4 0 0 0 0 0 0 0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
4 01kcPWA9K2BOxQeS5Rju 23 35 0 0 0 0 0 3 0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0

5 rows × 2184 columns


In [11]:
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss, p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)


[   14    19    36 ..., 10826 10854 10866] [    0     1     2 ..., 10864 10865 10867]
[    7    15    30 ..., 10844 10855 10864] [    0     1     2 ..., 10865 10866 10867]
[   27    39    41 ..., 10846 10851 10865] [    0     1     2 ..., 10864 10866 10867]
[   12    13    33 ..., 10843 10850 10863] [    0     1     2 ..., 10865 10866 10867]
[    1    11    21 ..., 10835 10847 10857] [    0     2     3 ..., 10865 10866 10867]
[    0     4     5 ..., 10849 10859 10860] [    1     2     3 ..., 10865 10866 10867]
[    3     9    16 ..., 10840 10856 10867] [    0     1     2 ..., 10864 10865 10866]
[    2    17    20 ..., 10809 10822 10858] [    0     1     3 ..., 10865 10866 10867]
[   25    31    38 ..., 10853 10861 10862] [    0     1     2 ..., 10865 10866 10867]
[    6     8    26 ..., 10842 10845 10852] [    0     1     2 ..., 10865 10866 10867]
logloss = 0.0138
multiclass logloss = 0.0138
score = 0.9976
[[1541    0    0    0    0    0    0    0    0]
 [   1 2475    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   5    0    0    0   37    0    0    0    0]
 [   3    0    0    0    0  748    0    0    0]
 [   0    0    0    0    0    0  398    0    0]
 [   0    0    0    0    0    0    0 1223    5]
 [   0    0    0    0    0    0    0    9 1004]]

In [24]:
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)


(array([    8,    33,    41, ..., 10830, 10838, 10862]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    0,     3,    18, ..., 10844, 10858, 10865]), array([    1,     2,     4, ..., 10864, 10866, 10867]))
(array([    9,    14,    27, ..., 10854, 10855, 10866]), array([    0,     1,     2, ..., 10864, 10865, 10867]))
(array([   26,    39,    40, ..., 10829, 10853, 10856]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    1,    12,    31, ..., 10842, 10863, 10867]), array([    0,     2,     3, ..., 10864, 10865, 10866]))
(array([   20,    28,    32, ..., 10823, 10836, 10852]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    6,    11,    19, ..., 10848, 10849, 10851]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    4,     5,    10, ..., 10850, 10857, 10859]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   21,    37,    72, ..., 10860, 10861, 10864]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    2,     7,    13, ..., 10805, 10825, 10845]), array([    0,     1,     3, ..., 10865, 10866, 10867]))
logloss = 0.0140
multiclass logloss = 0.0140
score = 0.9977
[[1541    0    0    0    0    0    0    0    0]
 [   1 2475    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   5    0    0    0   37    0    0    0    0]
 [   5    0    0    0    0  746    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1227    1]
 [   0    0    0    0    0    0    0    9 1004]]

5. Test/Experimental Code Only


In [ ]:
combined_train_data['class'] = sorted_train_labels.iloc[:,1]

class1 = X[combined_train_data['class'] == 1]
class2 = X[combined_train_data['class'] == 2]
class3 = X[combined_train_data['class'] == 3]
class4 = X[combined_train_data['class'] == 4]
class5 = X[combined_train_data['class'] == 5]
class6 = X[combined_train_data['class'] == 6]
class7 = X[combined_train_data['class'] == 7]
class8 = X[combined_train_data['class'] == 8]
class9 = X[combined_train_data['class'] == 9]

columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns)

class_stats['mean'][1] = 1.0
class_stats.head()

classx = X[combined_train_data['class'] == i]
classxmean = classx.mean()
classxmean.head()

columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns, dtype=float)
for i in range(1,10):
  classx = X[combined_train_data['class'] == i]
  class_stats['mean'][i] = classx.mean().sum()
  class_stats['std'][i] = classx.std().sum()
  #class_stats['corr'][i] = classx.corr().sum()
  #class_stats['cov'][i] = classx.cov().sum()

class_stats.head()

plt.figure(figsize=(15,15))
plt.xlabel("means")
plt.ylabel("standard deviation")
plt.scatter(class_stats['mean'], class_stats['std'], c=[ 1,2,3,4,5,6,7,8,9 ], cmap='brg')

In [4]:
name_map = {}
column_names = all_combined_train_data.columns
for cname in column_names:
    if cname not in name_map:
        name_map[cname] = 1
    else:
        name_map[cname] += 1
    if name_map[cname] > 1:
        print("Feature Name: {:s} -> {:d}".format(cname, name_map[cname]))