In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.cross_validation import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import log_loss, confusion_matrix
import xgboost as xgb

h = .02  # step size in the mesh

names = ["Nearest Neighbors", 
         #"Linear SVM", 
         #"RBF SVM", 
         "Decision Tree",
         "Random Forest", 
         "AdaBoost", 
         "Naive Bayes", 
         #"Linear Discriminant Analysis",
         #"Quadratic Discriminant Analysis",
         "Extra Trees"]
classifiers = [
    KNeighborsClassifier(),
    #SVC(kernel="linear", C=0.025, probability=True),
    #SVC(gamma=2, C=1, probability=True),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    #LinearDiscriminantAnalysis(),
    #QuadraticDiscriminantAnalysis(),
    ExtraTreesClassifier(n_estimators=1000)]

In [3]:
data = pd.read_csv('data/malware-features-asm.csv')
labels = pd.read_csv('data/trainLabels.csv')
X = data.iloc[:,1:]
y = [0]*500
fnames = data['filename']
for i in range(500):
  fname = data.loc[i,'filename']
  gr = labels[labels['Id'] == fname]
  y[i] = gr.iloc[0,1]

In [4]:
def run_cv(X,y, clf):

    # Construct a kfolds object
    kf = KFold(len(y),n_folds=10,shuffle=True)
    y_prob = np.zeros((len(y),9))
    y_pred = np.zeros(len(y))
    
    # Iterate through folds
    for train_index, test_index in kf:
        #print(train_index.shape)
        X_train = X.loc[train_index,:]
        X_test = X.loc[test_index,:]
        y_train = y[train_index]

        clf.fit(X_train,y_train)
        y_prob[test_index] = clf.predict_proba(X_test)
        y_pred[test_index] = clf.predict(X_test)
    
    return y_prob, y_pred

In [14]:
# iterate over classifiers
ytrain = np.array(y)
for name, clf in zip(names, classifiers):
  print(name)
  prob, pred = run_cv(X,ytrain,clf)
  print "logloss: %.3f" % log_loss(y, prob)
  cm = confusion_matrix(y, pred)
  print(cm)
#score = clf.score(X_test, y_test)


Nearest Neighbors
1.708
[[ 61   1   3   0   0   1   0   0   0]
 [  7 111   0   0   1   2   0   0   0]
 [  0   1 147   0   0   0   0   0   0]
 [  0   0   0   2   1   0   0   2   1]
 [  0   0   0   0   3   0   0   0   0]
 [  1   1   3   0   0  18   0   1   4]
 [  0   1   2   0   1   1  18   0   0]
 [  0   0   1   3   0   1   0  55   4]
 [  0   0   2   2   0   0   0   1  36]]
Decision Tree
1.546
[[ 55   2   0   0   0   5   2   1   1]
 [  3 114   0   0   0   0   2   1   1]
 [  0   0 141   0   0   1   5   0   1]
 [  2   0   0   0   1   1   1   0   1]
 [  0   0   0   2   0   1   0   0   0]
 [  4   1   0   0   0  19   3   1   0]
 [  2   0   0   0   0   0  21   0   0]
 [  2   0   1   0   0   4   0  57   0]
 [  1   0   0   0   2   3   0   0  35]]
Random Forest
0.297
[[ 63   0   2   0   0   0   0   0   1]
 [  4 115   0   0   0   0   0   2   0]
 [  0   0 147   0   0   0   0   0   1]
 [  0   0   0   0   0   0   0   6   0]
 [  0   0   0   0   0   0   0   3   0]
 [  3   0   0   0   0  22   1   1   1]
 [  0   0   2   0   0   0  20   1   0]
 [  2   0   0   0   0   0   0  62   0]
 [  0   0   0   0   0   0   0   1  40]]
AdaBoost
2.015
[[  0  53   1   0   1   0   1  10   0]
 [  0 118   3   0   0   0   0   0   0]
 [  0  11 136   0   0   0   1   0   0]
 [  0   5   1   0   0   0   0   0   0]
 [  0   2   1   0   0   0   0   0   0]
 [  0  25   1   0   0   0   1   1   0]
 [  0   2   3   0   0   0  17   1   0]
 [  0  61   0   0   0   0   1   2   0]
 [  0  36   0   0   0   0   1   2   2]]
Naive Bayes
6.851
[[ 35   5   1   0   0  14   1   2   8]
 [  2  66   2   1   0   7   0   0  43]
 [  0   0 147   0   0   0   0   0   1]
 [  0   0   0   4   0   1   1   0   0]
 [  0   0   0   3   0   0   0   0   0]
 [  1   1   3   0   0  18   0   2   3]
 [  0   0   0   0   0   0  21   0   2]
 [  0   0   1   0   0   2   3  57   1]
 [  0   1   0   0   0   0   0   4  36]]
Extra Trees
0.185
[[ 64   0   0   0   0   0   0   0   2]
 [  2 116   0   0   0   0   0   3   0]
 [  0   0 146   0   0   0   0   2   0]
 [  0   0   0   4   0   0   0   2   0]
 [  0   0   0   0   2   0   0   1   0]
 [  2   0   0   0   0  24   0   1   1]
 [  0   0   1   0   0   0  20   2   0]
 [  0   0   1   0   0   0   0  63   0]
 [  0   0   0   0   0   0   0   0  41]]

In [6]: