In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.cross_validation import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import log_loss, confusion_matrix
import xgboost as xgb
h = .02 # step size in the mesh
names = ["Nearest Neighbors",
#"Linear SVM",
#"RBF SVM",
"Decision Tree",
"Random Forest",
"AdaBoost",
"Naive Bayes",
#"Linear Discriminant Analysis",
#"Quadratic Discriminant Analysis",
"Extra Trees"]
classifiers = [
KNeighborsClassifier(),
#SVC(kernel="linear", C=0.025, probability=True),
#SVC(gamma=2, C=1, probability=True),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=1000),
AdaBoostClassifier(),
GaussianNB(),
#LinearDiscriminantAnalysis(),
#QuadraticDiscriminantAnalysis(),
ExtraTreesClassifier(n_estimators=1000)]
In [3]:
data = pd.read_csv('data/malware-features-asm.csv')
labels = pd.read_csv('data/trainLabels.csv')
X = data.iloc[:,1:]
y = [0]*500
fnames = data['filename']
for i in range(500):
fname = data.loc[i,'filename']
gr = labels[labels['Id'] == fname]
y[i] = gr.iloc[0,1]
In [4]:
def run_cv(X,y, clf):
# Construct a kfolds object
kf = KFold(len(y),n_folds=10,shuffle=True)
y_prob = np.zeros((len(y),9))
y_pred = np.zeros(len(y))
# Iterate through folds
for train_index, test_index in kf:
#print(train_index.shape)
X_train = X.loc[train_index,:]
X_test = X.loc[test_index,:]
y_train = y[train_index]
clf.fit(X_train,y_train)
y_prob[test_index] = clf.predict_proba(X_test)
y_pred[test_index] = clf.predict(X_test)
return y_prob, y_pred
In [14]:
# iterate over classifiers
ytrain = np.array(y)
for name, clf in zip(names, classifiers):
print(name)
prob, pred = run_cv(X,ytrain,clf)
print "logloss: %.3f" % log_loss(y, prob)
cm = confusion_matrix(y, pred)
print(cm)
#score = clf.score(X_test, y_test)
In [6]: