1. Model Selection.

- XGBoost
- LightGBM (Microsoft)
- CNTK (Microsoft)
- Leaf (https://github.com/autumnai/leaf)

In [1]:
import warnings
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from time import time
from scipy.stats import randint as sp_randint
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score, KFold, train_test_split
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.svm import SVC
import seaborn as sns
import xgboost as xgb
%pylab inline
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


Populating the interactive namespace from numpy and matplotlib

In [2]:
def run_cv(X,y, clf):

    # Construct a kfolds object
    kf = KFold(len(y),n_folds=10,shuffle=True)
    y_prob = np.zeros((len(y),9))
    y_pred = np.zeros(len(y))
    
    # Iterate through folds
    for train_index, test_index in kf:
        print(test_index, train_index)
        X_train = X.loc[train_index,:]
        X_test = X.loc[test_index,:]
        y_train = y[train_index]

        clf.fit(X_train, y_train.flatten()) # use flatten to get rid of data conversion warnings
        
        y_prob[test_index] = clf.predict_proba(X_test)
        y_pred[test_index] = clf.predict(X_test)
        #print(clf.get_params())
    
    return y_prob, y_pred

In [ ]:
# Feature sets:
# 1. PE ASM features.
# 2. Entropy features.
# 3. 
sorted_train_labels = pd.read_csv('data/sorted-train-labels-vs251.csv')
sorted_asm_features = pd.read_csv('data/sorted-pe-asm-features-vs251.csv')
sorted_entropy_features = pd.read_csv('data/sorted-entropy-features-vs251.csv')
sorted_file_id_features = pd.read_csv('data/sorted-file-id-features-vs251.csv')
sorted_packer_id_features = pd.read_csv('data/sorted-packer-id-features-vs251.csv')
sorted_call_graph_features = pd.read_csv('data/sorted-pe-call-graph-features-vs251.csv')
sorted_trid_id_features = pd.read_csv('data/sorted-trid-id-features-vs251.csv') # Select only scalar columns.
sorted_asm_features.head()

In [ ]:
# Assign asm data to X,y for brevity, then split the dataset in two equal parts.
X = combined_train_data.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [ ]:
plt.figure(figsize=(15,15))
plt.xlabel("EDX Register")
plt.ylabel("Malware Class")
xa = np.array(X['edx'])
xb = np.array(X['esi'])
ya = np.array(y)
plt.scatter(xa,ya,c=ya,cmap='brg')

In [ ]:
X = combined_train_data.iloc[:,1:]
ylabels = sorted_train_labels.iloc[:,1:]
y = np.array(ylabels - 1)
y = y.flatten()
y

In [ ]:
xgclf = xgb.XGBClassifier(objective="multi:softprob", nthread=4)

params     = {"n_estimators": [1000, 2000],
              "max_depth": [5, 10],
              "learning_rate": [0.1, 0.09]}

# run grid search
grid_search = GridSearchCV(xgclf, param_grid=params)
start = time()
grid_search.fit(X, y)

print("XGBoost Classifier - GridSearchCV:")
print(" ")
print("Best parameters set found on training set:")
print(" ")
print(grid_search.best_params_)
print(" ")
#print("Grid scores on training set:")
#print(" ")
#report(grid_search.grid_scores_)
#print(" ")
print("Classification report:")
print("GridSearchCV took {:.2f} seconds.".format((time() - start)))
print(" ")
y_pred = grid_search.predict(X)
print(classification_report(y, y_pred))
print(" ")
y_prob = grid_search.predict_proba(X)
print("logloss = {:.3f}".format(log_loss(y, y_prob)))
print("score = {:.3f}".format(accuracy_score(y, y_pred)))

In [ ]: