In [1]:
import warnings
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from time import time
from scipy.stats import randint as sp_randint
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score, KFold, train_test_split
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.svm import SVC
import seaborn as sns
import xgboost as xgb
%pylab inline
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
Run the model selection functions on the combined ASM training data for the
30% best feature set and call graph feature set.
So the data frames will be:
- final-combined-train-data-30percent.csv
- sorted_train_labels.csv
- all-combined-train-data.csv
In [2]:
# First load the .asm and .byte training data and training labels
# sorted_train_data_asm = pd.read_csv('data/sorted-train-malware-features-asm-reduced.csv')
# sorted_train_data_byte = pd.read_csv('data/sorted-train-malware-features-byte.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
combined_train_data = pd.read_csv('data/final-combined-train-data-30percent.csv')
combined_test_data = pd.read_csv('data/final-combined-test-data-30percent.csv')
call_graph_features_train = pd.read_csv('data/final-call-graph-features-10percent.csv')
In [3]:
sorted_train_labels.head()
Out[3]:
In [4]:
combined_train_data.head()
Out[4]:
In [3]:
combined_test_data.head()
Out[3]:
In [2]:
# Utility function to report best scores
from operator import itemgetter
def report(grid_scores, n_top=3):
top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
for i, score in enumerate(top_scores):
print("Model with rank: {0}".format(i + 1))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(score.mean_validation_score, np.std(score.cv_validation_scores)))
print("Parameters: {0}".format(score.parameters))
print("")
In [3]:
def run_cv(X,y, clf):
# Construct a kfolds object
kf = KFold(len(y),n_folds=10,shuffle=True)
y_prob = np.zeros((len(y),9))
y_pred = np.zeros(len(y))
# Iterate through folds
for train_index, test_index in kf:
print(test_index, train_index)
X_train = X.loc[train_index,:]
X_test = X.loc[test_index,:]
y_train = y[train_index]
clf.fit(X_train, y_train.flatten()) # use flatten to get rid of data conversion warnings
y_prob[test_index] = clf.predict_proba(X_test)
y_pred[test_index] = clf.predict(X_test)
#print(clf.get_params())
return y_prob, y_pred
In [7]:
# Assign asm data to X,y for brevity, then split the dataset in two equal parts.
X = combined_train_data.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
In [8]:
X_train.shape
Out[8]:
In [35]:
plt.figure(figsize=(15,15))
plt.xlabel("EDX Register")
plt.ylabel("Malware Class")
xa = np.array(X['edx'])
xb = np.array(X['esi'])
ya = np.array(y)
plt.scatter(xa,ya,c=ya,cmap='brg')
Out[35]:
In [36]:
plt.figure(figsize=(15,15))
plt.xlabel("EDX Register")
plt.ylabel("ESI Register")
xa = np.array(X['edx'])
xb = np.array(X['esi'])
ya = np.array(y)
plt.scatter(xa,xb,c=ya,cmap='brg')
Out[36]:
In [18]:
X_means = X.mean()
X_std = X.std()
X_var = X.var()
X_cov = X.cov()
In [19]:
X_means.head()
Out[19]:
In [23]:
X_std.head()
Out[23]:
In [24]:
X_var.head()
Out[24]:
In [25]:
X_cov.head()
Out[25]:
In [ ]:
In [ ]:
In [ ]:
# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
'C': [1, 10, 100, 1000]},
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
print("# Tuning hyper-parameters for SVC")
print()
clfrand = RandomizedSearchCV(SVC(C=1), tuned_parameters, cv=10)
start = time()
clfrand.fit(X_train, y_train)
print("Best parameters set found on training set:")
print()
print(clfrand.best_params_)
print()
print("Grid scores on training set:")
print()
report(clfrand.grid_scores_)
print()
print("Classification report:")
print("SVC took {:.2f} seconds for {:d} candidates.".format(((time() - start), n_iter_search)))
print()
y_true, y_pred = y_test, clfrand.predict(X_test)
print(classification_report(y_true, y_pred))
print()
In [ ]:
In [ ]:
# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
'C': [1, 10, 100, 1000]},
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
print("# Tuning hyper-parameters for SVC")
print()
clfgrid = GridSearchCV(SVC(C=1), tuned_parameters, cv=10, n_jobs=4)
start = time()
clfgrid.fit(X_train, y_train)
print("Best parameters set found on training set:")
print()
print(clfgrid.best_params_)
print()
print("Grid scores on training set:")
print()
report(clfgrid.grid_scores_)
print()
print("Classification report:")
print("SVC took {:.2f} seconds for {:d} candidates.".format(((time() - start), n_iter_search)))
print()
y_true, y_pred = y_test, clfgrid.predict(X_test)
print(classification_report(y_true, y_pred))
print()
In [16]:
In [ ]:
In [14]:
clfextra1 = ExtraTreesClassifier(n_jobs=4)
# use a random grid over parameters, most important parameters are n_estimators (larger is better) and
# max_features (for classification best value is square root of the number of features)
# Reference: http://scikit-learn.org/stable/modules/ensemble.html
param_dist = {"n_estimators": [100, 500, 1000],
"max_depth": [3, None],
"max_features": sp_randint(1, 11),
"min_samples_split": sp_randint(1, 11),
"min_samples_leaf": sp_randint(1, 11),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clfextra1, param_distributions=param_dist, n_iter=n_iter_search)
start = time()
random_search.fit(X_train, y_train)
print("ExtraTreesClassifier - RandomizedSearchCV:")
print(" ")
print("Best parameters set found on training set:")
print(" ")
print(random_search.best_params_)
print(" ")
print("Grid scores on training set:")
print(" ")
report(random_search.grid_scores_)
print(" ")
print("Classification report:")
print("RandomizedSearchCV took {:.2f} seconds for {:d} candidates.".format((time() - start), n_iter_search))
print(" ")
y_pred = random_search.predict(X_test)
print(classification_report(y_test, y_pred))
print(" ")
y_prob = random_search.predict_proba(X_test)
print("logloss = {:.3f}".format(log_loss(y_test, y_prob)))
print("score = {:.3f}".format(accuracy_score(y_test, y_pred)))
cm = confusion_matrix(y_test, y_pred)
print(cm)
In [ ]:
In [ ]:
clfextra2 = ExtraTreesClassifier(n_jobs=4)
# use a full grid over all parameters, most important parameters are n_estimators (larger is better) and
# max_features (for classification best value is square root of the number of features)
# Reference: http://scikit-learn.org/stable/modules/ensemble.html
param_grid = {"n_estimators": [100, 500, 1000, 2000],
"max_depth": [None],
"max_features": [20],
"min_samples_split": [1],
"min_samples_leaf": [1],
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
# run grid search
grid_search = GridSearchCV(clfextra2, param_grid=param_grid, cv=10)
start = time()
grid_search.fit(X_train, y_train)
print("ExtraTreesClassifier - GridSearchCV:")
print(" ")
print("Best parameters set found on training set:")
print(" ")
print(grid_search.best_params_)
print(" ")
print("Grid scores on training set:")
print(" ")
report(grid_search.grid_scores_)
print(" ")
print("Classification report:")
print("GridSearchCV took {:.2f} seconds.".format((time() - start)))
print(" ")
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))
print(" ")
y_prob = grid_search.predict_proba(X_test)
print("logloss = {:.3f}".format(log_loss(y_test, y_prob)))
print("score = {:.3f}".format(accuracy_score(y_train, y_pred)))
cm = confusion_matrix(y_test, y_pred)
print(cm)
In [13]:
print("Classification report:")
print("GridSearchCV took {:.2f} seconds.".format(time() - start))
print(" ")
y_pred = grid_search.predict(X_test)
print(classification_report(y_test, y_pred))
print(" ")
y_prob = grid_search.predict_proba(X_test)
print("logloss = {:.3f}".format(log_loss(y_test, y_prob)))
print("score = {:.3f}".format(accuracy_score(y_test, y_pred)))
cm = confusion_matrix(y_test, y_pred)
print(cm)
In [ ]:
In [37]:
# Assign byte data to X,y for brevity, then split the dataset in two equal parts.
X = sorted_train_data_byte.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
In [ ]:
In [44]:
plt.figure(figsize=(15,15))
plt.xlabel("File Entropy")
plt.ylabel("Malware Class")
xa = np.array(X['entropy'])
xb = np.array(X['filesize'])
ya = np.array(y)
plt.scatter(xa,ya,c=ya,cmap='brg')
Out[44]:
In [ ]:
In [43]:
plt.figure(figsize=(15,15))
plt.xlabel("File Size")
plt.ylabel("Malware Class")
plt.scatter(xb,ya,c=ya,cmap='brg')
Out[43]:
In [42]:
plt.figure(figsize=(15,15))
plt.xlabel("File Size")
plt.ylabel("Shannon's Entropy")
#colors = cm.rainbow(np.linspace(0, 1, len(ya)))
plt.scatter(xb,xa,c=ya,cmap='brg')
Out[42]:
In [ ]:
clfridge = RidgeClassifierCV(cv=10)
clfridge.fit(X_train, y_train)
y_pred = clfridge.predict(X_test)
print(classification_report(y_test, y_pred))
print(" ")
print("score = {:.3f}".format(accuracy_score(y_train, y_pred)))
cm = confusion_matrix(y_test, y_pred)
print(cm)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
clfextra = ExtraTreesClassifier(n_jobs=4)
# use a full grid over all parameters, most important parameters are n_estimators (larger is better) and
# max_features (for classification best value is square root of the number of features)
# Reference: http://scikit-learn.org/stable/modules/ensemble.html
param_grid = {"n_estimators": [1000, 2000],
"max_depth": [3, None],
"max_features": [1, 2],
"min_samples_split": [1, 3, 10],
"min_samples_leaf": [1, 3, 10],
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
# run grid search
grid_search = GridSearchCV(clfextra, param_grid=param_grid)
start = time()
grid_search.fit(X, y)
print("ExtraTreesClassifier - GridSearchCV:")
print(" ")
print("Best parameters set found on training set:")
print(" ")
print(grid_search.best_params_)
print(" ")
print("Grid scores on training set:")
print(" ")
report(grid_search.grid_scores_)
print(" ")
print("Classification report:")
print("GridSearchCV took {:.2f} seconds.".format((time() - start)))
print(" ")
y_pred = grid_search.predict(X)
print(classification_report(y, y_pred))
print(" ")
y_prob = grid_search.predict_proba(X)
print("logloss = {:.3f}".format(log_loss(y, y_prob)))
print("score = {:.3f}".format(accuracy_score(y, y_pred)))
cm = confusion_matrix(y, y_pred)
print(cm)
In [6]:
# Assign byte data to X,y for brevity, then split the dataset in two equal parts.
X = combined_train_data.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
In [ ]:
In [33]:
from sklearn.linear_model import RidgeClassifierCV
clfridge = RidgeClassifierCV(cv=10)
clfridge.fit(X_train, y_train)
y_pred = clfridge.predict(X_test)
print(classification_report(y_test, y_pred))
print(" ")
print("score = {:.3f}".format(accuracy_score(y_train, y_pred)))
cm = confusion_matrix(y_test, y_pred)
print(cm)
In [17]:
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.3f}".format(log_loss(y, p1)))
print("score = {:.3f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [15]:
X = combined_train_data.iloc[:,1:]
ylabels = sorted_train_labels.iloc[:,1:]
y = np.array(ylabels - 1)
y = y.flatten()
y
Out[15]:
In [ ]:
xgclf = xgb.XGBClassifier(objective="multi:softprob", nthread=4)
params = {"n_estimators": [1000, 2000],
"max_depth": [5, 10],
"learning_rate": [0.1, 0.05]}
# run grid search
grid_search = GridSearchCV(xgclf, param_grid=params)
start = time()
grid_search.fit(X, y)
print("XGBoost Classifier - GridSearchCV:")
print(" ")
print("Best parameters set found on training set:")
print(" ")
print(grid_search.best_params_)
print(" ")
print("Grid scores on training set:")
print(" ")
report(grid_search.grid_scores_)
print(" ")
print("Classification report:")
print("GridSearchCV took {:.2f} seconds.".format((time() - start)))
print(" ")
y_pred = grid_search.predict(X)
print(classification_report(y, y_pred))
print(" ")
y_prob = grid_search.predict_proba(X)
print("logloss = {:.3f}".format(log_loss(y, y_prob)))
print("score = {:.3f}".format(accuracy_score(y, y_pred)))
cm = confusion_matrix(y, y_pred)
print(cm)
In [18]:
print("GridSearchCV took {:.2f} seconds.".format((time() - start)))
print(" ")
y_pred = grid_search.predict(X)
print(classification_report(y, y_pred))
print(" ")
y_prob = grid_search.predict_proba(X)
print("logloss = {:.3f}".format(log_loss(y, y_prob)))
print("score = {:.3f}".format(accuracy_score(y, y_pred)))
cm = confusion_matrix(y, y_pred)
print(cm)
In [ ]:
# Now try with best parameters and 50/50 train-test split
xgclf = xgb.XGBClassifier(n_estimators=1000, max_depth=10, learning_rate=0.01,objective="multi:softprob", nthread=4)
prob1, pred1 = run_cv(X_train, y_train, xgclf)
print("logloss = {:.3f}".format(log_loss(y_train, prob1)))
print("score = {:.3f}".format(accuracy_score(y_train, pred1)))
cm = confusion_matrix(y_train, pred1)
print(cm)
In [9]:
pred2 = xgclf.predict(X_test)
prob2 = xgclf.predict_proba(X_test)
print("logloss = {:.3f}".format(log_loss(y_test, prob2)))
print("score = {:.3f}".format(accuracy_score(y_test, pred2)))
cm = confusion_matrix(y_test, pred2)
print(cm)
In [10]:
xgclf = xgb.XGBClassifier(n_estimators=1000, max_depth=10, learning_rate=0.1,objective="multi:softprob", nthread=4)
prob1, pred1 = run_cv(X,y,xgclf)
print("logloss = {:.3f}".format(log_loss(y, prob1)))
print("score = {:.3f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)
In [ ]:
help(xgb)
In [ ]:
help(ExtraTreesClassifier)
In [ ]:
ytrain = np.array(y)
In [ ]:
X = data_reduced.iloc[:,1:]
X.shape
In [ ]:
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,ytrain,clf1)
print "logloss = %.3f" % log_loss(y, p1)
print "score = %.3f" % accuracy_score(ytrain, pred1)
cm = confusion_matrix(y, pred1)
print(cm)
In [ ]:
clf2 = ExtraTreesClassifier(n_estimators=500, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p2, pred2 = run_cv(X,ytrain,clf2)
print "logloss = %.3f" % log_loss(y, p2)
print "score = %.3f" % accuracy_score(ytrain, pred2)
cm = confusion_matrix(y, pred2)
print(cm)
In [ ]:
clf3 = ExtraTreesClassifier(n_estimators=250, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p3, pred3 = run_cv(X,ytrain,clf3)
print "logloss = %.3f" % log_loss(y, p3)
print "score = %.3f" % accuracy_score(ytrain, pred3)
cm = confusion_matrix(y, pred3)
print(cm)
In [ ]:
clf4 = ExtraTreesClassifier(n_estimators=2000, max_features=None, min_samples_leaf=2, min_samples_split=3, n_jobs=4, criterion='gini')
p4, pred4 = run_cv(X,ytrain,clf4)
print "logloss = %.3f" % log_loss(y, p4)
print "score = %.3f" % accuracy_score(ytrain, pred4)
cm = confusion_matrix(y, pred4)
print(cm)
In [ ]:
In [ ]:
clf5 = ExtraTreesClassifier(n_estimators=1000, n_jobs=4, criterion='gini')
p5, pred5 = run_cv(X,ytrain,clf5)
print "logloss = %.4f" % log_loss(y, p5)
print "score = %.4f" % accuracy_score(ytrain, pred5)
cm = confusion_matrix(y, pred5)
print(cm)
In [ ]:
clf6 = ExtraTreesClassifier(n_estimators=2000, n_jobs=4, criterion='gini')
p6, pred6 = run_cv(X,ytrain,clf6)
print "logloss = %.4f" % log_loss(y, p6)
print "score = %.4f" % accuracy_score(ytrain, pred6)
cm = confusion_matrix(y, pred6)
print(cm)
In [4]:
data = pd.read_csv('data/all-combined-train-data-final.csv')
labels = pd.read_csv('data/sorted-train-labels.csv')
data.head(20)
Out[4]:
In [10]:
X = data.iloc[:,1:]
ylabels = labels.iloc[:,1:].values
y = np.array(ylabels - 1).flatten() # numpy arrays are unloved in many places.
y
Out[10]:
In [11]:
labels.head()
Out[11]:
In [12]:
xgclf = xgb.XGBClassifier(objective="multi:softprob", nthread=4)
params = {"n_estimators": [1000, 2000],
"max_depth": [5, 10],
"learning_rate": [0.1, 0.05]}
# run grid search
grid_search = GridSearchCV(xgclf, param_grid=params)
start = time()
grid_search.fit(X, y)
print("XGBoost Classifier - GridSearchCV:")
print(" ")
print("Best parameters set found on training set:")
print(" ")
print(grid_search.best_params_)
print(" ")
print("Grid scores on training set:")
print(" ")
report(grid_search.grid_scores_)
print(" ")
print("Classification report:")
print("GridSearchCV took {:.2f} seconds.".format((time() - start)))
print(" ")
y_pred = grid_search.predict(X)
print(classification_report(y, y_pred))
print(" ")
y_prob = grid_search.predict_proba(X)
print("logloss = {:.3f}".format(log_loss(y, y_prob)))
print("score = {:.3f}".format(accuracy_score(y, y_pred)))
cm = confusion_matrix(y, y_pred)
print(cm)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
# TODO:
In [ ]:
# go through the features and delete any that sum to less than 200
colsum = X.sum(axis=0, numeric_only=True)
In [ ]:
zerocols = colsum[(colsum[:] == 0)]
zerocols
In [ ]:
zerocols = colsum[(colsum[:] < 110)]
zerocols.shape
In [ ]:
reduceX = X
for col in reduceX.columns:
if sum(reduceX[col]) < 100:
del reduceX[col]
reduceX.shape
In [ ]:
skb = SelectKBest(chi2, k=20)
X_kbestnew = skb.fit_transform(X, y)
X_kbestnew.shape
In [ ]:
combined_train_data.loc[combined_train_data['filename'] == '4jKA1GUDv6TMNpPuIxER',:]
# Get an array of labels in the same order as the asm filenames
# y = [0]*labels.shape[0]
# fnames = train_data_asm['filename']
# for i in range(len(y)):
# fname = train_data_asm.loc[i,'filename']
# row = labels[labels['Id'] == fname]
# y[i] = row.iloc[0,1]
In [ ]:
train_data_byte[train_data_byte.loc[:,'filename']=='4jKA1GUDv6TMNpPuIxER']
In [ ]:
count = 0
for i in range(len(y)):
if y[i] == 0:
count += 1
print(count)
In [ ]:
count = 0
for i in range(len(sorted_train_labels)):
if sorted_train_labels.iloc[i,1] == 0:
count += 1
print(count)
In [ ]:
In [27]:
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier
# get some data
digits = load_digits()
X, y = digits.data, digits.target
In [32]:
type(X)
Out[32]: