In [1]:
import warnings
import numpy as np
import scipy as sp
import pandas as pd
#import sklearn as skl
import matplotlib.pyplot as plt
from time import time
from scipy.stats import randint as sp_randint
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.svm import SVC
import seaborn as sns
#import xgboost as xgb
#import lightgbm as lgb
import pickle
import gzip
%pylab inline
warnings.filterwarnings("ignore")
In [2]:
def run_cv(X, y, clf, num_iters):
# Construct a kfolds object:
# For softprob prediction this will only work if the distribution of
# label values is even throughout each sub-sample, so only large sample
# sizes will generally work, small sample sizes with a large number of
# label values will generate errors when doing the softprob assignment
# to y_prob because the training set will likely have a different number
# of unique label values from the full sample set.
# In this case comment out the softprob assignment and y_prob return
# value.
len_y = len(y)
num_labels = y.nunique()
kf = KFold(n_splits=num_iters, shuffle=True)
y_prob = np.zeros((len_y, num_labels))
y_pred = np.zeros(len_y)
# Iterate through folds
for train_index, test_index in kf.split(X,y):
print(test_index, train_index)
X_train = X.loc[train_index,:]
X_test = X.loc[test_index,:]
y_train = y[train_index]
clf.fit(X_train, y_train) # use flatten to get rid of data conversion warnings
y_prob[test_index] = clf.predict_proba(X_test)
y_pred[test_index] = clf.predict(X_test)
#print(clf.get_params())
return y_prob, y_pred
def print_classification_report(y, y_pred, out_file):
fop = open('data/' + out_file, 'w')
outlines = classification_report(y, y_pred)
fop.writelines(outlines)
fop.close()
return
def run_prediction_cv(X, y, clf, num_iters):
# Construct a kfolds object:
# For softprob prediction this will only work if the distribution of
# label values is even throughout each sub-sample, so only large sample
# sizes will generally work, small sample sizes with a large number of
# label values will generate errors when doing the softprob assignment
# to y_prob because the training set will likely have a different number
# of unique label values from the full sample set.
# In this case comment out the softprob assignment and y_prob return
# value.
len_y = len(y)
num_labels = y.nunique()
kf = KFold(n_splits=num_iters, shuffle=True)
y_prob = np.zeros((len_y, num_labels))
y_pred = np.zeros(len_y)
# Iterate through folds
for train_index, test_index in kf.split(X,y):
print(test_index, train_index)
X_train = X.loc[train_index,:]
X_test = X.loc[test_index,:]
y_train = y[train_index]
clf.fit(X_train, y_train) # use flatten to get rid of data conversion warnings
y_pred[test_index] = clf.predict(X_test)
return y_pred
def write_confusion_matrix(cm, out_file_name):
fop = open('data/' + out_file_name, 'w')
# this is rubbish -> cm.tofile(fop, ",")
for a_idx in range(0,cm.shape[0]):
#line = ",".join(cm[a_idx])
line = ','.join(str(x) for x in cm[a_idx])
#print("{:d} -> {:s}".format(a_idx, line))
fop.write(line + "\n")
#for b_idx in cm.shape[1]:
# line = line + ",".acm[a_idx,b_idx]
fop.close()
return
def reduce_feature_set(X, y, out_filename):
# Find the top 10 percent variance features.
print("Sorted feature subset: {:d}".format(idx))
print("Subset shape: {:d} {:d}".format(X.shape[0], X.shape[1]))
print("Length of y: {:d}".format(len(y)))
#sorted_feature_subset.head()
# Now select the 10% best features for this feature subset.
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1 # the column name indices start at 0 so add 1 to all.
data_trimmed = sorted_feature_subset.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_feature_subset['file_name'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/" + out_file_name
data_reduced.to_csv(filename, index=False)
#sorted_feature_subset['file_name'].to_csv(filename, index=False)
print("Writing file: {:s}".format(filename))
return
def pickle_it(clf, file_name, protocol = 4):
# Uncompressed pickle, files are too large
# when saving classifier objects, use gzip, 4GB ~> 20MB.
# NOTE: Do not use python gzip module, uses too much memory.
# TODO: Use a syscall to run gzip or bzip.
# Use protocol 4 with python 3, protoool 2 with python 2.
fop = open('data/' + file_name, 'wb')
#cPickle.dump(clf, fop)
pickle.dump(clf, fop, protocol)
fop.close()
return
def unpickle_it(file_name):
fip = open(('data/' + file_name, 'rb'))
the_thing = pickle.load(fip)
fip.close()
return the_thing
def gzip_pickle_it(object, filename, protocol = 4):
# DEPRECATED:
# Saves a compressed object to disk, will not work on extratreesclassifier
# unless you have at least 16GB of RAM, so many memberberries.
pickle.dump(object, filename, protocol)
#file = gzip.GzipFile(filename, 'wb')
#file.write(cPickle.dumps(object, protocol))
# Python 3 does not have a specific cPickle module
# because the developers are idiots.
#file.write(pickle.dumps(object, protocol))
#file.close()
return
def gzip_unpickle_it(filename):
# Loads a compressed object from disk
file = gzip.GzipFile(filename, 'rb')
buffer = ""
while True:
data = file.read()
if data == "":
break
buffer += data
object = pickle.loads(buffer)
file.close()
return object
def save_predictions(feature_set, y_labels, prediction_labels, file_name):
# TODO:
#fop = open('data/' + file_name, 'w')
out_df = pd.DataFrame()
out_df['file_name'] = feature_set['file_name']
out_df['label'] = y_labels
out_df['prediction'] = pd.Series(prediction_labels)
out_df.to_csv('data/' + file_name, index=False)
return
def save_function_count_predictions(feature_set, y_labels, prediction_labels, file_name):
# TODO:
#fop = open('data/' + file_name, 'w')
out_df = pd.DataFrame()
out_df['file_name'] = feature_set['filename']
out_df['label'] = y_labels
out_df['prediction'] = pd.Series(prediction_labels)
out_df.to_csv('data/' + file_name, index=False)
return
In [ ]:
help(cPickle)
In [ ]:
help(StratifiedKFold)
In [ ]:
help(KFold)
In [ ]:
In [6]:
In [ ]:
In [3]:
# Load in the combined feature set and training labels
combined_train_features = pd.read_csv('data/combined-pe-features-vs251.csv')
train_labels = pd.read_csv('data/sorted-pe-coff-train-labels-vs251.csv')
train_labels.head()
Out[3]:
In [5]:
combined_train_features.head()
Out[5]:
In [4]:
# X,y = get_training_data(combined_train_features, sorted_train_labels, 'data/sorted-pe-coff-train-labels-vs251.csv')
X = combined_train_features.iloc[:,1:]
y = train_labels['label']
print("Length of y: {:d}".format(len(y)))
print("Shape of X: {:d} {:d}".format(X.shape[0], X.shape[1]))
In [7]:
type(y)
Out[7]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
# Getting memory errors :-(
# GridSearchCV needs more memberberries.
clfextra = ExtraTreesClassifier(n_jobs=4)
# use a full grid over all parameters, most important parameters are n_estimators (larger is better) and
# max_features (for classification best value is square root of the number of features)
# Reference: http://scikit-learn.org/stable/modules/ensemble.html
param_grid = {"n_estimators": [1000, 2000],
"max_depth": [3, None],
"max_features": [1, 2],
"min_samples_split": [1, 3, 10],
"min_samples_leaf": [1, 3, 10],
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
# run grid search
grid_search = GridSearchCV(clfextra, param_grid=param_grid)
start = time()
grid_search.fit(X, y)
print("ExtraTreesClassifier - GridSearchCV:")
print(" ")
print("Best parameters set found on training set:")
print(" ")
print(grid_search.best_params_)
print(" ")
#print("Grid scores on training set:")
#print(" ")
#report(grid_search.grid_scores_)
#print(" ")
print("Classification report:")
print("GridSearchCV took {:.2f} seconds.".format((time() - start)))
print(" ")
y_pred = grid_search.predict(X)
print(classification_report(y, y_pred))
print(" ")
y_prob = grid_search.predict_proba(X)
print("logloss = {:.3f}".format(log_loss(y, y_prob)))
print("score = {:.3f}".format(accuracy_score(y, y_pred)))
cm = confusion_matrix(y, y_pred)
print(cm)
In [ ]:
# Getting memory exhaustion, try a small subset of features.
#X = combined_train_features.iloc[:,1:100]
#y = train_labels['label']
#print("Length of y: {:d}".format(len(y)))
#print("Shape of X: {:d} {:d}".format(X.shape[0], X.shape[1]))
clf1 = ExtraTreesClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
print(classification_report(y, pred1))
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-xtratrees-vs251.txt')
In [9]:
write_confusion_matrix(cm, 'confusion-matrix-xtratrees-vs251.txt')
In [ ]:
In [ ]:
In [6]:
clf1 = RandomForestClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-randomforest-vs251.txt')
In [ ]:
In [ ]:
xgblabels = y - 1
xgclf = xgb.XGBClassifier(n_estimators=100, max_depth=10, learning_rate=1.0, objective="multi:softmax", nthread=4)
start = time()
pred1 = run_prediction_cv(X, xgblabels, xgclf, 10)
#print("logloss = {:.3f}".format(log_loss(y, prob1)))
print("score = {:.3f}".format(accuracy_score(xgblabels, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(xgblabels, pred1)
write_confusion_matrix(cm, 'confusion-matrix-xgboost-vs251.txt')
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
# DEPRECATED: use the LightGBM sklearn API class.
def run_lightgbm_cv(X, y, params, num_iters):
# Construct a kfolds object:
# For softprob prediction this will only work if the distribution of
# label values is even throughout each sub-sample, so only large sample
# sizes will generally work, small sample sizes with a large number of
# label values will generate errors when doing the softprob assignment
# to y_prob because the training set will likely have a different number
# of unique label values from the full sample set.
# In this case comment out the softprob assignment and y_prob return
# value.
len_y = len(y)
num_labels = y.nunique()
kf = KFold(len_y, n_folds=num_iters, shuffle=True)
y_prob = np.zeros((len_y, num_labels))
y_pred = np.zeros(len_y)
# Iterate through folds
for train_index, test_index in kf:
print(test_index, train_index)
X_train = X.loc[train_index,:]
X_test = X.loc[test_index,:]
y_train = y[train_index]
train_data = lgb.Dataset(X_train, label=y_train)
clf = lgb.train(params, train_data, 10)
y_prob[test_index] = clf.predict_proba(X_test)
y_pred[test_index] = clf.predict(X_test)
return y_prob, y_pred
In [ ]:
# DEPRECATED: use sklearn API module below.
train_data = lgb.Dataset(X, label=y)
params = {'learning_rate':1.0, 'num_leaves':31, 'num_trees':100, 'objective':'multiclass', 'num_threads':4 }
prob1, pred1 = run_lightgbm_cv(X, y, params, 10)
print(" ")
print("Classification report:")
print(classification_report(y, pred1))
print("logloss = {:.3f}".format(log_loss(y, prob1)))
print("score = {:.3f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-xtratrees-vs251.txt')
In [ ]:
In [9]:
In [ ]:
# sklearn type API.
clf1 = lgb.LGBMClassifier(n_estimators=100, learning_rate=1.0, objective='multiclass')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
print(classification_report(y, pred1))
#print("logloss = {:.3f}".format(log_loss(y, prob1)))
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-lightgbm-vs251.txt')
In [8]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
fop = open('data/classifier-model-vs251-xtrees-1000.pkl', 'wb')
cPickle.dump(clf1, fop)
fop.close()
In [ ]:
In [3]:
sorted_train_labels = pd.read_csv('data/sorted-pe-coff-train-labels-vs252.csv')
combined_features = pd.read_csv('data/combined-pe-features-vs252.csv')
combined_features.head()
Out[3]:
In [14]:
sorted_train_labels.head()
Out[14]:
In [4]:
X = combined_features.iloc[:,1:]
y = sorted_train_labels['label']
print("Length of y: {:d}".format(len(y)))
print("Shape of X: {:d} {:d}".format(X.shape[0], X.shape[1]))
In [9]:
# MODEL 1: ExtraTrees 100
# 130 features
clf1 = ExtraTreesClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
#print_classification_report(y, pred1, 'classification-report-extratrees-vs252.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
#cm = confusion_matrix(y, pred1)
#write_confusion_matrix(cm, 'confusion-matrix-xtratrees-vs252.txt')
pickle_it(clf1, 'classifier-model-vs252-extratrees-100-gini.pkl', 4)
In [5]:
# MODEL 1: ExtraTrees 100
# 130 features
clf1 = ExtraTreesClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
#print_classification_report(y, pred1, 'classification-report-extratrees-vs252.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
#cm = confusion_matrix(y, pred1)
#write_confusion_matrix(cm, 'confusion-matrix-xtratrees-vs252.txt')
pickle_it(clf1, 'classifier-model-vs252-extratrees-100-gini.pkl', 4)
In [10]:
clf1 = RandomForestClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-randomforest-vs252.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-randomforest-vs252.txt')
In [ ]:
fop = open('data/classifier-model-vs252-randomforest-1000.pkl', 'wb')
cPickle.dump(clf1, fop)
fop.close()
In [7]:
clf1 = lgb.LGBMClassifier(n_estimators=100, learning_rate=1.0, objective='multiclass')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-lightgbm-vs252.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("LightGBM took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-lightgbm-vs252.txt')
In [3]:
sorted_train_labels = pd.read_csv('data/sorted-pe-coff-train-labels-vs252.csv')
function_count_features = pd.read_csv('data/sorted-pe-function-count-features-10percent-vs252.csv')
function_count_features.head()
Out[3]:
In [4]:
X = function_count_features.iloc[:,1:]
y = sorted_train_labels['label']
print("Length of y: {:d}".format(len(y)))
print("Shape of X: {:d} {:d}".format(X.shape[0], X.shape[1]))
In [ ]:
# MODEL 1: 100 estimators (gini)
# NOTE: see clean_out_the_elves() in feature-extraction-validation.ipynb
# for error below.
clf1 = ExtraTreesClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
#print_classification_report(y, pred1, 'classification-report-funcounts-extratrees-vs252-100-gini.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
#cm = confusion_matrix(y, pred1)
#write_confusion_matrix(cm, 'confusion-matrix-funcounts-xtratrees-vs252-100-gini.txt')
#save_function_count_predictions(function_count_features, y, pred1, 'classifier-predictions-funcounts-xtratrees-vs252-100-gini.csv')
# NOTE: kernel died while pickling, probably memory exhaustion.
pickle_it(clf1, 'classifier-model-vs252-funcounts-extratrees-100-gini.pkl', 4)
In [ ]:
# MODEL 2:
# 100 estimators (entropy)
clf1 = ExtraTreesClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='entropy')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-funcounts-extratrees-vs252-100-entropy.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-funcounts-xtratrees-vs252-100-entropy.txt')
save_predictions(combined_features, y, pred1, 'classifier-predictions-funcounts-xtratrees-vs252-100-entropy.csv')
pickle_it(clf1, 'classifier-model-vs252-funcounts-extratrees-100-entropy.pkl', 4)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
fop = open('data/classifier-model-vs252-lightgbm-1000.pkl', 'wb')
cPickle.dump(clf1, fop)
fop.close()
In [8]:
In [4]:
sorted_train_labels = pd.read_csv('data/sorted-pe-coff-train-labels-vs263.csv')
combined_features = pd.read_csv('data/combined-pe-features-vs263.csv')
combined_features.head()
Out[4]:
In [5]:
sorted_train_labels.head()
Out[5]:
In [5]:
X = combined_features.iloc[:,1:]
y = sorted_train_labels['label']
print("Length of y: {:d}".format(len(y)))
print("Shape of X: {:d} {:d}".format(X.shape[0], X.shape[1]))
In [6]:
# MODEL 1:
# 100 estimators (gini)
clf1 = ExtraTreesClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-extratrees-vs263-100-gini.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-xtratrees-vs263-100-gini.txt')
save_predictions(combined_features, y, pred1, 'classifier-predictions-xtratrees-vs263-100-gini.csv')
In [ ]:
# DEPRECATED:
# See if we can get decent sized object serialization, use protocol 2,
# protocol 0 produces huge files and runs out of memory.
# Note: OverflowError: size does not fit in an int, is a bug in old versions of zlib module.
# supposed to have been fixed.
#gzip_pickle_it(clf1, 'data/classifier-model-vs263-extratrees-100-gini.pkl', 2)
# Use python3.x pickle module then compress manually.
pickle_it(clf1, 'classifier-model-vs263-extratrees-100-gini.pkl', 4)
In [ ]:
In [5]:
# MODEL 2:
# 100 estimators (entropy)
clf1 = ExtraTreesClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='entropy')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-extratrees-vs263-100-entropy.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-xtratrees-vs263-100-entropy.txt')
save_predictions(combined_features, y, pred1, 'classifier-predictions-xtratrees-vs263-100-entropy.csv')
pickle_it(clf1, 'classifier-model-vs263-extratrees-100-entropy.pkl', 4)
In [6]:
# MODEL 3:
# RandomForest (gini)
clf1 = RandomForestClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-randomforest-vs263-100.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("RandomForestClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-randomforest-vs263-100-gini.txt')
save_predictions(combined_features, y, pred1, 'classifier-predictions-randomforest-vs263-100-gini.csv')
pickle_it(clf1, 'classifier-model-vs263-randomforest-100-gini.pkl', 4)
In [6]:
# MODEL 4:
# RandomForest (entropy)
clf1 = RandomForestClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='entropy')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-randomforest-vs263-100-entropy.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("RandomForestClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-randomforest-vs263-100-entropy.txt')
save_predictions(combined_features, y, pred1, 'classifier-predictions-randomforest-vs263-100-entropy.csv')
pickle_it(clf1, 'classifier-model-vs263-randomforest-100-entropy.pkl', 4)
In [ ]:
In [ ]:
# MODEL 5:
# LightGBM
In [ ]:
In [ ]:
In [ ]:
In [3]:
sorted_train_labels = pd.read_csv('data/sorted-pe-coff-train-labels-vs263.csv')
function_count_features = pd.read_csv('data/sorted-pe-function-count-features-10percent-vs263.csv')
function_count_features.head()
Out[3]:
In [4]:
X = function_count_features.iloc[:,1:]
y = sorted_train_labels['label']
print("Length of y: {:d}".format(len(y)))
print("Shape of X: {:d} {:d}".format(X.shape[0], X.shape[1]))
In [5]:
# MODEL 1: 100 estimators (gini)
# NOTE: see clean_out_the_elves() in feature-extraction-validation.ipynb
# for error below.
clf1 = ExtraTreesClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-funcounts-extratrees-vs263-100-gini.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
#cm = confusion_matrix(y, pred1)
#write_confusion_matrix(cm, 'confusion-matrix-funcounts-xtratrees-vs263-100-gini.txt')
#save_function_count_predictions(function_count_features, y, pred1, 'classifier-predictions-funcounts-xtratrees-vs263-100-gini.csv')
pickle_it(clf1, 'classifier-model-vs263-funcounts-extratrees-100-gini.pkl', 4)
In [6]:
# MODEL 2: 100 estimators (entropy)
clf1 = ExtraTreesClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='entropy')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-funcounts-extratrees-vs263-100-entropy.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
#cm = confusion_matrix(y, pred1)
#write_confusion_matrix(cm, 'confusion-matrix-funcounts-xtratrees-vs263-100-entropy.txt')
#save_function_count_predictions(function_count_features, y, pred1, 'classifier-predictions-funcounts-xtratrees-vs263-100-entropy.csv')
pickle_it(clf1, 'classifier-model-vs263-funcounts-extratrees-100-entropy.pkl', 4)
In [7]:
# MODEL 3:
# MODEL 3: 100 estimators (gini)
clf1 = RandomForestClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-funcounts-randomforest-vs263-100.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("RandomForestClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
#cm = confusion_matrix(y, pred1)
#write_confusion_matrix(cm, 'confusion-matrix-funcounts-randomforest-vs263-100-gini.txt')
#save_function_count_predictions(function_count_features, y, pred1, 'classifier-predictions-funcounts-randomforest-vs263-100-gini.csv')
pickle_it(clf1, 'classifier-model-vs263-funcounts-randomforest-100-gini.pkl', 4)
In [ ]:
# MODEL 4:
# MODEL 4: 100 estimators (entropy)
clf1 = RandomForestClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='entropy')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-funcounts-randomforest-vs263-100-entropy.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("RandomForestClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
#cm = confusion_matrix(y, pred1)
#write_confusion_matrix(cm, 'confusion-matrix-funcounts-randomforest-vs263-100-entropy.txt')
#save_function_count_predictions(function_count_features, y, pred1, 'classifier-predictions-funcounts-randomforest-vs263-100-entropy.csv')
pickle_it(clf1, 'classifier-model-vs263-funcounts-randomforest-100-entropy.pkl', 4)
In [7]:
# MODEL 5:
# MODEL 5: 100 estimators 10-fold cross validation
clf1 = lgb.LGBMClassifier(n_estimators=100, learning_rate=1.0, objective='multiclass')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 5)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-funcounts-lightgbm-vs263.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("LightGBM took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-funcounts-lightgbm-vs263.txt')
save_function_count_predictions(function_count_features, y, pred1, 'classifier-predictions-funcounts-lightgbm-vs263-100.csv')
In [14]:
# See if we can get decent sized object serialization.
gzip_pickle_it(clf1, 'data/classifier-model-vs263-funcounts-lightgbm-100.pkl', 0)
In [3]:
sorted_train_labels = pd.read_csv('data/sorted-pe-coff-train-labels-vs264.csv')
combined_features = pd.read_csv('data/combined-pe-features-vs264.csv')
combined_features.head()
Out[3]:
In [3]:
sorted_train_labels.head()
Out[3]:
In [4]:
X = combined_features.iloc[:,1:]
y = sorted_train_labels['label']
print("Length of y: {:d}".format(len(y)))
print("Shape of X: {:d} {:d}".format(X.shape[0], X.shape[1]))
In [5]:
type(X)
Out[5]:
In [11]:
# 100 estimators (gini)
clf1 = ExtraTreesClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-extratrees-vs264-100-gini.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
#cm = confusion_matrix(y, pred1)
#write_confusion_matrix(cm, 'confusion-matrix-xtratrees-vs264-100-gini.txt')
#save_predictions(combined_features, y, pred1, 'classifier-predictions-xtratrees-vs264-100-gini.csv')
pickle_it(clf1, 'classifier-model-vs264-extratrees-100-gini.pkl', 4)
In [6]:
# MODEL 1:
# NOTE: models always have slightly lower predication accuracy in python 3 compared to python 2.
#
clf1 = ExtraTreesClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-extratrees-vs264-100-gini.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-xtratrees-vs264-100-gini.txt')
save_predictions(combined_features, y, pred1, 'classifier-predictions-xtratrees-vs264-100-gini.csv')
pickle_it(clf1, 'classifier-model-vs264-extratrees-100-gini.pkl', 4)
In [7]:
# MODEL 2:
# 100 estimators (entropy)
clf1 = ExtraTreesClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='entropy')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-extratrees-vs264-100-entropy.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
#cm = confusion_matrix(y, pred1)
#write_confusion_matrix(cm, 'confusion-matrix-xtratrees-vs264-100-entropy.txt')
#save_predictions(combined_features, y, pred1, 'classifier-predictions-xtratrees-vs264-100-entropy.csv')
pickle_it(clf1, 'classifier-model-vs264-extratrees-100-entropy.pkl', 4)
In [ ]:
In [8]:
In [9]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [5]:
# Model 3:
# RandomForest (gini)
clf1 = RandomForestClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-randomforest-vs264-100.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("RandomForestClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
#cm = confusion_matrix(y, pred1)
#write_confusion_matrix(cm, 'confusion-matrix-randomforest-vs264-100-gini.txt')
#save_predictions(combined_features, y, pred1, 'classifier-predictions-randomforest-vs264-100-gini.csv')
pickle_it(clf1, 'classifier-model-vs264-randomforest-100-gini.pkl', 4)
In [6]:
# MODEL 4:
# RandomForest (entropy)
clf1 = RandomForestClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='entropy')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-randomforest-vs264-100-entropy.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
#cm = confusion_matrix(y, pred1)
#write_confusion_matrix(cm, 'confusion-matrix-randomforest-vs264-100-entropy.txt')
#save_predictions(combined_features, y, pred1, 'classifier-predictions-randomforest-vs264-100-entropy.csv')
pickle_it(clf1, 'classifier-model-vs264-randomforest-100-entropy.pkl', 4)
In [10]:
In [19]:
len(pred1)
Out[19]:
In [20]:
len(y)
Out[20]:
In [21]:
X.shape
Out[21]:
In [ ]:
In [ ]:
In [ ]:
# DEPRECATED: far too slow.
from sklearn.ensemble import GradientBoostingClassifier
clf1 = GradientBoostingClassifier(n_estimators=10, max_features=None, min_samples_leaf=1, min_samples_split=9)
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-gradientboost-vs264-10-entropy.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("GradientBoost took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-gradientboost-vs264-10-entropy.txt')
save_predictions(combined_features, y, pred1, 'classifier-predictions-gradientboost-vs264-10-entropy.csv')
In [ ]:
In [7]:
clf1 = lgb.LGBMClassifier(n_estimators=50, learning_rate=1.0, objective='multiclass')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-lightgbm-vs264.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("LightGBM took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-lightgbm-vs264.txt')
save_predictions(combined_features, y, pred1, 'classifier-predictions-lightgbm-vs264-50.csv')
In [11]:
clf1 = lgb.LGBMClassifier(n_estimators=100, learning_rate=1.0, objective='multiclass')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 5)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-lightgbm-vs264.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("LightGBM took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-lightgbm-vs264.txt')
save_predictions(combined_features, y, pred1, 'classifier-predictions-lightgbm-vs264-100.csv')
In [12]:
clf1 = lgb.LGBMClassifier(n_estimators=100, learning_rate=1.0, objective='multiclass')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-lightgbm-vs264.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("LightGBM took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-lightgbm-vs264.txt')
save_predictions(combined_features, y, pred1, 'classifier-predictions-lightgbm-vs264-100.csv')
In [13]:
clf1 = lgb.LGBMClassifier(n_estimators=200, learning_rate=1.0, objective='multiclass')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 5)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-lightgbm-vs264.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("LightGBM took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-lightgbm-vs264.txt')
save_predictions(combined_features, y, pred1, 'classifier-predictions-lightgbm-vs264-200.csv')
In [ ]:
# DEPRECATED - xgboost is a pain in the arse.
xgblabels = y - 1
xgclf = xgb.XGBClassifier(n_estimators=10, max_depth=10, learning_rate=1.0, objective="multi:softmax", nthread=4)
start = time()
pred1 = run_prediction_cv(X, xgblabels, xgclf, 2)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-xgboost-vs264.txt')
print("score = {:.3f}".format(accuracy_score(xgblabels, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(xgblabels, pred1)
write_confusion_matrix(cm, 'confusion-matrix-xgboost-vs264.txt')
save_predictions(combined_features, y, pred1, 'classifier-predictions-xgboost-vs264-10.csv')
In [3]:
sorted_train_labels = pd.read_csv('data/sorted-pe-coff-train-labels-vs264.csv')
function_count_features = pd.read_csv('data/sorted-pe-function-count-features-10percent-vs264.csv')
function_count_features.head()
Out[3]:
In [4]:
X = function_count_features.iloc[:,1:]
y = sorted_train_labels['label']
print("Length of y: {:d}".format(len(y)))
print("Shape of X: {:d} {:d}".format(X.shape[0], X.shape[1]))
In [5]:
# MODEL 1: 100 estimators (gini)
clf1 = ExtraTreesClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-funcounts-extratrees-vs264-100-gini.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
#cm = confusion_matrix(y, pred1)
#write_confusion_matrix(cm, 'confusion-matrix-funcounts-xtratrees-vs264-100-gini.txt')
#save_function_count_predictions(function_count_features, y, pred1, 'classifier-predictions-funcounts-xtratrees-vs264-100-gini.csv')
pickle_it(clf1, 'classifier-model-vs264-funcounts-extratrees-100-gini.pkl', 4)
In [7]:
# DEPRECATED MODEL 1: 100 estimators (gini) not shuffled
clf1 = ExtraTreesClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-funcounts-extratrees-vs264-100-gini.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-funcounts-xtratrees-vs264-100-gini.txt')
save_function_count_predictions(function_count_features, y, pred1, 'classifier-predictions-funcounts-xtratrees-vs264-100-gini.csv')
In [20]:
# DEPRECATED MODEL 1: 100 estimators (gini) StratifiedKFolds
clf1 = ExtraTreesClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-funcounts-extratrees-vs264-100-gini.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-funcounts-xtratrees-vs264-100-gini.txt')
save_function_count_predictions(function_count_features, y, pred1, 'classifier-predictions-funcounts-xtratrees-vs264-100-gini-strat.csv')
In [6]:
# MODEL 2: 100 estimators (entropy)
clf1 = ExtraTreesClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='entropy')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-funcounts-extratrees-vs264-100-entropy.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
#cm = confusion_matrix(y, pred1)
#write_confusion_matrix(cm, 'confusion-matrix-funcounts-xtratrees-vs264-100-entropy.txt')
#save_function_count_predictions(function_count_features, y, pred1, 'classifier-predictions-funcounts-xtratrees-vs264-100-entropy.csv')
pickle_it(clf1, 'classifier-model-vs264-funcounts-extratrees-100-entropy.pkl', 4)
In [8]:
# DEPRECATED MODEL 2: 100 estimators (entropy) not shuffled
clf1 = ExtraTreesClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='entropy')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-funcounts-extratrees-vs264-100-entropy.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-funcounts-xtratrees-vs264-100-entropy.txt')
save_function_count_predictions(function_count_features, y, pred1, 'classifier-predictions-funcounts-xtratrees-vs264-100-entropy.csv')
In [7]:
# MODEL 3: 100 estimators (gini)
clf1 = RandomForestClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-funcounts-randomforest-vs264-100.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("RandomForestClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
#cm = confusion_matrix(y, pred1)
#write_confusion_matrix(cm, 'confusion-matrix-funcounts-randomforest-vs264-100-gini.txt')
#save_function_count_predictions(function_count_features, y, pred1, 'classifier-predictions-funcounts-randomforest-vs264-100-gini.csv')
pickle_it(clf1, 'classifier-model-vs264-funcounts-randomforest-100-gini.pkl', 4)
In [19]:
# DEFPRECATED - MODEL 3: 100 estimators (gini) StratifiedKFolds
clf1 = RandomForestClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-funcounts-randomforest-vs264-100.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("RandomForestClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-funcounts-randomforest-vs264-100-gini.txt')
save_function_count_predictions(function_count_features, y, pred1, 'classifier-predictions-funcounts-randomforest-vs264-100-gini-strat.csv')
In [8]:
# MODEL 4: 100 estimators (entropy)
clf1 = RandomForestClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='entropy')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-funcounts-randomforest-vs264-100-entropy.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("RandomForestClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
#cm = confusion_matrix(y, pred1)
#write_confusion_matrix(cm, 'confusion-matrix-funcounts-randomforest-vs264-100-entropy.txt')
#save_function_count_predictions(function_count_features, y, pred1, 'classifier-predictions-funcounts-randomforest-vs264-100-entropy.csv')
pickle_it(clf1, 'classifier-model-vs264-funcounts-randomforest-100-entropy.pkl', 4)
In [ ]:
# MODEL 5: 100 estimators 10-fold cross validation
clf1 = lgb.LGBMClassifier(n_estimators=100, learning_rate=1.0, objective='multiclass')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 5)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-funcounts-lightgbm-vs264.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("LightGBM took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-funcounts-lightgbm-vs264.txt')
save_function_count_predictions(function_count_features, y, pred1, 'classifier-predictions-funcounts-lightgbm-vs264-100.csv')
In [14]:
# MODEL 6: TODO.
clf1 = AdaBoostClassifier(n_estimators=200)
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-funcounts-adaboost-vs264-200-entropy.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("AdaBoost took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-funcounts-adaboost-vs264-200-entropy.txt')
save_function_count_predictions(function_count_features, y, pred1, 'classifier-predictions-funcounts-adaboost-vs264-200.csv')
In [18]:
# DEPRECATED MODEL 6: StratifiedKFolds.
clf1 = AdaBoostClassifier(n_estimators=200)
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-funcounts-adaboost-vs264-200-entropy.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("AdaBoost took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-funcounts-adaboost-vs264-200-entropy.txt')
save_function_count_predictions(function_count_features, y, pred1, 'classifier-predictions-funcounts-adaboost-vs264-200-strat.csv')
In [17]:
# MODEL 7:
xgblabels = y - 1
xgclf = xgb.XGBClassifier(n_estimators=50, max_depth=10, learning_rate=1.0, objective="multi:softmax", nthread=4)
start = time()
pred1 = run_prediction_cv(X, xgblabels, xgclf, 10)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-funcounts-xgboost-vs264.txt')
print("score = {:.3f}".format(accuracy_score(xgblabels, pred1)))
print(" ")
print("XGBoost took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(xgblabels, pred1)
write_confusion_matrix(cm, 'confusion-matrix-funcounts-xgboost-vs264.txt')
save_function_count_predictions(function_count_features, y, pred1, 'classifier-predictions-funcounts-xgboost-vs264-50.csv')
In [5]:
# MODEL 7: 5-fold
xgblabels = y - 1
xgclf = xgb.XGBClassifier(n_estimators=100, max_depth=10, learning_rate=1.0, objective="multi:softmax", nthread=4)
start = time()
pred1 = run_prediction_cv(X, xgblabels, xgclf, 5)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-funcounts-xgboost-vs264.txt')
print("score = {:.3f}".format(accuracy_score(xgblabels, pred1)))
print(" ")
print("XGBoost took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(xgblabels, pred1)
write_confusion_matrix(cm, 'confusion-matrix-funcounts-xgboost-vs264.txt')
save_function_count_predictions(function_count_features, y, pred1, 'classifier-predictions-funcounts-xgboost-vs264-100.csv')
In [7]:
# DEPRECATED: pandas concat reorders the columns if the sets do not have identical column names, there is
# no way to change this behaviour without reverting to using a common subset which would reduce the number of features.
# METHOD: create a dataframe with only the filenames, then merge on 'file_name' with the feature set dataframes.
# Use only feature sets that have common features, exclude PE Header and PE Function Count feature sets as these
# have many different features.
#vs251
sorted_asm_features_fixed = pd.read_csv('data/sorted-pe-asm-features-vs251-fixed.csv')
sorted_entropy_features = pd.read_csv('data/sorted-entropy-features-vs251.csv')
sorted_file_id_features = pd.read_csv('data/sorted-file-id-features-vs251.csv')
sorted_packer_id_features = pd.read_csv('data/sorted-packer-id-features-vs251.csv')
sorted_trid_id_features = pd.read_csv('data/sorted-trid-id-features-vs251.csv')
sorted_call_graph_features = pd.read_csv('data/sorted-pe-call-graph-features-vs251.csv')
fileidfeatures = pd.DataFrame(sorted_file_id_features.iloc[:,[0,2]])
packeridfeatures = pd.DataFrame(sorted_packer_id_features.iloc[:,[0,2]])
trididfeatures = pd.DataFrame(sorted_trid_id_features.iloc[:,[0,2,3]])
vs251 = sorted_asm_features_fixed.merge(sorted_entropy_features, on='file_name', how='inner', suffixes=('_asm','_ent'))
vs251 = vs251.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_asm','_fid'))
vs251 = vs251.merge(trididfeatures, on='file_name', how='inner', suffixes=('_asm','_tid'))
vs251 = vs251.merge(packeridfeatures, on='file_name', how='inner', suffixes=('_asm','_pid'))
vs251 = vs251.merge(sorted_call_graph_features, on='file_name', how='inner', suffixes=('_asm','_cg'))
vs251.to_csv('data/combined-pe-asm-features-vs251.csv', index=False)
vs251.head()
Out[7]:
In [3]:
# VS252
sorted_asm_features_fixed = pd.read_csv('data/sorted-pe-asm-features-vs252-fixed.csv')
sorted_entropy_features = pd.read_csv('data/sorted-entropy-features-vs252.csv')
sorted_file_id_features = pd.read_csv('data/sorted-file-id-features-vs252.csv')
sorted_packer_id_features = pd.read_csv('data/sorted-packer-id-features-vs252.csv')
sorted_trid_id_features = pd.read_csv('data/sorted-trid-id-features-vs252.csv')
sorted_call_graph_features = pd.read_csv('data/sorted-pe-call-graph-features-vs252.csv')
fileidfeatures = pd.DataFrame(sorted_file_id_features.iloc[:,[0,2]])
packeridfeatures = pd.DataFrame(sorted_packer_id_features.iloc[:,[0,2]])
trididfeatures = pd.DataFrame(sorted_trid_id_features.iloc[:,[0,2,3]])
vs252 = sorted_asm_features_fixed.merge(sorted_entropy_features, on='file_name', how='inner', suffixes=('_asm','_ent'))
vs252 = vs252.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_asm','_fid'))
vs252 = vs252.merge(trididfeatures, on='file_name', how='inner', suffixes=('_asm','_tid'))
vs252 = vs252.merge(packeridfeatures, on='file_name', how='inner', suffixes=('_asm','_pid'))
vs252 = vs252.merge(sorted_call_graph_features, on='file_name', how='inner', suffixes=('_asm','_cg'))
vs252.to_csv('data/combined-pe-asm-features-vs252.csv', index=False)
vs252.head()
Out[3]:
In [5]:
# VS263
sorted_asm_features_fixed = pd.read_csv('data/sorted-pe-asm-features-vs263-fixed.csv')
sorted_entropy_features = pd.read_csv('data/sorted-entropy-features-vs263.csv')
sorted_file_id_features = pd.read_csv('data/sorted-file-id-features-vs263.csv')
sorted_packer_id_features = pd.read_csv('data/sorted-packer-id-features-vs263.csv')
sorted_trid_id_features = pd.read_csv('data/sorted-trid-id-features-vs263.csv')
sorted_call_graph_features = pd.read_csv('data/sorted-pe-call-graph-features-vs263.csv')
fileidfeatures = pd.DataFrame(sorted_file_id_features.iloc[:,[0,2]])
packeridfeatures = pd.DataFrame(sorted_packer_id_features.iloc[:,[0,2]])
trididfeatures = pd.DataFrame(sorted_trid_id_features.iloc[:,[0,2,3]])
vs263 = sorted_asm_features_fixed.merge(sorted_entropy_features, on='file_name', how='inner', suffixes=('_asm','_ent'))
vs263 = vs263.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_asm','_fid'))
vs263 = vs263.merge(trididfeatures, on='file_name', how='inner', suffixes=('_asm','_tid'))
vs263 = vs263.merge(packeridfeatures, on='file_name', how='inner', suffixes=('_asm','_pid'))
vs263 = vs263.merge(sorted_call_graph_features, on='file_name', how='inner', suffixes=('_asm','_cg'))
vs263.to_csv('data/combined-pe-asm-features-vs263.csv', index=False)
vs263.head()
Out[5]:
In [6]:
# VS264
sorted_asm_features_fixed = pd.read_csv('data/sorted-pe-asm-features-vs264-fixed.csv')
sorted_entropy_features = pd.read_csv('data/sorted-entropy-features-vs264.csv')
sorted_file_id_features = pd.read_csv('data/sorted-file-id-features-vs264.csv')
sorted_packer_id_features = pd.read_csv('data/sorted-packer-id-features-vs264.csv')
sorted_trid_id_features = pd.read_csv('data/sorted-trid-id-features-vs264.csv')
sorted_call_graph_features = pd.read_csv('data/sorted-pe-call-graph-features-vs264.csv')
fileidfeatures = pd.DataFrame(sorted_file_id_features.iloc[:,[0,2]])
packeridfeatures = pd.DataFrame(sorted_packer_id_features.iloc[:,[0,2]])
trididfeatures = pd.DataFrame(sorted_trid_id_features.iloc[:,[0,2,3]])
vs264 = sorted_asm_features_fixed.merge(sorted_entropy_features, on='file_name', how='inner', suffixes=('_asm','_ent'))
vs264 = vs264.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_asm','_fid'))
vs264 = vs264.merge(trididfeatures, on='file_name', how='inner', suffixes=('_asm','_tid'))
vs264 = vs264.merge(packeridfeatures, on='file_name', how='inner', suffixes=('_asm','_pid'))
vs264 = vs264.merge(sorted_call_graph_features, on='file_name', how='inner', suffixes=('_asm','_cg'))
vs264.to_csv('data/combined-pe-asm-features-vs264.csv', index=False)
vs264.head()
Out[6]:
In [8]:
vs251.shape
Out[8]:
In [3]:
# Start HERE.
all_combined_feature_sets = pd.read_csv('data/combined-pe-asm-features-vs251.csv')
all_combined_feature_sets = all_combined_feature_sets.append(pd.read_csv('data/combined-pe-asm-features-vs252.csv'), ignore_index=True)
all_combined_feature_sets = all_combined_feature_sets.append(pd.read_csv('data/combined-pe-asm-features-vs263.csv'), ignore_index=True)
all_combined_feature_sets = all_combined_feature_sets.append(pd.read_csv('data/combined-pe-asm-features-vs264.csv'), ignore_index=True)
all_combined_feature_sets.head()
Out[3]:
In [10]:
all_combined_feature_sets.shape
Out[10]:
In [20]:
In [ ]:
In [ ]:
In [4]:
all_train_labels = pd.read_csv('data/sorted-pe-coff-train-labels-vs251.csv')
all_train_labels = all_train_labels.append(pd.read_csv('data/sorted-pe-coff-train-labels-vs252.csv'), ignore_index=True)
all_train_labels = all_train_labels.append(pd.read_csv('data/sorted-pe-coff-train-labels-vs263.csv'), ignore_index=True)
all_train_labels = all_train_labels.append(pd.read_csv('data/sorted-pe-coff-train-labels-vs264.csv'), ignore_index=True)
all_train_labels.head()
Out[4]:
In [ ]:
#all_combined_feature_sets.fillna(0, inplace=True)
In [5]:
X = all_combined_feature_sets.iloc[:,1:]
y = all_train_labels['label']
print("Length of y: {:d}".format(len(y)))
print("Shape of X: {:d} {:d}".format(X.shape[0], X.shape[1]))
In [ ]:
# MODEL 1: 100 estimators (gini)
# DEPRECATED: memory errors, get more members.
clf1 = ExtraTreesClassifier(n_estimators=100, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-extratrees-vs251-252-263-264-100-gini.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-xtratrees-vs251-252-263-264-100-gini.txt')
save_predictions(all_combined_feature_sets, y, pred1, 'classifier-predictions-xtratrees-vs251-252-263-264-100-gini.csv')
In [ ]:
help(pd.read_csv)
In [ ]:
help(pd.DataFrame.append)
In [ ]:
help(pd.DataFrame.merge)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
help(pd.DataFrame)
In [5]:
# VirusShare APT Feature sets:
# 1. PE ASM features.
# 2. Entropy features.
# 3. File ID features.
# 4. Packer ID features.
# 5. PE Call Graph features.
# 6. Trid ID features.
# 7. PE Header features.
# 8. TODO: function count features.
# 9. TODO: Binary or ASM images (RNN/CNN models).
#
# Final Combination: combine all PE/COFF features into feature set ->
# all-combined-pe-features-10perc-vsapt.csv
sorted_train_labels = pd.read_csv('data/sorted-train-labels-apt.csv')
sorted_asm_features = pd.read_csv('data/sorted-pe-asm-features-apt.csv')
sorted_entropy_features = pd.read_csv('data/sorted-entropy-features-apt.csv')
sorted_file_id_features = pd.read_csv('data/sorted-file-id-features-apt.csv')
sorted_packer_id_features = pd.read_csv('data/sorted-packer-id-features-apt.csv')
sorted_call_graph_features = pd.read_csv('data/sorted-pe-call-graph-features-apt.csv')
sorted_trid_id_features = pd.read_csv('data/sorted-trid-id-features-apt.csv') # Select only scalar columns.
sorted_header_features = pd.read_csv('data/sorted-pe-header-features-apt.csv')
# BROKEN: rerun function counts.
#sorted_function_count_features = pd.read_csv('data/sorted-pe-function-counts-10percent-apt.csv')
sorted_asm_features.head()
Out[5]:
In [8]:
X,y = get_training_data(sorted_asm_features, sorted_train_labels, 'data/sorted-pe-coff-train-labels-apt.csv')
print("Length of y: {:d}".format(len(y)))
print("Shape of X: {:d} {:d}".format(X.shape[0], X.shape[1]))
In [9]:
fileidfeatures = pd.DataFrame(sorted_file_id_features.iloc[:,[0,2]])
fileidfeatures.head()
Out[9]:
In [10]:
packeridfeatures = pd.DataFrame(sorted_packer_id_features.iloc[:,[0,2]])
packeridfeatures.head()
Out[10]:
In [11]:
trididfeatures = pd.DataFrame(sorted_trid_id_features.iloc[:,[0,2,3]])
trididfeatures.head()
Out[11]:
In [12]:
combined_train_features = sorted_asm_features.merge(sorted_header_features, on='file_name', how='inner', suffixes=('_asm','_hd'))
combined_train_features = combined_train_features.merge(sorted_call_graph_features, on='file_name', how='inner', suffixes=('_asm','_cg'))
combined_train_features = combined_train_features.merge(sorted_entropy_features, on='file_name', how='inner', suffixes=('_asm','_ent'))
combined_train_features = combined_train_features.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_asm','_fid'))
combined_train_features = combined_train_features.merge(trididfeatures, on='file_name', how='inner', suffixes=('_asm','_tid'))
combined_train_features = combined_train_features.merge(packeridfeatures, on='file_name', how='inner', suffixes=('_asm','_pid'))
combined_train_features.head()
Out[12]:
In [13]:
combined_train_features.to_csv('data/combined-pe-features-apt.csv', index=False)
In [3]:
# Now reduce the feature set to 10% best features.
sorted_train_labels = pd.read_csv('data/sorted-pe-coff-train-labels-apt.csv')
combined_features = pd.read_csv('data/combined-pe-features-apt.csv')
combined_features.head()
Out[3]:
In [4]:
sorted_train_labels.head()
Out[4]:
In [5]:
X = combined_features.iloc[:,1:]
y = sorted_train_labels['label']
type(y)
Out[5]:
In [7]:
type(X)
Out[7]:
In [ ]:
y
In [2]:
In [20]:
In [21]:
reduce_feature_set(combined_features, X, y, 'data/combined-pe-features-apt-reduced.csv')
In [ ]:
In [ ]:
In [9]:
combined_train_features = pd.read_csv('data/combined-pe-features-apt-reduced.csv')
train_labels = pd.read_csv('data/sorted-pe-coff-train-labels-apt.csv')
combined_train_features.head()
Out[9]:
In [10]:
X = combined_train_features.iloc[:,1:]
y = train_labels['label']
X.head()
Out[10]:
In [ ]:
y
In [ ]:
In [7]:
# Now get the reduced combined feature set and try ExtraTreesClassifier and XGBoost.
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
pred1 = run_prediction_cv(X,y,clf1,10)
#print("logloss = {:.3f}".format(log_loss(y, prob1)))
print("score = {:.3f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)
In [ ]:
In [ ]:
hel
In [10]:
cm.shape
Out[10]:
In [5]:
def write_confusion_matrix(cm, out_file_name):
#fop = open('data/confusion-matrix-apt.txt','w')
fop = open('data/' + out_file_name, 'w')
# this is rubbish -> cm.tofile(fop, ",")
for a_idx in range(0,cm.shape[0]):
#line = ",".join(cm[a_idx])
line = ','.join(str(x) for x in cm[a_idx])
print("{:d} -> {:s}".format(a_idx, line))
fop.write(line + "\n")
#for b_idx in cm.shape[1]:
# line = line + ",".acm[a_idx,b_idx]
fop.close()
return
In [ ]:
In [ ]:
In [12]:
# XGBoost does not like feature names containing [,<] characters!!!!
combined_train_features = pd.read_csv('data/combined-pe-features-apt-reduced-xgboost.csv')
train_labels = pd.read_csv('data/sorted-pe-coff-train-labels-apt.csv')
combined_train_features.head()
Out[12]:
In [13]:
X = combined_train_features.iloc[:,1:]
y = train_labels['label']
X.head()
Out[13]:
In [ ]:
#xgclf = xgb.XGBClassifier(n_estimators=1000, max_depth=10, learning_rate=0.1,objective="multi:softprob")
xgclf = xgb.XGBClassifier(n_estimators=1000, max_depth=10, learning_rate=0.1, objective="multi:softmax")
pred1 = run_prediction_cv(X, y, xgclf, 10)
#print("logloss = {:.3f}".format(log_loss(y, prob1)))
print("score = {:.3f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-xgboost-apt.txt')
In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
combined_train_features = pd.read_csv('data/combined-pe-features-apt-reduced.csv')
train_labels = pd.read_csv('data/sorted-pe-coff-train-labels-apt.csv')
#X = combined_train_features.iloc[:,1:]
y = train_labels['label']
h = 10.0 # step size in the mesh
# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0 # SVM regularization parameter
print("RBF SVM:")
X = combined_train_features.iloc[:,1:3]
#X.head()
rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(X, y)
# create a mesh to plot in
x_min, x_max = X.iloc[:,0].min() - 1, X.iloc[:,0].max() + 1
y_min, y_max = X.iloc[:,1].min() - 1, X.iloc[:,1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = rbf_svc.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)
# Plot also the training points
plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, cmap=plt.cm.coolwarm)
plt.xlabel('EDI')
plt.ylabel('ESI')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
plt.title('SVM - RBF Kernel')
plt.show()
In [8]:
plt.figure(figsize=(15,15))
plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)
# Plot also the training points
plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, cmap=plt.cm.coolwarm)
plt.xlabel('EDI')
plt.ylabel('ESI')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
plt.title('SVM - RBF Kernel')
plt.show()
In [ ]:
In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
combined_train_features = pd.read_csv('data/combined-pe-features-apt-reduced.csv')
train_labels = pd.read_csv('data/sorted-pe-coff-train-labels-apt.csv')
#X = combined_train_features.iloc[:,1:]
y = train_labels['label']
h = .02 # step size in the mesh
# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0 # SVM regularization parameter
print("Linear SVM")
X = combined_train_features.iloc[:,1:3]
svc = svm.SVC(kernel='linear', C=C).fit(X, y)
print("RBF SVM")
X = combined_train_features.iloc[:,1:3]
rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(X, y)
print("Poly SVM")
X = combined_train_features.iloc[:,1:3]
poly_svc = svm.SVC(kernel='poly', degree=3, C=C).fit(X, y)
print("Linear SVM 2")
X = combined_train_features.iloc[:,1:3]
lin_svc = svm.LinearSVC(C=C).fit(X, y)
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# title for the plots
titles = ['SVC with linear kernel',
'LinearSVC (linear kernel)',
'SVC with RBF kernel',
'SVC with polynomial (degree 3) kernel']
for i, clf in enumerate((svc, lin_svc, rbf_svc, poly_svc)):
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
plt.subplot(2, 2, i + 1)
plt.subplots_adjust(wspace=0.4, hspace=0.4)
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
plt.title(titles[i])
plt.show()
In [ ]:
help(plt)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
# Lets do some plots and have look.
plt.figure(figsize=(15,15))
plt.xlabel("Vertex Count")
plt.ylabel("Edge Count")
xa = np.array(X['vertex_count'])
xb = np.array(X['edge_count'])
ya = np.array(y)
plt.scatter(xa,xb,c=ya,cmap='brg')
# Lets do some plots and have look.
plt.figure(figsize=(15,15))
plt.xlabel("EDX Register")
plt.ylabel("Malware Class")
xa = np.array(X['edx'])
xb = np.array(X['esi'])
ya = np.array(y)
plt.scatter(xa,ya,c=ya,cmap='brg')
plt.figure(figsize=(15,15))
plt.xlabel("EDX Register")
plt.ylabel("ESI Register")
plt.scatter(xa,xb,c=ya,cmap='brg')
In [ ]:
In [ ]:
In [ ]:
# Fix ASM feature file_name values.
in_file = open('data/sorted-pe-asm-features-vs251.csv', 'r')
in_lines = in_file.readlines()
in_file.close()
for line in in_lines:
idx = line.find('.pe')
if idx > 0:
# TODO:
In [ ]:
In [ ]:
In [ ]:
# TEST CODE ONLY:
In [ ]:
# 500 estimators
clf1 = ExtraTreesClassifier(n_estimators=500, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-extratrees-vs264-500.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-xtratrees-vs264-500.txt')
In [ ]:
# 1000 estimators
# memory/kernel fail
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
start = time()
pred1 = run_prediction_cv(X, y, clf1, 10)
print(" ")
print("Classification report:")
print_classification_report(y, pred1, 'classification-report-extratrees-vs264-1000.txt')
print("score = {:.3f}".format(accuracy_score(y, pred1)))
print(" ")
print("ExtraTreesClassifier took {:.2f} minutes.".format(((time() - start) / 60)))
cm = confusion_matrix(y, pred1)
write_confusion_matrix(cm, 'confusion-matrix-xtratrees-vs264-1000.txt')
In [ ]:
In [ ]: