In [1]:
# Just import everything we might need, it saves time.
import warnings
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from time import time
from scipy.stats import randint as sp_randint
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score, KFold, train_test_split
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVC
import seaborn as sns
%pylab inline
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
In [4]:
import sklearn
print(sklearn.__version__)
In [2]:
# First load the .asm and .byte training/test data and training labels
sorted_train_data_asm = pd.read_csv('data/sorted-train-malware-features-asm-30percent.csv')
sorted_train_data_byte = pd.read_csv('data/sorted-train-malware-features-byte.csv')
sorted_test_data_asm = pd.read_csv('data/sorted-test-malware-features-asm-30percent.csv')
sorted_test_data_byte = pd.read_csv('data/sorted-test-malware-features-byte.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
#combined_train_data = pd.read_csv('data/sorted-train-features-combined.csv')
# Load the image data for training data
sorted_train_image_asm = pd.read_csv('data/sorted-train-image-features-asm-30percent.csv')
sorted_test_image_asm = pd.read_csv('data/sorted-test-image-features-asm-30percent.csv')
#sorted_train_image_byte = pd.read_csv('data/sorted-train-image-features-byte-reduced.csv')
# Now load the row statistics for the original feature set, these will be combined with the new features
all_train_rowstats = pd.read_csv('data/all-train-asm-rowstats.csv')
all_test_rowstats = pd.read_csv('data/all-test-asm-rowstats.csv')
#all_train_image_asm_rowstats = pd.read_csv('data/all-train-image-asm-rowstats.csv')
#all_test_image_asm_rowstats = pd.read_csv('data/all-test-image-asm-rowstats.csv')
In [3]:
sorted_train_data_asm.head()
Out[3]:
In [4]:
sorted_train_labels.head()
Out[4]:
In [4]:
sorted_train_image_asm.head()
Out[4]:
In [5]:
sorted_test_image_asm.head()
Out[5]:
In [7]:
sorted_train_data_byte.head()
Out[7]:
In [6]:
sorted_test_data_asm.head()
Out[6]:
In [3]:
# Assign asm data to X,y for brevity.
X = sorted_train_data_asm.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
X_test = sorted_test_data_asm.iloc[:,1:]
X_train_image_asm = sorted_train_image_asm.iloc[:,1:]
X_test_image_asm = sorted_test_image_asm.iloc[:,1:]
# X_image_byte = sorted_train_image_byte.iloc[:,1:]
In [7]:
X_train_image_asm.shape
Out[7]:
In [8]:
X_test_image_asm.shape
Out[8]:
In [10]:
# Train feature stats
X_means = X.mean()
X_std = X.std()
X_cor = X.corr()
X_cov = X.cov()
# Test feature stats
X_test_means = X_test.mean()
X_test_std = X_test.std()
X_test_cor = X_test.corr()
X_test_cov = X_test.cov()
# Train image feature stats
X_train_image_asm_means = X_train_image_asm.mean()
X_train_image_asm_std = X_train_image_asm.std()
X_train_image_asm_cor = X_train_image_asm.corr()
X_train_image_asm_cov = X_train_image_asm.cov()
# Test image feature stats
X_test_image_asm_means = X_test_image_asm.mean()
X_test_image_asm_std = X_test_image_asm.std()
X_test_image_asm_cor = X_test_image_asm.corr()
X_test_image_asm_cov = X_test_image_asm.cov()
# Not using byte image features
#X_image_byte_means = X_image_byte.mean()
#X_image_byte_std = X_image_byte.std()
#X_image_byte_cor = X_image_byte.corr()
#X_image_byte_cov = X_image_byte.cov()
In [11]:
X_train_image_asm_means.head()
Out[11]:
In [12]:
X_train_image_asm_std.head()
Out[12]:
In [13]:
X_test_image_asm_means.head()
Out[13]:
In [14]:
# The byte image data has low standard deviation and mean variance and is not very useful for learning
# so this data will not be used for further analysis.
X_test_image_asm_std.head()
Out[14]:
In [ ]:
X_means.head()
X_std.head()
X_cor.head()
X_cov.head()
X_means.min()
X_means.max()
X_std.min()
X_std.max()
X_means[X_means == X_means.min()]
X_means[X_means == X_means.max()]
X_std[X_std == X_std.min()]
X_std[X_std == X_std.max()]
In [4]:
# Row stats for train and test data
X_train_rowstats = pd.DataFrame(index=np.arange(X.shape[0]), columns=['filename','mean','std','min','max','total','logtotal'], dtype=float64)
X_test_rowstats = pd.DataFrame(index=np.arange(X_test.shape[0]), columns=['filename','e_mean','e_std','e_min','e_max','e_total','e_logtotal'], dtype=float64)
X_train_rowstats['filename'] = sorted_train_data_asm['filename']
for i in range(0,X.shape[0]):
X_train_rowstats.loc[i,'mean'] = X.iloc[i,:].mean()
X_train_rowstats.loc[i,'std'] = X.iloc[i,:].std()
X_train_rowstats.loc[i,'min'] = X.iloc[i,:].min()
X_train_rowstats.loc[i,'max'] = X.iloc[i,:].max()
X_train_rowstats['total'] = X_train_rowstats['max'] * X_train_rowstats['mean'] * X_train_rowstats['std']
X_train_rowstats['logtotal'] = np.log(X_train_rowstats['total']) # natural logarithm
X_test_rowstats['filename'] = sorted_test_data_asm['filename']
for i in range(0,X_test.shape[0]):
X_test_rowstats.loc[i,'e_mean'] = X_test.iloc[i,:].mean()
X_test_rowstats.loc[i,'e_std'] = X_test.iloc[i,:].std()
X_test_rowstats.loc[i,'e_min'] = X_test.iloc[i,:].min()
X_test_rowstats.loc[i,'e_max'] = X_test.iloc[i,:].max()
X_test_rowstats['e_total'] = X_test_rowstats['e_max'] * X_test_rowstats['e_mean'] * X_test_rowstats['e_std']
X_test_rowstats['e_logtotal'] = np.log(X_test_rowstats['e_total']) # natural logarithm
X_train_rowstats.head()
Out[4]:
In [5]:
X_test_rowstats.head()
Out[5]:
In [6]:
# Image row stats
X_train_image_asm_rowstats = pd.DataFrame(index=np.arange(X_train_image_asm.shape[0]), columns=['filename','tr_mean','tr_std','tr_min','tr_max','tr_total','tr_logtotal'], dtype=float64)
X_test_image_asm_rowstats = pd.DataFrame(index=np.arange(X_test_image_asm.shape[0]), columns=['filename','te_mean','te_std','te_min','te_max','te_total','te_logtotal'], dtype=float64)
X_train_image_asm_rowstats['filename'] = sorted_train_data_asm['filename']
for i in range(0,X_train_image_asm.shape[0]):
X_train_image_asm_rowstats.loc[i,'tr_mean'] = X_train_image_asm.iloc[i,:].mean()
X_train_image_asm_rowstats.loc[i,'tr_std'] = X_train_image_asm.iloc[i,:].std()
X_train_image_asm_rowstats.loc[i,'tr_min'] = X_train_image_asm.iloc[i,:].min()
X_train_image_asm_rowstats.loc[i,'tr_max'] = X_train_image_asm.iloc[i,:].max()
X_train_image_asm_rowstats['tr_total'] = X_train_image_asm_rowstats['tr_max'] * X_train_image_asm_rowstats['tr_mean'] * X_train_image_asm_rowstats['tr_std']
X_train_image_asm_rowstats['tr_logtotal'] = np.log(X_train_image_asm_rowstats['tr_total']) # natural logarithm
X_test_image_asm_rowstats['filename'] = sorted_test_data_asm['filename']
for i in range(0,X_test_image_asm.shape[0]):
X_test_image_asm_rowstats.loc[i,'te_mean'] = X_test_image_asm.iloc[i,:].mean()
X_test_image_asm_rowstats.loc[i,'te_std'] = X_test_image_asm.iloc[i,:].std()
X_test_image_asm_rowstats.loc[i,'te_min'] = X_test_image_asm.iloc[i,:].min()
X_test_image_asm_rowstats.loc[i,'te_max'] = X_test_image_asm.iloc[i,:].max()
X_test_image_asm_rowstats['te_total'] = X_test_image_asm_rowstats['te_max'] * X_test_image_asm_rowstats['te_mean'] * X_test_image_asm_rowstats['te_std']
X_test_image_asm_rowstats['te_logtotal'] = np.log(X_test_image_asm_rowstats['te_total']) # natural logarithm
X_train_image_asm_rowstats.head()
Out[6]:
In [7]:
X_test_image_asm_rowstats.head()
Out[7]:
In [8]:
# Write column stats and row stats to file
X_train_rowstats.to_csv('data/train-asm-rowstats-30percent.csv', index=False)
X_test_rowstats.to_csv('data/test-asm-rowstats-30percent.csv', index=False)
X_train_image_asm_rowstats.to_csv('data/train-image-asm-rowstats-30percent.csv', index=False)
X_test_image_asm_rowstats.to_csv('data/test-image-asm-rowstats-30percent.csv', index=False)
In [9]:
# Generate some polynomial features
X_train_polyize = sorted_train_data_asm[['edi','esi','eax']]
X_test_polyize = sorted_test_data_asm[['edi','esi','eax']]
poly = PolynomialFeatures(3)
#X_train_byte_poly = DataFrame(poly.fit_transform(sorted_train_data_byte[['entropy','filesize']]), columns=['p1','p2','p3'])
X_train_byte_poly = poly.fit_transform(sorted_train_data_byte[['entropy','filesize']])
X_test_byte_poly = poly.fit_transform(sorted_test_data_byte[['entropy','filesize']])
X_train_asm_poly = poly.fit_transform(X_train_polyize)
X_test_asm_poly = poly.fit_transform(X_test_polyize)
X_train_asm_poly
Out[9]:
In [10]:
X_train_asm_poly.shape
Out[10]:
In [11]:
X_train_byte_poly.shape
Out[11]:
In [22]:
X_train_byte_poly_df = pd.DataFrame(X_train_byte_poly, columns=[ 'train_byte_p{:d}'.format(i) for i in range(1,11) ])
X_test_byte_poly_df = pd.DataFrame(X_test_byte_poly, columns=[ 'test_byte_p{:d}'.format(i) for i in range(1,11) ])
X_train_asm_poly_df = pd.DataFrame(X_train_asm_poly, columns=[ 'train_asm_p{:d}'.format(i) for i in range(1,21) ])
X_test_asm_poly_df = pd.DataFrame(X_test_asm_poly, columns=[ 'test_asm_p{:d}'.format(i) for i in range(1,21) ])
X_train_asm_poly_df['filename'] = sorted_train_data_asm['filename']
X_train_byte_poly_df['filename'] = sorted_train_data_asm['filename']
X_test_byte_poly_df['filename'] = sorted_test_data_asm['filename']
X_test_asm_poly_df['filename'] = sorted_test_data_asm['filename']
X_train_asm_poly_df.head()
Out[22]:
In [23]:
X_test_byte_poly_df.head()
Out[23]:
In [24]:
#TODO:
# Combine all the training features and write to file.
combined_train_data = sorted_train_data_asm.merge(sorted_train_data_byte, on='filename')
combined_train_data = combined_train_data.merge(X_train_rowstats, on='filename')
combined_train_data = combined_train_data.merge(X_train_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))
combined_train_data = combined_train_data.merge(sorted_train_image_asm, on='filename')
combined_train_data = combined_train_data.merge(all_train_rowstats, on='filename')
# merge polynomial features
combined_train_data = combined_train_data.merge(X_train_asm_poly_df, on='filename')
combined_train_data = combined_train_data.merge(X_train_byte_poly_df, on='filename')
combined_train_data.to_csv('data/final-combined-train-data-30percent-poly.csv', index=False)
combined_train_data.head()
Out[24]:
In [25]:
#TODO:
# Combine all the testing features and write to file.
combined_test_data = sorted_test_data_asm.merge(sorted_test_data_byte, on='filename')
combined_test_data = combined_test_data.merge(X_test_rowstats, on='filename')
combined_test_data = combined_test_data.merge(X_test_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))
combined_test_data = combined_test_data.merge(sorted_test_image_asm, on='filename')
combined_test_data = combined_test_data.merge(all_test_rowstats, on='filename')
# merge polynomial features
combined_test_data = combined_test_data.merge(X_test_asm_poly_df, on='filename')
combined_test_data = combined_test_data.merge(X_test_byte_poly_df, on='filename')
combined_test_data.to_csv('data/final-combined-test-data-30percent-poly.csv', index=False)
combined_test_data.head()
Out[25]:
In [20]:
# Combine all the training features and write to file.
combined_train_data = sorted_train_data_asm.merge(sorted_train_data_byte, on='filename')
combined_train_data = combined_train_data.merge(X_train_rowstats, on='filename')
combined_train_data = combined_train_data.merge(X_train_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))
combined_train_data = combined_train_data.merge(sorted_train_image_asm, on='filename')
combined_train_data = combined_train_data.merge(all_train_rowstats, on='filename')
# Result is better without the image rowstats for all image features
#combined_train_data = combined_train_data.merge(all_train_image_asm_rowstats, on='filename')
combined_train_data.to_csv('data/final-combined-train-data-30percent.csv', index=False)
combined_train_data.head()
Out[20]:
In [21]:
# Combine all the testing features and write to file.
combined_test_data = sorted_test_data_asm.merge(sorted_test_data_byte, on='filename')
combined_test_data = combined_test_data.merge(X_test_rowstats, on='filename')
combined_test_data = combined_test_data.merge(X_test_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))
combined_test_data = combined_test_data.merge(sorted_test_image_asm, on='filename')
combined_test_data = combined_test_data.merge(all_test_rowstats, on='filename')
# Result is better without the image rowstats for all image features
#combined_test_data = combined_test_data.merge(all_test_image_asm_rowstats, on='filename')
combined_test_data.to_csv('data/final-combined-test-data-30percent.csv', index=False)
combined_test_data.head()
Out[21]:
In [26]:
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
"""Multi class version of Logarithmic Loss metric.
https://www.kaggle.com/wiki/MultiClassLogLoss
Parameters
----------
y_true : array, shape = [n_samples]
true class, intergers in [0, n_classes - 1)
y_pred : array, shape = [n_samples, n_classes]
Returns
-------
loss : float
"""
predictions = np.clip(y_pred, eps, 1 - eps)
# normalize row sums to 1
predictions /= predictions.sum(axis=1)[:, np.newaxis]
actual = np.zeros(y_pred.shape)
n_samples = actual.shape[0]
actual[np.arange(n_samples), y_true.astype(int)] = 1
vectsum = np.sum(actual * np.log(predictions))
loss = -1.0 / n_samples * vectsum
return loss
In [27]:
def run_cv(X,y, clf):
# Construct a kfolds object
kf = KFold(len(y),n_folds=10,shuffle=True)
y_prob = np.zeros((len(y),9))
y_pred = np.zeros(len(y))
# Iterate through folds
for train_index, test_index in kf:
print(test_index, train_index)
X_train = X.loc[train_index,:]
X_test = X.loc[test_index,:]
y_train = y[train_index]
clf.fit(X_train,y_train)
y_prob[test_index] = clf.predict_proba(X_test)
y_pred[test_index] = clf.predict(X_test)
return y_prob, y_pred
In [28]:
# Set our X,y for the classifiers
X = combined_train_data.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
yloss = y - 1
In [29]:
# combined train data plus polynomial features
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)
In [25]:
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)
In [24]:
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)
In [ ]:
combined_train_data['class'] = sorted_train_labels.iloc[:,1]
class1 = X[combined_train_data['class'] == 1]
class2 = X[combined_train_data['class'] == 2]
class3 = X[combined_train_data['class'] == 3]
class4 = X[combined_train_data['class'] == 4]
class5 = X[combined_train_data['class'] == 5]
class6 = X[combined_train_data['class'] == 6]
class7 = X[combined_train_data['class'] == 7]
class8 = X[combined_train_data['class'] == 8]
class9 = X[combined_train_data['class'] == 9]
columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns)
class_stats['mean'][1] = 1.0
class_stats.head()
classx = X[combined_train_data['class'] == i]
classxmean = classx.mean()
classxmean.head()
columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns, dtype=float)
for i in range(1,10):
classx = X[combined_train_data['class'] == i]
class_stats['mean'][i] = classx.mean().sum()
class_stats['std'][i] = classx.std().sum()
#class_stats['corr'][i] = classx.corr().sum()
#class_stats['cov'][i] = classx.cov().sum()
class_stats.head()
plt.figure(figsize=(15,15))
plt.xlabel("means")
plt.ylabel("standard deviation")
plt.scatter(class_stats['mean'], class_stats['std'], c=[ 1,2,3,4,5,6,7,8,9 ], cmap='brg')
In [17]:
['filename',[ 'train_asm_p{:d}'.format(i) for i in range(1,11) ]]
Out[17]:
In [ ]:
help(pd.DataFrame)