In [34]:
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
pd.options.mode.chained_assignment = None
Next, we have to load the data into a dataframe. In order to have a balanced dataset, we will use 10000 records from Alexa which will represent the not malicious domains, and 10000 records from gameoverdga representing the malicious domains.
You can see that at the end we have 10000 of each.
In [2]:
df = pd.read_csv( '../../data/dga-full.csv' )
#Filter to alexo and game over
df = df[df['dsrc'].isin(['alexa','gameoverdga'])]
df.dsrc.value_counts()
Out[2]:
In [3]:
df['isMalicious'] = df['dsrc'].apply( lambda x: 0 if x == "alexa" else 1 )
For this, let’s create a rather small training data se as it will reduce the time to train up a model. Feel free to try a 15%, 20% or even a 30% portion for the training data (lower percentages for slower machines).
In this example, we will split 30% for train and 70% for test.
Normally you would want most of the data in the training data, but more training data can considerably extend the time neede to train up a model.
We're also going to need a list of column names for the feature columns as well as the target column.
In [39]:
train, test = train_test_split(df, test_size = 0.7)
features = ['length', 'dicts', 'entropy','numbers', 'ngram']
target = 'isMalicious'
The next step is to create the classifiers. What you'll see is that scikit-learn maintains a constant interface for every machine learning algorithm. For a supervised model, the steps are:
.fit() method with the training data set and the target .predict() method
In [42]:
#Create the Random Forest Classifier
random_forest_clf = RandomForestClassifier(n_estimators=10,
max_depth=None,
min_samples_split=2,
random_state=0)
random_forest_clf = random_forest_clf.fit( train[features], train[target])
In [40]:
#Next, create the SVM classifier
svm_classifier = svm.SVC()
svm_classifier = svm_classifier.fit(train[features], train[target])
In [44]:
scores = cross_val_score(random_forest_clf, train[features], train[target])
scores.mean()
Out[44]:
We'll need to to get the predictions from both classifiers, so we add columns to the test and training sets for the predictions.
In [46]:
test['predictions'] = random_forest_clf.predict( test[features] )
train['predictions'] = random_forest_clf.predict( train[features] )
test['svm-predictions'] = svm_classifier.predict( test[features])
train['svm-predictions'] = svm_classifier.predict( train[features])
In [47]:
test.head()
Out[47]:
In [48]:
confusion_matrix( test['isMalicious'], test['predictions'])
Out[48]:
The code below generates a nicer presentation of the confusion matrix for the random forest classifer.
In [50]:
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Compute confusion matrix
cnf_matrix = confusion_matrix( test['isMalicious'], test['predictions'])
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Not Malicious', 'Malicious'],
title='RF Confusion matrix, without normalization')
# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Not Malicious', 'Malicious'], normalize=True,
title='RF Normalized confusion matrix')
plt.show()
And again for the SVM classifier.
In [49]:
# Compute confusion matrix
svm_cnf_matrix = confusion_matrix( test['isMalicious'], test['svm-predictions'])
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(svm_cnf_matrix, classes=['Not Malicious', 'Malicious'],
title='SVM Confusion matrix, without normalization')
# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(svm_cnf_matrix, classes=['Not Malicious', 'Malicious'], normalize=True,
title='SVM Normalized confusion matrix')
plt.show()
In [52]:
importances = random_forest_clf.feature_importances_
In [21]:
importances
Out[21]:
You can also visualize this with the following code from: #From: http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
In [25]:
std = np.std([random_forest_clf.feature_importances_ for tree in random_forest_clf.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(test[features].shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(test[features].shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(test[features].shape[1]), indices)
plt.xlim([-1, test[features].shape[1]])
plt.show()
You can calculate the accuracy with the metrics.accuracy() method, and finally, there is the metrics.classification-report() which will calculate all the metrics except accuracy at once.
In [56]:
pscore = metrics.accuracy_score(test['isMalicious'], test['predictions'])
pscore_train = metrics.accuracy_score(train['isMalicious'], train['predictions'])
In [57]:
print( metrics.classification_report(test['isMalicious'], test['predictions'], target_names=['Malicious', 'Not Malicious'] ) )
In [58]:
svm_pscore = metrics.accuracy_score(test['isMalicious'], test['svm-predictions'])
svm_pscore_train = metrics.accuracy_score(train['isMalicious'], train['svm-predictions'])
print( metrics.classification_report(test['isMalicious'], test['svm-predictions'], target_names=['Malicious', 'Not Malicious'] ) )
In [59]:
print( svm_pscore, svm_pscore_train)
In [60]:
print( pscore, pscore_train)