In [1]:
from __future__ import division
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import export_graphviz
import os
In [2]:
def get_data_csv():
return pd.read_csv('intrusions.csv')
def generate_confusion_matrix(y_test, y_pred):
"""Create the plot for our confusion matrix"""
# y_test = joblib.load('models/rf_y_test.pkl')
# y_pred = joblib.load('models/rf_y_pred.pkl')
cat = y_test.value_counts().index.tolist()
cm = confusion_matrix(y_test, y_pred, sorted(cat))
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
df_cm = pd.DataFrame(cm, index = [i for i in cat],
columns = [i for i in cat])
plt.figure(figsize = (8,6))
ax = sns.heatmap(df_cm, annot=False, cmap="GnBu")
plt.setp(ax.get_yticklabels(), rotation=0)
plt.setp(ax.get_xticklabels(), rotation=90)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
# plt.savefig('app/static/fig/cm-10.png')
In [ ]:
guess_passwd. 12
buffer_overflow. 8
land. 6
warezmaster. 5
imap. 4
ftp_write. 4
multihop. 4
loadmodule. 3
rootkit. 2
perl. 1
spy. 1
phf. 1
In [3]:
# Get intrusions data
intrusions = get_data_csv()
# Clean up data frame
intrusions.set_index('key_column', inplace=True)
del intrusions['Unnamed: 0']
intrusions['attack'].replace(['buffer_overflow.', 'land.', 'guess_passwd.', 'warezmaster.', 'imap.', 'ftp_write.', 'multihop.', 'loadmodule.', 'rootkit.', 'perl.', 'spy.', 'phf.'],
['others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.'], inplace=True)
intrusions['attack'].value_counts()
Out[3]:
In [4]:
intrusions['attack_cat'] = intrusions['attack'].astype('category')
del intrusions['attack']
dummy = pd.get_dummies(intrusions, prefix=['Protocol', 'Service', 'Flag'], prefix_sep='_', columns=['protocol_type', 'service', 'flag'])
# # Train model
X = dummy.ix[:,(dummy.columns != 'attack_cat')]
y = dummy['attack_cat']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=10)
In [5]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
Out[5]:
In [6]:
y_pred_hard = rf.predict(X_test)
In [7]:
y_pred = rf.predict_proba(X_test)
In [8]:
rf.classes_
Out[8]:
In [9]:
target_names = [
'portsweep.',
'normal.',
'warezclient.',
'ipsweep.',
'teardrop.',
'satan.',
'smurf.',
'pod.',
'neptune.',
'back.',
'nmap.',
'others.'
]
target_names = sorted(target_names)
In [10]:
target_names
Out[10]:
In [11]:
class_precisions = []
class_recalls = []
for i in range(len(target_names)):
# for i in [0, 17, 18, 9, 11, 5, 15, 20, 21]:
thresh_vec = np.unique(np.round(y_pred[:,i],6))
y_test_b = (y_test == target_names[i])
precisions = []
recalls = []
for th in thresh_vec:
y_pred_b = y_pred[:,i] > th
tp = np.sum(np.logical_and(y_test_b==1, y_pred_b==1))
fp = np.sum(np.logical_and(y_test_b==0, y_pred_b==1))
tn = np.sum(np.logical_and(y_test_b==0, y_pred_b==0))
fn = np.sum(np.logical_and(y_test_b==1, y_pred_b==0))
# print target_names[i], tp, fp, tn, fn
precision = 1
recall = 1
if (tp + fp) > 0:
precision = 1.0 * tp / (tp + fp)
if (tp + fn) > 0:
recall = 1.0 * tp / (tp + fn)
precisions.append(precision)
recalls.append(recall)
class_precisions.append(precisions)
class_recalls.append(recalls)
In [12]:
%matplotlib inline
plt.figure(figsize=(8,6))
# for c in [5,6]:
for c in range(len(target_names)):
if c != 5:
plt.plot(class_recalls[c], class_precisions[c], label=target_names[c])
plt.plot(class_recalls[5], class_precisions[5], label=target_names[5], color='red')
# plt.title('Precision-Recall Curve for Random Forest')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim(0,1.05)
plt.xlim(0,1.05)
plt.legend(bbox_to_anchor=(1, 1), loc='best', ncol=1)
Out[12]:
In [69]:
confusion_matrix(y_test, y_pred_hard)
Out[69]:
In [13]:
generate_confusion_matrix(y_test, y_pred_hard)
In [144]:
def generate_log_loss(X_train, y_train, X_test, y_test):
lls_train = []
lls_test = []
space = np.linspace(10,1000,100,retstep=10)[0]
space = [25, 50, 75]
for est in range(len(space)):
rf = RandomForestClassifier(n_estimators=int(space[est]))
rf.fit(X_train, y_train)
y_train_pred = rf.predict_proba(X_train)
y_pred = rf.predict_proba(X_test)
ll_train = log_loss(y_train, y_train_pred)
ll_test = log_loss(y_test, y_pred)
lls_train.append(ll_train)
lls_test.append(ll_test)
print int(space[est]), ll_train, ll_test
return lls_train, lls_test
In [145]:
lls_train, lls_test = generate_log_loss(X_train, y_train, X_test, y_test)
25 0.000302684176182 0.00105213304396 50 0.00028989262852 0.000956214574294 75 0.000295615044477 0.00101434253832 100 0.000284924302275 0.000972275829683 200 0.000287291808115 0.000960061103854 300 0.000286136543844 0.000968485447115
In [149]:
lls_test
Out[149]:
In [146]:
lls_train.append(0.000284924302275)
lls_train.append(0.000287291808115)
lls_train.append(0.000286136543844)
lls_test.append(0.000972275829683)
lls_test.append(0.000960061103854)
lls_test.append(0.000968485447115)
In [150]:
epochs = len(lls_train)
# x_axis = range(0, epochs)
x_axis = np.linspace(10,1000,100,retstep=10)
x_axis = [25, 50, 75, 100, 200, 300]
fig, ax = plt.subplots(figsize = (8,6))
ax.plot(x_axis, lls_train, label='Train', alpha=0.6)
ax.plot(x_axis, lls_test, label='Test', alpha=0.6)
ax.set_yscale('log')
ax.legend()
plt.ylabel('Log Loss')
plt.xlabel('n_estimators')
plt.title('Random Forest Log Loss')
plt.show()
In [ ]: