In [2]:
from __future__ import division
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
%matplotlib inline
In [7]:
def get_data_csv():
return pd.read_csv('intrusions.csv')
# Get intrusions data
intrusions = get_data_csv()
# Clean up data frame
intrusions.set_index('key_column', inplace=True)
del intrusions['Unnamed: 0']
intrusions['attack'].replace(['buffer_overflow.', 'land.', 'guess_passwd.', 'warezmaster.', 'imap.', 'ftp_write.', 'multihop.', 'loadmodule.', 'rootkit.', 'perl.', 'spy.', 'phf.'],
['others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.'], inplace=True)
intrusions['attack'].value_counts()
Out[7]:
In [8]:
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(intrusions['attack'])
label_encoded_y = label_encoder.transform(intrusions['attack'])
label_key = label_encoder.inverse_transform([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
dummy = pd.get_dummies(intrusions, prefix=['Protocol', 'Service', 'Flag'], prefix_sep='_', columns=['protocol_type', 'service', 'flag'])
# Split data
X = dummy.ix[:,(dummy.columns != 'attack')]
# y = label_encoded_y
y = dummy['attack']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=10)
In [10]:
xg2 = xgb.XGBClassifier(learning_rate=0.3, objective='multi:softmax', silent=True)
eval_set = [(X_train, y_train), (X_test, y_test)]
xg2.fit(X_train, y_train, eval_metric=["merror", "mlogloss"], eval_set=eval_set, verbose=True, early_stopping_rounds=5)
joblib.dump(xg2, 'models/xg_scikit_model_2.pkl')
Out[10]:
In [5]:
# xg2 = joblib.load('models/xg_scikit_model.pkl')
In [11]:
# retrieve performance metrics
results = xg2.evals_result()
epochs = len(results['validation_0']['merror'])
x_axis = range(0, epochs)
# plot log loss
fig, ax = plt.subplots(figsize = (8,6))
ax.plot(x_axis, results['validation_0']['mlogloss'], label='Train', alpha=0.6)
ax.plot(x_axis, results['validation_1']['mlogloss'], label='Test', alpha=0.6)
ax.set_yscale('log')
ax.legend()
plt.ylabel('Log Loss')
plt.xlabel('Epochs')
plt.title('XGBoost Log Loss')
plt.show()
In [6]:
# plot classification error
fig, ax = plt.subplots(figsize = (8,6))
ax.plot(x_axis, results['validation_0']['merror'], label='Train')
ax.plot(x_axis, results['validation_1']['merror'], label='Test')
ax.set_yscale('log')
ax.legend()
plt.ylabel('Classification Error')
plt.xlabel('Epochs')
plt.title('XGBoost Classification Error')
plt.show()
In [19]:
y_pred = xg2.predict_proba(X_test)
In [22]:
target_names = [
'portsweep.',
'normal.',
'warezclient.',
'ipsweep.',
'teardrop.',
'satan.',
'smurf.',
'pod.',
'neptune.',
'back.',
'nmap.',
'others.'
]
target_names = sorted(target_names)
class_precisions = []
class_recalls = []
for i in range(len(target_names)):
# for i in [0, 17, 18, 9, 11, 5, 15, 20, 21]:
thresh_vec = np.unique(np.round(y_pred[:,i],4))
y_test_b = (y_test == target_names[i])
precisions = []
recalls = []
for th in thresh_vec:
y_pred_b = y_pred[:,i] > th
tp = np.sum(np.logical_and(y_test_b==1, y_pred_b==1))
fp = np.sum(np.logical_and(y_test_b==0, y_pred_b==1))
tn = np.sum(np.logical_and(y_test_b==0, y_pred_b==0))
fn = np.sum(np.logical_and(y_test_b==1, y_pred_b==0))
# print target_names[i], tp, fp, tn, fn
precision = 1
recall = 1
if (tp + fp) > 0:
precision = 1.0 * tp / (tp + fp)
if (tp + fn) > 0:
recall = 1.0 * tp / (tp + fn)
precisions.append(precision)
recalls.append(recall)
class_precisions.append(precisions)
class_recalls.append(recalls)
In [23]:
%matplotlib inline
plt.figure(figsize=(8,6))
# for c in [5,6]:
for c in range(len(target_names)):
if c != 5:
plt.plot(class_recalls[c], class_precisions[c], label=target_names[c])
plt.plot(class_recalls[5], class_precisions[5], label=target_names[5], color='red')
# plt.title('Precision-Recall Curve for Random Forest')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim(0,1.05)
plt.xlim(0,1.05)
plt.legend(bbox_to_anchor=(1, 1), loc='best', ncol=1)
Out[23]:
In [ ]: