In [2]:
from __future__ import division
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
%matplotlib inline

In [7]:
def get_data_csv():
    return pd.read_csv('intrusions.csv')

# Get intrusions data
intrusions = get_data_csv()

# Clean up data frame
intrusions.set_index('key_column', inplace=True)
del intrusions['Unnamed: 0']
intrusions['attack'].replace(['buffer_overflow.', 'land.', 'guess_passwd.', 'warezmaster.', 'imap.', 'ftp_write.', 'multihop.', 'loadmodule.', 'rootkit.', 'perl.', 'spy.', 'phf.'],
                             ['others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.'], inplace=True)
intrusions['attack'].value_counts()


Out[7]:
smurf.          280790
neptune.        107201
normal.          97277
back.             2203
satan.            1589
ipsweep.          1247
portsweep.        1040
warezclient.      1020
teardrop.          979
pod.               264
nmap.              231
others.            179
Name: attack, dtype: int64

In [8]:
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(intrusions['attack'])
label_encoded_y = label_encoder.transform(intrusions['attack'])
label_key = label_encoder.inverse_transform([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])

dummy = pd.get_dummies(intrusions, prefix=['Protocol', 'Service', 'Flag'], prefix_sep='_', columns=['protocol_type', 'service', 'flag'])


# Split data
X = dummy.ix[:,(dummy.columns != 'attack')]
# y = label_encoded_y
y = dummy['attack']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=10)

In [10]:
xg2 = xgb.XGBClassifier(learning_rate=0.3, objective='multi:softmax', silent=True)
eval_set = [(X_train, y_train), (X_test, y_test)]
xg2.fit(X_train, y_train, eval_metric=["merror", "mlogloss"], eval_set=eval_set, verbose=True, early_stopping_rounds=5)
joblib.dump(xg2, 'models/xg_scikit_model_2.pkl')


[0]	validation_0-merror:0.003554	validation_0-mlogloss:0.949649	validation_1-merror:0.003448	validation_1-mlogloss:0.949496
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 5 rounds.
[1]	validation_0-merror:0.002932	validation_0-mlogloss:0.654898	validation_1-merror:0.002928	validation_1-mlogloss:0.654896
[2]	validation_0-merror:0.001695	validation_0-mlogloss:0.468377	validation_1-merror:0.001673	validation_1-mlogloss:0.468456
[3]	validation_0-merror:0.001527	validation_0-mlogloss:0.340996	validation_1-merror:0.001586	validation_1-mlogloss:0.341151
[4]	validation_0-merror:0.001313	validation_0-mlogloss:0.250687	validation_1-merror:0.00141	validation_1-mlogloss:0.250905
[5]	validation_0-merror:0.001047	validation_0-mlogloss:0.185573	validation_1-merror:0.001147	validation_1-mlogloss:0.18586
[6]	validation_0-merror:0.001015	validation_0-mlogloss:0.137971	validation_1-merror:0.001127	validation_1-mlogloss:0.13831
[7]	validation_0-merror:0.000882	validation_0-mlogloss:0.102836	validation_1-merror:0.001012	validation_1-mlogloss:0.10324
[8]	validation_0-merror:0.000755	validation_0-mlogloss:0.077054	validation_1-merror:0.000911	validation_1-mlogloss:0.077502
[9]	validation_0-merror:0.000711	validation_0-mlogloss:0.057836	validation_1-merror:0.000877	validation_1-mlogloss:0.058335
[10]	validation_0-merror:0.00068	validation_0-mlogloss:0.043618	validation_1-merror:0.00083	validation_1-mlogloss:0.044145
[11]	validation_0-merror:0.000656	validation_0-mlogloss:0.033045	validation_1-merror:0.00083	validation_1-mlogloss:0.03359
[12]	validation_0-merror:0.00059	validation_0-mlogloss:0.025175	validation_1-merror:0.000769	validation_1-mlogloss:0.025736
[13]	validation_0-merror:0.000515	validation_0-mlogloss:0.019259	validation_1-merror:0.000681	validation_1-mlogloss:0.019825
[14]	validation_0-merror:0.000483	validation_0-mlogloss:0.014782	validation_1-merror:0.000648	validation_1-mlogloss:0.015352
[15]	validation_0-merror:0.00044	validation_0-mlogloss:0.01145	validation_1-merror:0.000567	validation_1-mlogloss:0.012018
[16]	validation_0-merror:0.000405	validation_0-mlogloss:0.008896	validation_1-merror:0.000547	validation_1-mlogloss:0.009442
[17]	validation_0-merror:0.000367	validation_0-mlogloss:0.006974	validation_1-merror:0.000533	validation_1-mlogloss:0.007504
[18]	validation_0-merror:0.00035	validation_0-mlogloss:0.00549	validation_1-merror:0.000513	validation_1-mlogloss:0.005965
[19]	validation_0-merror:0.000298	validation_0-mlogloss:0.00436	validation_1-merror:0.000499	validation_1-mlogloss:0.004809
[20]	validation_0-merror:0.00028	validation_0-mlogloss:0.003525	validation_1-merror:0.000466	validation_1-mlogloss:0.003963
[21]	validation_0-merror:0.000272	validation_0-mlogloss:0.002853	validation_1-merror:0.000452	validation_1-mlogloss:0.003268
[22]	validation_0-merror:0.000269	validation_0-mlogloss:0.002355	validation_1-merror:0.000432	validation_1-mlogloss:0.002764
[23]	validation_0-merror:0.00026	validation_0-mlogloss:0.001963	validation_1-merror:0.000405	validation_1-mlogloss:0.002358
[24]	validation_0-merror:0.000208	validation_0-mlogloss:0.001658	validation_1-merror:0.000364	validation_1-mlogloss:0.002065
[25]	validation_0-merror:0.000188	validation_0-mlogloss:0.001419	validation_1-merror:0.000317	validation_1-mlogloss:0.001817
[26]	validation_0-merror:0.000168	validation_0-mlogloss:0.001225	validation_1-merror:0.00029	validation_1-mlogloss:0.001621
[27]	validation_0-merror:0.00015	validation_0-mlogloss:0.001061	validation_1-merror:0.00027	validation_1-mlogloss:0.001468
[28]	validation_0-merror:0.000142	validation_0-mlogloss:0.000942	validation_1-merror:0.000256	validation_1-mlogloss:0.001355
[29]	validation_0-merror:0.000133	validation_0-mlogloss:0.000841	validation_1-merror:0.00025	validation_1-mlogloss:0.001246
[30]	validation_0-merror:0.000127	validation_0-mlogloss:0.000755	validation_1-merror:0.00025	validation_1-mlogloss:0.00117
[31]	validation_0-merror:0.00013	validation_0-mlogloss:0.000683	validation_1-merror:0.000256	validation_1-mlogloss:0.001102
[32]	validation_0-merror:0.000107	validation_0-mlogloss:0.000623	validation_1-merror:0.000243	validation_1-mlogloss:0.00104
[33]	validation_0-merror:0.000101	validation_0-mlogloss:0.000576	validation_1-merror:0.000243	validation_1-mlogloss:0.000995
[34]	validation_0-merror:9.5e-05	validation_0-mlogloss:0.000537	validation_1-merror:0.000229	validation_1-mlogloss:0.000959
[35]	validation_0-merror:8.1e-05	validation_0-mlogloss:0.0005	validation_1-merror:0.000223	validation_1-mlogloss:0.000917
[36]	validation_0-merror:7.5e-05	validation_0-mlogloss:0.000473	validation_1-merror:0.000223	validation_1-mlogloss:0.000893
[37]	validation_0-merror:6.4e-05	validation_0-mlogloss:0.000441	validation_1-merror:0.000175	validation_1-mlogloss:0.000855
[38]	validation_0-merror:6.1e-05	validation_0-mlogloss:0.000413	validation_1-merror:0.000182	validation_1-mlogloss:0.000831
[39]	validation_0-merror:5.8e-05	validation_0-mlogloss:0.000387	validation_1-merror:0.000182	validation_1-mlogloss:0.000806
[40]	validation_0-merror:5.2e-05	validation_0-mlogloss:0.000363	validation_1-merror:0.000182	validation_1-mlogloss:0.00078
[41]	validation_0-merror:5.2e-05	validation_0-mlogloss:0.000345	validation_1-merror:0.000175	validation_1-mlogloss:0.000756
[42]	validation_0-merror:4.9e-05	validation_0-mlogloss:0.000329	validation_1-merror:0.000162	validation_1-mlogloss:0.000744
[43]	validation_0-merror:4e-05	validation_0-mlogloss:0.000315	validation_1-merror:0.000175	validation_1-mlogloss:0.000731
[44]	validation_0-merror:4e-05	validation_0-mlogloss:0.000298	validation_1-merror:0.000175	validation_1-mlogloss:0.000715
[45]	validation_0-merror:3.8e-05	validation_0-mlogloss:0.000287	validation_1-merror:0.000175	validation_1-mlogloss:0.000704
[46]	validation_0-merror:3.8e-05	validation_0-mlogloss:0.000275	validation_1-merror:0.000169	validation_1-mlogloss:0.0007
[47]	validation_0-merror:3.8e-05	validation_0-mlogloss:0.000268	validation_1-merror:0.000169	validation_1-mlogloss:0.000689
[48]	validation_0-merror:3.8e-05	validation_0-mlogloss:0.000254	validation_1-merror:0.000175	validation_1-mlogloss:0.000672
[49]	validation_0-merror:3.5e-05	validation_0-mlogloss:0.000242	validation_1-merror:0.000169	validation_1-mlogloss:0.000662
[50]	validation_0-merror:3.5e-05	validation_0-mlogloss:0.000234	validation_1-merror:0.000169	validation_1-mlogloss:0.000653
[51]	validation_0-merror:3.5e-05	validation_0-mlogloss:0.000224	validation_1-merror:0.000169	validation_1-mlogloss:0.000644
[52]	validation_0-merror:3.5e-05	validation_0-mlogloss:0.000217	validation_1-merror:0.000169	validation_1-mlogloss:0.000634
[53]	validation_0-merror:3.5e-05	validation_0-mlogloss:0.000211	validation_1-merror:0.000155	validation_1-mlogloss:0.000629
[54]	validation_0-merror:3.2e-05	validation_0-mlogloss:0.000205	validation_1-merror:0.000155	validation_1-mlogloss:0.000623
[55]	validation_0-merror:2.9e-05	validation_0-mlogloss:0.000196	validation_1-merror:0.000148	validation_1-mlogloss:0.000616
[56]	validation_0-merror:2.9e-05	validation_0-mlogloss:0.00019	validation_1-merror:0.000142	validation_1-mlogloss:0.000611
[57]	validation_0-merror:2.6e-05	validation_0-mlogloss:0.000183	validation_1-merror:0.000142	validation_1-mlogloss:0.000602
[58]	validation_0-merror:2e-05	validation_0-mlogloss:0.000177	validation_1-merror:0.000142	validation_1-mlogloss:0.000594
[59]	validation_0-merror:1.7e-05	validation_0-mlogloss:0.00017	validation_1-merror:0.000142	validation_1-mlogloss:0.000591
[60]	validation_0-merror:2e-05	validation_0-mlogloss:0.000166	validation_1-merror:0.000142	validation_1-mlogloss:0.000589
[61]	validation_0-merror:2.3e-05	validation_0-mlogloss:0.00016	validation_1-merror:0.000148	validation_1-mlogloss:0.000587
[62]	validation_0-merror:2.3e-05	validation_0-mlogloss:0.000156	validation_1-merror:0.000148	validation_1-mlogloss:0.000587
[63]	validation_0-merror:1.7e-05	validation_0-mlogloss:0.000151	validation_1-merror:0.000142	validation_1-mlogloss:0.000582
[64]	validation_0-merror:1.4e-05	validation_0-mlogloss:0.000145	validation_1-merror:0.000148	validation_1-mlogloss:0.000578
[65]	validation_0-merror:1.4e-05	validation_0-mlogloss:0.000142	validation_1-merror:0.000148	validation_1-mlogloss:0.000576
[66]	validation_0-merror:1.4e-05	validation_0-mlogloss:0.000138	validation_1-merror:0.000142	validation_1-mlogloss:0.000571
[67]	validation_0-merror:1.4e-05	validation_0-mlogloss:0.000135	validation_1-merror:0.000155	validation_1-mlogloss:0.000571
[68]	validation_0-merror:1.4e-05	validation_0-mlogloss:0.00013	validation_1-merror:0.000142	validation_1-mlogloss:0.00057
[69]	validation_0-merror:1.4e-05	validation_0-mlogloss:0.000128	validation_1-merror:0.000142	validation_1-mlogloss:0.000565
[70]	validation_0-merror:1.4e-05	validation_0-mlogloss:0.000125	validation_1-merror:0.000148	validation_1-mlogloss:0.000567
[71]	validation_0-merror:1.2e-05	validation_0-mlogloss:0.00012	validation_1-merror:0.000148	validation_1-mlogloss:0.000566
[72]	validation_0-merror:1.2e-05	validation_0-mlogloss:0.000117	validation_1-merror:0.000148	validation_1-mlogloss:0.000566
[73]	validation_0-merror:9e-06	validation_0-mlogloss:0.000114	validation_1-merror:0.000148	validation_1-mlogloss:0.000566
[74]	validation_0-merror:9e-06	validation_0-mlogloss:0.00011	validation_1-merror:0.000148	validation_1-mlogloss:0.000564
[75]	validation_0-merror:9e-06	validation_0-mlogloss:0.000108	validation_1-merror:0.000155	validation_1-mlogloss:0.000565
[76]	validation_0-merror:9e-06	validation_0-mlogloss:0.000105	validation_1-merror:0.000148	validation_1-mlogloss:0.000563
[77]	validation_0-merror:9e-06	validation_0-mlogloss:0.000103	validation_1-merror:0.000142	validation_1-mlogloss:0.00056
[78]	validation_0-merror:9e-06	validation_0-mlogloss:0.0001	validation_1-merror:0.000148	validation_1-mlogloss:0.000561
[79]	validation_0-merror:9e-06	validation_0-mlogloss:9.8e-05	validation_1-merror:0.000142	validation_1-mlogloss:0.000558
[80]	validation_0-merror:9e-06	validation_0-mlogloss:9.6e-05	validation_1-merror:0.000148	validation_1-mlogloss:0.00056
[81]	validation_0-merror:9e-06	validation_0-mlogloss:9.4e-05	validation_1-merror:0.000142	validation_1-mlogloss:0.000561
[82]	validation_0-merror:9e-06	validation_0-mlogloss:9.2e-05	validation_1-merror:0.000142	validation_1-mlogloss:0.000562
[83]	validation_0-merror:9e-06	validation_0-mlogloss:9e-05	validation_1-merror:0.000142	validation_1-mlogloss:0.000562
[84]	validation_0-merror:9e-06	validation_0-mlogloss:8.7e-05	validation_1-merror:0.000142	validation_1-mlogloss:0.000563
Stopping. Best iteration:
[79]	validation_0-merror:9e-06	validation_0-mlogloss:9.8e-05	validation_1-merror:0.000142	validation_1-mlogloss:0.000558

Out[10]:
['models/xg_scikit_model_2.pkl']

In [5]:
# xg2 = joblib.load('models/xg_scikit_model.pkl')

In [11]:
# retrieve performance metrics
results = xg2.evals_result()
epochs = len(results['validation_0']['merror'])
x_axis = range(0, epochs)
# plot log loss

fig, ax = plt.subplots(figsize = (8,6))
ax.plot(x_axis, results['validation_0']['mlogloss'], label='Train', alpha=0.6)
ax.plot(x_axis, results['validation_1']['mlogloss'], label='Test', alpha=0.6)
ax.set_yscale('log')
ax.legend()
plt.ylabel('Log Loss')
plt.xlabel('Epochs')
plt.title('XGBoost Log Loss')
plt.show()



In [6]:
# plot classification error
fig, ax = plt.subplots(figsize = (8,6))
ax.plot(x_axis, results['validation_0']['merror'], label='Train')
ax.plot(x_axis, results['validation_1']['merror'], label='Test')
ax.set_yscale('log')
ax.legend()
plt.ylabel('Classification Error')
plt.xlabel('Epochs')
plt.title('XGBoost Classification Error')
plt.show()



In [19]:
y_pred = xg2.predict_proba(X_test)

In [22]:
target_names = [
    'portsweep.',
    'normal.',
    'warezclient.',
    'ipsweep.',
    'teardrop.',
    'satan.',
    'smurf.',
    'pod.',
    'neptune.',
    'back.',
    'nmap.',
    'others.'
]

target_names = sorted(target_names)

class_precisions = []
class_recalls = []
for i in range(len(target_names)):
# for i in [0, 17, 18, 9, 11, 5, 15, 20, 21]:
    thresh_vec = np.unique(np.round(y_pred[:,i],4))
    y_test_b = (y_test == target_names[i])
    precisions = []
    recalls = []

    for th in thresh_vec:
        y_pred_b = y_pred[:,i] > th
        tp = np.sum(np.logical_and(y_test_b==1, y_pred_b==1))
        fp = np.sum(np.logical_and(y_test_b==0, y_pred_b==1))
        tn = np.sum(np.logical_and(y_test_b==0, y_pred_b==0))
        fn = np.sum(np.logical_and(y_test_b==1, y_pred_b==0))
#         print target_names[i], tp, fp, tn, fn
        precision = 1
        recall = 1
        if (tp + fp) > 0:
            precision = 1.0 * tp /  (tp + fp)
        if (tp + fn) > 0:
            recall = 1.0 * tp / (tp + fn)
        precisions.append(precision)
        recalls.append(recall)
    
    class_precisions.append(precisions)
    class_recalls.append(recalls)

In [23]:
%matplotlib inline
plt.figure(figsize=(8,6))
# for c in [5,6]:
for c in range(len(target_names)):
    if c != 5:
        plt.plot(class_recalls[c], class_precisions[c], label=target_names[c])

plt.plot(class_recalls[5], class_precisions[5], label=target_names[5], color='red')        

# plt.title('Precision-Recall Curve for Random Forest')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim(0,1.05)
plt.xlim(0,1.05)
plt.legend(bbox_to_anchor=(1, 1), loc='best', ncol=1)


Out[23]:
<matplotlib.legend.Legend at 0x7f6e009b8b50>

In [ ]: