In [1]:
from __future__ import division
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import export_graphviz
import os


/home/josh/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
def get_data_csv():
    return pd.read_csv('intrusions.csv')


def generate_confusion_matrix(y_test, y_pred):
    """Create the plot for our confusion matrix"""
    # y_test = joblib.load('models/rf_y_test.pkl')
    # y_pred = joblib.load('models/rf_y_pred.pkl')

    cat = y_test.value_counts().index.tolist()
    cm = confusion_matrix(y_test, y_pred, sorted(cat))

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    df_cm = pd.DataFrame(cm, index = [i for i in cat],
                    columns = [i for i in cat])
    plt.figure(figsize = (8,6))
    ax = sns.heatmap(df_cm, annot=False, cmap="GnBu")
    plt.setp(ax.get_yticklabels(), rotation=0)
    plt.setp(ax.get_xticklabels(), rotation=90)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
#     plt.savefig('app/static/fig/cm-10.png')

In [ ]:
guess_passwd.          12
buffer_overflow.        8
land.                   6
warezmaster.            5
imap.                   4
ftp_write.              4
multihop.               4
loadmodule.             3
rootkit.                2
perl.                   1
spy.                    1
phf.                    1

In [3]:
# Get intrusions data
intrusions = get_data_csv()

# Clean up data frame
intrusions.set_index('key_column', inplace=True)
del intrusions['Unnamed: 0']
intrusions['attack'].replace(['buffer_overflow.', 'land.', 'guess_passwd.', 'warezmaster.', 'imap.', 'ftp_write.', 'multihop.', 'loadmodule.', 'rootkit.', 'perl.', 'spy.', 'phf.'],
                             ['others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.', 'others.'], inplace=True)
intrusions['attack'].value_counts()


Out[3]:
smurf.          280790
neptune.        107201
normal.          97277
back.             2203
satan.            1589
ipsweep.          1247
portsweep.        1040
warezclient.      1020
teardrop.          979
pod.               264
nmap.              231
others.            179
Name: attack, dtype: int64

In [4]:
intrusions['attack_cat'] = intrusions['attack'].astype('category')
del intrusions['attack']
dummy = pd.get_dummies(intrusions, prefix=['Protocol', 'Service', 'Flag'], prefix_sep='_', columns=['protocol_type', 'service', 'flag'])

# # Train model
X = dummy.ix[:,(dummy.columns != 'attack_cat')]
y = dummy['attack_cat']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=10)

In [5]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)


Out[5]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [6]:
y_pred_hard = rf.predict(X_test)

In [7]:
y_pred = rf.predict_proba(X_test)

In [8]:
rf.classes_


Out[8]:
array(['back.', 'ipsweep.', 'neptune.', 'nmap.', 'normal.', 'others.',
       'pod.', 'portsweep.', 'satan.', 'smurf.', 'teardrop.',
       'warezclient.'], dtype=object)

In [9]:
target_names = [
    'portsweep.',
    'normal.',
    'warezclient.',
    'ipsweep.',
    'teardrop.',
    'satan.',
    'smurf.',
    'pod.',
    'neptune.',
    'back.',
    'nmap.',
    'others.'
]

target_names = sorted(target_names)

In [10]:
target_names


Out[10]:
['back.',
 'ipsweep.',
 'neptune.',
 'nmap.',
 'normal.',
 'others.',
 'pod.',
 'portsweep.',
 'satan.',
 'smurf.',
 'teardrop.',
 'warezclient.']

In [11]:
class_precisions = []
class_recalls = []
for i in range(len(target_names)):
# for i in [0, 17, 18, 9, 11, 5, 15, 20, 21]:
    thresh_vec = np.unique(np.round(y_pred[:,i],6))
    y_test_b = (y_test == target_names[i])
    precisions = []
    recalls = []

    for th in thresh_vec:
        y_pred_b = y_pred[:,i] > th
        tp = np.sum(np.logical_and(y_test_b==1, y_pred_b==1))
        fp = np.sum(np.logical_and(y_test_b==0, y_pred_b==1))
        tn = np.sum(np.logical_and(y_test_b==0, y_pred_b==0))
        fn = np.sum(np.logical_and(y_test_b==1, y_pred_b==0))
#         print target_names[i], tp, fp, tn, fn
        precision = 1
        recall = 1
        if (tp + fp) > 0:
            precision = 1.0 * tp /  (tp + fp)
        if (tp + fn) > 0:
            recall = 1.0 * tp / (tp + fn)
        precisions.append(precision)
        recalls.append(recall)
    
    class_precisions.append(precisions)
    class_recalls.append(recalls)

In [12]:
%matplotlib inline
plt.figure(figsize=(8,6))
# for c in [5,6]:
for c in range(len(target_names)):
    if c != 5:
        plt.plot(class_recalls[c], class_precisions[c], label=target_names[c])

plt.plot(class_recalls[5], class_precisions[5], label=target_names[5], color='red')        

# plt.title('Precision-Recall Curve for Random Forest')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim(0,1.05)
plt.xlim(0,1.05)
plt.legend(bbox_to_anchor=(1, 1), loc='best', ncol=1)


Out[12]:
<matplotlib.legend.Legend at 0x7f61e1351190>

In [69]:
confusion_matrix(y_test, y_pred_hard)


Out[69]:
array([[  654,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0],
       [    0,   354,     0,     0,     1,     0,     0,     0,     0,
            0,     0,     0],
       [    0,     0, 31975,     0,     0,     0,     0,     0,     0,
            0,     0,     0],
       [    0,     2,     0,    65,     2,     0,     0,     0,     0,
            0,     0,     0],
       [    0,     0,     0,     0, 29279,     1,     0,     0,     0,
            0,     0,     2],
       [    0,     0,     0,     0,    11,    40,     0,     0,     0,
            0,     0,     0],
       [    0,     0,     0,     0,     2,     0,    75,     0,     0,
            0,     0,     0],
       [    0,     0,     0,     0,     0,     0,     0,   329,     0,
            0,     0,     0],
       [    0,     0,     0,     0,     5,     0,     0,     0,   484,
            0,     0,     0],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
        84309,     0,     0],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,   303,     0],
       [    0,     0,     0,     0,    10,     0,     0,     0,     0,
            0,     0,   303]])

In [13]:
generate_confusion_matrix(y_test, y_pred_hard)



In [144]:
def generate_log_loss(X_train, y_train, X_test, y_test):
    lls_train = []
    lls_test = []
    space = np.linspace(10,1000,100,retstep=10)[0]
    space = [25, 50, 75]
    for est in range(len(space)):
        rf = RandomForestClassifier(n_estimators=int(space[est]))
        rf.fit(X_train, y_train)
        y_train_pred = rf.predict_proba(X_train)
        y_pred = rf.predict_proba(X_test)
        ll_train = log_loss(y_train, y_train_pred)
        ll_test = log_loss(y_test, y_pred)
        lls_train.append(ll_train)
        lls_test.append(ll_test)
        print int(space[est]), ll_train, ll_test
    return lls_train, lls_test

In [145]:
lls_train, lls_test = generate_log_loss(X_train, y_train, X_test, y_test)


25 0.000302684176182 0.00105213304396
50 0.00028989262852 0.000956214574294
75 0.000295615044477 0.00101434253832

25 0.000302684176182 0.00105213304396 50 0.00028989262852 0.000956214574294 75 0.000295615044477 0.00101434253832 100 0.000284924302275 0.000972275829683 200 0.000287291808115 0.000960061103854 300 0.000286136543844 0.000968485447115


In [149]:
lls_test


Out[149]:
[0.0010521330439552088,
 0.00095621457429422767,
 0.0010143425383172405,
 0.000972275829683,
 0.000960061103854,
 0.000968485447115]

In [146]:
lls_train.append(0.000284924302275)
lls_train.append(0.000287291808115)
lls_train.append(0.000286136543844)
lls_test.append(0.000972275829683)
lls_test.append(0.000960061103854)
lls_test.append(0.000968485447115)

In [150]:
epochs = len(lls_train)
# x_axis = range(0, epochs)
x_axis = np.linspace(10,1000,100,retstep=10)
x_axis = [25, 50, 75, 100, 200, 300]
fig, ax = plt.subplots(figsize = (8,6))
ax.plot(x_axis, lls_train, label='Train', alpha=0.6)
ax.plot(x_axis, lls_test, label='Test', alpha=0.6)
ax.set_yscale('log')
ax.legend()
plt.ylabel('Log Loss')
plt.xlabel('n_estimators')
plt.title('Random Forest Log Loss')
plt.show()



In [ ]: