In [1]:
from __future__ import division
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import export_graphviz
import os

In [2]:
def get_data_csv():
    return pd.read_csv('intrusions.csv')


def generate_confusion_matrix(y_test, y_pred):
    """Create the plot for our confusion matrix"""
    # y_test = joblib.load('models/rf_y_test.pkl')
    # y_pred = joblib.load('models/rf_y_pred.pkl')

    cat = y_test.value_counts().index.tolist()
    cm = confusion_matrix(y_test, y_pred, sorted(cat))

    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    df_cm = pd.DataFrame(cm, index = [i for i in cat],
                    columns = [i for i in cat])
    plt.figure(figsize = (8,6))
    ax = sns.heatmap(df_cm, annot=False, cmap="GnBu")
    plt.setp(ax.get_yticklabels(), rotation=0)
    plt.setp(ax.get_xticklabels(), rotation=90)
    plt.tight_layout()
    plt.savefig('app/static/fig/cm-10.png')

In [3]:
# Get intrusions data
intrusions = get_data_csv()

# Clean up data frame
intrusions.set_index('key_column', inplace=True)
del intrusions['Unnamed: 0']
intrusions['attack_cat'] = intrusions['attack'].astype('category')
del intrusions['attack']
dummy = pd.get_dummies(intrusions, prefix=['Protocol', 'Service', 'Flag'], prefix_sep='_', columns=['protocol_type', 'service', 'flag'])

# # Train model
X = dummy.ix[:,(dummy.columns != 'attack_cat')]
y = dummy['attack_cat']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=10)

In [13]:
rf = RandomForestClassifier(n_estimators=1000)
rf.fit(X_train, y_train)


Out[13]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [14]:
y_pred = rf.predict_proba(X_test)


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-14-2cda46f10bf6> in <module>()
----> 1 y_pred = rf.predict_proba(X_test)

c:\program files\anaconda2\lib\site-packages\sklearn\ensemble\forest.pyc in predict_proba(self, X)
    581             delayed(parallel_helper)(e, 'predict_proba', X,
    582                                       check_input=False)
--> 583             for e in self.estimators_)
    584 
    585         # Reduce

c:\program files\anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __call__(self, iterable)
    756             # was dispatched. In particular this covers the edge
    757             # case of Parallel used with an exhausted iterator.
--> 758             while self.dispatch_one_batch(iterator):
    759                 self._iterating = True
    760             else:

c:\program files\anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.pyc in dispatch_one_batch(self, iterator)
    606                 return False
    607             else:
--> 608                 self._dispatch(tasks)
    609                 return True
    610 

c:\program files\anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.pyc in _dispatch(self, batch)
    569         dispatch_timestamp = time.time()
    570         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571         job = self._backend.apply_async(batch, callback=cb)
    572         self._jobs.append(job)
    573 

c:\program files\anaconda2\lib\site-packages\sklearn\externals\joblib\_parallel_backends.pyc in apply_async(self, func, callback)
    107     def apply_async(self, func, callback=None):
    108         """Schedule a func to be run"""
--> 109         result = ImmediateResult(func)
    110         if callback:
    111             callback(result)

c:\program files\anaconda2\lib\site-packages\sklearn\externals\joblib\_parallel_backends.pyc in __init__(self, batch)
    324         # Don't delay the application, to avoid keeping the input
    325         # arguments in memory
--> 326         self.results = batch()
    327 
    328     def get(self):

c:\program files\anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.pyc in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

c:\program files\anaconda2\lib\site-packages\sklearn\utils\fixes.pyc in parallel_helper(obj, methodname, *args, **kwargs)
    313 def parallel_helper(obj, methodname, *args, **kwargs):
    314     """Helper to workaround Python 2 limitations of pickling instance methods"""
--> 315     return getattr(obj, methodname)(*args, **kwargs)
    316 
    317 

c:\program files\anaconda2\lib\site-packages\sklearn\tree\tree.pyc in predict_proba(self, X, check_input)
    766         """
    767         X = self._validate_X_predict(X, check_input)
--> 768         proba = self.tree_.predict(X)
    769 
    770         if self.n_outputs_ == 1:

sklearn\tree\_tree.pyx in sklearn.tree._tree.Tree.predict (sklearn\tree\_tree.c:9416)()

sklearn\tree\_tree.pyx in sklearn.tree._tree.Tree.predict (sklearn\tree\_tree.c:9263)()

MemoryError: 

In [12]:
y_pred[0]


Out[12]:
array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [ ]: