In [1]:

    
# for data manipulation
import pandas as pd
import numpy as np

#for graphs
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#models to run
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelBinarizer
from sklearn.cross_validation import cross_val_score

#train_test_split
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit, GridSearchCV

#metrics
from sklearn.metrics import confusion_matrix,precision_recall_fscore_support
from sklearn.metrics import accuracy_score, roc_auc_score









    



/home/phoenix/anaconda2/envs/bengalore/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Objective of sprint 1 (~30 minutes) :

Read csv file.
Look at the shape of dataframe.
Clean label.
Check for duplicates and remove duplicates.
A bar plot of distribution of labels, i.e. attack-types using sns.countplot
Convert labels into binary labels. Normal and attack.
A bar plot of distribution of binary labels (normal and attacks)
Distribution of features
1. Categorical features : Use sns.countplot to understand prominent values of a feature in attack vs normal.
2. Continuous features : Use sns.boxplot to understand distribution of a feature in attack vs normal.



In [2]:

    
# read raw data
raw_data = pd.read_csv('/home/phoenix/Documents/session_1_data_train.csv')



In [3]:

    
test_data = pd.read_csv('/home/phoenix/Documents/session_1_data_test.csv')
test_data.columns = raw_data.columns



In [4]:

    
raw_data.head()









    Out[4]:







  
    
      
      duration
      protocol_type
      service
      flag
      src_bytes
      dst_bytes
      land
      wrong_fragment
      urgent
      hot
      ...
      dst_host_srv_count
      dst_host_same_srv_rate
      dst_host_diff_srv_rate
      dst_host_same_src_port_rate
      dst_host_srv_diff_host_rate
      dst_host_serror_rate
      dst_host_srv_serror_rate
      dst_host_rerror_rate
      dst_host_srv_rerror_rate
      label
    
  
  
    
      0
      0
      tcp
      http
      SF
      181
      5450
      0
      0
      0
      0
      ...
      9
      1.0
      0.0
      0.11
      0.0
      0.0
      0.0
      0.0
      0.0
      normal.
    
    
      1
      0
      tcp
      http
      SF
      239
      486
      0
      0
      0
      0
      ...
      19
      1.0
      0.0
      0.05
      0.0
      0.0
      0.0
      0.0
      0.0
      normal.
    
    
      2
      0
      tcp
      http
      SF
      235
      1337
      0
      0
      0
      0
      ...
      29
      1.0
      0.0
      0.03
      0.0
      0.0
      0.0
      0.0
      0.0
      normal.
    
    
      3
      0
      tcp
      http
      SF
      219
      1337
      0
      0
      0
      0
      ...
      39
      1.0
      0.0
      0.03
      0.0
      0.0
      0.0
      0.0
      0.0
      normal.
    
    
      4
      0
      tcp
      http
      SF
      217
      2032
      0
      0
      0
      0
      ...
      49
      1.0
      0.0
      0.02
      0.0
      0.0
      0.0
      0.0
      0.0
      normal.
    
  

5 rows × 42 columns



In [5]:

    
raw_data.label.value_counts().keys()









    Out[5]:





Index(['smurf.', 'neptune.', 'normal.', 'back.', 'satan.', 'ipsweep.',
       'portsweep.', 'warezclient.', 'teardrop.', 'pod.', 'nmap.',
       'guess_passwd.', 'buffer_overflow.', 'land.', 'warezmaster.', 'others.',
       'imap.', 'rootkit.', 'loadmodule.', 'ftp_write.', 'multihop.', 'phf.',
       'perl.', 'spy.'],
      dtype='object')



In [6]:

    
test_data.label.value_counts().keys()









    Out[6]:





Index(['smurf.', 'normal.', 'neptune.', 'snmpgetattack.', 'mailbomb.',
       'guess_passwd.', 'snmpguess.', 'satan.', 'warezmaster.', 'back.',
       'mscan.', 'apache2.', 'processtable.', 'saint.', 'portsweep.',
       'ipsweep.', 'httptunnel.', 'pod.', 'nmap.', 'buffer_overflow.',
       'multihop.', 'sendmail.', 'named.', 'ps.', 'xterm.', 'rootkit.',
       'teardrop.', 'land.', 'xlock.', 'xsnoop.', 'ftp_write.', 'perl.',
       'loadmodule.', 'sqlattack.', 'phf.', 'worm.', 'udpstorm.', 'imap.'],
      dtype='object')



In [7]:

    
# remove ". " from labels
raw_data['label'] = raw_data['label'].apply(lambda x: x[:-1])
test_data['label'] = test_data['label'].apply(lambda x: x[:-1])



In [8]:

    
pd.isnull(raw_data).sum()









    Out[8]:





duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_host_rate             0
dst_host_count                 0
dst_host_srv_count             0
dst_host_same_srv_rate         0
dst_host_diff_srv_rate         0
dst_host_same_src_port_rate    0
dst_host_srv_diff_host_rate    0
dst_host_serror_rate           0
dst_host_srv_serror_rate       0
dst_host_rerror_rate           0
dst_host_srv_rerror_rate       0
label                          0
dtype: int64



In [9]:

    
raw_data = raw_data.drop_duplicates()
raw_data = raw_data.dropna()



In [14]:

    
# distribution of labels
sns.set_color_codes()

fig, ax1 = plt.subplots(1,1, figsize = (18,6))

sns.countplot('label', data = raw_data,palette="Set2", ax = ax1)
plt.xticks(rotation=30)









    Out[14]:





(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23]), <a list of 24 Text xticklabel objects>)



In [10]:

    
# combining labels as normal and attack
# normal is 1 , attack is 0

def get_label_grouping(label):
    if label == 'normal':
        return 'normal'
    else:
        return 'attack'

raw_data['label_attack_type']= raw_data['label'].apply(get_label_grouping)
test_data['label_attack_type']= test_data['label'].apply(get_label_grouping)



In [11]:

    
raw_data['label_attack_type'].value_counts()









    Out[11]:





normal    87814
attack    57772
Name: label_attack_type, dtype: int64



In [12]:

    
test_data['label_attack_type'].value_counts()









    Out[12]:





attack    250436
normal     60592
Name: label_attack_type, dtype: int64



In [13]:

    
# distribution of label_attack_type

fig, ax1 = plt.subplots(1,1, figsize = (18,6))

sns.countplot('label_attack_type', data = raw_data,palette="Set2", ax = ax1)
# plt.xticks(rotation=30)









    Out[13]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f6ba9cb38d0>



In [14]:

    
raw_data.columns









    Out[14]:





Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'label', 'label_attack_type'],
      dtype='object')



In [15]:

    
# distribution of categorical variables with 'label_attack_type'

sns.set()

categorical_cols = ['protocol_type','flag','land','logged_in','is_host_login','is_guest_login']

for column in categorical_cols:
    plt.figure()
    sns.countplot(x=column, hue="label_attack_type",data=raw_data, palette="Set2")



In [21]:

    
## Checking distributions of continuous variables with default_status by plotting boxplots
for column in raw_data.columns:
    if column not in categorical_cols+['index','service','label','label_attack_type']:
        plt.figure()
        sns.boxplot(x="label_attack_type", y=column,data=raw_data , palette = "Set3")









    



/home/phoenix/anaconda2/lib/python2.7/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)

Outcomes of sprint 1 :

service columns has lots of different values. Need to find a way to colnsolidate distinct distinct classes. Proceeding without services column for now.
is_host_login always has a value of 0. So, drop the column.

Objective of sprint 2 (~30 minutes):

Convert data labels to 0/1.
One-hot encoding of categorical features i.e. flag and protocol_type.
Stratified sampling. Use train_test_split. Keep a 70-30 split in training-testing data split.
Write evaluation function to calculate accuracy, precision, recall, roc_auc_curve, confusion matrix. Call it get_performance_metrics(y_test, model_predictions)



In [16]:

    
# converting label_attack_type to 0 and 1
raw_data.loc[raw_data['label_attack_type'] == 'normal', 'final_label'] = 0
raw_data.loc[raw_data['label_attack_type'] == 'attack', 'final_label'] = 1



In [17]:

    
# converting label_attack_type to 0 and 1
test_data.loc[test_data['label_attack_type'] == 'normal', 'final_label'] = 0
test_data.loc[test_data['label_attack_type'] == 'attack', 'final_label'] = 1



In [18]:

    
#one hot encoding of categorical variables

flag_encoding_raw = pd.get_dummies(raw_data['flag'],prefix = 'flag')
protocol_encoding_raw = pd.get_dummies(raw_data['protocol_type'],prefix = 'protocol')

# concat with blm dataframe

raw_data = pd.concat([raw_data, flag_encoding_raw,protocol_encoding_raw], axis =1 )



In [19]:

    
#one hot encoding of categorical variables

flag_encoding_test = pd.get_dummies(test_data['flag'],prefix = 'flag')
protocol_encoding_test = pd.get_dummies(test_data['protocol_type'],prefix = 'protocol')

# concat with blm dataframe

test_data = pd.concat([test_data, flag_encoding_test,protocol_encoding_test], axis =1 )

Train test split



In [20]:

    
predictors = [c for c in raw_data.columns if c not in ['label', 'label_attack_type', 'index', 'protocol_type',
                                                   'flag','service','is_host_login','final_label']]

X_train = raw_data[predictors]
y_train = raw_data['final_label']
X_test = test_data[predictors]
y_test = test_data['final_label']



In [21]:

    
# X_train, X_test, y_train, y_test = train_test_split(test_data[predictors], test_data['final_label'], 
#                                                     test_size=0.30, random_state=3, stratify = test_data['final_label'])



In [22]:

    
print(y_test.value_counts())









    



1.0    250436
0.0     60592
Name: final_label, dtype: int64



In [23]:

    
print(y_train.value_counts())









    



0.0    87814
1.0    57772
Name: final_label, dtype: int64

Evaluation function

Metrics - Accuracy, precision, recall, F1 score, Confusion matrix, roc_auc



In [24]:

    
def get_performance_metrics(y_test,model_predictions):
    # Accuracy
    model_accuracy = accuracy_score(y_test,model_predictions)
    print("Accuracy is ", model_accuracy)

    # precision, recall, f1 score
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_test,model_predictions)
    print('Precision for each class is ', model_precision)
    print('Recall/sensitivity for each class is ', model_recall)
    print('F1 Score for each class is ', model_f1)

    # roc_auc
    model_roc_auc = roc_auc_score(y_test,model_predictions)
    print('AUC-ROC score is ', model_roc_auc)

    # confusion matrix
    model_confusion_matrix = confusion_matrix(y_test,model_predictions)
    print('confusion matrix is :-->')
    print(model_confusion_matrix)

Objective of Sprint 3:

Fit different models and calculate performance metrics.
Use Naive Bayes, Logistic Regression.
Hyperparameter tuning of Logistic Regression using GridSearchCV.
Use tree based classifiers. DecisionTreeClassifier, RandomForestClassifier.

Naive Bayes



In [31]:

    
# create instance of Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)









    Out[31]:





GaussianNB(priors=None)



In [32]:

    
#making predictions
nb_predictions = nb_model.predict(X_test)



In [33]:

    
get_performance_metrics(y_test,nb_predictions)









    



('Accuracy is ', 0.71099708064868761)
('Precision for each class is ', array([ 0.40167544,  0.99535951]))
('Recall/sensitivity for each class is ', array([ 0.98758912,  0.64407673]))
('F1 Score for each class is ', array([ 0.57107955,  0.78208336]))
('AUC-ROC score is ', 0.81583292542927921)
confusion matrix is :-->
[[ 59840    752]
 [ 89136 161300]]

Logistic Regression



In [43]:

    
# create instance of logistic model
lr_model = LogisticRegression(random_state = 3)
lr_model.fit(X_train, y_train)









    Out[43]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=3, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [44]:

    
# predictions
lr_predictions = lr_model.predict(X_test)



In [45]:

    
get_performance_metrics(y_test,lr_predictions)









    



('Accuracy is ', 0.81034183417570127)
('Precision for each class is ', array([ 0.50691324,  0.99066077]))
('Recall/sensitivity for each class is ', array([ 0.96993002,  0.7717301 ]))
('F1 Score for each class is ', array([ 0.66583961,  0.86759725]))
('AUC-ROC score is ', 0.87083006323320156)
confusion matrix is :-->
[[ 58770   1822]
 [ 57167 193269]]

Support Vector Machine



In [46]:

    
svc_model = SVC()



In [ ]:

    
svc_model.fit(X_train,y_train)



In [ ]:

    
svc_predictions = svc_model.predict(X_test)
get_performance_metrics(y_test,svc_predictions)

Hyperparameter tuning - using GridSearchCV for Logistic Regression



In [495]:

    
# choose set of parameters to tune
params = {  'C' : [0.001, 0.01, 0.1, 1, 10],
            'penalty': ['l1', 'l2'], 
            'fit_intercept': [True, False]}

# create instance of GridSearch and fit the data
grid = GridSearchCV(estimator = lr_model, param_grid = params)
grid.fit(X_train, y_train)









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-495-7fadb15e0e12> in <module>()
      6 # create instance of GridSearch and fit the data
      7 grid = GridSearchCV(estimator = lr_model, param_grid = params)
----> 8 grid.fit(X_train, y_train)

/Users/sourabhrohilla/anaconda/lib/python2.7/site-packages/sklearn/model_selection/_search.pyc in fit(self, X, y, groups, **fit_params)
    636                                   error_score=self.error_score)
    637           for parameters, (train, test) in product(candidate_params,
--> 638                                                    cv.split(X, y, groups)))
    639 
    640         # if one choose to see train score, "out" will contain train score info

/Users/sourabhrohilla/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    777             # was dispatched. In particular this covers the edge
    778             # case of Parallel used with an exhausted iterator.
--> 779             while self.dispatch_one_batch(iterator):
    780                 self._iterating = True
    781             else:

/Users/sourabhrohilla/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
    623                 return False
    624             else:
--> 625                 self._dispatch(tasks)
    626                 return True
    627 

/Users/sourabhrohilla/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
    586         dispatch_timestamp = time.time()
    587         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588         job = self._backend.apply_async(batch, callback=cb)
    589         self._jobs.append(job)
    590 

/Users/sourabhrohilla/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in apply_async(self, func, callback)
    109     def apply_async(self, func, callback=None):
    110         """Schedule a func to be run"""
--> 111         result = ImmediateResult(func)
    112         if callback:
    113             callback(result)

/Users/sourabhrohilla/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in __init__(self, batch)
    330         # Don't delay the application, to avoid keeping the input
    331         # arguments in memory
--> 332         self.results = batch()
    333 
    334     def get(self):

/Users/sourabhrohilla/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

/Users/sourabhrohilla/anaconda/lib/python2.7/site-packages/sklearn/model_selection/_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
    435             estimator.fit(X_train, **fit_params)
    436         else:
--> 437             estimator.fit(X_train, y_train, **fit_params)
    438 
    439     except Exception as e:

/Users/sourabhrohilla/anaconda/lib/python2.7/site-packages/sklearn/linear_model/logistic.pyc in fit(self, X, y, sample_weight)
   1231                 self.class_weight, self.penalty, self.dual, self.verbose,
   1232                 self.max_iter, self.tol, self.random_state,
-> 1233                 sample_weight=sample_weight)
   1234             self.n_iter_ = np.array([n_iter_])
   1235             return self

/Users/sourabhrohilla/anaconda/lib/python2.7/site-packages/sklearn/svm/base.pyc in _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight, penalty, dual, verbose, max_iter, tol, random_state, multi_class, loss, epsilon, sample_weight)
    888         X, y_ind, sp.isspmatrix(X), solver_type, tol, bias, C,
    889         class_weight_, max_iter, rnd.randint(np.iinfo('i').max),
--> 890         epsilon, sample_weight)
    891     # Regarding rnd.randint(..) in the above signature:
    892     # seed for srand in range [0..INT_MAX); due to limitations in Numpy

KeyboardInterrupt:



In [468]:

    
# evaluate the best grid searched model on the testing data
grid_search_accuracy = grid.score(X_test, y_test)
print('Grid Search Accuracy is {0}'.format(grid_search_accuracy))
print("grid search best parameters: {}".format(grid.best_params_))









    



Grid Search Accuracy is 0.908825089596
grid search best parameters: {'penalty': 'l1', 'C': 0.001, 'fit_intercept': True}

Decision Tree



In [34]:

    
#create instance of decision tree
dt_model = DecisionTreeClassifier(random_state = 3)
dt_model.fit(X_train, y_train)









    Out[34]:





DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=3, splitter='best')



In [35]:

    
# predictions
dt_predictions = dt_model.predict(X_test)



In [36]:

    
get_performance_metrics(y_test,dt_predictions)









    



('Accuracy is ', 0.92998701081574653)
('Precision for each class is ', array([ 0.73929769,  0.99725127]))
('Recall/sensitivity for each class is ', array([ 0.98956958,  0.91557124]))
('F1 Score for each class is ', array([ 0.84631888,  0.95466733]))
('AUC-ROC score is ', 0.95257041194674574)
confusion matrix is :-->
[[ 59960    632]
 [ 21144 229292]]

Random Forest Model



In [37]:

    
#create instance of random forest model
rf_model = RandomForestClassifier(n_estimators=500,n_jobs = -1, random_state=3)



In [38]:

    
# fitting data to random forest model
rf_model.fit(X_train,y_train)









    Out[38]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=-1, oob_score=False, random_state=3,
            verbose=0, warm_start=False)



In [39]:

    
# predictions
rf_predictions = rf_model.predict(X_test)



In [40]:

    
get_performance_metrics(y_test,rf_predictions)









    



('Accuracy is ', 0.92698406574327719)
('Precision for each class is ', array([ 0.72906589,  0.99865551]))
('Recall/sensitivity for each class is ', array([ 0.99493332,  0.91054401]))
('F1 Score for each class is ', array([ 0.84149916,  0.95256654]))
('AUC-ROC score is ', 0.95273866788784045)
confusion matrix is :-->
[[ 60285    307]
 [ 22403 228033]]

Objective of Sprint 4:

Adjust the misclassification cost in Random Forest classifier.
Adjust the decision threshold. Run a cross-validation procedure (e.g. 10-fold cross validation) to get a distribution of accuracies for each decision threshold (0.1,0.2,0.3,.....,0.9). Select the threshold with highest accuracy.

Addressing Imbalance data at algorithm level

1. Adjust the class weight (misclassification costs) for Random Forest



In [479]:

    
# Class weights can be cutomized in this format: class_weight={0: 100,1: 1} 
# Class weights can be given 'balanced': class_weight= 'balanced'

rf_model = RandomForestClassifier(n_estimators=500,n_jobs = -1, random_state=3, class_weight={0: 100,1: 1})
rf_model.fit(X_train,y_train)
rf_predictions = rf_model.predict(X_test)



In [480]:

    
get_performance_metrics(y_test,rf_predictions)









    



('Accuracy is ', 0.93748301872145523)
('Precision for each class is ', array([ 0.91818905,  0.97602979]))
('Recall/sensitivity for each class is ', array([ 0.98710162,  0.85655933]))
('F1 Score for each class is ', array([ 0.95139909,  0.91240029]))
('AUC-ROC score is ', 0.92183047590013967)
confusion matrix is :-->
[[47295   618]
 [ 4214 25164]]

Adjust the decision threshold



In [ ]:

    
## Adjust the decision threshold of default value of 0.5 
## Finding threshold cutoff probability value 

def predict_label_given_cutoff(clf_rf,X_train,cutoff_prob):
    return (clf_rf.predict_proba(X_train)[:,1]>cutoff_prob)

scores = []
cutoff_prob_list = []

def f1_for_given_cutoff(cutoff_prob):
    def f1_cutoff(clf,X_train,y_train):
        y_predict = predict_label_given_cutoff(clf_rf,X_train,cutoff_prob)
        return sklearn.metric.f1_score(y_train,y_predict)
    
clf_rf = RandomForestClassifier(n_estimators= 50,n_jobs = -1)
for cutoff_prob in np.arange(0.1,0.9,0.1):    
    lb = LabelBinarizer()
    y_train = np.array([number[0] for number in lb.fit_transform(y_train)])
    validated = cross_val_score(clf_rf,X_train,y_train, cv=10, scoring =f1_for_given_cutoff(cutoff_prob))
    scores.append(validated)
    cutoff_prob_list.append(cutoff_prob)
    print(cutoff_prob)
    
sns.boxplot(cutoff_prob_list,scores)
plt.xlabel('cutoff_probability')
plt.ylabel('Classification F1 scores')
plt.title('Classifcation score for number of trees')
plt.show()

0.1



In [44]:

    
## Observed cutoff_prob from above graph 
cutoff_prob = 0.1

## Model 
rf_model = RandomForestClassifier(n_estimators= 50,n_jobs = -1)
rf_model.fit(X_train,y_train)

# predictions, for given cutoff prob value
rf_predictions = predict_label_given_cutoff(rf_model,X_test,cutoff_prob)



In [45]:

    
get_performance_metrics(y_test,rf_predictions)









    



('Accuracy is ', 0.93556207158197979)
('Precision for each class is ', array([ 0.75520479,  0.99743503]))
('Recall/sensitivity for each class is ', array([ 0.99019673,  0.92234343]))
('F1 Score for each class is ', array([ 0.8568817 ,  0.95842064]))
('AUC-ROC score is ', 0.95627007934655228)
confusion matrix is :-->
[[ 59998    594]
 [ 19448 230988]]



In [ ]:

	protocol_type	service	flag	src_bytes	dst_bytes	...	dst_host_srv_count	dst_host_same_srv_rate	dst_host_same_src_port_rate	label
0	tcp	http	SF	181	5450	...	9	1.0	0.11	normal.
1	tcp	http	SF	239	486	...	19	1.0	0.05	normal.
2	tcp	http	SF	235	1337	...	29	1.0	0.03	normal.
3	tcp	http	SF	219	1337	...	39	1.0	0.03	normal.
4	tcp	http	SF	217	2032	...	49	1.0	0.02	normal.