In [1]:
# for data manipulation
import pandas as pd
import numpy as np

#for graphs
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#models to run
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelBinarizer
from sklearn.cross_validation import cross_val_score

#train_test_split
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit, GridSearchCV

#metrics
from sklearn.metrics import confusion_matrix,precision_recall_fscore_support
from sklearn.metrics import accuracy_score, roc_auc_score


/home/phoenix/anaconda2/envs/bengalore/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Objective of sprint 1 (~30 minutes) :

  1. Read csv file.
  2. Look at the shape of dataframe.
  3. Clean label.
  4. Check for duplicates and remove duplicates.
  5. A bar plot of distribution of labels, i.e. attack-types using sns.countplot
  6. Convert labels into binary labels. Normal and attack.
  7. A bar plot of distribution of binary labels (normal and attacks)
  8. Distribution of features
    1. Categorical features : Use sns.countplot to understand prominent values of a feature in attack vs normal.
    2. Continuous features : Use sns.boxplot to understand distribution of a feature in attack vs normal.

In [2]:
# read raw data
raw_data = pd.read_csv('/home/phoenix/Documents/session_1_data_train.csv')

In [3]:
test_data = pd.read_csv('/home/phoenix/Documents/session_1_data_test.csv')
test_data.columns = raw_data.columns

In [4]:
raw_data.head()


Out[4]:
duration protocol_type service flag src_bytes dst_bytes land wrong_fragment urgent hot ... dst_host_srv_count dst_host_same_srv_rate dst_host_diff_srv_rate dst_host_same_src_port_rate dst_host_srv_diff_host_rate dst_host_serror_rate dst_host_srv_serror_rate dst_host_rerror_rate dst_host_srv_rerror_rate label
0 0 tcp http SF 181 5450 0 0 0 0 ... 9 1.0 0.0 0.11 0.0 0.0 0.0 0.0 0.0 normal.
1 0 tcp http SF 239 486 0 0 0 0 ... 19 1.0 0.0 0.05 0.0 0.0 0.0 0.0 0.0 normal.
2 0 tcp http SF 235 1337 0 0 0 0 ... 29 1.0 0.0 0.03 0.0 0.0 0.0 0.0 0.0 normal.
3 0 tcp http SF 219 1337 0 0 0 0 ... 39 1.0 0.0 0.03 0.0 0.0 0.0 0.0 0.0 normal.
4 0 tcp http SF 217 2032 0 0 0 0 ... 49 1.0 0.0 0.02 0.0 0.0 0.0 0.0 0.0 normal.

5 rows × 42 columns


In [5]:
raw_data.label.value_counts().keys()


Out[5]:
Index(['smurf.', 'neptune.', 'normal.', 'back.', 'satan.', 'ipsweep.',
       'portsweep.', 'warezclient.', 'teardrop.', 'pod.', 'nmap.',
       'guess_passwd.', 'buffer_overflow.', 'land.', 'warezmaster.', 'others.',
       'imap.', 'rootkit.', 'loadmodule.', 'ftp_write.', 'multihop.', 'phf.',
       'perl.', 'spy.'],
      dtype='object')

In [6]:
test_data.label.value_counts().keys()


Out[6]:
Index(['smurf.', 'normal.', 'neptune.', 'snmpgetattack.', 'mailbomb.',
       'guess_passwd.', 'snmpguess.', 'satan.', 'warezmaster.', 'back.',
       'mscan.', 'apache2.', 'processtable.', 'saint.', 'portsweep.',
       'ipsweep.', 'httptunnel.', 'pod.', 'nmap.', 'buffer_overflow.',
       'multihop.', 'sendmail.', 'named.', 'ps.', 'xterm.', 'rootkit.',
       'teardrop.', 'land.', 'xlock.', 'xsnoop.', 'ftp_write.', 'perl.',
       'loadmodule.', 'sqlattack.', 'phf.', 'worm.', 'udpstorm.', 'imap.'],
      dtype='object')

In [7]:
# remove ". " from labels
raw_data['label'] = raw_data['label'].apply(lambda x: x[:-1])
test_data['label'] = test_data['label'].apply(lambda x: x[:-1])

In [8]:
pd.isnull(raw_data).sum()


Out[8]:
duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      0
dst_bytes                      0
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
su_attempted                   0
num_root                       0
num_file_creations             0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          0
srv_count                      0
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_host_rate             0
dst_host_count                 0
dst_host_srv_count             0
dst_host_same_srv_rate         0
dst_host_diff_srv_rate         0
dst_host_same_src_port_rate    0
dst_host_srv_diff_host_rate    0
dst_host_serror_rate           0
dst_host_srv_serror_rate       0
dst_host_rerror_rate           0
dst_host_srv_rerror_rate       0
label                          0
dtype: int64

In [9]:
raw_data = raw_data.drop_duplicates()
raw_data = raw_data.dropna()

In [14]:
# distribution of labels
sns.set_color_codes()

fig, ax1 = plt.subplots(1,1, figsize = (18,6))

sns.countplot('label', data = raw_data,palette="Set2", ax = ax1)
plt.xticks(rotation=30)


Out[14]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23]), <a list of 24 Text xticklabel objects>)

In [10]:
# combining labels as normal and attack
# normal is 1 , attack is 0

def get_label_grouping(label):
    if label == 'normal':
        return 'normal'
    else:
        return 'attack'

raw_data['label_attack_type']= raw_data['label'].apply(get_label_grouping)
test_data['label_attack_type']= test_data['label'].apply(get_label_grouping)

In [11]:
raw_data['label_attack_type'].value_counts()


Out[11]:
normal    87814
attack    57772
Name: label_attack_type, dtype: int64

In [12]:
test_data['label_attack_type'].value_counts()


Out[12]:
attack    250436
normal     60592
Name: label_attack_type, dtype: int64

In [13]:
# distribution of label_attack_type

fig, ax1 = plt.subplots(1,1, figsize = (18,6))

sns.countplot('label_attack_type', data = raw_data,palette="Set2", ax = ax1)
# plt.xticks(rotation=30)


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6ba9cb38d0>

In [14]:
raw_data.columns


Out[14]:
Index(['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'label', 'label_attack_type'],
      dtype='object')

In [15]:
# distribution of categorical variables with 'label_attack_type'

sns.set()

categorical_cols = ['protocol_type','flag','land','logged_in','is_host_login','is_guest_login']

for column in categorical_cols:
    plt.figure()
    sns.countplot(x=column, hue="label_attack_type",data=raw_data, palette="Set2")



In [21]:
## Checking distributions of continuous variables with default_status by plotting boxplots
for column in raw_data.columns:
    if column not in categorical_cols+['index','service','label','label_attack_type']:
        plt.figure()
        sns.boxplot(x="label_attack_type", y=column,data=raw_data , palette = "Set3")


/home/phoenix/anaconda2/lib/python2.7/site-packages/matplotlib/pyplot.py:524: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)

Outcomes of sprint 1 :

  1. service columns has lots of different values. Need to find a way to colnsolidate distinct distinct classes. Proceeding without services column for now.
  2. is_host_login always has a value of 0. So, drop the column.

Objective of sprint 2 (~30 minutes):

  1. Convert data labels to 0/1.
  2. One-hot encoding of categorical features i.e. flag and protocol_type.
  3. Stratified sampling. Use train_test_split. Keep a 70-30 split in training-testing data split.
  4. Write evaluation function to calculate accuracy, precision, recall, roc_auc_curve, confusion matrix. Call it get_performance_metrics(y_test, model_predictions)

In [16]:
# converting label_attack_type to 0 and 1
raw_data.loc[raw_data['label_attack_type'] == 'normal', 'final_label'] = 0
raw_data.loc[raw_data['label_attack_type'] == 'attack', 'final_label'] = 1

In [17]:
# converting label_attack_type to 0 and 1
test_data.loc[test_data['label_attack_type'] == 'normal', 'final_label'] = 0
test_data.loc[test_data['label_attack_type'] == 'attack', 'final_label'] = 1

In [18]:
#one hot encoding of categorical variables

flag_encoding_raw = pd.get_dummies(raw_data['flag'],prefix = 'flag')
protocol_encoding_raw = pd.get_dummies(raw_data['protocol_type'],prefix = 'protocol')

# concat with blm dataframe

raw_data = pd.concat([raw_data, flag_encoding_raw,protocol_encoding_raw], axis =1 )

In [19]:
#one hot encoding of categorical variables

flag_encoding_test = pd.get_dummies(test_data['flag'],prefix = 'flag')
protocol_encoding_test = pd.get_dummies(test_data['protocol_type'],prefix = 'protocol')

# concat with blm dataframe

test_data = pd.concat([test_data, flag_encoding_test,protocol_encoding_test], axis =1 )

Train test split


In [20]:
predictors = [c for c in raw_data.columns if c not in ['label', 'label_attack_type', 'index', 'protocol_type',
                                                   'flag','service','is_host_login','final_label']]

X_train = raw_data[predictors]
y_train = raw_data['final_label']
X_test = test_data[predictors]
y_test = test_data['final_label']

In [21]:
# X_train, X_test, y_train, y_test = train_test_split(test_data[predictors], test_data['final_label'], 
#                                                     test_size=0.30, random_state=3, stratify = test_data['final_label'])

In [22]:
print(y_test.value_counts())


1.0    250436
0.0     60592
Name: final_label, dtype: int64

In [23]:
print(y_train.value_counts())


0.0    87814
1.0    57772
Name: final_label, dtype: int64

Evaluation function

Metrics - Accuracy, precision, recall, F1 score, Confusion matrix, roc_auc


In [24]:
def get_performance_metrics(y_test,model_predictions):
    # Accuracy
    model_accuracy = accuracy_score(y_test,model_predictions)
    print("Accuracy is ", model_accuracy)

    # precision, recall, f1 score
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_test,model_predictions)
    print('Precision for each class is ', model_precision)
    print('Recall/sensitivity for each class is ', model_recall)
    print('F1 Score for each class is ', model_f1)

    # roc_auc
    model_roc_auc = roc_auc_score(y_test,model_predictions)
    print('AUC-ROC score is ', model_roc_auc)

    # confusion matrix
    model_confusion_matrix = confusion_matrix(y_test,model_predictions)
    print('confusion matrix is :-->')
    print(model_confusion_matrix)

Objective of Sprint 3:

  1. Fit different models and calculate performance metrics.
  2. Use Naive Bayes, Logistic Regression.
  3. Hyperparameter tuning of Logistic Regression using GridSearchCV.
  4. Use tree based classifiers. DecisionTreeClassifier, RandomForestClassifier.

Naive Bayes


In [31]:
# create instance of Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)


Out[31]:
GaussianNB(priors=None)

In [32]:
#making predictions
nb_predictions = nb_model.predict(X_test)

In [33]:
get_performance_metrics(y_test,nb_predictions)


('Accuracy is ', 0.71099708064868761)
('Precision for each class is ', array([ 0.40167544,  0.99535951]))
('Recall/sensitivity for each class is ', array([ 0.98758912,  0.64407673]))
('F1 Score for each class is ', array([ 0.57107955,  0.78208336]))
('AUC-ROC score is ', 0.81583292542927921)
confusion matrix is :-->
[[ 59840    752]
 [ 89136 161300]]

Logistic Regression


In [43]:
# create instance of logistic model
lr_model = LogisticRegression(random_state = 3)
lr_model.fit(X_train, y_train)


Out[43]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=3, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [44]:
# predictions
lr_predictions = lr_model.predict(X_test)

In [45]:
get_performance_metrics(y_test,lr_predictions)


('Accuracy is ', 0.81034183417570127)
('Precision for each class is ', array([ 0.50691324,  0.99066077]))
('Recall/sensitivity for each class is ', array([ 0.96993002,  0.7717301 ]))
('F1 Score for each class is ', array([ 0.66583961,  0.86759725]))
('AUC-ROC score is ', 0.87083006323320156)
confusion matrix is :-->
[[ 58770   1822]
 [ 57167 193269]]

Support Vector Machine


In [46]:
svc_model = SVC()

In [ ]:
svc_model.fit(X_train,y_train)

In [ ]:
svc_predictions = svc_model.predict(X_test)
get_performance_metrics(y_test,svc_predictions)

Hyperparameter tuning - using GridSearchCV for Logistic Regression


In [495]:
# choose set of parameters to tune
params = {  'C' : [0.001, 0.01, 0.1, 1, 10],
            'penalty': ['l1', 'l2'], 
            'fit_intercept': [True, False]}

# create instance of GridSearch and fit the data
grid = GridSearchCV(estimator = lr_model, param_grid = params)
grid.fit(X_train, y_train)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-495-7fadb15e0e12> in <module>()
      6 # create instance of GridSearch and fit the data
      7 grid = GridSearchCV(estimator = lr_model, param_grid = params)
----> 8 grid.fit(X_train, y_train)

/Users/sourabhrohilla/anaconda/lib/python2.7/site-packages/sklearn/model_selection/_search.pyc in fit(self, X, y, groups, **fit_params)
    636                                   error_score=self.error_score)
    637           for parameters, (train, test) in product(candidate_params,
--> 638                                                    cv.split(X, y, groups)))
    639 
    640         # if one choose to see train score, "out" will contain train score info

/Users/sourabhrohilla/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    777             # was dispatched. In particular this covers the edge
    778             # case of Parallel used with an exhausted iterator.
--> 779             while self.dispatch_one_batch(iterator):
    780                 self._iterating = True
    781             else:

/Users/sourabhrohilla/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
    623                 return False
    624             else:
--> 625                 self._dispatch(tasks)
    626                 return True
    627 

/Users/sourabhrohilla/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
    586         dispatch_timestamp = time.time()
    587         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588         job = self._backend.apply_async(batch, callback=cb)
    589         self._jobs.append(job)
    590 

/Users/sourabhrohilla/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in apply_async(self, func, callback)
    109     def apply_async(self, func, callback=None):
    110         """Schedule a func to be run"""
--> 111         result = ImmediateResult(func)
    112         if callback:
    113             callback(result)

/Users/sourabhrohilla/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in __init__(self, batch)
    330         # Don't delay the application, to avoid keeping the input
    331         # arguments in memory
--> 332         self.results = batch()
    333 
    334     def get(self):

/Users/sourabhrohilla/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

/Users/sourabhrohilla/anaconda/lib/python2.7/site-packages/sklearn/model_selection/_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
    435             estimator.fit(X_train, **fit_params)
    436         else:
--> 437             estimator.fit(X_train, y_train, **fit_params)
    438 
    439     except Exception as e:

/Users/sourabhrohilla/anaconda/lib/python2.7/site-packages/sklearn/linear_model/logistic.pyc in fit(self, X, y, sample_weight)
   1231                 self.class_weight, self.penalty, self.dual, self.verbose,
   1232                 self.max_iter, self.tol, self.random_state,
-> 1233                 sample_weight=sample_weight)
   1234             self.n_iter_ = np.array([n_iter_])
   1235             return self

/Users/sourabhrohilla/anaconda/lib/python2.7/site-packages/sklearn/svm/base.pyc in _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight, penalty, dual, verbose, max_iter, tol, random_state, multi_class, loss, epsilon, sample_weight)
    888         X, y_ind, sp.isspmatrix(X), solver_type, tol, bias, C,
    889         class_weight_, max_iter, rnd.randint(np.iinfo('i').max),
--> 890         epsilon, sample_weight)
    891     # Regarding rnd.randint(..) in the above signature:
    892     # seed for srand in range [0..INT_MAX); due to limitations in Numpy

KeyboardInterrupt: 

In [468]:
# evaluate the best grid searched model on the testing data
grid_search_accuracy = grid.score(X_test, y_test)
print('Grid Search Accuracy is {0}'.format(grid_search_accuracy))
print("grid search best parameters: {}".format(grid.best_params_))


Grid Search Accuracy is 0.908825089596
grid search best parameters: {'penalty': 'l1', 'C': 0.001, 'fit_intercept': True}

Decision Tree


In [34]:
#create instance of decision tree
dt_model = DecisionTreeClassifier(random_state = 3)
dt_model.fit(X_train, y_train)


Out[34]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=3, splitter='best')

In [35]:
# predictions
dt_predictions = dt_model.predict(X_test)

In [36]:
get_performance_metrics(y_test,dt_predictions)


('Accuracy is ', 0.92998701081574653)
('Precision for each class is ', array([ 0.73929769,  0.99725127]))
('Recall/sensitivity for each class is ', array([ 0.98956958,  0.91557124]))
('F1 Score for each class is ', array([ 0.84631888,  0.95466733]))
('AUC-ROC score is ', 0.95257041194674574)
confusion matrix is :-->
[[ 59960    632]
 [ 21144 229292]]

Random Forest Model


In [37]:
#create instance of random forest model
rf_model = RandomForestClassifier(n_estimators=500,n_jobs = -1, random_state=3)

In [38]:
# fitting data to random forest model
rf_model.fit(X_train,y_train)


Out[38]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=-1, oob_score=False, random_state=3,
            verbose=0, warm_start=False)

In [39]:
# predictions
rf_predictions = rf_model.predict(X_test)

In [40]:
get_performance_metrics(y_test,rf_predictions)


('Accuracy is ', 0.92698406574327719)
('Precision for each class is ', array([ 0.72906589,  0.99865551]))
('Recall/sensitivity for each class is ', array([ 0.99493332,  0.91054401]))
('F1 Score for each class is ', array([ 0.84149916,  0.95256654]))
('AUC-ROC score is ', 0.95273866788784045)
confusion matrix is :-->
[[ 60285    307]
 [ 22403 228033]]

Objective of Sprint 4:

  1. Adjust the misclassification cost in Random Forest classifier.
  2. Adjust the decision threshold. Run a cross-validation procedure (e.g. 10-fold cross validation) to get a distribution of accuracies for each decision threshold (0.1,0.2,0.3,.....,0.9). Select the threshold with highest accuracy.

Addressing Imbalance data at algorithm level

1. Adjust the class weight (misclassification costs) for Random Forest


In [479]:
# Class weights can be cutomized in this format: class_weight={0: 100,1: 1} 
# Class weights can be given 'balanced': class_weight= 'balanced'

rf_model = RandomForestClassifier(n_estimators=500,n_jobs = -1, random_state=3, class_weight={0: 100,1: 1})
rf_model.fit(X_train,y_train)
rf_predictions = rf_model.predict(X_test)

In [480]:
get_performance_metrics(y_test,rf_predictions)


('Accuracy is ', 0.93748301872145523)
('Precision for each class is ', array([ 0.91818905,  0.97602979]))
('Recall/sensitivity for each class is ', array([ 0.98710162,  0.85655933]))
('F1 Score for each class is ', array([ 0.95139909,  0.91240029]))
('AUC-ROC score is ', 0.92183047590013967)
confusion matrix is :-->
[[47295   618]
 [ 4214 25164]]

Adjust the decision threshold


In [ ]:
## Adjust the decision threshold of default value of 0.5 
## Finding threshold cutoff probability value 

def predict_label_given_cutoff(clf_rf,X_train,cutoff_prob):
    return (clf_rf.predict_proba(X_train)[:,1]>cutoff_prob)

scores = []
cutoff_prob_list = []

def f1_for_given_cutoff(cutoff_prob):
    def f1_cutoff(clf,X_train,y_train):
        y_predict = predict_label_given_cutoff(clf_rf,X_train,cutoff_prob)
        return sklearn.metric.f1_score(y_train,y_predict)
    
clf_rf = RandomForestClassifier(n_estimators= 50,n_jobs = -1)
for cutoff_prob in np.arange(0.1,0.9,0.1):    
    lb = LabelBinarizer()
    y_train = np.array([number[0] for number in lb.fit_transform(y_train)])
    validated = cross_val_score(clf_rf,X_train,y_train, cv=10, scoring =f1_for_given_cutoff(cutoff_prob))
    scores.append(validated)
    cutoff_prob_list.append(cutoff_prob)
    print(cutoff_prob)
    
sns.boxplot(cutoff_prob_list,scores)
plt.xlabel('cutoff_probability')
plt.ylabel('Classification F1 scores')
plt.title('Classifcation score for number of trees')
plt.show()


0.1

In [44]:
## Observed cutoff_prob from above graph 
cutoff_prob = 0.1

## Model 
rf_model = RandomForestClassifier(n_estimators= 50,n_jobs = -1)
rf_model.fit(X_train,y_train)

# predictions, for given cutoff prob value
rf_predictions = predict_label_given_cutoff(rf_model,X_test,cutoff_prob)

In [45]:
get_performance_metrics(y_test,rf_predictions)


('Accuracy is ', 0.93556207158197979)
('Precision for each class is ', array([ 0.75520479,  0.99743503]))
('Recall/sensitivity for each class is ', array([ 0.99019673,  0.92234343]))
('F1 Score for each class is ', array([ 0.8568817 ,  0.95842064]))
('AUC-ROC score is ', 0.95627007934655228)
confusion matrix is :-->
[[ 59998    594]
 [ 19448 230988]]

In [ ]: