In [1]:
# for data manipulation
import pandas as pd
import numpy as np
#for graphs
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#models to run
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelBinarizer
from sklearn.cross_validation import cross_val_score
#train_test_split
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit, GridSearchCV
#metrics
from sklearn.metrics import confusion_matrix,precision_recall_fscore_support
from sklearn.metrics import accuracy_score, roc_auc_score
sns.countplotsns.countplot to understand prominent values of a feature in attack vs normal.sns.boxplot to understand distribution of a feature in attack vs normal.
In [2]:
# read raw data
raw_data = pd.read_csv('/home/phoenix/Documents/session_1_data_train.csv')
In [3]:
test_data = pd.read_csv('/home/phoenix/Documents/session_1_data_test.csv')
test_data.columns = raw_data.columns
In [4]:
raw_data.head()
Out[4]:
In [5]:
raw_data.label.value_counts().keys()
Out[5]:
In [6]:
test_data.label.value_counts().keys()
Out[6]:
In [7]:
# remove ". " from labels
raw_data['label'] = raw_data['label'].apply(lambda x: x[:-1])
test_data['label'] = test_data['label'].apply(lambda x: x[:-1])
In [8]:
pd.isnull(raw_data).sum()
Out[8]:
In [9]:
raw_data = raw_data.drop_duplicates()
raw_data = raw_data.dropna()
In [14]:
# distribution of labels
sns.set_color_codes()
fig, ax1 = plt.subplots(1,1, figsize = (18,6))
sns.countplot('label', data = raw_data,palette="Set2", ax = ax1)
plt.xticks(rotation=30)
Out[14]:
In [10]:
# combining labels as normal and attack
# normal is 1 , attack is 0
def get_label_grouping(label):
if label == 'normal':
return 'normal'
else:
return 'attack'
raw_data['label_attack_type']= raw_data['label'].apply(get_label_grouping)
test_data['label_attack_type']= test_data['label'].apply(get_label_grouping)
In [11]:
raw_data['label_attack_type'].value_counts()
Out[11]:
In [12]:
test_data['label_attack_type'].value_counts()
Out[12]:
In [13]:
# distribution of label_attack_type
fig, ax1 = plt.subplots(1,1, figsize = (18,6))
sns.countplot('label_attack_type', data = raw_data,palette="Set2", ax = ax1)
# plt.xticks(rotation=30)
Out[13]:
In [14]:
raw_data.columns
Out[14]:
In [15]:
# distribution of categorical variables with 'label_attack_type'
sns.set()
categorical_cols = ['protocol_type','flag','land','logged_in','is_host_login','is_guest_login']
for column in categorical_cols:
plt.figure()
sns.countplot(x=column, hue="label_attack_type",data=raw_data, palette="Set2")
In [21]:
## Checking distributions of continuous variables with default_status by plotting boxplots
for column in raw_data.columns:
if column not in categorical_cols+['index','service','label','label_attack_type']:
plt.figure()
sns.boxplot(x="label_attack_type", y=column,data=raw_data , palette = "Set3")
Outcomes of sprint 1 :
train_test_split. Keep a 70-30 split in training-testing data split.get_performance_metrics(y_test, model_predictions)
In [16]:
# converting label_attack_type to 0 and 1
raw_data.loc[raw_data['label_attack_type'] == 'normal', 'final_label'] = 0
raw_data.loc[raw_data['label_attack_type'] == 'attack', 'final_label'] = 1
In [17]:
# converting label_attack_type to 0 and 1
test_data.loc[test_data['label_attack_type'] == 'normal', 'final_label'] = 0
test_data.loc[test_data['label_attack_type'] == 'attack', 'final_label'] = 1
In [18]:
#one hot encoding of categorical variables
flag_encoding_raw = pd.get_dummies(raw_data['flag'],prefix = 'flag')
protocol_encoding_raw = pd.get_dummies(raw_data['protocol_type'],prefix = 'protocol')
# concat with blm dataframe
raw_data = pd.concat([raw_data, flag_encoding_raw,protocol_encoding_raw], axis =1 )
In [19]:
#one hot encoding of categorical variables
flag_encoding_test = pd.get_dummies(test_data['flag'],prefix = 'flag')
protocol_encoding_test = pd.get_dummies(test_data['protocol_type'],prefix = 'protocol')
# concat with blm dataframe
test_data = pd.concat([test_data, flag_encoding_test,protocol_encoding_test], axis =1 )
In [20]:
predictors = [c for c in raw_data.columns if c not in ['label', 'label_attack_type', 'index', 'protocol_type',
'flag','service','is_host_login','final_label']]
X_train = raw_data[predictors]
y_train = raw_data['final_label']
X_test = test_data[predictors]
y_test = test_data['final_label']
In [21]:
# X_train, X_test, y_train, y_test = train_test_split(test_data[predictors], test_data['final_label'],
# test_size=0.30, random_state=3, stratify = test_data['final_label'])
In [22]:
print(y_test.value_counts())
In [23]:
print(y_train.value_counts())
In [24]:
def get_performance_metrics(y_test,model_predictions):
# Accuracy
model_accuracy = accuracy_score(y_test,model_predictions)
print("Accuracy is ", model_accuracy)
# precision, recall, f1 score
model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_test,model_predictions)
print('Precision for each class is ', model_precision)
print('Recall/sensitivity for each class is ', model_recall)
print('F1 Score for each class is ', model_f1)
# roc_auc
model_roc_auc = roc_auc_score(y_test,model_predictions)
print('AUC-ROC score is ', model_roc_auc)
# confusion matrix
model_confusion_matrix = confusion_matrix(y_test,model_predictions)
print('confusion matrix is :-->')
print(model_confusion_matrix)
In [31]:
# create instance of Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
Out[31]:
In [32]:
#making predictions
nb_predictions = nb_model.predict(X_test)
In [33]:
get_performance_metrics(y_test,nb_predictions)
In [43]:
# create instance of logistic model
lr_model = LogisticRegression(random_state = 3)
lr_model.fit(X_train, y_train)
Out[43]:
In [44]:
# predictions
lr_predictions = lr_model.predict(X_test)
In [45]:
get_performance_metrics(y_test,lr_predictions)
In [46]:
svc_model = SVC()
In [ ]:
svc_model.fit(X_train,y_train)
In [ ]:
svc_predictions = svc_model.predict(X_test)
get_performance_metrics(y_test,svc_predictions)
In [495]:
# choose set of parameters to tune
params = { 'C' : [0.001, 0.01, 0.1, 1, 10],
'penalty': ['l1', 'l2'],
'fit_intercept': [True, False]}
# create instance of GridSearch and fit the data
grid = GridSearchCV(estimator = lr_model, param_grid = params)
grid.fit(X_train, y_train)
In [468]:
# evaluate the best grid searched model on the testing data
grid_search_accuracy = grid.score(X_test, y_test)
print('Grid Search Accuracy is {0}'.format(grid_search_accuracy))
print("grid search best parameters: {}".format(grid.best_params_))
In [34]:
#create instance of decision tree
dt_model = DecisionTreeClassifier(random_state = 3)
dt_model.fit(X_train, y_train)
Out[34]:
In [35]:
# predictions
dt_predictions = dt_model.predict(X_test)
In [36]:
get_performance_metrics(y_test,dt_predictions)
In [37]:
#create instance of random forest model
rf_model = RandomForestClassifier(n_estimators=500,n_jobs = -1, random_state=3)
In [38]:
# fitting data to random forest model
rf_model.fit(X_train,y_train)
Out[38]:
In [39]:
# predictions
rf_predictions = rf_model.predict(X_test)
In [40]:
get_performance_metrics(y_test,rf_predictions)
In [479]:
# Class weights can be cutomized in this format: class_weight={0: 100,1: 1}
# Class weights can be given 'balanced': class_weight= 'balanced'
rf_model = RandomForestClassifier(n_estimators=500,n_jobs = -1, random_state=3, class_weight={0: 100,1: 1})
rf_model.fit(X_train,y_train)
rf_predictions = rf_model.predict(X_test)
In [480]:
get_performance_metrics(y_test,rf_predictions)
In [ ]:
## Adjust the decision threshold of default value of 0.5
## Finding threshold cutoff probability value
def predict_label_given_cutoff(clf_rf,X_train,cutoff_prob):
return (clf_rf.predict_proba(X_train)[:,1]>cutoff_prob)
scores = []
cutoff_prob_list = []
def f1_for_given_cutoff(cutoff_prob):
def f1_cutoff(clf,X_train,y_train):
y_predict = predict_label_given_cutoff(clf_rf,X_train,cutoff_prob)
return sklearn.metric.f1_score(y_train,y_predict)
clf_rf = RandomForestClassifier(n_estimators= 50,n_jobs = -1)
for cutoff_prob in np.arange(0.1,0.9,0.1):
lb = LabelBinarizer()
y_train = np.array([number[0] for number in lb.fit_transform(y_train)])
validated = cross_val_score(clf_rf,X_train,y_train, cv=10, scoring =f1_for_given_cutoff(cutoff_prob))
scores.append(validated)
cutoff_prob_list.append(cutoff_prob)
print(cutoff_prob)
sns.boxplot(cutoff_prob_list,scores)
plt.xlabel('cutoff_probability')
plt.ylabel('Classification F1 scores')
plt.title('Classifcation score for number of trees')
plt.show()
In [44]:
## Observed cutoff_prob from above graph
cutoff_prob = 0.1
## Model
rf_model = RandomForestClassifier(n_estimators= 50,n_jobs = -1)
rf_model.fit(X_train,y_train)
# predictions, for given cutoff prob value
rf_predictions = predict_label_given_cutoff(rf_model,X_test,cutoff_prob)
In [45]:
get_performance_metrics(y_test,rf_predictions)
In [ ]: