In [54]:
# for data manipulation
import pandas as pd
import numpy as np
# plotting graphs
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#models to run
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.preprocessing import LabelBinarizer
# train_test split
from sklearn.model_selection import train_test_split,StratifiedKFold, cross_val_score
#save model to file
from sklearn.externals import joblib
#metrics
from sklearn.metrics import confusion_matrix,accuracy_score,precision_recall_fscore_support
sns.countplotsns.countplot to understand prominent values of a feature in each class.sns.boxplot to understand distribution of a feature in each class.
In [55]:
# read raw data
raw_data = pd.read_csv('../../../session_1_data_train.csv')
In [56]:
test_data = pd.read_csv('../../../session_1_data_test.csv')
test_data.columns = raw_data.columns
In [57]:
raw_data.head()
Out[57]:
In [58]:
raw_data.label.value_counts().keys()
Out[58]:
In [59]:
test_data.label.value_counts().keys()
Out[59]:
In [60]:
# remove ". " from labels
raw_data['label'] = raw_data['label'].apply(lambda x: x[:-1])
test_data['label'] = test_data['label'].apply(lambda x: x[:-1])
In [61]:
# drop duplicate records
raw_data.drop_duplicates(inplace = True)
test_data.drop_duplicates(inplace = True)
In [62]:
# distribution of labels
sns.set_color_codes()
fig, ax1 = plt.subplots(1,1, figsize = (18,6))
sns.countplot('label', data = raw_data,palette="Set2", ax = ax1)
plt.xticks(rotation=30)
Out[62]:
In [63]:
# combining labels as normal, denial of Service, user to root, remote to local, probes
def label_grouping(label):
if label in ['back', 'land', 'neptune', 'pod', 'smurf', 'teardrop']:
return 'dos'
elif label in ['buffer_overflow', 'loadmodule','perl', 'rootkit']:
return 'utr'
elif label in ['Ftp_write', 'guess_passwd', 'imap', 'multihop', 'phf', 'spy','warezclient', 'warezmaster']:
return 'rtl'
elif label in ['satan', 'ipsweep', 'nmap', 'portsweep']:
return 'probes'
elif label=='normal':
return 'normal'
else:
return 'others'
raw_data['label_attack_type']= raw_data['label'].apply(label_grouping)
test_data['label_attack_type']= test_data['label'].apply(label_grouping)
In [64]:
raw_data['label_attack_type'].value_counts()
Out[64]:
In [65]:
test_data['label_attack_type'].value_counts()
Out[65]:
In [66]:
# distribution of label_attack_type
fig, ax1 = plt.subplots(1,1, figsize = (18,6))
sns.countplot('label_attack_type', data = raw_data,palette="Set2", ax = ax1)
plt.xticks(rotation=10)
Out[66]:
In [67]:
raw_data.columns
Out[67]:
In [68]:
# distribution of categorical variables with 'label_attack_type'
sns.set()
categorical_cols = ['protocol_type','flag','land','logged_in','is_host_login','is_guest_login']
for col in categorical_cols:
plt.figure()
sns.countplot(x=col, hue="label_attack_type",data=raw_data, palette="Set2")
train_test_split. Keep a 70-30 split in training-testing data split.get_performance_metrics(y_test, model_predictions)
In [69]:
def get_label_encoding(label):
if label == 'dos':
return 1
elif label == 'utr':
return 2
elif label == 'rtl':
return 3
elif label == 'probes':
return 4
elif label == 'others':
return 5
else:
return 0
raw_data['label_encoding']= raw_data['label_attack_type'].apply(get_label_encoding)
test_data['label_encoding']= test_data['label_attack_type'].apply(get_label_encoding)
raw_data['label_encoding'].value_counts()
Out[69]:
In [70]:
#one hot encoding of categorical variables
flag_encoding_test = pd.get_dummies(test_data['flag'],prefix = 'flag')
protocol_encoding_test = pd.get_dummies(test_data['protocol_type'],prefix = 'protocol')
# concat with original dataframe
test_data = pd.concat([test_data, flag_encoding_test,protocol_encoding_test],axis =1)
In [71]:
#one hot encoding of categorical variables
flag_encoding_raw = pd.get_dummies(raw_data['flag'],prefix = 'flag')
protocol_encoding_raw = pd.get_dummies(raw_data['protocol_type'],prefix = 'protocol')
# concat with original dataframe
raw_data = pd.concat([raw_data, flag_encoding_raw,protocol_encoding_raw],axis =1)
In [72]:
raw_data.head()
Out[72]:
In [73]:
predictors = [c for c in raw_data.columns if c not in ['label', 'label_attack_type', 'index', 'protocol_type',
'flag','service','is_host_login','label_encoding',
'count','same_srv_rate','diff_srv_rate','src_bytes','flag_SF',
'dst_host_same_srv_rate','dst_host_srv_count',
'dst_bytes','dst_host_srv_serror_rate',
'dst_host_diff_srv_rate','dst_host_serror_rate'
'srv_serror_rate','flag_S0','serror_rate','logged_in',
'dst_host_same_src_port_rate','dst_host_count']]
X_train = raw_data[predictors]
y_train = raw_data['label_encoding']
X_test = test_data[predictors]
y_test = test_data['label_encoding']
In [75]:
sns.countplot(y_train)
Out[75]:
In [22]:
sns.countplot(y_test)
Out[22]:
In [82]:
def get_performance_metrics(y_test,model_predictions):
# Accuracy
model_accuracy = accuracy_score(y_test,model_predictions)
print("Accuracy is ", model_accuracy)
# precision, recall, f1 score
model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_test,model_predictions)
print('Precision for each class is ', model_precision)
print('Recall/sensitivity for each class is ', model_recall)
print('F1 Score for each class is ', model_f1)
# confusion matrix
model_confusion_matrix = confusion_matrix(y_test,model_predictions)
print('confusion matrix is :-->')
print(model_confusion_matrix)
In [24]:
#create instance of decision tree
dt_model = DecisionTreeClassifier(random_state = 3)
dt_model.fit(X_train, y_train)
Out[24]:
In [25]:
#making predictions
dt_predictions = dt_model.predict(X_test)
In [26]:
get_performance_metrics(y_test,dt_predictions)
In [27]:
svc_model = SVC()
In [ ]:
svc_model.fit(X_train,y_train)
In [ ]:
svc_predictions = svc_model.predict(X_test)
get_performance_metrics(y_test,svc_predictions)
In [28]:
dt_model = DecisionTreeClassifier(random_state = 3)
skf = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 2)
predictions = []
score = []
for i, (train_index,test_index) in enumerate(skf.split(X_train,y_train)):
train_predictors = X_train.iloc[train_index]
train_target = y_train.iloc[train_index]
test_predictors = X_train.iloc[test_index]
test_target = y_train.iloc[test_index]
#fit the model
dt_model.fit(train_predictors,train_target)
#make predictions
test_pred = dt_model.predict(test_predictors)
accuracy = accuracy_score(test_target, test_pred)
score.append(accuracy)
p, r, f1, support = precision_recall_fscore_support(test_target, test_pred)
confusion = confusion_matrix(test_target, test_pred)
print('------------------------------')
print('Accuracy at {0}-Fold is'.format(i),accuracy)
print('Precision at {0}-Fold is'.format(i), p)
print('Recall at {0}-Fold is'.format(i), r)
print('F1 score at {0}-Fold is'.format(i), f1)
print('Confusion matrix at {0}-Fold is :-->'.format(i))
print(confusion)
print('------------------------------')
#calculating average accuracy , precision, recall, F1 score
print('Average accuracy is', np.mean(score))
In [29]:
rf_model = RandomForestClassifier(n_estimators=1000,n_jobs = -1, random_state=3)
In [30]:
# fitting data to random forest model
rf_model.fit(X_train,y_train)
Out[30]:
In [46]:
# predictions
rf_predictions = rf_model.predict(X_test)
In [47]:
get_performance_metrics(y_test,rf_predictions)
In [64]:
ovr_dt_model = OneVsRestClassifier(DecisionTreeClassifier(random_state = 3))
ovr_dt_model.fit(X_train, y_train)
Out[64]:
In [65]:
#making predictions
ovr_dt_predictions = ovr_dt_model.predict(X_test)
In [66]:
get_performance_metrics(y_test,ovr_dt_predictions)
In [80]:
ovo_dt_model = OneVsOneClassifier(DecisionTreeClassifier(random_state = 3))
ovo_dt_model.fit(X_train, y_train)
Out[80]:
In [81]:
#making predictions
ovo_dt_predictions = ovo_dt_model.predict(X_test)
In [82]:
get_performance_metrics(y_test,ovo_dt_predictions)
In [90]:
y_train_columns = pd.get_dummies(y_train,prefix = 'label')
In [91]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=3, stratify = y)
Cost_weights = [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 100}, {0: 1, 1: 50},{0: 1, 1: 10},{0: 1, 1: 10}]
rf_model = RandomForestClassifier(n_estimators=100,n_jobs = -1, random_state=3, class_weight=Cost_weights)
# fitting data to random forest model
rf_model.fit(X_train,y_train_columns)
# predictions
rf_predictions_df = pd.DataFrame(rf_model.predict(X_test))
rf_predictions_df.head()
Out[91]:
In [92]:
rf_predictions_df['predict'] = rf_predictions_df.apply(
lambda row: row[0]*0 + row[1]*1+row[2]*2 + row[3]*3 + row[4]*4 , axis=1)
rf_predictions=rf_predictions_df['predict']
get_performance_metrics(y_test,rf_predictions)
In [114]:
y_train_utr = y_train.copy(deep=True)
y_train_utr[y_train_utr!=2] = 0
y_train_utr[y_train_utr==2] = 1
y_test_utr = y_test.copy(deep=True)
y_test_utr[y_test_utr!=2] = 0
y_test_utr[y_test_utr==2] = 1
In [115]:
rf_model_utr = RandomForestClassifier(n_estimators=1000,n_jobs = -1, random_state=3)
rf_model_utr.fit(X_train,y_train_utr)
rf_predictions = rf_model_utr.predict(X_test)
get_performance_metrics(y_test_utr,rf_predictions)
In [116]:
from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_train_utr_smote,y_train_utr_smote = sm.fit_sample(X_train,y_train_utr)
In [117]:
rf_model_utr_smote = RandomForestClassifier(n_estimators=1000,n_jobs = -1, random_state=3)
rf_model_utr_smote.fit(X_train_utr_smote,y_train_utr_smote)
rf_predictions = rf_model_utr_smote.predict(X_test)
get_performance_metrics(y_test_utr,rf_predictions)
In [ ]: