In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.pylab as pylab
import numpy as np
import pandas as pd
columns = ["duration", "protocol_type", "service",
"flag", "src_bytes", "dst_bytes", "land",
"wrong_fragment", "urgent", "hot", "num_failed_logins",
"logged_in", "num_compromised", "root_shell",
"su_attempted", "num_root", "num_file_creations",
"num_shells", "num_access_files", "num_outbound_cmds",
"is_host_login", "is_guest_login", "count", "srv_count",
"serror_rate", "srv_serror_rate", "rerror_rate",
"srv_rerror_rate", "same_srv_rate", "diff_srv_rate",
"srv_diff_host_rate", "dst_host_count",
"dst_host_srv_count", "dst_host_same_srv_rate",
"dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
"dst_host_srv_diff_host_rate", "dst_host_serror_rate",
"dst_host_srv_serror_rate", "dst_host_rerror_rate",
"dst_host_srv_rerror_rate", "outcome"]
dataset = pd.read_csv('./kdd/kddcup.dat', names=columns, nrows=1000000)
In [2]:
print(dataset.head())
In [3]:
dataset.shape
Out[3]:
In [4]:
dataset.dtypes
Out[4]:
In [5]:
sorted(dataset['outcome'].unique())
Out[5]:
In [6]:
from sklearn.preprocessing import LabelEncoder
labels_enc = LabelEncoder()
labels = labels_enc.fit_transform(dataset['outcome'])
labels_map = labels_enc.classes_
labels_map
Out[6]:
In [7]:
dataset.drop('outcome', axis=1, inplace=True)
In [8]:
observations = pd.get_dummies(dataset, sparse=True)
del dataset
In [9]:
observations.shape
Out[9]:
In [10]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(observations.as_matrix(), labels,
train_size=0.5, random_state=101)
del observations
In [11]:
def plot_normalised_confusion_matrix(cm, labels_str, title='Normalised confusion matrix', cmap=plt.cm.Blues):
pylab.rcParams['figure.figsize'] = (6.0, 6.0)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.imshow(cm_normalized, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(labels_str))
plt.xticks(tick_marks, labels_str, rotation=90)
plt.yticks(tick_marks, labels_str)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [12]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
clf = SGDClassifier('log', random_state=101)
clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)
print("TRAIN SET")
print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("Confusion matrix:")
plot_normalised_confusion_matrix(confusion_matrix(y_train, y_train_pred), labels_map)
print("Classification report:")
print(classification_report(y_train, y_train_pred, target_names=labels_map))
print("TEST SET")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Confusion matrix:")
plot_normalised_confusion_matrix(confusion_matrix(y_test, y_test_pred), labels_map)
print("Classification report:")
print(classification_report(y_test, y_test_pred, target_names=labels_map))
In [ ]:
In [13]:
import random
random.seed(101)
def sample_class_with_replacement(X, y, label, min_samples_out, max_samples_out):
rows = np.where(y==label)[0]
if len(rows) == 0:
raise Exception
n_estraction = min(max(len(rows), min_samples_out), max_samples_out)
extracted = [random.choice(rows) for _ in range(n_estraction)]
return extracted
train_idx = []
for label in np.unique(labels):
try:
idx = sample_class_with_replacement(X_train, y_train, label, 500, 20000)
train_idx.extend(idx)
except:
pass
X_train_sampled_balanced = X_train[train_idx, :]
y_train_sampled_balanced = y_train[train_idx]
In [14]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
clf = SGDClassifier('log', random_state=101)
clf.fit(X_train_sampled_balanced, y_train_sampled_balanced)
y_train_pred = clf.predict(X_train_sampled_balanced)
y_test_pred = clf.predict(X_test)
print("TRAIN SET")
print("Accuracy:", accuracy_score(y_train_sampled_balanced, y_train_pred))
print("Confusion matrix:")
plot_normalised_confusion_matrix(confusion_matrix(y_train_sampled_balanced, y_train_pred), labels_map)
print("Classification report:")
print(classification_report(y_train_sampled_balanced, y_train_pred, target_names=labels_map))
print("TEST SET")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Confusion matrix:")
plot_normalised_confusion_matrix(confusion_matrix(y_test, y_test_pred), labels_map)
print("Classification report:")
print(classification_report(y_test, y_test_pred, target_names=labels_map))
In [15]:
from sklearn.grid_search import GridSearchCV
parameters = {
'loss': ('log', 'hinge'),
'alpha': [0.1, 0.01, 0.001, 0.0001]
}
clfgs = GridSearchCV(SGDClassifier(random_state=101, n_jobs=1),
param_grid=parameters,
cv=3,
n_jobs=1,
scoring='accuracy'
)
clfgs.fit(X_train_sampled_balanced, y_train_sampled_balanced)
clf = clfgs.best_estimator_
print(clfgs.best_estimator_)
y_train_pred = clf.predict(X_train_sampled_balanced)
y_test_pred = clf.predict(X_test)
print("TRAIN SET")
print("Accuracy:", accuracy_score(y_train_sampled_balanced, y_train_pred))
print("Confusion matrix:")
plot_normalised_confusion_matrix(confusion_matrix(y_train_sampled_balanced, y_train_pred),
labels_map)
print("Classification report:")
print(classification_report(y_train_sampled_balanced, y_train_pred, target_names=labels_map))
print("TEST SET")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Confusion matrix:")
plot_normalised_confusion_matrix(confusion_matrix(y_test, y_test_pred), labels_map)
print("Classification report:")
print(classification_report(y_test, y_test_pred, target_names=labels_map))
In [ ]:
In [16]:
from sklearn.multiclass import OneVsOneClassifier
from sklearn.grid_search import GridSearchCV
parameters = {
'estimator__loss': ('log', 'hinge'),
'estimator__alpha': [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001]
}
clfgs = GridSearchCV(OneVsOneClassifier(SGDClassifier(random_state=101, n_jobs=1)),
param_grid=parameters,
cv=3,
n_jobs=1,
scoring='accuracy'
)
clfgs.fit(X_train_sampled_balanced, y_train_sampled_balanced)
clf = clfgs.best_estimator_
y_train_pred = clf.predict(X_train_sampled_balanced)
y_test_pred = clf.predict(X_test)
print("TRAIN SET")
print("Accuracy:", accuracy_score(y_train_sampled_balanced, y_train_pred))
print("Confusion matrix:")
plot_normalised_confusion_matrix(confusion_matrix(y_train_sampled_balanced, y_train_pred), labels_map)
print("Classification report:")
print(classification_report(y_train_sampled_balanced, y_train_pred, target_names=labels_map))
print("TEST SET")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Confusion matrix:")
plot_normalised_confusion_matrix(confusion_matrix(y_test, y_test_pred), labels_map)
print("Classification report:")
print(classification_report(y_test, y_test_pred, target_names=labels_map))
In [17]:
from sklearn.linear_model import LogisticRegression
clf = OneVsOneClassifier(LogisticRegression(random_state=101))
clf.fit(X_train_sampled_balanced, y_train_sampled_balanced)
y_train_pred = clf.predict(X_train_sampled_balanced)
y_test_pred = clf.predict(X_test)
print("TRAIN SET")
print("Accuracy:", accuracy_score(y_train_sampled_balanced, y_train_pred))
print("Confusion matrix:")
plot_normalised_confusion_matrix(confusion_matrix(y_train_sampled_balanced, y_train_pred), labels_map)
print("Classification report:")
print(classification_report(y_train_sampled_balanced, y_train_pred, target_names=labels_map))
print("TEST SET")
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("Confusion matrix:")
plot_normalised_confusion_matrix(confusion_matrix(y_test, y_test_pred), labels_map)
print("Classification report:")
print(classification_report(y_test, y_test_pred, target_names=labels_map))
In [ ]: