In [1]:
import pandas as pd
import numpy as np
from sklearn import *
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
%matplotlib inline
In [5]:
df = pd.read_csv("/data/kddcup.data", header=None)
df.head()
Out[5]:
In [6]:
columns = [f.split(":")[0] for f in """
duration: continuous.
protocol_type: symbolic.
service: symbolic.
flag: symbolic.
src_bytes: continuous.
dst_bytes: continuous.
land: symbolic.
wrong_fragment: continuous.
urgent: continuous.
hot: continuous.
num_failed_logins: continuous.
logged_in: symbolic.
num_compromised: continuous.
root_shell: continuous.
su_attempted: continuous.
num_root: continuous.
num_file_creations: continuous.
num_shells: continuous.
num_access_files: continuous.
num_outbound_cmds: continuous.
is_host_login: symbolic.
is_guest_login: symbolic.
count: continuous.
srv_count: continuous.
serror_rate: continuous.
srv_serror_rate: continuous.
rerror_rate: continuous.
srv_rerror_rate: continuous.
same_srv_rate: continuous.
diff_srv_rate: continuous.
srv_diff_host_rate: continuous.
dst_host_count: continuous.
dst_host_srv_count: continuous.
dst_host_same_srv_rate: continuous.
dst_host_diff_srv_rate: continuous.
dst_host_same_src_port_rate: continuous.
dst_host_srv_diff_host_rate: continuous.
dst_host_serror_rate: continuous.
dst_host_srv_serror_rate: continuous.
dst_host_rerror_rate: continuous.
dst_host_srv_rerror_rate: continuous.
""".split("\n") if len(f)>0]
columns.append("Category")
print(columns)
In [7]:
df.columns = columns
In [8]:
df.info()
In [11]:
X = df.select_dtypes(include=[np.float64, np.int64]).values
In [13]:
y = df.Category
y.value_counts()
Out[13]:
In [16]:
%%time
y = np.where(df.Category == "normal.", 0, 1)
X = df.select_dtypes(include=[np.float64, np.int64]).values
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,
test_size = 0.3, random_state = 12345)
scaler = preprocessing.StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)
print("X_train", X_train.shape, "X_test", X_test.shape)
In [18]:
pd.Series(y_train).value_counts()/len(y_train)
Out[18]:
In [22]:
%%time
pca = decomposition.PCA(random_state=1)
pca.fit(X_train_std)
In [27]:
fig, ax = plt.subplots()
ax.bar(range(X_train_std.shape[1]), pca.explained_variance_ratio_)
ax.plot(range(X_train_std.shape[1]), np.cumsum(pca.explained_variance_ratio_))
plt.xlabel("PC components")
plt.ylabel("Variance retention")
Out[27]:
In [40]:
pd.DataFrame({"retention": np.cumsum(pca.explained_variance_ratio_)})\
.query("retention>0.99")[:3]
Out[40]:
In [30]:
%%time
pca = decomposition.PCA(random_state=1, n_components=23)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)
In [34]:
X_train_pca.shape, X_test_pca.shape, y_train.shape, y_test.shape
Out[34]:
In [36]:
%%time
est = linear_model.LogisticRegression()
est.fit(X_train_std, y_train)
print("Training accuracy:", est.score(X_train_std, y_train),
"Test accuracy:", est.score(X_test_std, y_test))
In [35]:
%%time
est = linear_model.LogisticRegression()
est.fit(X_train_pca, y_train)
print("Training accuracy:", est.score(X_train_pca, y_train),
"Test accuracy:", est.score(X_test_pca, y_test))
In [ ]: