This simple example shows how to load a non-trivial dataset from CSV and train a neural network. The dataset is the KDD99 dataset. This dataset is used to detect between normal and malicious network activity.
In [ ]:
# Imports for this Notebook
In [28]:
# Imports
import pandas as pd
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
import tensorflow.contrib.learn as skflow
from sklearn import metrics
In [24]:
# These are several handy functions that I use in my class:
# Encode a text field to dummy variables
def encode_text_dummy(df,name):
dummies = pd.get_dummies(df[name])
for x in dummies.columns:
dummy_name = "{}-{}".format(name,x)
df[dummy_name] = dummies[x]
df.drop(name, axis=1, inplace=True)
# Encode a text field to a single index value
def encode_text_index(df,name):
le = preprocessing.LabelEncoder()
df[name] = le.fit_transform(df[name])
return le.classes_
# Encode a numeric field to Z-Scores
def encode_numeric_zscore(df,name,mean=None,sd=None):
if mean is None:
mean = df[name].mean()
if sd is None:
sd = df[name].std()
df[name] = (df[name]-mean)/sd
# Encode a numeric field to fill missing values with the median.
def missing_median(df, name):
med = df[name].median()
df[name] = df[name].fillna(med)
# Convert a dataframe to x/y suitable for training.
def to_xy(df,target):
result = []
for x in df.columns:
if x != target:
result.append(x)
return df.as_matrix(result),df[target]
In [25]:
# This file is a CSV, just no CSV extension or headers
# Download from: http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html
df = pd.read_csv("/Users/jeff/Downloads/data/kddcup.data_10_percent", header=None)
print("Read {} rows.".format(len(df)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
df.dropna(inplace=True,axis=1) # For now, just drop NA's (rows with missing values)
# The CSV file has no column heads, so add them
df.columns = [
'duration',
'protocol_type',
'service',
'flag',
'src_bytes',
'dst_bytes',
'land',
'wrong_fragment',
'urgent',
'hot',
'num_failed_logins',
'logged_in',
'num_compromised',
'root_shell',
'su_attempted',
'num_root',
'num_file_creations',
'num_shells',
'num_access_files',
'num_outbound_cmds',
'is_host_login',
'is_guest_login',
'count',
'srv_count',
'serror_rate',
'srv_serror_rate',
'rerror_rate',
'srv_rerror_rate',
'same_srv_rate',
'diff_srv_rate',
'srv_diff_host_rate',
'dst_host_count',
'dst_host_srv_count',
'dst_host_same_srv_rate',
'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate',
'dst_host_serror_rate',
'dst_host_srv_serror_rate',
'dst_host_rerror_rate',
'dst_host_srv_rerror_rate',
'outcome'
]
# display 5 rows
df[0:5]
Out[25]:
In [26]:
# Now encode the feature vector
encode_numeric_zscore(df, 'duration')
encode_text_dummy(df, 'protocol_type')
encode_text_dummy(df, 'service')
encode_text_dummy(df, 'flag')
encode_numeric_zscore(df, 'src_bytes')
encode_numeric_zscore(df, 'dst_bytes')
encode_text_dummy(df, 'land')
encode_numeric_zscore(df, 'wrong_fragment')
encode_numeric_zscore(df, 'urgent')
encode_numeric_zscore(df, 'hot')
encode_numeric_zscore(df, 'num_failed_logins')
encode_text_dummy(df, 'logged_in')
encode_numeric_zscore(df, 'num_compromised')
encode_numeric_zscore(df, 'root_shell')
encode_numeric_zscore(df, 'su_attempted')
encode_numeric_zscore(df, 'num_root')
encode_numeric_zscore(df, 'num_file_creations')
encode_numeric_zscore(df, 'num_shells')
encode_numeric_zscore(df, 'num_access_files')
encode_numeric_zscore(df, 'num_outbound_cmds')
encode_text_dummy(df, 'is_host_login')
encode_text_dummy(df, 'is_guest_login')
encode_numeric_zscore(df, 'count')
encode_numeric_zscore(df, 'srv_count')
encode_numeric_zscore(df, 'serror_rate')
encode_numeric_zscore(df, 'srv_serror_rate')
encode_numeric_zscore(df, 'rerror_rate')
encode_numeric_zscore(df, 'srv_rerror_rate')
encode_numeric_zscore(df, 'same_srv_rate')
encode_numeric_zscore(df, 'diff_srv_rate')
encode_numeric_zscore(df, 'srv_diff_host_rate')
encode_numeric_zscore(df, 'dst_host_count')
encode_numeric_zscore(df, 'dst_host_srv_count')
encode_numeric_zscore(df, 'dst_host_same_srv_rate')
encode_numeric_zscore(df, 'dst_host_diff_srv_rate')
encode_numeric_zscore(df, 'dst_host_same_src_port_rate')
encode_numeric_zscore(df, 'dst_host_srv_diff_host_rate')
encode_numeric_zscore(df, 'dst_host_serror_rate')
encode_numeric_zscore(df, 'dst_host_srv_serror_rate')
encode_numeric_zscore(df, 'dst_host_rerror_rate')
encode_numeric_zscore(df, 'dst_host_srv_rerror_rate')
outcomes = encode_text_index(df, 'outcome')
num_classes = len(outcomes)
# display 5 rows
df.dropna(inplace=True,axis=1)
df[0:5]
# This is the numeric feature vector, as it goes to the neural net
Out[26]:
In [30]:
# Break into X (predictors) & y (prediction)
x, y = to_xy(df,'outcome')
# Create a test/train split. 25% test
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(
x, y, test_size=0.25, random_state=42)
# Create a deep neural network with 3 hidden layers of 10, 20, 10
classifier = skflow.TensorFlowDNNClassifier(hidden_units=[10, 20, 10],
n_classes=num_classes, steps=500)
# Early stopping
early_stop = skflow.monitors.ValidationMonitor(x_test, y_test,
early_stopping_rounds=200,
n_classes=num_classes,
print_steps=50)
# Fit/train neural network
classifier.fit(x, y, early_stop)
Out[30]:
In [31]:
# Measure accuracy
pred = classifier.predict(x_test)
score = metrics.accuracy_score(y_test, pred)
print("Validation score: {}".format(score))
In [ ]: