In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import metrics
In [2]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.utils import shuffle
In [3]:
df = pd.read_csv('output_http_csic_2010_weka_with_duplications_utf8_escd_v02_full.csv')
df.head()
Out[3]:
In [4]:
# Remove columns that contain the same value
df = df.drop(['userAgent', 'pragma', 'cacheControl', 'acceptEncoding', 'acceptCharset', 'acceptLanguage'], 1)
df = df.drop(['connection', 'cookie', 'accept', 'protocol'], 1)
# Since everything is localhost let's keep only the port and remove the host part
df['port'] = df['host'].str.split(':', expand=True)[1]
df = df.drop(['host'], 1)
df.head()
Out[4]:
In [5]:
# Split the dataset in two to avoid mixed indices
df_anom = df[df['label']=='anom']
df_norm = df[df['label']=='norm']
print df_anom.describe()
print df_norm.describe()
In [6]:
df2_anom = df_anom[['index', 'payload', 'label']]
df2_anom = df2_anom.dropna()
print(df2_anom.head())
df2_norm = df_norm[['index', 'payload', 'label']]
df2_norm = df2_norm.dropna()
print(df2_norm.head())
In [7]:
#df3 = df2[['payload','label']].groupby(df2['index']).sum()
#df3 = df2[['payload','label']].groupby(df2['index']).agg(lambda x: ' '.join(set(x)))
df3_anom = df2_anom[['payload','label']].groupby(df2_anom['index']).agg(lambda x: ' '.join(set(x)))
df3_anom["payload"] = df3_anom['payload'].apply(lambda x: x.replace("=", " "))
print(df3_anom.head())
df3_anom['label'] = 1
print(df3_anom.head())
In [8]:
df3_norm = df2_norm[['payload','label']].groupby(df2_norm['index']).agg(lambda x: ' '.join(set(x)))
df3_norm["payload"] = df3_norm['payload'].apply(lambda x: x.replace("=", " "))
print(df3_norm.head())
df3_norm['label'] = 0
print(df3_norm.head())
In [9]:
df4 = pd.concat([df3_norm, df3_anom])
print(df4.head())
print(df4.describe())
print(df4.label.value_counts())
In [10]:
# Vectorize the payload by creating character n-grams
vec = TfidfVectorizer(analyzer='word',ngram_range=(3,3))
#vec = HashingVectorizer(analyzer='char',ngram_range=(6,6))
y = df4['label']
X = vec.fit_transform(df4['payload'].dropna())
In [11]:
print(X.shape, y.shape)
In [12]:
# Use a chi-squared test to extract features
ch2 = SelectKBest(chi2, k=600)
X_train = ch2.fit_transform(X, y)
print(X_train.shape)
In [13]:
X1, y1 = shuffle(X_train, y)
offset = int(X1.shape[0] * 0.8)
In [14]:
# Random Forest Classifier
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X1[:offset], y1[:offset])
pred = clf.predict(X1[offset:,:])
accuracy = metrics.accuracy_score(y1[offset:], pred)
f1_score = metrics.f1_score(y1[offset:], pred)
conf_matrix = metrics.confusion_matrix(y1[offset:], pred)
print(accuracy, f1_score)
print(conf_matrix)
In [15]:
# Gradient Boosting Classifier
clf = GradientBoostingClassifier(n_estimators=1000)
clf.fit(X1[:offset], y1[:offset])
pred = clf.predict(X1[offset:,:].toarray())
accuracy = metrics.accuracy_score(y1[offset:], pred)
f1_score = metrics.f1_score(y1[offset:], pred)
conf_matrix = metrics.confusion_matrix(y1[offset:], pred)
print(accuracy, f1_score)
print(conf_matrix)
In [17]:
clf = LinearSVC(penalty="l2", dual=False, tol=2, C=1, max_iter=10000)
clf.fit(X1[:offset], y1[:offset])
pred = clf.predict(X1[offset:,:])
accuracy = metrics.accuracy_score(y1[offset:], pred)
f1_score = metrics.f1_score(y1[offset:], pred)
conf_matrix = metrics.confusion_matrix(y1[offset:], pred)
print(accuracy, f1_score)
print(conf_matrix)
In [21]:
clf = KNeighborsClassifier(n_neighbors=20)
clf.fit(X1[:offset].toarray(), y1.values[:offset])
pred = clf.predict(X1[offset:,:].toarray())
accuracy = metrics.accuracy_score(y1[offset:], pred)
f1_score = metrics.f1_score(y1[offset:], pred)
conf_matrix = metrics.confusion_matrix(y1[offset:], pred)
print(accuracy, f1_score)
print(conf_matrix)
In [19]:
clf = SGDClassifier(alpha=.01, n_iter=10000)
clf.fit(X1[:offset], y1[:offset])
pred = clf.predict(X1[offset:,:])
accuracy = metrics.accuracy_score(y1[offset:], pred)
f1_score = metrics.f1_score(y1[offset:], pred)
conf_matrix = metrics.confusion_matrix(y1[offset:], pred)
print(accuracy, f1_score)
print(conf_matrix)
In [20]:
clf = PassiveAggressiveClassifier(n_iter=50)
clf.fit(X1[:offset].toarray(), y1[:offset])
pred = clf.predict(X1[offset:,:].toarray())
accuracy = metrics.accuracy_score(y1[offset:], pred)
f1_score = metrics.f1_score(y1[offset:], pred)
conf_matrix = metrics.confusion_matrix(y1[offset:], pred)
print(accuracy, f1_score)
print(conf_matrix)
In [ ]: