In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import metrics

In [2]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.utils import shuffle

In [3]:
df = pd.read_csv('output_http_csic_2010_weka_with_duplications_utf8_escd_v02_full.csv')
df.head()


/home/marik0/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (13) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Out[3]:
index method url protocol userAgent pragma cacheControl accept acceptEncoding acceptCharset acceptLanguage host connection contentLength contentType cookie payload label
0 0 GET http://localhost:8080/tienda1/publico/anadir.jsp HTTP/1.1 Mozilla/5.0 (compatible; Konqueror/3.5; Linux)... no-cache no-cache text/xml,application/xml,application/xhtml+xml... x-gzip, x-deflate, gzip, deflate utf-8, utf-8;q=0.5, *;q=0.5 en localhost:8080 close null null JSESSIONID=B92A8B48B9008CD29F622A994E0F650D id=2 anom
1 0 GET http://localhost:8080/tienda1/publico/anadir.jsp HTTP/1.1 Mozilla/5.0 (compatible; Konqueror/3.5; Linux)... no-cache no-cache text/xml,application/xml,application/xhtml+xml... x-gzip, x-deflate, gzip, deflate utf-8, utf-8;q=0.5, *;q=0.5 en localhost:8080 close null null JSESSIONID=B92A8B48B9008CD29F622A994E0F650D nombre=Jam�n Ib�rico anom
2 0 GET http://localhost:8080/tienda1/publico/anadir.jsp HTTP/1.1 Mozilla/5.0 (compatible; Konqueror/3.5; Linux)... no-cache no-cache text/xml,application/xml,application/xhtml+xml... x-gzip, x-deflate, gzip, deflate utf-8, utf-8;q=0.5, *;q=0.5 en localhost:8080 close null null JSESSIONID=B92A8B48B9008CD29F622A994E0F650D precio=85 anom
3 0 GET http://localhost:8080/tienda1/publico/anadir.jsp HTTP/1.1 Mozilla/5.0 (compatible; Konqueror/3.5; Linux)... no-cache no-cache text/xml,application/xml,application/xhtml+xml... x-gzip, x-deflate, gzip, deflate utf-8, utf-8;q=0.5, *;q=0.5 en localhost:8080 close null null JSESSIONID=B92A8B48B9008CD29F622A994E0F650D cantidad='; DROP TABLE usuarios; SELECT * FROM... anom
4 0 GET http://localhost:8080/tienda1/publico/anadir.jsp HTTP/1.1 Mozilla/5.0 (compatible; Konqueror/3.5; Linux)... no-cache no-cache text/xml,application/xml,application/xhtml+xml... x-gzip, x-deflate, gzip, deflate utf-8, utf-8;q=0.5, *;q=0.5 en localhost:8080 close null null JSESSIONID=B92A8B48B9008CD29F622A994E0F650D B1=A�adir al carrito anom

In [4]:
# Remove columns that contain the same value
df = df.drop(['userAgent', 'pragma', 'cacheControl', 'acceptEncoding', 'acceptCharset', 'acceptLanguage'], 1)
df = df.drop(['connection', 'cookie', 'accept', 'protocol'], 1)

# Since everything is localhost let's keep only the port and remove the host part 
df['port'] = df['host'].str.split(':', expand=True)[1]
df = df.drop(['host'], 1)
df.head()


Out[4]:
index method url contentLength contentType payload label port
0 0 GET http://localhost:8080/tienda1/publico/anadir.jsp null null id=2 anom 8080
1 0 GET http://localhost:8080/tienda1/publico/anadir.jsp null null nombre=Jam�n Ib�rico anom 8080
2 0 GET http://localhost:8080/tienda1/publico/anadir.jsp null null precio=85 anom 8080
3 0 GET http://localhost:8080/tienda1/publico/anadir.jsp null null cantidad='; DROP TABLE usuarios; SELECT * FROM... anom 8080
4 0 GET http://localhost:8080/tienda1/publico/anadir.jsp null null B1=A�adir al carrito anom 8080

In [5]:
# Split the dataset in two to avoid mixed indices
df_anom = df[df['label']=='anom']
df_norm = df[df['label']=='norm']
print df_anom.describe()
print df_norm.describe()


               index
count  119585.000000
mean    13631.637613
std      7276.230752
min         0.000000
25%      7373.000000
50%     14668.000000
75%     19944.000000
max     25064.000000
               index
count  104000.000000
mean    21268.153846
std     10914.638765
min         0.000000
25%     11741.000000
50%     23483.000000
75%     31047.000000
max     35999.000000

In [6]:
df2_anom = df_anom[['index', 'payload', 'label']]
df2_anom = df2_anom.dropna()
print(df2_anom.head())

df2_norm = df_norm[['index', 'payload', 'label']]
df2_norm = df2_norm.dropna()
print(df2_norm.head())


   index                                            payload label
0      0                                               id=2  anom
1      0                               nombre=Jam�n Ib�rico  anom
2      0                                          precio=85  anom
3      0  cantidad='; DROP TABLE usuarios; SELECT * FROM...  anom
4      0                               B1=A�adir al carrito  anom
        index               payload label
119586      1                  id=3  norm
119587      1     nombre=Vino Rioja  norm
119588      1            precio=100  norm
119589      1           cantidad=55  norm
119590      1  B1=A�adir al carrito  norm

In [7]:
#df3 = df2[['payload','label']].groupby(df2['index']).sum()
#df3 = df2[['payload','label']].groupby(df2['index']).agg(lambda x: ' '.join(set(x)))
df3_anom = df2_anom[['payload','label']].groupby(df2_anom['index']).agg(lambda x: ' '.join(set(x)))
df3_anom["payload"] = df3_anom['payload'].apply(lambda x: x.replace("=", " "))
print(df3_anom.head())

df3_anom['label'] = 1
print(df3_anom.head())


                                                 payload label
index                                                         
0      cantidad '; DROP TABLE usuarios; SELECT * FROM...  anom
1      cantidad 49 id 2/ B1 A�adir al carrito nombre ...  anom
3      modo entrar B1 Entrar remember on pwd 84m3ri15...  anom
4      modo entrar login grimshaw B1 Entrar remember ...  anom
5      modo entrar login grimshaw rememberA on pwd 84...  anom
                                                 payload  label
index                                                          
0      cantidad '; DROP TABLE usuarios; SELECT * FROM...      1
1      cantidad 49 id 2/ B1 A�adir al carrito nombre ...      1
3      modo entrar B1 Entrar remember on pwd 84m3ri15...      1
4      modo entrar login grimshaw B1 Entrar remember ...      1
5      modo entrar login grimshaw rememberA on pwd 84...      1

In [8]:
df3_norm = df2_norm[['payload','label']].groupby(df2_norm['index']).agg(lambda x: ' '.join(set(x)))
df3_norm["payload"] = df3_norm['payload'].apply(lambda x: x.replace("=", " "))
print(df3_norm.head())

df3_norm['label'] = 0
print(df3_norm.head())


                                                 payload label
index                                                         
1      nombre Vino Rioja B1 A�adir al carrito precio ...  norm
2      modo entrar B1 Entrar pwd d1se3ci�n login choo...  norm
3                                                   id 2  norm
5                      errorMsg Credenciales incorrectas  norm
7            B1 Pasar por caja precio 2672 modo insertar  norm
                                                 payload  label
index                                                          
1      nombre Vino Rioja B1 A�adir al carrito precio ...      0
2      modo entrar B1 Entrar pwd d1se3ci�n login choo...      0
3                                                   id 2      0
5                      errorMsg Credenciales incorrectas      0
7            B1 Pasar por caja precio 2672 modo insertar      0

In [9]:
df4 = pd.concat([df3_norm, df3_anom])
print(df4.head())
print(df4.describe())
print(df4.label.value_counts())


                                                 payload  label
index                                                          
1      nombre Vino Rioja B1 A�adir al carrito precio ...      0
2      modo entrar B1 Entrar pwd d1se3ci�n login choo...      0
3                                                   id 2      0
5                      errorMsg Credenciales incorrectas      0
7            B1 Pasar por caja precio 2672 modo insertar      0
              label
count  35574.000000
mean       0.550233
std        0.497477
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
1    19574
0    16000
Name: label, dtype: int64

In [10]:
# Vectorize the payload by creating character n-grams
vec = TfidfVectorizer(analyzer='word',ngram_range=(3,3))
#vec = HashingVectorizer(analyzer='char',ngram_range=(6,6))

y = df4['label']
X = vec.fit_transform(df4['payload'].dropna())

In [11]:
print(X.shape, y.shape)


((35574, 111699), (35574,))

In [12]:
# Use a chi-squared test to extract features
ch2 = SelectKBest(chi2, k=600)
X_train = ch2.fit_transform(X, y)
print(X_train.shape)


(35574, 600)

In [13]:
X1, y1 = shuffle(X_train, y)
offset = int(X1.shape[0] * 0.8)

In [14]:
# Random Forest Classifier
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X1[:offset], y1[:offset])
pred = clf.predict(X1[offset:,:])

accuracy = metrics.accuracy_score(y1[offset:], pred)
f1_score = metrics.f1_score(y1[offset:], pred)
conf_matrix = metrics.confusion_matrix(y1[offset:], pred)
print(accuracy, f1_score)
print(conf_matrix)


(0.78580463808854528, 0.83285808291291952)
[[1794 1360]
 [ 164 3797]]

In [15]:
# Gradient Boosting Classifier
clf = GradientBoostingClassifier(n_estimators=1000)
clf.fit(X1[:offset], y1[:offset])
pred = clf.predict(X1[offset:,:].toarray())

accuracy = metrics.accuracy_score(y1[offset:], pred)
f1_score = metrics.f1_score(y1[offset:], pred)
conf_matrix = metrics.confusion_matrix(y1[offset:], pred)
print(accuracy, f1_score)
print(conf_matrix)


(0.77610681658468028, 0.82519477669263686)
[[1762 1392]
 [ 201 3760]]

In [17]:
clf = LinearSVC(penalty="l2", dual=False, tol=2, C=1, max_iter=10000)
clf.fit(X1[:offset], y1[:offset])
pred = clf.predict(X1[offset:,:])
accuracy = metrics.accuracy_score(y1[offset:], pred)
f1_score = metrics.f1_score(y1[offset:], pred)
conf_matrix = metrics.confusion_matrix(y1[offset:], pred)
print(accuracy, f1_score)
print(conf_matrix)


(0.71988756148981026, 0.77788922322523124)
[[1632 1522]
 [ 471 3490]]

In [21]:
clf = KNeighborsClassifier(n_neighbors=20)
clf.fit(X1[:offset].toarray(), y1.values[:offset])
pred = clf.predict(X1[offset:,:].toarray())
accuracy = metrics.accuracy_score(y1[offset:], pred)
f1_score = metrics.f1_score(y1[offset:], pred)
conf_matrix = metrics.confusion_matrix(y1[offset:], pred)
print(accuracy, f1_score)
print(conf_matrix)


(0.76444132115249475, 0.81340458695168105)
[[1786 1368]
 [ 308 3653]]

In [19]:
clf = SGDClassifier(alpha=.01, n_iter=10000)
clf.fit(X1[:offset], y1[:offset])
pred = clf.predict(X1[offset:,:])
accuracy = metrics.accuracy_score(y1[offset:], pred)
f1_score = metrics.f1_score(y1[offset:], pred)
conf_matrix = metrics.confusion_matrix(y1[offset:], pred)
print(accuracy, f1_score)
print(conf_matrix)


(0.66479269149683762, 0.76765708718947878)
[[ 790 2364]
 [  21 3940]]

In [20]:
clf = PassiveAggressiveClassifier(n_iter=50)
clf.fit(X1[:offset].toarray(), y1[:offset])
pred = clf.predict(X1[offset:,:].toarray())
accuracy = metrics.accuracy_score(y1[offset:], pred)
f1_score = metrics.f1_score(y1[offset:], pred)
conf_matrix = metrics.confusion_matrix(y1[offset:], pred)
print(accuracy, f1_score)
print(conf_matrix)


(0.74378074490513002, 0.79710628825820806)
[[1711 1443]
 [ 380 3581]]

In [ ]: