notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import metrics



In [45]:

    
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.utils import shuffle



In [151]:

    
df = pd.read_csv('output_http_csic_2010_weka_with_duplications_utf8_escd_v02_full.csv')
df.head()









    Out[151]:






  
    
      
      index
      method
      url
      protocol
      userAgent
      pragma
      cacheControl
      accept
      acceptEncoding
      acceptCharset
      acceptLanguage
      host
      connection
      contentLength
      contentType
      cookie
      payload
      label
    
  
  
    
      0
      0
      GET
      http://localhost:8080/tienda1/publico/anadir.jsp
      HTTP/1.1
      Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...
      no-cache
      no-cache
      text/xml,application/xml,application/xhtml+xml...
      x-gzip, x-deflate, gzip, deflate
      utf-8, utf-8;q=0.5, *;q=0.5
      en
      localhost:8080
      close
      null
      null
      JSESSIONID=B92A8B48B9008CD29F622A994E0F650D
      id=2
      anom
    
    
      1
      0
      GET
      http://localhost:8080/tienda1/publico/anadir.jsp
      HTTP/1.1
      Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...
      no-cache
      no-cache
      text/xml,application/xml,application/xhtml+xml...
      x-gzip, x-deflate, gzip, deflate
      utf-8, utf-8;q=0.5, *;q=0.5
      en
      localhost:8080
      close
      null
      null
      JSESSIONID=B92A8B48B9008CD29F622A994E0F650D
      nombre=Jam�n Ib�rico
      anom
    
    
      2
      0
      GET
      http://localhost:8080/tienda1/publico/anadir.jsp
      HTTP/1.1
      Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...
      no-cache
      no-cache
      text/xml,application/xml,application/xhtml+xml...
      x-gzip, x-deflate, gzip, deflate
      utf-8, utf-8;q=0.5, *;q=0.5
      en
      localhost:8080
      close
      null
      null
      JSESSIONID=B92A8B48B9008CD29F622A994E0F650D
      precio=85
      anom
    
    
      3
      0
      GET
      http://localhost:8080/tienda1/publico/anadir.jsp
      HTTP/1.1
      Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...
      no-cache
      no-cache
      text/xml,application/xml,application/xhtml+xml...
      x-gzip, x-deflate, gzip, deflate
      utf-8, utf-8;q=0.5, *;q=0.5
      en
      localhost:8080
      close
      null
      null
      JSESSIONID=B92A8B48B9008CD29F622A994E0F650D
      cantidad='; DROP TABLE usuarios; SELECT * FROM...
      anom
    
    
      4
      0
      GET
      http://localhost:8080/tienda1/publico/anadir.jsp
      HTTP/1.1
      Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...
      no-cache
      no-cache
      text/xml,application/xml,application/xhtml+xml...
      x-gzip, x-deflate, gzip, deflate
      utf-8, utf-8;q=0.5, *;q=0.5
      en
      localhost:8080
      close
      null
      null
      JSESSIONID=B92A8B48B9008CD29F622A994E0F650D
      B1=A�adir al carrito
      anom



In [152]:

    
# Remove columns that contain the same value
df = df.drop(['userAgent', 'pragma', 'cacheControl', 'acceptEncoding', 'acceptCharset', 'acceptLanguage'], 1)
df = df.drop(['connection', 'cookie', 'accept', 'protocol'], 1)

# Since everything is localhost let's keep only the port and remove the host part 
df['port'] = df['host'].str.split(':', expand=True)[1]
df = df.drop(['host'], 1)
df.head()









    Out[152]:






  
    
      
      index
      method
      url
      contentLength
      contentType
      payload
      label
      port
    
  
  
    
      0
      0
      GET
      http://localhost:8080/tienda1/publico/anadir.jsp
      null
      null
      id=2
      anom
      8080
    
    
      1
      0
      GET
      http://localhost:8080/tienda1/publico/anadir.jsp
      null
      null
      nombre=Jam�n Ib�rico
      anom
      8080
    
    
      2
      0
      GET
      http://localhost:8080/tienda1/publico/anadir.jsp
      null
      null
      precio=85
      anom
      8080
    
    
      3
      0
      GET
      http://localhost:8080/tienda1/publico/anadir.jsp
      null
      null
      cantidad='; DROP TABLE usuarios; SELECT * FROM...
      anom
      8080
    
    
      4
      0
      GET
      http://localhost:8080/tienda1/publico/anadir.jsp
      null
      null
      B1=A�adir al carrito
      anom
      8080



In [153]:

    
# Split the dataset in two to avoid mixed indices
df_anom = df[df['label']=='anom']
df_norm = df[df['label']=='norm']
print df_anom.describe()
print df_norm.describe()









    



               index
count  119585.000000
mean    13631.637613
std      7276.230752
min         0.000000
25%      7373.000000
50%     14668.000000
75%     19944.000000
max     25064.000000
               index
count  104000.000000
mean    21268.153846
std     10914.638765
min         0.000000
25%     11741.000000
50%     23483.000000
75%     31047.000000
max     35999.000000



In [154]:

    
df2_anom = df_anom[['index', 'payload', 'label']]
df2_anom = df2_anom.dropna()
print(df2_anom.head())

df2_norm = df_norm[['index', 'payload', 'label']]
df2_norm = df2_norm.dropna()
print(df2_norm.head())









    



   index                                            payload label
0      0                                               id=2  anom
1      0                               nombre=Jam�n Ib�rico  anom
2      0                                          precio=85  anom
3      0  cantidad='; DROP TABLE usuarios; SELECT * FROM...  anom
4      0                               B1=A�adir al carrito  anom
        index               payload label
119586      1                  id=3  norm
119587      1     nombre=Vino Rioja  norm
119588      1            precio=100  norm
119589      1           cantidad=55  norm
119590      1  B1=A�adir al carrito  norm



In [155]:

    
#df3 = df2[['payload','label']].groupby(df2['index']).sum()
#df3 = df2[['payload','label']].groupby(df2['index']).agg(lambda x: ' '.join(set(x)))
df3_anom = df2_anom[['payload','label']].groupby(df2_anom['index']).agg(lambda x: ' '.join(set(x)))
df3_anom["payload"] = df3_anom['payload'].apply(lambda x: x.replace("=", " "))
print(df3_anom.head())

df3_anom['label'] = 1
print(df3_anom.head())









    



                                                 payload label
index                                                         
0      cantidad '; DROP TABLE usuarios; SELECT * FROM...  anom
1      cantidad 49 id 2/ B1 A�adir al carrito nombre ...  anom
3      modo entrar B1 Entrar remember on pwd 84m3ri15...  anom
4      modo entrar login grimshaw B1 Entrar remember ...  anom
5      modo entrar login grimshaw rememberA on pwd 84...  anom
                                                 payload  label
index                                                          
0      cantidad '; DROP TABLE usuarios; SELECT * FROM...      1
1      cantidad 49 id 2/ B1 A�adir al carrito nombre ...      1
3      modo entrar B1 Entrar remember on pwd 84m3ri15...      1
4      modo entrar login grimshaw B1 Entrar remember ...      1
5      modo entrar login grimshaw rememberA on pwd 84...      1



In [156]:

    
df3_norm = df2_norm[['payload','label']].groupby(df2_norm['index']).agg(lambda x: ' '.join(set(x)))
df3_norm["payload"] = df3_norm['payload'].apply(lambda x: x.replace("=", " "))
print(df3_norm.head())

df3_norm['label'] = 0
print(df3_norm.head())









    



                                                 payload label
index                                                         
1      nombre Vino Rioja B1 A�adir al carrito precio ...  norm
2      modo entrar B1 Entrar pwd d1se3ci�n login choo...  norm
3                                                   id 2  norm
5                      errorMsg Credenciales incorrectas  norm
7            B1 Pasar por caja precio 2672 modo insertar  norm
                                                 payload  label
index                                                          
1      nombre Vino Rioja B1 A�adir al carrito precio ...      0
2      modo entrar B1 Entrar pwd d1se3ci�n login choo...      0
3                                                   id 2      0
5                      errorMsg Credenciales incorrectas      0
7            B1 Pasar por caja precio 2672 modo insertar      0



In [157]:

    
df4 = pd.concat([df3_norm, df3_anom])
print(df4.head())
print(df4.describe())
print(df4.label.value_counts())









    



                                                 payload  label
index                                                          
1      nombre Vino Rioja B1 A�adir al carrito precio ...      0
2      modo entrar B1 Entrar pwd d1se3ci�n login choo...      0
3                                                   id 2      0
5                      errorMsg Credenciales incorrectas      0
7            B1 Pasar por caja precio 2672 modo insertar      0
              label
count  35574.000000
mean       0.550233
std        0.497477
min        0.000000
25%        0.000000
50%        1.000000
75%        1.000000
max        1.000000
1    19574
0    16000
Name: label, dtype: int64



In [272]:

    
# Vectorize the payload by creating character n-grams
vec = TfidfVectorizer(analyzer='word',ngram_range=(3,3))
#vec = HashingVectorizer(analyzer='char',ngram_range=(6,6))

y = df4['label']
X = vec.fit_transform(df4['payload'].dropna())



In [273]:

    
print(X.shape, y.shape)









    



((35574, 111699), (35574,))



In [274]:

    
# Use a chi-squared test to extract features
ch2 = SelectKBest(chi2, k=600)
X_train = ch2.fit_transform(X, y)
print(X_train.shape)









    



(35574, 600)



In [275]:

    
X1, y1 = shuffle(X_train, y)
offset = int(X1.shape[0] * 0.8)



In [178]:

    
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout



In [295]:

    
model = Sequential()
model.add(Dense(input_dim=X1.shape[1], output_dim=600))
model.add(Activation('relu'))
model.add(Dense(600))
model.add(Activation('relu'))
#model.add(Dropout(0.5))
model.add(Dense(300))
model.add(Activation('relu'))
#model.add(Dropout(0.5))
#model.add(Dense(100))
#model.add(Activation('relu'))
model.add(Dense(10))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])



In [296]:

    
model.fit(X1[:offset].toarray(), y1[:offset], batch_size=100, nb_epoch=25, verbose=1, callbacks=[],
          validation_data=(X1[offset:].toarray(), y1[offset:]),
          shuffle=False, class_weight=None, sample_weight=None)









    



Train on 28459 samples, validate on 7115 samples
Epoch 1/25
28459/28459 [==============================] - 1s - loss: 0.4421 - acc: 0.7255 - val_loss: 0.4181 - val_acc: 0.7396
Epoch 2/25
28459/28459 [==============================] - 1s - loss: 0.4213 - acc: 0.7370 - val_loss: 0.4182 - val_acc: 0.7421
Epoch 3/25
28459/28459 [==============================] - 1s - loss: 0.4188 - acc: 0.7379 - val_loss: 0.4186 - val_acc: 0.7434
Epoch 4/25
28459/28459 [==============================] - 1s - loss: 0.4178 - acc: 0.7384 - val_loss: 0.4187 - val_acc: 0.7390
Epoch 5/25
28459/28459 [==============================] - 1s - loss: 0.4173 - acc: 0.7388 - val_loss: 0.4197 - val_acc: 0.7396
Epoch 6/25
28459/28459 [==============================] - 1s - loss: 0.4170 - acc: 0.7385 - val_loss: 0.4194 - val_acc: 0.7411
Epoch 7/25
28459/28459 [==============================] - 1s - loss: 0.4167 - acc: 0.7391 - val_loss: 0.4199 - val_acc: 0.7397
Epoch 8/25
28459/28459 [==============================] - 1s - loss: 0.4164 - acc: 0.7387 - val_loss: 0.4200 - val_acc: 0.7400
Epoch 9/25
28459/28459 [==============================] - 1s - loss: 0.4165 - acc: 0.7376 - val_loss: 0.4203 - val_acc: 0.7390
Epoch 10/25
28459/28459 [==============================] - 1s - loss: 0.4162 - acc: 0.7382 - val_loss: 0.4205 - val_acc: 0.7417
Epoch 11/25
28459/28459 [==============================] - 1s - loss: 0.4226 - acc: 0.7379 - val_loss: 0.4206 - val_acc: 0.7413
Epoch 12/25
28459/28459 [==============================] - 1s - loss: 0.4164 - acc: 0.7391 - val_loss: 0.4204 - val_acc: 0.7415
Epoch 13/25
28459/28459 [==============================] - 1s - loss: 0.4160 - acc: 0.7385 - val_loss: 0.4203 - val_acc: 0.7415
Epoch 14/25
28459/28459 [==============================] - 1s - loss: 0.4158 - acc: 0.7358 - val_loss: 0.4203 - val_acc: 0.7442
Epoch 15/25
28459/28459 [==============================] - 1s - loss: 0.4158 - acc: 0.7398 - val_loss: 0.4205 - val_acc: 0.7436
Epoch 16/25
28459/28459 [==============================] - 1s - loss: 0.4157 - acc: 0.7392 - val_loss: 0.4207 - val_acc: 0.7441
Epoch 17/25
28459/28459 [==============================] - 1s - loss: 0.4154 - acc: 0.7396 - val_loss: 0.4205 - val_acc: 0.7411
Epoch 18/25
28459/28459 [==============================] - 1s - loss: 0.4153 - acc: 0.7395 - val_loss: 0.4208 - val_acc: 0.7445
Epoch 19/25
28459/28459 [==============================] - 1s - loss: 0.4151 - acc: 0.7405 - val_loss: 0.4210 - val_acc: 0.7446
Epoch 20/25
28459/28459 [==============================] - 1s - loss: 0.4152 - acc: 0.7401 - val_loss: 0.4203 - val_acc: 0.7441
Epoch 21/25
28459/28459 [==============================] - 1s - loss: 0.4150 - acc: 0.7403 - val_loss: 0.4208 - val_acc: 0.7439
Epoch 22/25
28459/28459 [==============================] - 1s - loss: 0.4150 - acc: 0.7395 - val_loss: 0.4202 - val_acc: 0.7432
Epoch 23/25
28459/28459 [==============================] - 1s - loss: 0.4152 - acc: 0.7388 - val_loss: 0.4205 - val_acc: 0.7441
Epoch 24/25
28459/28459 [==============================] - 1s - loss: 0.4150 - acc: 0.7391 - val_loss: 0.4209 - val_acc: 0.7432
Epoch 25/25
28459/28459 [==============================] - 1s - loss: 0.4148 - acc: 0.7397 - val_loss: 0.4204 - val_acc: 0.7448






    Out[296]:





<keras.callbacks.History at 0x7f192fe49390>



In [297]:

    
pred = model.predict_classes(X1[offset:].toarray())
accuracy = metrics.accuracy_score(y1[offset:], pred)
f1_score = metrics.f1_score(y1[offset:], pred)
conf_matrix = metrics.confusion_matrix(y1[offset:], pred)
print("\n")
print(accuracy, f1_score)
print(conf_matrix)









    



6528/7115 [==========================>...] - ETA: 0s

(0.74476458186929018, 0.79659498207885304)
[[1743 1500]
 [ 316 3556]]



In [ ]:

	method	url	protocol	userAgent	pragma	cacheControl	accept	acceptEncoding	acceptCharset	acceptLanguage	host	connection	contentLength	contentType	cookie	payload	label
0	GET	http://localhost:8080/tienda1/publico/anadir.jsp	HTTP/1.1	Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...	no-cache	no-cache	text/xml,application/xml,application/xhtml+xml...	x-gzip, x-deflate, gzip, deflate	utf-8, utf-8;q=0.5, *;q=0.5	en	localhost:8080	close	null	null	JSESSIONID=B92A8B48B9008CD29F622A994E0F650D	id=2	anom
1	GET	http://localhost:8080/tienda1/publico/anadir.jsp	HTTP/1.1	Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...	no-cache	no-cache	text/xml,application/xml,application/xhtml+xml...	x-gzip, x-deflate, gzip, deflate	utf-8, utf-8;q=0.5, *;q=0.5	en	localhost:8080	close	null	null	JSESSIONID=B92A8B48B9008CD29F622A994E0F650D	nombre=Jam�n Ib�rico	anom
2	GET	http://localhost:8080/tienda1/publico/anadir.jsp	HTTP/1.1	Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...	no-cache	no-cache	text/xml,application/xml,application/xhtml+xml...	x-gzip, x-deflate, gzip, deflate	utf-8, utf-8;q=0.5, *;q=0.5	en	localhost:8080	close	null	null	JSESSIONID=B92A8B48B9008CD29F622A994E0F650D	precio=85	anom
3	GET	http://localhost:8080/tienda1/publico/anadir.jsp	HTTP/1.1	Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...	no-cache	no-cache	text/xml,application/xml,application/xhtml+xml...	x-gzip, x-deflate, gzip, deflate	utf-8, utf-8;q=0.5, *;q=0.5	en	localhost:8080	close	null	null	JSESSIONID=B92A8B48B9008CD29F622A994E0F650D	cantidad='; DROP TABLE usuarios; SELECT * FROM...	anom
4	GET	http://localhost:8080/tienda1/publico/anadir.jsp	HTTP/1.1	Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...	no-cache	no-cache	text/xml,application/xml,application/xhtml+xml...	x-gzip, x-deflate, gzip, deflate	utf-8, utf-8;q=0.5, *;q=0.5	en	localhost:8080	close	null	null	JSESSIONID=B92A8B48B9008CD29F622A994E0F650D	B1=A�adir al carrito	anom