Todo:
In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
In [2]:
!head -n 5 "../data/output_http_csic_2010_weka_with_duplications_utf8_escd_v02_full.csv"
In [7]:
!sed -n "12770,12775p" < "../data/output_http_csic_2010_weka_with_duplications_utf8_escd_v02_full.csv"
In [4]:
from pandas.io import sql
import sqlite3
conn = sqlite3.connect("../data/anomaly.db")
In [6]:
# create table
#conn.execute("drop table records")
conn.execute('''
create table records (uid integer,idx,method,url,protocol,user_agent,pragma,cache_control,accept,
accept_encoding,accept_charset,accept_language,host,connection,content_length,
content_type,cookie,payload,label)
''')
Out[6]:
In [7]:
# delete all records
c = conn.cursor()
c.execute("delete from records")
conn.commit()
In [8]:
# insert into table
import pdb
c = conn.cursor()
INSERT_STATEMENT = """
insert into records values ({0},"{1}","{2}","{3}","{4}","{5}","{6}","{7}",
"{8}","{9}","{10}","{11}","{12}","{13}","{14}","{15}","{16}","{17}","{18}")"""
with open("../data/output_http_csic_2010_weka_with_duplications_utf8_escd_v02_full.csv", "r") as f:
rdr = csv.reader(f)
header = next(rdr)
for n, row in enumerate(rdr):
payload = row[16].strip().replace('\"','\'')
stmt = INSERT_STATEMENT.format(n,
row[0], row[1], row[2], row[3], row[4], row[5], row[6],
row[7], row[8], row[9], row[10], row[11], row[12], row[13],
row[14], row[15], payload, row[17])
try:
c.execute(stmt)
except Exception as ex:
print("error %d, %s" % (n, row[0]))
print("exception: %s" % ex)
conn.commit()
In [15]:
!sed -n "126513,126517p" < "../data/output_http_csic_2010_weka_with_duplications_utf8_escd_v02_full.csv"
In [ ]:
!tail -n 3 "../data/output_http_csic_2010_weka_with_duplications_utf8_escd_v02_full.csv"
In [9]:
# read into df
df = pd.read_sql("select * from records", con=conn)
In [11]:
# close connection
conn.close()
In [12]:
df.head()
df.tail()
df.describe
df.shape
df.isnull().sum()
Out[12]:
In [23]:
df.columns
Out[23]:
In [24]:
df.method.value_counts()
Out[24]:
In [25]:
df.label.value_counts()
Out[25]:
In [30]:
df.url.unique().size
Out[30]:
In [31]:
df.payload.unique().size
Out[31]:
In [32]:
df.user_agent.unique().size
Out[32]:
In [39]:
df.method.value_counts().plot(kind="bar")
Out[39]:
In [15]:
df["is_anom"] = 0
df["is_anom"] = df.label.map({"norm":0, 135:0, "anom":1})
In [19]:
df.is_anom.isnull().sum()
Out[19]:
In [133]:
df[df.is_anom.isnull()]
Out[133]:
In [20]:
df = df.dropna()
In [50]:
df.url.value_counts().plot(kind="bar")
Out[50]:
In [58]:
ad = pd.crosstab(df.url, df.is_anom)
ad
Out[58]:
In [134]:
pd.crosstab(df.method, df.is_anom)
Out[134]:
In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split, cross_val_score
In [21]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df["payload"], df["is_anom"])
In [22]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)
In [23]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
In [149]:
print(predictions[:5])
print(X_test_raw[:5])
In [150]:
df.columns
Out[150]:
In [154]:
print(y_test.shape)
print(predictions.shape)
In [156]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
mtrx = confusion_matrix(y_test, predictions)
print(mtrx)
plt.matshow(mtrx)
plt.title("confusion matrix")
plt.colorbar()
plt.ylabel("true label")
plt.xlabel("predicted label")
plt.show()
In [24]:
from sklearn.metrics import accuracy_score
print("accuracy ", accuracy_score(y_test, predictions))
In [26]:
# null classifier
1 - y_test.mean()
Out[26]:
In [159]:
precisions = cross_val_score(classifier, X_train, y_train, cv=5, scoring='precision')
recalls = cross_val_score(classifier, X_train, y_train, cv=5, scoring='recall')
f1s = cross_val_score(classifier, X_train, y_train, cv=5, scoring='f1')
print("precision ", np.mean(precisions))
print("recall ", np.mean(recalls))
print("F1", np.mean(f1s))
In [171]:
print(y_train.shape)
print(X_train_raw.shape)
In [182]:
aa = pd.concat([X_train_raw, y_train], axis=1)
In [183]:
aa.head()
Out[183]:
In [184]:
aa[aa.is_anom == 1]
Out[184]:
In [185]:
df[df.uid == 17926]
Out[185]: