Approach:
In [1]:
import sklearn.feature_extraction
sklearn.__version__
Out[1]:
In [2]:
import pandas as pd
pd.__version__
Out[2]:
In [3]:
# Plotting defaults
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.size'] = 18.0
plt.rcParams['figure.figsize'] = 12.0, 5.0
In [4]:
# A plotting helper method
def plot_it(df,label_x,label_y):
fig, ax = plt.subplots(subplot_kw={'axisbg':'#EEEEE5'})
ax.grid(color='grey', linestyle='solid')
df.T.plot(kind='bar', logx=True, rot=0, ax=ax, colormap='PiYG')
ax.legend(loc=0, prop={'size':14})
plt.subplots_adjust(left=0, right=1, bottom=0, top=1, wspace=0, hspace=0)
plt.xlabel(label_x)
plt.ylabel(label_y)
return ax
In [5]:
# Read in a set of SQL statements from various sources
import os
basedir = 'data'
filelist = os.listdir(basedir)
df_list = []
for file in filelist:
df = pd.read_csv(os.path.join(basedir,file), sep='|||', names=['raw_sql'], header=None)
df['type'] = 'legit' if file.split('.')[0] == 'legit' else 'malicious'
df_list.append(df)
dataframe = pd.concat(df_list, ignore_index=True)
dataframe.dropna(inplace=True)
print dataframe['type'].value_counts()
dataframe.head()
Out[5]:
In [6]:
!which python
In [7]:
# Use the SQLParse module: sqlparse is a non-validating SQL parser module for Python
# https://github.com/andialbrecht/sqlparse
import sqlparse
def parse_it(raw_sql):
parsed = sqlparse.parse(unicode(raw_sql,'utf-8'))
return [token._get_repr_name() for parse in parsed for token in parse.tokens if token._get_repr_name() != 'Whitespace']
dataframe['parsed_sql'] = dataframe['raw_sql'].map(lambda x: parse_it(x))
In [8]:
dataframe.head()
Out[8]:
In [9]:
# Looking at the SQL tokens is 'kinda' interesting but sequences of tokens and transitions
# between tokens seems more meaningful so we're also going to compute sequences by
# computing NGrams for every SQL statement...
def ngrams(lst, N):
ngrams = []
for n in xrange(0,N):
ngrams += zip(*(lst[i:] for i in xrange(n+1)))
return [str(tuple) for tuple in ngrams]
In [10]:
dataframe['sequences'] = dataframe['parsed_sql'].map(lambda x: ngrams(x, 3))
In [11]:
# Helper method
def token_expansion(series, types):
_tokens, _types = zip(*[(token,token_type) for t_list,token_type in zip(series,types) for token in t_list])
return pd.Series(_tokens), pd.Series(_types)
In [12]:
dataframe['sequences']
Out[12]:
In [13]:
# The data hacking repository has a simple stats module we're going to use
import data_hacking.simple_stats as ss
# Spin up our g_test class
g_test = ss.GTest()
# Here we'd like to see how various sql tokens and transitions are related.
# Is there an association with particular token sets and malicious SQL statements.
tokens, types = token_expansion(dataframe['sequences'], dataframe['type'])
df_ct, df_cd, df_stats = g_test.highest_gtest_scores(tokens, types, matches=0, N=0)
df_stats.sort('malicious_g', ascending=0).head(10)
# The table below shows raw counts, conditional distributions, expected counts, and g-test score.
Out[13]:
In [14]:
# Now plot the the head() and the tail() of the dataframe to see who's been naughty or nice
sorted_df = df_stats.sort('malicious_g', ascending=0)
naughty = sorted_df.head(7)
nice = sorted_df.tail(7).sort('malicious_g', ascending=0)
naughty_and_nice = pd.concat([naughty, nice])
ax = plot_it(naughty_and_nice[['malicious_g']],'SQL Command Types','G-Test Scores')
ax.set_xlim(.2, 1.4)
Out[14]:
In [15]:
# Documentation in sqlparse for the mapping can be found here:
# https://github.com/andialbrecht/sqlparse/blob/master/sqlparse/keywords.py
# or here
# https://github.com/andialbrecht/sqlparse/blob/master/sqlparse/lexer.py
# Here we look at example of the SQL sequence that G-Test has indicated are good
# indicators of SQL injections.
dataframe[dataframe['sequences'].map(lambda x: "('Single', 'Identifier')" in x)].head()
Out[15]:
In [16]:
dataframe[dataframe['sequences'].map(lambda x: "('Punctuation',)" in x)].head()
Out[16]:
In [17]:
# Generating additional feature dimensions for the machine learning to expand its mind into...
# We're basically building up features to include into our 'feature vector' for ML
import math
from collections import Counter
def entropy(s):
p, lns = Counter(s), float(len(s))
return -sum( count/lns * math.log(count/lns, 2) for count in p.values())
dataframe['length'] = dataframe['parsed_sql'].map(lambda x: len(x))
dataframe['entropy'] = dataframe['raw_sql'].map(lambda x: entropy(x))
dataframe.head()
Out[17]:
In [18]:
# For each SQL statement aggregate the malicious and legit g-test scores as features
import numpy as np
def g_aggregate(sequence, name):
try:
g_scores = [df_stats.ix[item][name] for item in sequence]
except KeyError:
return 0
return sum(g_scores)/len(g_scores) if g_scores else 0 # Average
dataframe['malicious_g'] = dataframe['sequences'].map(lambda x: g_aggregate(x, 'malicious_g'))
dataframe['legit_g'] = dataframe['sequences'].map(lambda x: g_aggregate(x, 'legit_g'))
In [19]:
dataframe.head()
Out[19]:
In [20]:
# Boxplots show you the distribution of the data (spread).
# http://en.wikipedia.org/wiki/Box_plot
# Plot the length and entropy of SQL statements
# Fixme Brian: make these pretty
dataframe.boxplot('length','type')
plt.ylabel('SQL Statement Length')
dataframe.boxplot('entropy','type')
plt.ylabel('SQL Statement Entropy')
Out[20]:
In [21]:
# Split the classes up so we can set colors, size, labels
fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEE5'))
ax.grid(color='grey', linestyle='solid')
cond = dataframe['type'] == 'malicious'
evil = dataframe[cond]
legit = dataframe[~cond]
plt.scatter(legit['length'], legit['entropy'], s=140, c='#aaaaff', label='Legit', alpha=.7)
plt.scatter(evil['length'], evil['entropy'], s=40, c='r', label='Injections', alpha=.3)
plt.legend()
plt.xlabel('SQL Statement Length')
plt.ylabel('SQL Statement Entropy')
Out[21]:
In [22]:
# Split the classes up so we can set colors, size, labels
fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEE5'))
ax.grid(color='grey', linestyle='solid')
plt.scatter(legit['malicious_g'], legit['legit_g'], s=140, c='#aaaaff', label='Legit', alpha=.7)
plt.scatter(evil['malicious_g'], evil['legit_g'], s=40, c='r', label='Injections', alpha=.3)
plt.legend()
plt.ylabel('Legit SQL G-Test Score')
plt.xlabel('Malicious SQL G-Test Score')
Out[22]:
In [23]:
# In preparation for using scikit learn we're just going to use
# some handles that help take us from pandas land to scikit land
# List of feature vectors (scikit learn uses 'X' for the matrix of feature vectors)
X = dataframe.as_matrix(['length', 'entropy','legit_g','malicious_g'])
# Labels (scikit learn uses 'y' for classification labels)
y = np.array(dataframe['type'].tolist()) # Yes, this is weird but it needs
# to be an np.array of strings
In [24]:
# Random Forest is a popular ensemble machine learning classifier.
# http://scikit-learn.org/dev/modules/generated/sklearn.ensemble.RandomForestClassifier.html
#
import sklearn.ensemble
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=20) # Trees in the forest
In [25]:
# Now we can use scikit learn's cross validation to assess predictive performance.
scores = sklearn.cross_validation.cross_val_score(clf, X, y, cv=10, n_jobs=4)
print scores
In [26]:
# Wow 99% accurate! There is an issue though...
# Recall that we have ~13k 'malicious SQL statements and
# we only have about 1k 'legit' SQL statements, so we dive
# in a bit and look at the predictive performance more deeply.
# Train on a 80/20 split
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test, index_train, index_test = train_test_split(X, y, dataframe.index, test_size=0.2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
In [27]:
# Now plot the results of the 80/20 split in a confusion matrix
from sklearn.metrics import confusion_matrix
labels = ['legit', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
def plot_cm(cm, labels):
# Compute percentanges
percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T) # Derp, I'm sure there's a better way
print 'Confusion Matrix Stats'
for i, label_i in enumerate(labels):
for j, label_j in enumerate(labels):
print "%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum())
# Show confusion matrix
# Thanks kermit666 from stackoverflow :)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.grid(b=False)
cax = ax.matshow(percent, cmap='coolwarm')
plt.title('Confusion matrix of the classifier')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
plot_cm(cm, labels)
In [28]:
# Compute the precition probabilities and use them to mimimize our false positives
# Note: This is simply a trade off, it means we'll miss a few of the malicious
# ones but typically false alarms are a death blow to any new 'fancy stuff' so
# we definitely want to mimimize the false alarms.
y_probs = clf.predict_proba(X_test)[:,1]
thres = .9 # This can be set to whatever you'd like
y_pred[y_probs<thres] = 'legit'
y_pred[y_probs>=thres] = 'malicious'
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
In [29]:
# We can also look at what features the learning algorithm thought were the most important
importances = zip(['length', 'entropy', 'legit_g', 'malicious_g'], clf.feature_importances_)
importances
# From the list below we see our feature importance scores. There's a lot of feature selection,
# sensitivity study, etc stuff that you could do if you wanted at this point.
Out[29]:
In [30]:
# Now were going to just do some post analysis on how the ML algorithm performed.
# Lets look at the legit samples that were misclassified as malicious
test_set = dataframe.ix[index_test]
test_set['pred'] = y_pred
misclassified = test_set[(test_set['type']=='legit') & (test_set['pred']=='malicious')]
misclassified.head()
Out[30]:
In [31]:
# Discussion for how to use the resulting models.
# Typically Machine Learning comes in two phases
# - Training of the Model
# - Evaluation of new observations against the Model
# This notebook is about exploration of the data and training the model.
# After you have a model that you are satisfied with, just 'pickle' it
# at the end of the your training script and then in a separate
# evaluation script 'unpickle' it and evaluate/score new observations
# coming in (through a file, or ZeroMQ, or whatever...)
#
# In this case we'd have to pickle the RandomForest classifier.
# See 'test_it' below for how to use them in evaluation mode.
# test_it shows how to do evaluation, also fun for manual testing below :)
def test_it(sql):
parsed_sql = parse_it(sql)
ngram_list = ngrams(parsed_sql, 3)
malicious_g = g_aggregate(ngram_list, 'malicious_g')
legit_g = g_aggregate(ngram_list, 'legit_g')
_X = [len(parsed_sql), entropy(sql), legit_g, malicious_g]
print '%-40s: %s' % (sql, clf.predict(_X)[0])
In [32]:
test_it('select * from employees')
test_it("'; exec master..xp_cmdshell")
test_it("'any 'x'='x'")
test_it('from dorseys mom xp_cmdshell biache')
test_it('select * from your_mom')
The combination of IPython, Pandas and Scikit Learn let us pull in some junky SQL data, clean it up, plot it, understand it and slap it with some machine learning!
Clearly a lot more formality could be used, plotting learning curves, adjusting for overfitting, feature selection, on and on... there are some really great machine learning resources that cover this deeper material. In particular we highly recommend the work and presentations of Olivier Grisel at INRIA Saclay. http://ogrisel.com/