This is the main notebook performing all feature engineering, model selection, training, evaluation etc.
The different steps are:
In [5]:
%matplotlib inline
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn
import string
from IPython.display import display
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import learning_curve
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
import sklearn.gaussian_process.kernels as kernels
from sklearn.cross_validation import ShuffleSplit
from sklearn.cross_validation import KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from scipy.stats import expon
In [6]:
payloads = pd.read_csv("data/payloads.csv",index_col='index')
display(payloads.head(30))
We will create our own feature space with features that might be important for this task, this includes:
In [7]:
def plot_feature_distribution(features):
print('Properties of feature: ' + features.name)
print(features.describe())
f, ax = plt.subplots(1, figsize=(10, 6))
ax.hist(features, bins=features.max()-features.min()+1, normed=1)
ax.set_xlabel('value')
ax.set_ylabel('fraction')
plt.show()
In [8]:
def create_feature_length(payloads):
'''
Feature describing the lengh of the input
'''
payloads['length'] = [len(str(row)) for row in payloads['payload']]
return payloads
payloads = create_feature_length(payloads)
display(payloads.head())
plot_feature_distribution(payloads['length'])
In [10]:
def create_feature_non_printable_characters(payloads):
'''
Feature
Number of non printable characthers within payload
'''
payloads['non-printable'] = [ len([1 for letter in str(row) if letter not in string.printable]) for row in payloads['payload']]
return payloads
create_feature_non_printable_characters(payloads)
display(payloads.head())
plot_feature_distribution(payloads['non-printable'])
In [11]:
def create_feature_punctuation_characters(payloads):
'''
Feature
Number of punctuation characthers within payload
'''
payloads['punctuation'] = [ len([1 for letter in str(row) if letter in string.punctuation]) for row in payloads['payload']]
return payloads
create_feature_punctuation_characters(payloads)
display(payloads.head())
plot_feature_distribution(payloads['punctuation'])
In [12]:
def create_feature_min_byte_value(payloads):
'''
Feature
Minimum byte value in payload
'''
payloads['min-byte'] = [ min(bytearray(str(row), 'utf8')) for row in payloads['payload']]
return payloads
create_feature_min_byte_value(payloads)
display(payloads.head())
plot_feature_distribution(payloads['min-byte'])
In [13]:
def create_feature_max_byte_value(payloads):
'''
Feature
Maximum byte value in payload
'''
payloads['max-byte'] = [ max(bytearray(str(row), 'utf8')) for row in payloads['payload']]
return payloads
create_feature_max_byte_value(payloads)
display(payloads.head())
plot_feature_distribution(payloads['max-byte'])
In [14]:
def create_feature_mean_byte_value(payloads):
'''
Feature
Maximum byte value in payload
'''
payloads['mean-byte'] = [ np.mean(bytearray(str(row), 'utf8')) for row in payloads['payload']]
return payloads
create_feature_mean_byte_value(payloads)
display(payloads.head())
plot_feature_distribution(payloads['mean-byte'].astype(int))
In [15]:
def create_feature_std_byte_value(payloads):
'''
Feature
Standard deviation byte value in payload
'''
payloads['std-byte'] = [ np.std(bytearray(str(row), 'utf8')) for row in payloads['payload']]
return payloads
create_feature_std_byte_value(payloads)
display(payloads.head())
plot_feature_distribution(payloads['std-byte'].astype(int))
In [ ]:
def create_feature_distinct_bytes(payloads):
'''
Feature
Number of distinct bytes in payload
'''
payloads['distinct-bytes'] = [ len(list(set(bytearray(str(row), 'utf8')))) for row in payloads['payload']]
return payloads
create_feature_distinct_bytes(payloads)
display(payloads.head())
plot_feature_distribution(payloads['distinct-bytes'])
In [ ]:
sql_keywords = pd.read_csv('data/SQLKeywords.txt', index_col=False)
def create_feature_sql_keywords(payloads):
'''
Feature
Number of SQL keywords within payload
'''
payloads['sql-keywords'] = [ len([1 for keyword in sql_keywords['Keyword'] if str(keyword).lower() in str(row).lower()]) for row in payloads['payload']]
return payloads
create_feature_sql_keywords(payloads)
display(type(sql_keywords))
display(payloads.head())
plot_feature_distribution(payloads['sql-keywords'])
In [ ]:
js_keywords = pd.read_csv('data/JavascriptKeywords.txt', index_col=False)
def create_feature_javascript_keywords(payloads):
'''
Feature
Number of Javascript keywords within payload
'''
payloads['js-keywords'] = [len([1 for keyword in js_keywords['Keyword'] if str(keyword).lower() in str(row).lower()]) for row in payloads['payload']]
return payloads
create_feature_javascript_keywords(payloads)
display(payloads.head())
plot_feature_distribution(payloads['js-keywords'])
define a function that makes a feature vector from the payload using the custom features
In [ ]:
def create_features(payloads):
features = create_feature_length(payloads)
features = create_feature_non_printable_characters(features)
features = create_feature_punctuation_characters(features)
features = create_feature_max_byte_value(features)
features = create_feature_min_byte_value(features)
features = create_feature_mean_byte_value(features)
features = create_feature_std_byte_value(features)
features = create_feature_distinct_bytes(features)
features = create_feature_sql_keywords(features)
features = create_feature_javascript_keywords(features)
del features['payload']
return features
In [15]:
Y = payloads['is_malicious']
X = create_features(pd.DataFrame(payloads['payload'].copy()))
test = SelectKBest(score_func=chi2, k='all')
fit = test.fit(X, Y)
# summarize scores
print(fit.scores_)
features = fit.transform(X)
# summarize selected features
# summarize scores
np.set_printoptions(precision=2)
print(fit.scores_)
# Get the indices sorted by most important to least important
indices = np.argsort(fit.scores_)
# To get your top 10 feature names
featuress = []
for i in range(10):
featuress.append(X.columns[indices[i]])
display(featuress)
display([featuress[i] + ' ' + str(fit.scores_[i]) for i in indices[range(10)]])
plt.rcdefaults()
fig, ax = plt.subplots()
y_pos = np.arange(len(featuress))
performance = 3 + 10 * np.random.rand(len(featuress))
error = np.random.rand(len(featuress))
ax.barh(y_pos, fit.scores_[indices[range(10)]], align='center',
color='green', ecolor='black')
ax.set_yticks(y_pos)
ax.set_yticklabels(featuress)
ax.set_xscale('log')
#ax.invert_yaxis() # labels read top-to-bottom
ax.set_xlabel('Points')
ax.set_title('SelectKBest()')
plt.show()
Additional to our custom feature space, we will create 6 more feature spaces using bag-of-words techniques
The following vectorizers below is another way of creating features for text input.
We will test the performance of these techniques independently from our custom features in Step 3A.
We will create vectorizers of these combinations:
The type of N-gram function determines how the actual "words" should be created from the payload string
Each vectorizer is used later in Step4 in Pipeline objects before training
See report for further explanation
In [4]:
def get1Grams(payload_obj):
'''Divides a string into 1-grams
Example: input - payload: "<script>"
output- ["<","s","c","r","i","p","t",">"]
'''
payload = str(payload_obj)
ngrams = []
for i in range(0,len(payload)-1):
ngrams.append(payload[i:i+1])
return ngrams
tfidf_vectorizer_1grams = TfidfVectorizer(tokenizer=get1Grams)
count_vectorizer_1grams = CountVectorizer(min_df=1, tokenizer=get1Grams)
In [5]:
def get2Grams(payload_obj):
'''Divides a string into 2-grams
Example: input - payload: "<script>"
output- ["<s","sc","cr","ri","ip","pt","t>"]
'''
payload = str(payload_obj)
ngrams = []
for i in range(0,len(payload)-2):
ngrams.append(payload[i:i+2])
return ngrams
tfidf_vectorizer_2grams = TfidfVectorizer(tokenizer=get2Grams)
count_vectorizer_2grams = CountVectorizer(min_df=1, tokenizer=get2Grams)
In [6]:
def get3Grams(payload_obj):
'''Divides a string into 3-grams
Example: input - payload: "<script>"
output- ["<sc","scr","cri","rip","ipt","pt>"]
'''
payload = str(payload_obj)
ngrams = []
for i in range(0,len(payload)-3):
ngrams.append(payload[i:i+3])
return ngrams
tfidf_vectorizer_3grams = TfidfVectorizer(tokenizer=get3Grams)
count_vectorizer_3grams = CountVectorizer(min_df=1, tokenizer=get3Grams)
After creating our different feature spaces to later train each classifier on,
we first examine them visually by projecting the feature spaces into two dimensions using Principle Component Analysis
Graphs are shown below displaying the data in 3 out of 7 of our feature spaces
In [9]:
def visualize_feature_space_by_projection(X,Y,title='PCA'):
'''Plot a two-dimensional projection of the dataset in the specified feature space
input: X - data
Y - labels
title - title of plot
'''
pca = TruncatedSVD(n_components=2)
X_r = pca.fit(X).transform(X)
# Percentage of variance explained for each components
print('explained variance ratio (first two components): %s'
% str(pca.explained_variance_ratio_))
plt.figure()
colors = ['blue', 'darkorange']
lw = 2
#Plot malicious and non-malicious separately with different colors
for color, i, y in zip(colors, [0, 1], Y):
plt.scatter(X_r[Y == i, 0], X_r[Y == i, 1], color=color, alpha=.3, lw=lw,
label=i)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title(title)
plt.show()
In [10]:
X = count_vectorizer_1grams.fit_transform(payloads['payload'])
Y = payloads['is_malicious']
visualize_feature_space_by_projection(X,Y,title='PCA visualization of 1-grams CountVectorizer feature space')
In [11]:
X = tfidf_vectorizer_3grams.fit_transform(payloads['payload'])
Y = payloads['is_malicious']
visualize_feature_space_by_projection(X,Y,title='PCA visualization of 3-grams TFIDFVectorizer feature space')
In [34]:
X = create_features(pd.DataFrame(payloads['payload'].copy()))
Y = payloads['is_malicious']
visualize_feature_space_by_projection(X,Y,title='PCA visualization of custom feature space')
First, we will automate hyperparameter tuning and out of sample testing using train_model below
In [6]:
def train_model(clf, param_grid, X, Y):
'''Trains and evaluates the model clf from input
The function selects the best model of clf by optimizing for the validation data,
then evaluates its performance using the out of sample test data.
input - clf: the model to train
param_grid: a dict of hyperparameters to use for optimization
X: features
Y: labels
output - the best estimator (trained model)
the confusion matrix from classifying the test data
'''
#First, partition into train and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
n_iter = 5
#If number of possible iterations are less than prefered number of iterations,
#set it to the number of possible iterations
#number of possible iterations are not less than prefered number of iterations if any argument is expon()
#because expon() is continous (writing 100 instead, could be any large number)
n_iter = min(n_iter,np.prod([
100 if type(xs) == type(expon())
else len(xs)
for xs in param_grid.values()
]))
#perform a grid search for the best parameters on the training data.
#Cross validation is made to select the parameters, so the training data is actually split into
#a new train data set and a validation data set, K number of times
cv = ShuffleSplit(n=len(X_train), n_iter=5, test_size=0.2, random_state=0) #DEBUG: n_iter=10
#cv = KFold(n=len(X), n_folds=10)
random_grid_search = RandomizedSearchCV(
clf,
param_distributions=param_grid,
cv=cv,
scoring='f1',
n_iter=n_iter, #DEBUG 1
random_state=5,
refit=True,
verbose=10
)
'''Randomized search used instead. We have limited computing power
grid_search = GridSearchCV(
clf,
param_grid=param_grid,
cv=cv,
scoring='f1', #accuracy/f1/f1_weighted all give same result?
verbose=10,
n_jobs=-1
)
grid_search.fit(X_train, Y_train)
'''
random_grid_search.fit(X_train, Y_train)
#Evaluate the best model on the test data
Y_test_predicted = random_grid_search.best_estimator_.predict(X_test)
Y_test_predicted_prob = random_grid_search.best_estimator_.predict_proba(X_test)[:, 1]
confusion = confusion_matrix(Y_test, Y_test_predicted)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
#Calculate recall (sensitivity) from confusion matrix
sensitivity = TP / float(TP + FN)
#Calculate specificity from confusion matrix
specificity = TN / float(TN + FP)
#Calculate accuracy
accuracy = (confusion[0][0] + confusion[1][1]) / (confusion.sum().sum())
#Calculate axes of ROC curve
fpr, tpr, thresholds = roc_curve(Y_test, Y_test_predicted_prob)
#Area under the ROC curve
auc = roc_auc_score(Y_test, Y_test_predicted_prob)
return {
'conf_matrix':confusion,
'accuracy':accuracy,
'sensitivity':sensitivity,
'specificity':specificity,
'auc':auc,
'params':random_grid_search.best_params_,
'model':random_grid_search.best_estimator_,
'roc':{'fpr':fpr,'tpr':tpr,'thresholds':thresholds}
}
Then, we will use the train_model function to train, optimize and retrieve out of sample testing results from a range of classifiers.
Classifiers tested using our custom feature space:
Classifiers tested using bag-of-words feature spaces:
Multinomial Naive Bayes
Some classifiers were unable to train using a bag-of-words feature space because they couldn't handle sparse graphs
All their best parameters with their performance is stored in a dataframe called classifier_results
Make dictionary of models with parameters to optimize using bag-of-words feature spaces
In [34]:
def create_classifier_inputs_using_vectorizers(vectorizer, subscript):
'''make pipelines of the specified vectorizer with the classifiers to train
input - vectorizer: the vectorizer to add to the pipelines
subscript: subscript name for the dictionary key
output - A dict of inputs to use for train_model(); a pipeline and a dict of params to optimize
'''
classifier_inputs = {}
classifier_inputs[subscript + ' MLPClassifier'] = {
'pipeline':Pipeline([('vect', vectorizer),('clf',MLPClassifier(
activation='relu',
solver='adam',
early_stopping=False,
verbose=True
))]),
'dict_params': {
'vect__min_df':[1,2,5,10,20,40],
'clf__hidden_layer_sizes':[(500,250,125,62)],
'clf__alpha':[0.0005,0.001,0.01,0.1,1],
'clf__learning_rate':['constant','invscaling'],
'clf__learning_rate_init':[0.001,0.01,0.1,1],
'clf__momentum':[0,0.9],
}
}
'''
classifier_inputs[subscript + ' MultinomialNB'] = {
'pipeline':Pipeline([('vect', vectorizer),('clf',MultinomialNB())]),
'dict_params': {
'vect__min_df':[1,2,5,10,20,40]
}
}
classifier_inputs[subscript + ' RandomForest'] = {
'pipeline':Pipeline([('vect', vectorizer),('clf',RandomForestClassifier(
max_depth=None,min_samples_split=2, random_state=0))]),
'dict_params': {
'vect__min_df':[1,2,5,10,20,40],
'clf__n_estimators':[10,20,40,60]
}
}
classifier_inputs[subscript + ' Logistic'] = {
'pipeline':Pipeline([('vect', vectorizer), ('clf',LogisticRegression())]),
'dict_params': {
'vect__min_df':[1,2,5,10,20,40],
'clf__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]
}
}
classifier_inputs[subscript + ' SVM'] = {
'pipeline':Pipeline([('vect', vectorizer), ('clf',SVC(probability=True))]),
'dict_params': {
'vect__min_df':[1,2,5,10,20,40],
'clf__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000],
'clf__gamma':[0.001, 0.0001,'auto'],
'clf__kernel':['rbf']
}
}
'''
return classifier_inputs
Make dictionary of models with parameters to optimize using custom feature spaces
In [49]:
def create_classifier_inputs(subscript):
classifier_inputs = {}
'''classifier_inputs[subscript + ' GPC'] = {
'pipeline':GaussianProcessClassifier(),
'dict_params': {
'kernel':[
1.0*kernels.RBF(1.0),
1.0*kernels.Matern(),
1.0*kernels.RationalQuadratic(),
1.0*kernels.DotProduct()
]
}
}'''
classifier_inputs[subscript + ' AdaBoostClassifier'] = {
'pipeline':AdaBoostClassifier(n_estimators=100),
'dict_params': {
'n_estimators':[10,20,50, 100],
'learning_rate':[0.1, 0.5, 1.0, 2.0]
}
}
classifier_inputs[subscript + ' SGD'] = {
'pipeline':SGDClassifier(loss="log", penalty="l2"),
'dict_params': {
'learning_rate': ['optimal']
}
}
classifier_inputs[subscript + ' RandomForest'] = {
'pipeline':RandomForestClassifier(
max_depth=None,min_samples_split=2, random_state=0),
'dict_params': {
'n_estimators':[10,20,40,60]
}
}
classifier_inputs[subscript + ' DecisionTree'] = {
'pipeline': DecisionTreeClassifier(max_depth=5),
'dict_params': {
'min_samples_split': [2]
}
}
'''classifier_inputs[subscript + ' MLPClassifier'] = {
'pipeline':MLPClassifier(
activation='relu',
solver='adam',
early_stopping=False,
verbose=True
),
'dict_params': {
'hidden_layer_sizes':[(300, 200, 150, 150), (30, 30, 30), (150, 30, 30, 150),
(400, 250, 100, 100) , (150, 200, 300)],
'alpha':[0.0005,0.001,0.01,0.1,1],
'learning_rate':['constant','invscaling'],
'learning_rate_init':[0.0005,0.001,0.01,0.1,1],
'momentum':[0,0.9],
}
}'''
classifier_inputs[subscript + ' Logistic'] = {
'pipeline':LogisticRegression(),
'dict_params': {
'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}
}
classifier_inputs[subscript + ' MultinomialNB'] = {
'pipeline':MultinomialNB(),
'dict_params': {
'alpha': [1.0]
}
}
'''classifier_inputs[subscript + ' SVM'] = {
'pipeline':SVC(probability=True),
'dict_params': {
'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000],
'gamma':[0.001, 0.0001,'auto'],
'kernel':['rbf']
}
}'''
return classifier_inputs
Create a new result table
In [35]:
classifier_results = pd.DataFrame(columns=['accuracy','sensitivity','specificity','auc','conf_matrix','params','model','roc'])#,index=classifier_inputs.keys())
In [ ]:
classifier_inputs = {}
classifier_inputs.update(create_classifier_inputs_using_vectorizers(count_vectorizer_1grams,'count 1grams'))
classifier_inputs.update(create_classifier_inputs_using_vectorizers(count_vectorizer_2grams,'count 2grams'))
classifier_inputs.update(create_classifier_inputs_using_vectorizers(count_vectorizer_3grams,'count 3grams'))
classifier_inputs.update(create_classifier_inputs_using_vectorizers(tfidf_vectorizer_1grams,'tfidf 1grams'))
classifier_inputs.update(create_classifier_inputs_using_vectorizers(tfidf_vectorizer_2grams,'tfidf 2grams'))
classifier_inputs.update(create_classifier_inputs_using_vectorizers(tfidf_vectorizer_3grams,'tfidf 3grams'))
X = payloads['payload']
Y = payloads['is_malicious']
for classifier_name, inputs in classifier_inputs.items():
display(inputs['dict_params'])
if classifier_name in classifier_results.index.values.tolist():
print('Skipping ' + classifier_name + ', already trained')
else:
result_dict = train_model(inputs['pipeline'],inputs['dict_params'],X,Y)
classifier_results.loc[classifier_name] = result_dict
display(classifier_results)
In [74]:
display(pd.DataFrame(payloads['payload'].copy()))
In [ ]:
classifier_inputs_custom = {}
#Get classifiers and parameters to optimize
classifier_inputs_custom.update(create_classifier_inputs('custom'))
#Extract payloads and labels
Y = payloads['is_malicious']
X = create_features(pd.DataFrame(payloads['payload'].copy()))
#Select the best features
X_new = SelectKBest(score_func=chi2, k=4).fit_transform(X,Y)
#Call train_model for every classifier and save results to classifier_results
for classifier_name, inputs in classifier_inputs_custom.items():
if classifier_name in classifier_results.index.values.tolist():
print('Skipping ' + classifier_name + ', already trained')
else:
result_dict = train_model(inputs['pipeline'],inputs['dict_params'],X,Y)
classifier_results.loc[classifier_name] = result_dict
display(classifier_results)
#pickle.dump( classifier_results, open( "data/trained_classifiers_custom_all_features.p", "wb" ) )
In [39]:
#Save classifiers in a pickle file to be able to re-use them without re-training
pickle.dump( classifier_results, open( "data/trained_classifiers.p", "wb" ) )
In [30]:
#Display the results for the classifiers that were trained using our custom feature space
custom_features_classifiers = pickle.load( open("data/trained_classifier_custom_all_features.p", "rb"))
display(custom_features_classifiers)
In [31]:
#Display the results for the classifiers that were using bag of words feature spaces
classifier_results = pickle.load( open( "data/trained_classifiers.p", "rb" ) )
display(classifier_results)
In [32]:
#Combine the two tables into one table
classifier_results = classifier_results.append(custom_features_classifiers)
classifier_results = classifier_results.sort_values(['sensitivity','accuracy'], ascending=[False,False])
display(classifier_results)
Calculate F1-score of each classifier and add to classifiers table
(We didn't implement this in the train_model function as with the other performance metrics because we've already done a 82 hour training session before this and didn't want to re-run the entire training just to add F1-score from inside train_model)
In [31]:
def f1_score(conf_matrix):
precision = conf_matrix[0][0] / (conf_matrix[0][0] + conf_matrix[0][1] )
recall = conf_matrix[0][0] / (conf_matrix[0][0] + conf_matrix[1][0] )
return (2 * precision * recall) / (precision + recall)
#load classifier table if not yet loaded
classifier_results = pickle.load( open( "data/trained_classifiers.p", "rb" ) )
#Calculate F1-scores
classifier_results['F1-score'] = [ f1_score(conf_matrix) for conf_matrix in classifier_results['conf_matrix']]
#Re-arrange columns
classifier_results = classifier_results[['F1-score','accuracy','sensitivity','specificity','auc','conf_matrix','params','model','roc']]
#re-sort on F1-score
classifier_results = classifier_results.sort_values(['F1-score','accuracy'], ascending=[False,False])
display(classifier_results)
Final formating
Convert numeric columns to float
Round numeric columns to 4 decimals
In [42]:
classifier_results[['F1-score','accuracy','sensitivity','specificity','auc']] = classifier_results[['F1-score','accuracy','sensitivity','specificity','auc']].apply(pd.to_numeric)
classifier_results = classifier_results.round({'F1-score':4,'accuracy':4,'sensitivity':4,'specificity':4,'auc':4})
#classifier_results[['F1-score','accuracy','sensitivity','specificity','auc','conf_matrix','params']].to_csv('data/classifiers_result_table.csv')
display(classifier_results.dtypes)
In [43]:
#save complete list of classifiers to 'trained_classifiers'
pickle.dump( classifier_results, open( "data/trained_classifiers.p", "wb" ) )
In [ ]:
#In this case, we are going to implement tfidf 2grams RandomForest in our dummy server
classifier = (custom_features_classifiers['model'].iloc[0])
print(classifier)
#Save classifiers in a pickle file to be able to re-use them without re-training
pickle.dump( classifier, open( "data/tfidf_2grams_randomforest.p", "wb" ) )
In [ ]:
classifier_results = pickle.load( open( "data/trained_classifiers.p", "rb" ) )
First, make a histogram of classifier performance measured by F1-score.
Same classifier using different feature spaces are clustered together in the graph
Also, print the table of F1-scores and computes the averages along the x-axis and y-axis,
e.g. the average F1-score for each classifier, and the average F1-score for each feature space
In [151]:
def get_classifier_name(index):
'''
Returns the name of the classifier at the given index name
'''
return index.split()[len(index.split())-1]
#Group rows together using same classifier
grouped = classifier_results.groupby(get_classifier_name)
hist_df = pd.DataFrame(columns=['custom','count 1grams','count 2grams','count 3grams','tfidf 1grams','tfidf 2grams','tfidf 3grams'])
for classifier, indices in grouped.groups.items():
#Make a list of feature spaces
feature_spaces = indices.tolist()
feature_spaces = [feature_space.replace(classifier,'') for feature_space in feature_spaces]
feature_spaces = [feature_space.strip() for feature_space in feature_spaces]
#If no result exists, it will stay as 0
hist_df.loc[classifier] = {
'custom':0,
'count 1grams':0,
'count 2grams':0,
'count 3grams':0,
'tfidf 1grams':0,
'tfidf 2grams':0,
'tfidf 3grams':0
}
#Extract F1-score from classifier_results to corrensponding entry in hist_df
for fs in feature_spaces:
hist_df[fs].loc[classifier] = classifier_results['F1-score'].loc[fs + ' ' + classifier]
#Plot the bar plot
f, ax = plt.subplots()
ax.set_ylim([0.989,1])
hist_df.plot(kind='bar', figsize=(12,7), title='F1-score of all models grouped by classifiers', ax=ax, width=0.8)
#Make Avgerage F1-score row and cols for the table and print the table
hist_df_nonzero = hist_df.copy()
hist_df_nonzero[hist_df > 0] = True
hist_df['Avg Feature'] = (hist_df.sum(axis=1) / np.array(hist_df_nonzero.sum(axis=1)))
hist_df_nonzero = hist_df.copy()
hist_df_nonzero[hist_df > 0] = True
hist_df.loc['Avg Classifier'] = (hist_df.sum(axis=0) / np.array(hist_df_nonzero.sum(axis=0)))
hist_df = hist_df.round(4)
display(hist_df)
In [44]:
def plot_learning_curve(df_row,X,Y):
'''Plots the learning curve of a classifier with its parameters
input - df_row: row of classifier_result
X: payload data
Y: labels
'''
#The classifier to plot learning curve for
estimator = df_row['model']
title = 'Learning curves for classifier ' + df_row.name
train_sizes = np.linspace(0.1,1.0,5)
cv = ShuffleSplit(n=len(X), n_iter=3, test_size=0.2, random_state=0)
#plot settings
plt.figure()
plt.title(title)
plt.xlabel("Training examples")
plt.ylabel("Score")
print('learning curve in process...')
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, Y, cv=cv, n_jobs=-1, train_sizes=train_sizes, verbose=0) #Change verbose=10 to print progress
print('Learning curve done!')
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
plt.show()
Three examples of learning curves from the trained classifiers.
All learning curves have upsloping cross-validation score at the end,
which means that adding more data would potentially increase the accuracy
In [45]:
#plot learning curve for tfidf 1grams RandomForest
X = payloads['payload']
Y = payloads['is_malicious']
plot_learning_curve(classifier_results.iloc[0],X,Y)
#plot learning curve for count 3grams MultinomialNB
X = payloads['payload']
Y = payloads['is_malicious']
plot_learning_curve(classifier_results.iloc[6],X,Y)
#plot learning curve for custom svm
X = create_features(pd.DataFrame(payloads['payload'].copy()))
Y = payloads['is_malicious']
plot_learning_curve(classifier_results.iloc[5],X,Y)
In [240]:
def visualize_result(classifier_list):
'''Plot the ROC curve for a list of classifiers in the same graph
input - classifier_list: a subset of classifier_results
'''
f, (ax1, ax2) = plt.subplots(1,2)
f.set_figheight(6)
f.set_figwidth(15)
#Subplot 1, ROC curve
for classifier in classifier_list:
ax1.plot(classifier['roc']['fpr'], classifier['roc']['tpr'])
ax1.scatter(1-classifier['specificity'],classifier['sensitivity'], edgecolor='k')
ax1.set_xlim([0, 1])
ax1.set_ylim([0, 1.0])
ax1.set_title('ROC curve for top3 and bottom3 classifiers')
ax1.set_xlabel('False Positive Rate (1 - Specificity)')
ax1.set_ylabel('True Positive Rate (Sensitivity)')
ax1.grid(True)
#subplot 2, ROC curve zoomed
for classifier in classifier_list:
ax2.plot(classifier['roc']['fpr'], classifier['roc']['tpr'])
ax2.scatter(1-classifier['specificity'],classifier['sensitivity'], edgecolor='k')
ax2.set_xlim([0, 0.3])
ax2.set_ylim([0.85, 1.0])
ax2.set_title('ROC curve for top3 and bottom3 classifiers (Zoomed)')
ax2.set_xlabel('False Positive Rate (1 - Specificity)')
ax2.set_ylabel('True Positive Rate (Sensitivity)')
ax2.grid(True)
#Add further zoom
left, bottom, width, height = [0.7, 0.27, 0.15, 0.15]
ax3 = f.add_axes([left, bottom, width, height])
for classifier in classifier_list:
ax3.plot(classifier['roc']['fpr'], classifier['roc']['tpr'])
ax3.scatter(1-classifier['specificity'],classifier['sensitivity'], edgecolor='k')
ax3.set_xlim([0, 0.002])
ax3.set_ylim([0.983, 1.0])
ax3.set_title('Zoomed even further')
ax3.grid(True)
plt.show()
Plot ROC curves for the top3 classifiers and the bottom 3 classifiers, sorted by F1-score
Left: standard scale ROC curve
Right: zoomed in version of same graph, to easier see in the upper right corner
In [241]:
indices = [0,1,2, len(classifier_results)-1,len(classifier_results)-2,len(classifier_results)-3]
visualize_result([classifier_results.iloc[index] for index in indices])
In [16]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
def get2Grams(payload_obj):
'''Divides a string into 2-grams
Example: input - payload: "<script>"
output- ["<s","sc","cr","ri","ip","pt","t>"]
'''
payload = str(payload_obj)
ngrams = []
for i in range(0,len(payload)-2):
ngrams.append(payload[i:i+2])
return ngrams
classifier = pickle.load( open("data/tfidf_2grams_randomforest.p", "rb"))
def injection_test(inputs):
variables = inputs.split('&')
values = [ variable.split('=')[1] for variable in variables]
print(values)
return 'MALICIOUS' if classifier.predict(values).sum() > 0 else 'NOT_MALICIOUS'
In [24]:
#test injection_test
display(injection_test("val1=%3Cscript%3Ekiddie"))
In [38]:
pipe = Pipeline([('vect', vectorizer), ('clf',LogisticRegression(C=10))])
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
cv = ShuffleSplit(n=len(X_train), n_iter=1, test_size=0.2, random_state=0) #DEBUG: n_iter=10
random_grid_search = RandomizedSearchCV(
pipe,
param_distributions={
'clf__C':[10]
},
cv=cv,
scoring='roc_auc',
n_iter=1,
random_state=5,
refit=True
)
random_grid_search.fit(X_train, Y_train)
#Evaluate the best model on the test data
Y_test_predicted = random_grid_search.best_estimator_.predict(X_test)
In [45]:
#Payloads classified incorrectly
pd.options.display.max_colwidth = 200
print('False positives')
print(X_test[(Y_test == 0) & (Y_test_predicted == 1)])
print('False negatives')
print(X_test[(Y_test == 1) & (Y_test_predicted == 0)])