In this notebook we're going to explore, understand, and classify shockwave flash files as being 'benign' or 'malicious'. We will explore the data, apply machine learning algorithms to the data, add new features, do more machine learning. Then we will test our classifier on a large amount of files to measure it's effectiveness.
** DISCLAIMER:** This exercise is for illustrative purposes and only uses about 500 samples which is too small for a generalizable model.
In [1]:
import pandas as pd
print 'pandas version is', pd.__version__
import numpy as np
print 'numpy version is', np.__version__
import sklearn
print 'scikit-learn version is', sklearn.__version__
import matplotlib
print 'matplotlib version is', matplotlib.__version__
import matplotlib.pyplot as plt
In [2]:
%matplotlib inline
plt.rcParams['font.size'] = 18.0
plt.rcParams['figure.figsize'] = 16.0, 5.0
In [3]:
def plot_cm(cm, labels):
# Compute percentanges
percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T)
print 'Confusion Matrix Stats'
for i, label_i in enumerate(labels):
for j, label_j in enumerate(labels):
print "%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum())
# Show confusion matrix
# Thanks to kermit666 from stackoverflow
fig = plt.figure()
ax = fig.add_subplot(111)
ax.grid(b=False)
cax = ax.matshow(percent, cmap='coolwarm',vmin=0,vmax=100)
plt.title('')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
In [4]:
def extract_features(data):
features = {}
try:
features['sha256'] = data['metadata']['sha256']
features['size'] = data['metadata']['file_size']
features['entropy'] = data['metadata']['entropy']
features['version'] = data['characteristics']['swf']['swf metadata']['version']
features['frame count'] = data['characteristics']['swf']['swf metadata']['framecount']
features['frame rate'] = data['characteristics']['swf']['swf metadata']['framerate']
x_min = data['characteristics']['swf']['swf metadata']['xmin']
x_max = data['characteristics']['swf']['swf metadata']['xmax']
y_min = data['characteristics']['swf']['swf metadata']['ymin']
y_max = data['characteristics']['swf']['swf metadata']['ymax']
x_length = x_max - x_min
y_length = y_max - y_min
features['swf area'] = x_length * y_length
features['swf perimeter'] = 2*(x_length+y_length)
features['tag count'] = 0
for tag_info in data['characteristics']['swf']['tag types']:
features[tag_info['tag name']] = 1
features['tag count'] += tag_info['count']
abc_info = {}
for tag_info in data['verbose']['swf']['tags']:
if 'DoABC' in tag_info or 'DoABCDefine' in tag_info:
key = 'DoABC'
if 'DoABCDefine' in tag_info:
key = 'DoABCDefine'
if 'abc bytecodename' not in features:
abc_info['abc bytecodename'] = []
try:
abc_info['abc bytecodename'].append(tag_info[key]['bytecodename'])
except KeyError:
abc_info['abc bytecodename'].append('DoABCDefine')
try:
abc_info['abc flag'] = tag_info[key]['flag']
except KeyError:
abc_info['abc flag'] = 0
if 'abc strings' not in features:
abc_info['abc strings'] = []
if 'abc string count' not in features:
abc_info['abc string count'] = 0
abc_info['abc strings'].extend(tag_info[key]['abc']['strings'])
if abc_info:
if abc_info['abc bytecodename'][0] == '':
features['first abc bytecode name'] = 1
elif abc_info['abc bytecodename'][0] == 'DoABCDefine':
features['first abc bytecode name'] = 2
elif abc_info['abc bytecodename'][0] == 'frame1':
features['first abc bytecode name'] = 3
else:
features['first abc bytecode name'] = 4
features['abc bytecode name'] = abc_info['abc bytecodename']
features['bytecode name count'] = len(abc_info['abc bytecodename'])
features['unique bytecode name count'] = len(set(abc_info['abc bytecodename']))
features['abc strings'] = abc_info['abc strings']
features['abc string count'] = len(features['abc strings'])
features['long hex string'] = 0
for s in features['abc strings']:
if len(s) > 100:
try:
s.decode('hex')
features['long hex string'] = 1
break
except:
pass
try:
features['abc string m/m ratio'] = float(data['verbose']['swf']['SWF String Statistical Analysis']['ActionScript String Length Mean to Median Ratio'])
except KeyError as k:
features['abc string m/m ratio'] = 0.0
except KeyError as ke:
print 'ERROR:', ke, data['metadata']['sha256']
return features
In [5]:
def load_files(file_list):
import json
features_list = []
for filename in file_list:
with open(filename,'rb') as f:
features = extract_features(json.loads(f.read()))
features_list.append(features)
return features_list
In [6]:
# Good files
import glob
good_list = glob.glob('data/clean/*.results')
good_features = load_files(good_list)
print "Files:", len(good_list)
In [7]:
# Bad files
bad_list = glob.glob('data/malicious/*.results')
bad_features = load_files(bad_list)
print "Files:", len(bad_list)
In [8]:
df_good = pd.DataFrame.from_records(good_features)
df_good.fillna(0, inplace=True)
df_good['label'] = 'benign'
df_good.head()
Out[8]:
In [9]:
df_bad = pd.DataFrame.from_records(bad_features)
df_bad.fillna(0, inplace=True)
df_bad['label'] = 'malicious'
df_bad.head()
Out[9]:
In [10]:
df = pd.concat([df_bad, df_good], ignore_index=True)
df.fillna(0, inplace=True)
In [11]:
df.groupby(['label', 'version'])['version'].count().unstack('label').fillna(0).plot(
colormap='GnBu', kind='bar', stacked=True, grid=False)
Out[11]:
In [12]:
df.boxplot(column='size', by='label')
plt.ylabel('File Size')
plt.xlabel('')
plt.title('')
plt.suptitle('')
plt.ylim(0, 200000)
Out[12]:
In [13]:
df.boxplot('entropy', 'label')
plt.ylabel('Entropy')
plt.xlabel('')
plt.title('')
plt.suptitle('')
Out[13]:
In [14]:
df.boxplot(column='frame count', by='label')
plt.ylabel('Frame Count')
plt.xlabel('')
plt.title('')
plt.suptitle('')
plt.ylim(0, 5000)
Out[14]:
In [15]:
df_good['frame count'].value_counts()[0:10]
Out[15]:
In [16]:
df.boxplot(column='frame rate', by='label')
plt.ylabel('Frame Rate')
plt.xlabel('')
plt.title('')
plt.suptitle('')
Out[16]:
In [17]:
df.boxplot('swf area', 'label')
plt.xlabel('')
plt.ylabel('Frame Area')
plt.title('')
plt.suptitle('')
plt.ylim(0, 750000)
Out[17]:
In [18]:
df.boxplot('swf perimeter', 'label')
plt.xlabel('')
plt.ylabel('Frame Perimeter')
plt.title('')
plt.suptitle('')
Out[18]:
In [19]:
my_seed = 1022
my_tsize = .2
In [20]:
import sklearn.ensemble
clf_simple = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
simple_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version']
X = df.as_matrix(simple_features)
y = np.array(df['label'].tolist())
scores = sklearn.cross_validation.cross_val_score(clf_simple, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
In [21]:
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_simple.fit(X_train, y_train)
y_pred = clf_simple.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
In [22]:
# Feature Selection
importances = zip(simple_features, clf_simple.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
for idx, im in enumerate(importances[0:10]):
print (str(idx+1) + ':').ljust(4), im[0].ljust(20), round(im[1], 5)
In [23]:
df.boxplot('tag count', 'label')
plt.xlabel('')
plt.ylabel('Number of Tags')
plt.title('')
plt.suptitle('')
plt.ylim(0,400)
Out[23]:
In [24]:
p = df.groupby(['PlaceObject2','label'])['PlaceObject2'].count().unstack('PlaceObject2').fillna(0).plot(
kind='bar', stacked=False, grid=False)
p.set_xlabel('')
p.plot()
Out[24]:
In [25]:
p = df.groupby(['DoABC','label'])['DoABC'].count().unstack('DoABC').fillna(0).plot(
kind='bar', stacked=False, grid=False)
p.set_xlabel('')
p.plot()
Out[25]:
In [26]:
p = df.groupby(['DoABCDefine','label'])['DoABCDefine'].count().unstack('DoABCDefine').fillna(0).plot(
kind='bar', stacked=False, grid=False)
p.set_xlabel('')
p.plot()
Out[26]:
In [27]:
p = df.groupby(['DefineBitsJPEG2','label'])['DefineBitsJPEG2'].count().unstack('DefineBitsJPEG2').fillna(0).plot(
kind='bar', stacked=False, grid=False)
p.set_xlabel('')
p.plot()
Out[27]:
In [28]:
p = df.groupby(['End','label'])['End'].count().unstack('End').fillna(0).plot(
kind='bar', stacked=False, grid=False)
p.set_xlabel('')
p.plot()
Out[28]:
In [29]:
import sklearn.ensemble
clf_tags = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
tag_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count']
X = df.as_matrix(tag_features)
y = np.array(df['label'].tolist())
scores = sklearn.cross_validation.cross_val_score(clf_tags, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
In [30]:
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_tags.fit(X_train, y_train)
y_pred = clf_tags.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
In [31]:
importances = zip(tag_features, clf_tags.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
for idx, im in enumerate(importances[0:25]):
print (str(idx+1) + ':').ljust(4), im[0].ljust(40), round(im[1], 5)
In [32]:
df.boxplot('abc string count', 'label')
plt.xlabel('')
plt.ylabel('Number of ActionScript Strings')
plt.title('')
plt.suptitle('')
plt.ylim(0, 1000)
Out[32]:
In [33]:
df.boxplot('abc string m/m ratio', 'label')
plt.xlabel('')
plt.ylabel('ActionScript Mean/Median Ratio')
plt.title('')
plt.suptitle('')
plt.ylim(0, 15)
Out[33]:
In [34]:
import sklearn.ensemble
clf_abc = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',
'abc string count', 'abc string m/m ratio',
'bytecode name count', 'first abc bytecode name', 'long hex string',
'unique bytecode name count']
X = df.as_matrix(abc_features)
y = np.array(df['label'].tolist())
scores = sklearn.cross_validation.cross_val_score(clf_abc, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
In [35]:
#### Again, not a real improvement.
In [36]:
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_abc.fit(X_train, y_train)
y_pred = clf_abc.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
In [37]:
importances = zip(abc_features, clf_abc.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
total = 0
for idx, im in enumerate(importances[0:20]):
total += round(im[1], 5)
print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), total
In [38]:
clf_everything = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',
'abc string count', 'abc string m/m ratio',
'bytecode name count', 'first abc bytecode name', 'long hex string',
'unique bytecode name count']
X_all = df.as_matrix(abc_features)
y_all = np.array(df['label'].tolist())
clf_everything.fit(X_all, y_all)
Out[38]:
In [39]:
swf_malware_df = pd.read_hdf('data/swf_malware_df.hd5', 'table')
swf_malware_df['label'] = 'malicious'
swf_malware_df.shape
Out[39]:
In [40]:
swf_bigpile_df = pd.read_hdf('data/swf_bigpile_df.hd5', 'table')
swf_bigpile_df['label'] = 'benign'
swf_bigpile_df.shape
Out[40]:
In [41]:
clean = 0
gray = 0
bad = 0
for x in swf_bigpile_df.as_matrix(abc_features):
try:
score = clf_everything.predict_proba(x)[:,1][0]
if score < 0.5:
clean += 1
elif score < 0.8:
gray += 1
else:
bad += 1
except:
print "Sad"
print x
break
print swf_bigpile_df.shape
print clean
print gray
print bad
In [42]:
swf_random_df = swf_bigpile_df.reindex(np.random.permutation(swf_bigpile_df.index))
swf_random_5k_df = swf_random_df[0:5000]
swf_random_the_rest_df = swf_random_df[5000:]
In [43]:
swf_bigger_df = pd.concat([swf_malware_df, swf_random_5k_df], ignore_index=True)
swf_bigger_df.fillna(0, inplace=True)
In [44]:
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
clf_5k = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',
'abc string count', 'abc string m/m ratio',
'bytecode name count', 'first abc bytecode name', 'long hex string',
'unique bytecode name count']
X = swf_bigger_df.as_matrix(abc_features)
y = np.array(swf_bigger_df['label'].tolist())
scores = sklearn.cross_validation.cross_val_score(clf_5k, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
In [45]:
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_5k.fit(X_train, y_train)
y_pred = clf_abc.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
In [46]:
importances = zip(abc_features, clf_5k.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
total = 0
for idx, im in enumerate(importances[0:20]):
total += round(im[1], 5)
print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), total
In [47]:
#### Next we training over all the data again, and test on the large corpus of files.
In [48]:
clf_everything_2 = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',
'abc string count', 'abc string m/m ratio',
'bytecode name count', 'first abc bytecode name', 'long hex string',
'unique bytecode name count']
X_all_2 = swf_bigger_df.as_matrix(abc_features)
y_all_2 = np.array(swf_bigger_df['label'].tolist())
clf_everything_2.fit(X_all_2, y_all_2)
Out[48]:
In [49]:
clean = 0
gray = 0
bad = 0
for x in swf_random_the_rest_df.as_matrix(abc_features):
try:
score = clf_everything_2.predict_proba(x)[:,1][0]
if score < 0.5:
clean += 1
elif score < 0.8:
gray += 1
else:
bad += 1
except:
print "Sad"
print x
break
print swf_bigpile_df.shape
print clean
print gray
print bad
In [50]:
df_abc_only = swf_bigger_df[(swf_bigger_df['DoABC'] == 1) | (swf_bigger_df['DoABCDefine'] == 1)]
df_abc_only.shape
Out[50]:
In [51]:
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
clf_abc_only = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',
'abc string count', 'abc string m/m ratio',
'bytecode name count', 'first abc bytecode name', 'long hex string',
'unique bytecode name count']
X = df_abc_only.as_matrix(abc_features)
y = np.array(df_abc_only['label'].tolist())
scores = sklearn.cross_validation.cross_val_score(clf_abc_only, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
In [53]:
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_abc_only.fit(X_train, y_train)
y_pred = clf_abc.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
In [54]:
importances = zip(abc_features, clf_abc.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
total = 0
for idx, im in enumerate(importances[0:20]):
total += round(im[1], 5)
print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), total
In [55]:
swf_abc_only_the_rest_df = swf_random_the_rest_df[(swf_random_the_rest_df['DoABC'] == 1) | (swf_random_the_rest_df['DoABCDefine'] == 1)]
In [56]:
clf_everything_abc = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
abc_features = ['entropy', 'frame count', 'frame rate', 'size', 'swf area', 'swf perimeter', 'version',
'CSMTextSettings', 'DebugID', 'DefineBinaryData', 'DefineBits',
'DefineBitsJPEG2', 'DefineBitsJPEG3', 'DefineBitsLossless',
'DefineBitsLossless2', 'DefineButton', 'DefineButton2',
'DefineButtonSound', 'DefineEditText', 'DefineFont', 'DefineFont2',
'DefineFont3', 'DefineFont4', 'DefineFontAlignZones', 'DefineFontInfo',
'DefineFontInfo2', 'DefineFontName', 'DefineMorphShape',
'DefineMorphShape2', 'DefineScalingGrid', 'DefineSceneAndFrameLabelData',
'DefineShape', 'DefineShape2', 'DefineShape3', 'DefineShape4',
'DefineSound', 'DefineSprite', 'DefineText', 'DefineText2',
'DefineVideoStream', 'DoABC', 'DoABCDefine', 'DoAction', 'DoInitAction',
'EnableDebugger2', 'End', 'ExportAssets', 'FileAttributes', 'FrameLabel',
'ImportAssets2', 'JPEGTables', 'Metadata', 'PlaceObject', 'PlaceObject2',
'PlaceObject3', 'ProductInfo', 'Protect', 'RemoveObject2', 'ScriptLimits',
'SetBackgroundColor', 'ShowFrame', 'SoundStreamBlock', 'SoundStreamHead',
'SoundStreamHead2', 'SymbolClass', 'Unknown', 'tag count',
'abc string count', 'abc string m/m ratio',
'bytecode name count', 'first abc bytecode name', 'long hex string',
'unique bytecode name count']
X_all_3 = df_abc_only.as_matrix(abc_features)
y_all_3 = np.array(df_abc_only['label'].tolist())
clf_everything_abc.fit(X_all_3, y_all_3)
Out[56]:
In [57]:
clean = 0
gray = 0
bad = 0
for x in swf_abc_only_the_rest_df.as_matrix(abc_features):
try:
score = clf_everything_abc.predict_proba(x)[:,1][0]
if score < 0.5:
clean += 1
elif score < 0.8:
gray += 1
else:
bad += 1
except Exception as e:
print "Sad"
print e
print x
break
print swf_abc_only_the_rest_df.shape
print clean
print gray
print bad