In this notebook we're going to explore, understand, and classify java class files as being 'benign' or 'malicious'. We will explore the data, apply machine learning algorithms to the data, add new features, do more machine learning. Then we will test our classifier on a large amount of files to measure it's effectiveness.
In [117]:
import pandas as pd
print 'pandas version is', pd.__version__
import numpy as np
print 'numpy version is', np.__version__
import sklearn
print 'scikit-learn version is', sklearn.__version__
import matplotlib
print 'matplotlib version is', matplotlib.__version__
import matplotlib.pyplot as plt
In [118]:
%matplotlib inline
plt.rcParams['font.size'] = 18.0
plt.rcParams['figure.figsize'] = 16.0, 5.0
In [119]:
def plot_cm(cm, labels):
# Compute percentanges
percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T)
print 'Confusion Matrix Stats'
for i, label_i in enumerate(labels):
for j, label_j in enumerate(labels):
print "%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum())
# Show confusion matrix
# Thanks to kermit666 from stackoverflow
fig = plt.figure()
ax = fig.add_subplot(111)
ax.grid(b=False)
cax = ax.matshow(percent, cmap='coolwarm',vmin=0,vmax=100)
plt.title('')
fig.colorbar(cax)
ax.set_xticklabels([''] + labels)
ax.set_yticklabels([''] + labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
In [120]:
def extract_character_info(string):
lowercase_runs = []
uppercase_runs = []
digit_runs = []
lower = map(str.islower, str(string))
upper = map(str.isupper, str(string))
digits = map(str.isdigit, str(string))
current_length = 0
current = False
for l in lower:
if l:
current_length += 1
current = True
else:
if current:
lowercase_runs.append(current_length)
current_length = 0
current = False
if current:
lowercase_runs.append(current_length)
current_length = 0
current = False
for u in upper:
if u:
current_length += 1
current = True
else:
if current:
uppercase_runs.append(current_length)
current_length = 0
current = False
if current:
uppercase_runs.append(current_length)
current_length = 0
current = False
for d in digits:
if d:
current_length += 1
current = True
else:
if current:
digit_runs.append(current_length)
current_length = 0
current = False
if current:
digit_runs.append(current_length)
return lowercase_runs, uppercase_runs, digit_runs
In [121]:
def extract_features(data):
features = {}
try:
features['sha256'] = data['metadata']['sha256']
features['size'] = data['metadata']['file_size']
features['entropy'] = data['metadata']['entropy']
if 'sourcefile' in data['characteristics']['java']:
features['source file'] = data['characteristics']['java']['sourcefile']
else:
features['source file'] = 'No Source File'
if 'access_permissions' in data['characteristics']['java']:
features['ap_count'] = len(data['characteristics']['java']['access_permissions'])
for ap in data['characteristics']['java']['access_permissions']:
features[str.lower(str(ap).replace(" ", "_"))] = 1
features['class name'] = data['characteristics']['java']['class_name']
features['class_name_slash_count'] = features['class name'].count('/')
features['class_name_length'] = len(features['class name'])
cn_lowercase_runs, cn_uppercase_runs, cn_digit_runs = extract_character_info(features['class name'])
cn_lowercase_run_longest = 0
cn_lowercase_run_average = 0
cn_uppercase_run_longest = 0
cn_uppercase_run_average = 0
cn_digit_run_longest = 0
cn_digit_run_average = 0
if cn_lowercase_runs:
cn_lowercase_run_longest = np.max(cn_lowercase_runs)
cn_lowercase_run_average = np.mean(cn_lowercase_runs)
features['class_name_lowercase_run_longest'] = cn_lowercase_run_longest
features['class_name_lowercase_run_avg'] = cn_lowercase_run_average
if cn_uppercase_runs:
cn_uppercase_run_longest = np.max(cn_uppercase_runs)
cn_uppercase_run_average = np.mean(cn_uppercase_runs)
features['class_name_uppercase_run_longest'] = cn_uppercase_run_longest
features['class_name_uppercase_run_avg'] = cn_uppercase_run_average
if cn_digit_runs:
cn_digit_run_longest = np.max(cn_digit_runs)
cn_digit_run_average = np.mean(cn_digit_runs)
features['class_name_digit_run_longest'] = cn_digit_run_longest
features['class_name_digit_run_avg'] = cn_digit_run_average
features['major version'] = data['characteristics']['java']['major_version']
features['minor version'] = data['characteristics']['java']['minor_version']
if 'method_names' in data['characteristics']['java']:
features['method names'] = data['characteristics']['java']['method_names']
else:
features['method names'] = []
features['methods_count'] = len(features['method names'])
lowercase_run_longest = 0
lowercase_run_average = 0
lowercase_runs = []
uppercase_run_longest = 0
uppercase_run_average = 0
uppercase_runs = []
digit_run_longest = 0
digit_run_average = 0
digit_runs = []
for method in features['method names']:
lc, uc, d = extract_character_info(method)
lowercase_runs.extend(lc)
uppercase_runs.extend(uc)
digit_runs.extend(d)
if lowercase_runs:
lowercase_run_longest = np.max(lowercase_runs)
lowercase_run_average = np.mean(lowercase_runs)
features['method_name_lowercase_run_longest'] = lowercase_run_longest
features['method_name_lowercase_run_avg'] = lowercase_run_average
if uppercase_runs:
uppercase_run_longest = np.max(uppercase_runs)
uppercase_run_average = np.mean(uppercase_runs)
features['method_name_uppercase_run_longest'] = uppercase_run_longest
features['method_name_uppercase_run_avg'] = uppercase_run_average
if digit_runs:
digit_run_longest = np.max(digit_runs)
digit_run_average = np.mean(digit_runs)
features['method_name_digit_run_longest'] = digit_run_longest
features['method_name_digit_run_avg'] = digit_run_average
if 'interfaces' in data['characteristics']['java']:
features['interfaces'] = data['characteristics']['java']['interfaces']
else:
features['interfaces'] = []
features['interface_count'] = len(features['interfaces'])
features['constant_pool_count'] = data['characteristics']['java']['const_pool_count']
except KeyError as ke:
print 'ERROR:', ke, data['metadata']['sha256']
return features
In [122]:
def load_files(file_list):
import json
features_list = []
for filename in file_list:
with open(filename,'rb') as f:
features = extract_features(json.loads(f.read()))
features_list.append(features)
return features_list
In [123]:
# Good files
import glob
good_list = glob.glob('data/clean/*.results')
good_features = load_files(good_list)
print "Files:", len(good_list)
In [124]:
# Bad files
bad_list = glob.glob('data/malicious/*.results')
bad_features = load_files(bad_list)
print "Files:", len(bad_list)
In [125]:
df_good = pd.DataFrame.from_records(good_features)
df_good.fillna(0, inplace=True)
df_good['label'] = 'benign'
df_good.head()
Out[125]:
In [126]:
df_bad = pd.DataFrame.from_records(bad_features)
df_bad.fillna(0, inplace=True)
df_bad['label'] = 'malicious'
df_bad.head()
Out[126]:
In [127]:
df = pd.concat([df_bad, df_good], ignore_index=True)
df.fillna(0, inplace=True)
In [129]:
df.boxplot(column='size', by='label')
plt.ylabel('File Size')
plt.xlabel('')
plt.title('')
plt.suptitle('')
Out[129]:
In [130]:
df.boxplot(column='size', by='label')
plt.ylabel('File Size')
plt.xlabel('')
plt.title('')
plt.suptitle('')
plt.ylim(0, 15000)
Out[130]:
In [131]:
df.boxplot('entropy', 'label')
plt.ylabel('Entropy')
plt.xlabel('')
plt.title('')
plt.suptitle('')
Out[131]:
In [132]:
df.boxplot(column='constant_pool_count', by='label')
plt.ylabel('Constant Pool Count')
plt.xlabel('')
plt.title('')
plt.suptitle('')
Out[132]:
In [133]:
df.boxplot(column='constant_pool_count', by='label')
plt.xlabel('')
plt.ylabel('Constant Pool Count')
plt.title('')
plt.suptitle('')
plt.ylim(0, 1000)
Out[133]:
In [134]:
df.boxplot(column='methods_count', by='label')
plt.ylabel('Number of Methods')
plt.xlabel('')
plt.title('')
plt.suptitle('')
Out[134]:
In [135]:
df.boxplot(column='interface_count', by='label')
plt.ylabel('Number of Interfaces')
plt.xlabel('')
plt.title('')
plt.suptitle('')
Out[135]:
In [136]:
my_seed = 1022
my_tsize = .2
In [137]:
import sklearn.ensemble
clf_simple = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
simple_features = ['acc_abstract', 'acc_annotation', 'acc_enum', 'acc_final', 'acc_interface',
'acc_public', 'acc_super', 'acc_synthetic', 'ap_count', 'constant_pool_count',
'entropy', 'size', 'interface_count', 'major version', 'methods_count', 'minor version']
X = df.as_matrix(simple_features)
y = np.array(df['label'].tolist())
scores = sklearn.cross_validation.cross_val_score(clf_simple, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
In [138]:
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_simple.fit(X_train, y_train)
y_pred = clf_simple.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
In [139]:
importances = zip(simple_features, clf_simple.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
for idx, im in enumerate(importances):
print (str(idx+1) + ':').ljust(4), im[0].ljust(20), round(im[1], 5)
In [145]:
bad = []
good = []
for strings, label in zip(df['method names'], df['label']):
for name in strings:
d = {'method name': name}
if label == 'malicious' and d not in bad:
bad.append(d)
elif label == 'benign' and d not in good:
good.append(d)
df_method_names_bad = pd.DataFrame.from_records(bad)
df_method_names_good = pd.DataFrame.from_records(good)
In [146]:
df_method_names_bad.head(50)
Out[146]:
In [147]:
df_method_names_good.head(50)
Out[147]:
In [148]:
df.boxplot('method_name_lowercase_run_longest', 'label')
plt.ylabel('Max length of lower case letters')
plt.xlabel('')
plt.title('')
plt.suptitle('')
Out[148]:
In [149]:
df.boxplot('method_name_lowercase_run_avg', 'label')
plt.ylabel('Avg length of lower case letters')
plt.xlabel('')
plt.title('')
plt.suptitle('')
Out[149]:
In [150]:
df.boxplot('method_name_uppercase_run_longest', 'label')
plt.ylabel('Max length of upper case letters')
plt.xlabel('')
plt.title('')
plt.suptitle('')
Out[150]:
In [151]:
df.boxplot('method_name_uppercase_run_avg', 'label')
plt.ylabel('Avg length of upper case letters')
plt.xlabel('')
plt.title('')
plt.suptitle('')
Out[151]:
In [152]:
df.boxplot('method_name_digit_run_longest', 'label')
plt.ylabel('Max length of digits')
plt.xlabel('')
plt.title('')
plt.suptitle('')
Out[152]:
In [153]:
df.boxplot('method_name_digit_run_avg', 'label')
plt.ylabel('Avg length of digits')
plt.xlabel('')
plt.title('')
plt.suptitle('')
Out[153]:
In [154]:
import sklearn.ensemble
clf_methods = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
method_name_features = ['acc_abstract', 'acc_annotation', 'acc_enum', 'acc_final', 'acc_interface',
'acc_public', 'acc_super', 'acc_synthetic', 'ap_count', 'constant_pool_count',
'entropy', 'size', 'interface_count', 'major version', 'methods_count',
'minor version',
'method_name_digit_run_avg', 'method_name_digit_run_longest',
'method_name_lowercase_run_avg', 'method_name_lowercase_run_longest',
'method_name_uppercase_run_avg', 'method_name_uppercase_run_longest']
X = df.as_matrix(method_name_features)
y = np.array(df['label'].tolist())
scores = sklearn.cross_validation.cross_val_score(clf_methods, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
In [155]:
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_methods.fit(X_train, y_train)
y_pred = clf_methods.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
importances = zip(method_name_features, clf_methods.featureimportances) importances.sort(key=lambda k:k[1], reverse=True) for idx, im in enumerate(importances[0:15]): print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5)
In [157]:
for idx, gcn in enumerate(df_good['class name']):
print gcn
if idx == 19:
break
In [158]:
for idx, gcn in enumerate(df_bad['class name']):
print gcn
if idx == 19:
break
In [159]:
df.boxplot('class_name_length', 'label')
plt.ylabel('Class Name Length')
plt.xlabel('')
plt.title('')
plt.suptitle('')
Out[159]:
In [160]:
df.boxplot('class_name_slash_count', 'label')
plt.ylabel('Class Name Slash Count')
plt.xlabel('')
plt.title('')
plt.suptitle('')
Out[160]:
In [161]:
df.boxplot('class_name_lowercase_run_longest', 'label')
plt.ylabel('Max Run of Lower Case Letters')
plt.xlabel('')
plt.title('')
plt.suptitle('')
Out[161]:
In [162]:
df.boxplot('class_name_lowercase_run_avg', 'label')
plt.ylabel('Avg Run of Lower Case Letters')
plt.xlabel('')
plt.title('')
plt.suptitle('')
Out[162]:
In [163]:
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
clf_all = sklearn.ensemble.RandomForestClassifier(n_estimators=75)
all_features = ['acc_abstract', 'acc_annotation', 'acc_enum', 'acc_final', 'acc_interface',
'acc_public', 'acc_super', 'acc_synthetic', 'ap_count', 'constant_pool_count',
'entropy', 'interface_count', 'major version', 'methods_count',
'size', 'minor version',
'method_name_digit_run_avg', 'method_name_digit_run_longest',
'method_name_lowercase_run_avg', 'method_name_lowercase_run_longest',
'method_name_uppercase_run_avg', 'method_name_uppercase_run_longest',
'class_name_digit_run_avg', 'class_name_digit_run_longest',
'class_name_length', 'class_name_lowercase_run_avg',
'class_name_lowercase_run_longest', 'class_name_slash_count',
'class_name_uppercase_run_avg', 'class_name_uppercase_run_longest']
X = df.as_matrix(all_features)
y = np.array(df['label'].tolist())
labels = ['good', 'bad']
scores = sklearn.cross_validation.cross_val_score(clf_all, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
In [172]:
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_all.fit(X_train, y_train)
y_pred = clf_all.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
In [173]:
y_probs = clf_all.predict_proba(X_test)[:,1]
thres = .80 # This can be set to whatever you'd like
y_pred[y_probs>thres] = 'malicious'
y_pred[y_probs<=thres] = 'benign'
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
In [9]:
#### We do the same, but set the threshold lower, to only 20%
In [174]:
y_probs = clf_all.predict_proba(X_test)[:,1]
thres = .20 # This can be set to whatever you'd like
y_pred[y_probs>thres] = 'malicious'
y_pred[y_probs<=thres] = 'benign'
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
In [175]:
scores = clf_all.predict_proba(X_test)[:,1]
plt.hist(scores, bins=20)
plt.grid(True)
plt.show()
In [176]:
importances = zip(all_features, clf_all.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
sum = 0
for idx, im in enumerate(importances):
sum += round(im[1], 5)
print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), sum
Let's try a different classifier: Extra Trees Classifier (like RandomForest, but even more random).
http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
In [179]:
clf_er = sklearn.ensemble.ExtraTreesClassifier(n_estimators=50)
X_er = df.as_matrix(all_features)
y_er = np.array(df['label'].tolist())
labels = ['benign', 'malicious']
scores = sklearn.cross_validation.cross_val_score(clf_er, X_er, y_er, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
In [180]:
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X_er, y_er, test_size=my_tsize, random_state=my_seed)
clf_er.fit(X_train, y_train)
y_pred = clf_er.predict(X_test)
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
In [181]:
import sklearn.svm
import sklearn.preprocessing
clf_svc = sklearn.svm.SVC()
X_svc = df.as_matrix(all_features)
X_svc = sklearn.preprocessing.scale(X_svc)
y_svc = np.array(df['label'].tolist())
labels = ['benign', 'malicious']
scores = sklearn.cross_validation.cross_val_score(clf_svc, X_svc, y_svc, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
In [182]:
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X_svc, y_svc, test_size=my_tsize, random_state=my_seed)
clf_svc.fit(X_train, y_train)
y_pred = clf_svc.predict(X_test)
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
In [183]:
# Now we can use scikit learn's cross validation to assess predictive performance.
scores = sklearn.cross_validation.cross_val_score(clf_all, X_all, y_all, cv=20)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
In [184]:
# Now we can use scikit learn's cross validation to assess predictive performance.
scores = sklearn.cross_validation.cross_val_score(clf_er, X_er, y_er, cv=20)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
In [185]:
clf_everything = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
all_features = ['acc_abstract', 'acc_annotation', 'acc_enum', 'acc_final', 'acc_interface',
'acc_public', 'acc_super', 'acc_synthetic', 'ap_count', 'constant_pool_count',
'entropy', 'size', 'interface_count', 'major version', 'methods_count', 'minor version',
'method_name_digit_run_avg', 'method_name_digit_run_longest',
'method_name_lowercase_run_avg', 'method_name_lowercase_run_longest',
'method_name_uppercase_run_avg', 'method_name_uppercase_run_longest',
'class_name_digit_run_avg', 'class_name_digit_run_longest',
'class_name_length', 'class_name_lowercase_run_avg',
'class_name_lowercase_run_longest', 'class_name_slash_count',
'class_name_uppercase_run_avg', 'class_name_uppercase_run_longest']
X_all = df.as_matrix(all_features)
y_all = np.array(df['label'].tolist())
clf_everything.fit(X_all, y_all)
Out[185]:
In [190]:
java_big_pile_df = pd.read_hdf('data/java_clean_df.hd5', 'table')
In [191]:
clean = 0
gray = 0
bad = 0
for x in java_big_pile_df.as_matrix(all_features):
try:
score = clf_everything.predict_proba(x)[:,1][0]
if score < 0.5:
clean += 1
elif score < 0.8:
gray += 1
else:
bad += 1
except:
print "Sad"
print x
break
print java_big_pile_df.shape
print clean
print gray
print bad
In [219]:
java_more_bad_df = pd.read_hdf('data/java_malicious_df.hd5', 'table')
In [220]:
java_big_pile_df.head()
Out[220]:
In [221]:
java_big_pile_df['class_name_length'].describe()
Out[221]:
Randomize list
In [239]:
java_random_df = java_big_pile_df.reindex(np.random.permutation(java_big_pile_df.index))
java_random_2k_df = java_random_df[0:2000]
java_random_the_rest_df = java_random_df[2000:]
In [240]:
java_random_2k_df['label'] = 'benign'
In [241]:
java_more_bad_df['label'] = 'malicious'
In [242]:
java_4k_df = pd.concat([java_more_bad_df, java_random_2k_df], ignore_index=True)
java_4k_df.fillna(0, inplace=True)
In [243]:
clf_4k = sklearn.ensemble.RandomForestClassifier(n_estimators=75)
all_features = ['acc_abstract', 'acc_annotation', 'acc_enum', 'acc_final', 'acc_interface',
'acc_public', 'acc_super', 'acc_synthetic', 'ap_count',
'class_name_digit_run_avg', 'class_name_digit_run_longest',
'class_name_length', 'class_name_lowercase_run_avg',
'class_name_lowercase_run_longest', 'class_name_slash_count',
'class_name_uppercase_run_avg', 'class_name_uppercase_run_longest',
'constant_pool_count', 'entropy', 'interface_count', 'major version',
'method_name_digit_run_avg', 'method_name_digit_run_longest',
'method_name_lowercase_run_avg', 'method_name_lowercase_run_longest',
'method_name_uppercase_run_avg', 'method_name_uppercase_run_longest',
'methods_count', 'minor version', 'size']
X = java_4k_df.as_matrix(all_features)
y = np.array(java_4k_df['label'].tolist())
scores = sklearn.cross_validation.cross_val_score(clf_4k, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))
In [244]:
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_4k.fit(X_train, y_train)
y_pred = clf_4k.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)
In [245]:
# Feature Selection
# Which features best deferentiated the two classes?
# Here we're going to grab the feature_importances from the classifier itself,
importances = zip(all_features, clf_4k.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
sum = 0
for idx, im in enumerate(importances):
sum += round(im[1], 5)
print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), sum
In [246]:
clf_everything_4k = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
X_all = java_4k_df.as_matrix(all_features)
y_all = np.array(java_4k_df['label'].tolist())
clf_everything_4k.fit(X_all, y_all)
Out[246]:
In [247]:
clean = 0
gray = 0
bad = 0
X_rest = java_random_the_rest_df.as_matrix(all_features)
for x in X_rest:
score = clf_everything_4k.predict_proba(x)[:,1][0]
if score < 0.5:
clean += 1
elif score < 0.8:
gray += 1
else:
bad += 1
print java_random_the_rest_df.shape[0]
print clean
print gray
print bad
In [ ]: