In [1]:
# TODO: move all the merge code from model-selection-pe-coff.ipynb
import warnings
import numpy as np
import scipy as sp
import pandas as pd
In [ ]:
# NOTE: move this merging stuff to merge-feature-sets-pe-coff.ipynb
# it is making the notebook too long.
# Merge all the vs251 Feature sets:
# 1. PE ASM features.
# 2. Entropy features.
# 3. File ID features.
# 4. Packer ID features.
# 5. PE Call Graph features.
# 6. Trid ID features.
# 7. PE Header features.
#
# NOTE: keep function count features and ASM/Binary images separate, train on different models.
# 8. TODO: function count features.
# 9. TODO: Binary or ASM images (RNN/CNN models).
#
# Final Combination: combine all PE/COFF features then use chi2 tests to
# reduce to 10% best feature set ->
# all-combined-pe-features-10perc-vs251.csv
sorted_train_labels = pd.read_csv('data/sorted-train-labels-vs251.csv')
#sorted_asm_features = pd.read_csv('data/sorted-pe-asm-features-vs251.csv')
sorted_asm_features_fixed = pd.read_csv('data/sorted-pe-asm-features-vs251-fixed.csv')
sorted_entropy_features = pd.read_csv('data/sorted-entropy-features-vs251.csv')
sorted_file_id_features = pd.read_csv('data/sorted-file-id-features-vs251.csv')
sorted_packer_id_features = pd.read_csv('data/sorted-packer-id-features-vs251.csv')
sorted_call_graph_features = pd.read_csv('data/sorted-pe-call-graph-features-vs251.csv')
sorted_trid_id_features = pd.read_csv('data/sorted-trid-id-features-vs251.csv') # Select only scalar columns.
sorted_header_features = pd.read_csv('data/sorted-pe-header-features-10percent-vs251.csv')
sorted_packer_id_features.head()
In [ ]:
sorted_asm_features_fixed.head()
In [ ]:
fileidfeatures = pd.DataFrame(sorted_file_id_features.iloc[:,[0,2]])
fileidfeatures.head()
In [ ]:
packeridfeatures = pd.DataFrame(sorted_packer_id_features.iloc[:,[0,2]])
packeridfeatures.head()
In [ ]:
trididfeatures = pd.DataFrame(sorted_trid_id_features.iloc[:,[0,2,3]])
trididfeatures.head()
In [ ]:
# NOTE: the file name problem has been fixed now, do not use get_training_data()
def get_training_data(sorted_train_features_df, sorted_train_labels_df, train_labels_file_out):
X = sorted_train_features_df.iloc[:,1:]
ylabels = sorted_train_labels_df.iloc[:,4]
# not necessary, labels start at zero. ylabels = ylabels - 1
sorted_sample_names = sorted_train_features_df.loc[:,'file_name']
# Now get the labels of the PE malware samples from the label set.
counter = 0
y = []
#train_names = sorted_train_labels['family_label']
for fname in sorted_sample_names:
counter += 1
# Have a slight problem with the file_names in asm feature sets,
# they still have .pe on the end.
idx = fname.find('.pe')
if idx > 0:
fname = fname[:idx]
if counter % 100 == 1:
print("Appending {:d} -> {:s}".format(counter, fname))
for idx, fname2 in enumerate(sorted_train_labels['file_name']):
if (fname2 == fname):
y.append(sorted_train_labels.iloc[idx, 4]) # Append the family class label.
break
###############################
# Write out the PE/COFF sample train labels for later use and validation.
fop = open(train_labels_file_out, 'w')
fop.writelines("\n".join(str(x) for x in y))
fop.close()
###############################
return X,y
In [ ]:
# Combine all the feature sets, ensure we drop all the rows that are not in the PE/COFF sample set.
# NOTE: use stream editor to fix ASM file_name field: echo 'as;dflkj;a.pe,123' | sed s/\.pe//
combined_train_features = sorted_asm_features_fixed.merge(sorted_header_features, on='file_name', how='inner', suffixes=('_asm','_hd'))
combined_train_features = combined_train_features.merge(sorted_call_graph_features, on='file_name', how='inner', suffixes=('_asm','_cg'))
combined_train_features = combined_train_features.merge(sorted_entropy_features, on='file_name', how='inner', suffixes=('_asm','_ent'))
combined_train_features = combined_train_features.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_asm','_fid'))
combined_train_features = combined_train_features.merge(trididfeatures, on='file_name', how='inner', suffixes=('_asm','_tid'))
combined_train_features = combined_train_features.merge(packeridfeatures, on='file_name', how='inner', suffixes=('_asm','_pid'))
combined_train_features.head()
combined_train_features.to_csv('data/combined-pe-features-vs251.csv', index=False)
In [ ]:
In [ ]:
In [ ]:
In [2]:
# Merge all the vs252 Feature sets:
# 1. PE ASM features.
# 2. Entropy features.
# 3. File ID features.
# 4. Packer ID features.
# 5. PE Call Graph features.
# 6. Trid ID features.
# 7. PE Header features.
# 8. TODO: function count features.
# 9. TODO: Binary or ASM images (RNN/CNN models).
#
# Final Combination: combine all PE/COFF features then use chi2 tests to
# reduce to 10% best feature set ->
# all-combined-pe-features-10perc-vs251.csv
sorted_train_labels = pd.read_csv('data/sorted-train-labels-vs252.csv')
#sorted_asm_features = pd.read_csv('data/sorted-pe-asm-features-vs252.csv')
sorted_asm_features_fixed = pd.read_csv('data/sorted-pe-asm-features-vs252-fixed.csv')
sorted_entropy_features = pd.read_csv('data/sorted-entropy-features-vs252.csv')
sorted_file_id_features = pd.read_csv('data/sorted-file-id-features-vs252.csv')
sorted_packer_id_features = pd.read_csv('data/sorted-packer-id-features-vs252.csv')
sorted_call_graph_features = pd.read_csv('data/sorted-pe-call-graph-features-vs252.csv')
sorted_trid_id_features = pd.read_csv('data/sorted-trid-id-features-vs252.csv') # Select only scalar columns.
sorted_header_features = pd.read_csv('data/sorted-pe-header-features-10percent-vs252.csv')
# Use separate model for function counts, otherwise the csv files are too big for github.
#sorted_function_count_features = pd.read_csv('data/sorted-pe-function-counts-10percent-vs252.csv')
sorted_asm_features_fixed.head()
Out[2]:
In [ ]:
sorted_asm_features_fixed.head()
In [3]:
fileidfeatures = pd.DataFrame(sorted_file_id_features.iloc[:,[0,2]])
fileidfeatures.head()
Out[3]:
In [4]:
packeridfeatures = pd.DataFrame(sorted_packer_id_features.iloc[:,[0,2]])
packeridfeatures.head()
Out[4]:
In [5]:
trididfeatures = pd.DataFrame(sorted_trid_id_features.iloc[:,[0,2,3]])
trididfeatures.head()
Out[5]:
In [6]:
# Combine all the feature sets, ensure we drop all the rows that are not in the PE/COFF sample set.
# NOTE: use stream editor to fix ASM file_name field: echo 'as;dflkj;a.pe,123' | sed s/\.pe//
combined_train_features = sorted_asm_features_fixed.merge(sorted_header_features, on='file_name', how='inner', suffixes=('_asm','_hd'))
combined_train_features = combined_train_features.merge(sorted_call_graph_features, on='file_name', how='inner', suffixes=('_asm','_cg'))
combined_train_features = combined_train_features.merge(sorted_entropy_features, on='file_name', how='inner', suffixes=('_asm','_ent'))
combined_train_features = combined_train_features.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_asm','_fid'))
combined_train_features = combined_train_features.merge(trididfeatures, on='file_name', how='inner', suffixes=('_asm','_tid'))
combined_train_features = combined_train_features.merge(packeridfeatures, on='file_name', how='inner', suffixes=('_asm','_pid'))
combined_train_features.head()
Out[6]:
In [7]:
combined_train_features.to_csv('data/combined-pe-features-vs252.csv', index=False)
In [ ]:
# DEPRECATED: do no use get_training_data() now, problem has been fixed.
X,y = get_training_data(sorted_asm_features_fixed, sorted_train_labels, 'data/sorted-pe-coff-train-labels-vs252.csv')
print("Length of y: {:d}".format(len(y)))
print("Shape of X: {:d} {:d}".format(X.shape[0], X.shape[1]))
In [ ]:
X,y = get_training_data(combined_train_features, sorted_train_labels, 'data/sorted-pe-coff-train-labels-vs252.csv')
print("Length of y: {:d}".format(len(y)))
print("Shape of X: {:d} {:d}".format(X.shape[0], X.shape[1]))
In [ ]:
In [3]:
# vs263 Feature sets:
# 1. PE ASM features.
# 2. Entropy features.
# 3. File ID features.
# 4. Packer ID features.
# 5. PE Call Graph features.
# 6. Trid ID features.
# 7. PE Header features.
# 8. function count features.
# 9. TODO: Binary or ASM images (RNN/CNN models).
#
# Final Combination: combine all PE/COFF features then use chi2 tests to
# reduce to 10% best feature set ->
# all-combined-pe-features-10perc-vs251.csv
sorted_train_labels = pd.read_csv('data/sorted-train-labels-vs263.csv')
#sorted_asm_features = pd.read_csv('data/sorted-pe-asm-features-vs263.csv')
sorted_asm_features_fixed = pd.read_csv('data/sorted-pe-asm-features-vs263-fixed.csv')
sorted_entropy_features = pd.read_csv('data/sorted-entropy-features-vs263.csv')
sorted_file_id_features = pd.read_csv('data/sorted-file-id-features-vs263.csv')
sorted_packer_id_features = pd.read_csv('data/sorted-packer-id-features-vs263.csv')
sorted_call_graph_features = pd.read_csv('data/sorted-pe-call-graph-features-vs263.csv')
sorted_trid_id_features = pd.read_csv('data/sorted-trid-id-features-vs263.csv') # Select only scalar columns.
sorted_header_features = pd.read_csv('data/sorted-pe-header-features-10percent-vs263.csv')
# Adding function counts makes the csv file too big for github and models too slow,
# model separately.
#sorted_function_count_features = pd.read_csv('data/sorted-pe-function-counts-10percent-vs263.csv')
sorted_asm_features_fixed.head()
Out[3]:
In [ ]:
X,y = get_training_data(sorted_asm_features_fixed, sorted_train_labels, 'data/sorted-pe-coff-train-labels-vs263.csv')
print("Length of y: {:d}".format(len(y)))
print("Shape of X: {:d} {:d}".format(X.shape[0], X.shape[1]))
In [4]:
fileidfeatures = pd.DataFrame(sorted_file_id_features.iloc[:,[0,2]])
fileidfeatures.head()
Out[4]:
In [5]:
packeridfeatures = pd.DataFrame(sorted_packer_id_features.iloc[:,[0,2]])
packeridfeatures.head()
Out[5]:
In [6]:
trididfeatures = pd.DataFrame(sorted_trid_id_features.iloc[:,[0,2,3]])
trididfeatures.head()
Out[6]:
In [8]:
# Combine all the feature sets, ensure we drop all the rows that are not in the PE/COFF sample set.
# NOTE: use stream editor to fix ASM file_name field: echo 'as;dflkj;a.pe,123' | sed s/\.pe//
combined_train_features = sorted_asm_features_fixed.merge(sorted_header_features, on='file_name', how='inner', suffixes=('_asm','_hd'))
combined_train_features = combined_train_features.merge(sorted_call_graph_features, on='file_name', how='inner', suffixes=('_asm','_cg'))
combined_train_features = combined_train_features.merge(sorted_entropy_features, on='file_name', how='inner', suffixes=('_asm','_ent'))
combined_train_features = combined_train_features.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_asm','_fid'))
combined_train_features = combined_train_features.merge(trididfeatures, on='file_name', how='inner', suffixes=('_asm','_tid'))
combined_train_features = combined_train_features.merge(packeridfeatures, on='file_name', how='inner', suffixes=('_asm','_pid'))
combined_train_features.head()
Out[8]:
In [9]:
combined_train_features.to_csv('data/combined-pe-features-vs263.csv', index=False)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [4]:
# vs264 Feature sets:
# 1. PE ASM features.
# 2. Entropy features.
# 3. File ID features.
# 4. Packer ID features.
# 5. PE Call Graph features.
# 6. Trid ID features.
# 7. PE Header features.
# 8. ASM function count features.
# 9. TODO: Binary or ASM images (RNN/CNN models).
#
# Final Combination: combine all PE/COFF features then use chi2 tests to
# reduce to 10% best feature set ->
# all-combined-pe-features-10perc-vs264.csv
sorted_train_labels = pd.read_csv('data/sorted-train-labels-vs264.csv')
#sorted_asm_features = pd.read_csv('data/sorted-pe-asm-features-vs264.csv')
sorted_asm_features_fixed = pd.read_csv('data/sorted-pe-asm-features-vs264-fixed.csv')
sorted_entropy_features = pd.read_csv('data/sorted-entropy-features-vs264.csv')
sorted_file_id_features = pd.read_csv('data/sorted-file-id-features-vs264.csv')
sorted_packer_id_features = pd.read_csv('data/sorted-packer-id-features-vs264.csv')
sorted_call_graph_features = pd.read_csv('data/sorted-pe-call-graph-features-vs264.csv')
sorted_trid_id_features = pd.read_csv('data/sorted-trid-id-features-vs264.csv') # Select only scalar columns.
sorted_header_features = pd.read_csv('data/sorted-pe-header-features-10percent-vs264.csv')
sorted_function_count_features = pd.read_csv('data/sorted-pe-function-count-features-10percent-vs264.csv')
sorted_asm_features_fixed.head()
Out[4]:
In [5]:
fileidfeatures = pd.DataFrame(sorted_file_id_features.iloc[:,[0,2]])
fileidfeatures.head()
Out[5]:
In [6]:
packeridfeatures = pd.DataFrame(sorted_packer_id_features.iloc[:,[0,2]])
packeridfeatures.head()
Out[6]:
In [7]:
trididfeatures = pd.DataFrame(sorted_trid_id_features.iloc[:,[0,2,3]])
trididfeatures.head()
Out[7]:
In [8]:
# Combine all the feature sets, ensure we drop all the rows that are not in the PE/COFF sample set.
# NOTE: use stream editor to fix ASM file_name field: echo 'as;dflkj;a.pe,123' | sed s/\.pe//
combined_train_features = sorted_asm_features_fixed.merge(sorted_header_features, on='file_name', how='inner', suffixes=('_asm','_hd'))
combined_train_features = combined_train_features.merge(sorted_call_graph_features, on='file_name', how='inner', suffixes=('_asm','_cg'))
combined_train_features = combined_train_features.merge(sorted_entropy_features, on='file_name', how='inner', suffixes=('_asm','_ent'))
combined_train_features = combined_train_features.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_asm','_fid'))
combined_train_features = combined_train_features.merge(trididfeatures, on='file_name', how='inner', suffixes=('_asm','_tid'))
combined_train_features = combined_train_features.merge(packeridfeatures, on='file_name', how='inner', suffixes=('_asm','_pid'))
combined_train_features.head()
Out[8]:
In [9]:
combined_train_features.to_csv('data/combined-pe-features-vs264.csv', index=False)
In [ ]:
In [ ]: