In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import matplotlib.mlab as mlab
import warnings
warnings.filterwarnings('ignore')
In [7]:
# VS251 feature set
# Merge the feature sets.
#
phtml = pd.read_csv('data/sorted-html-features-vs251.csv')
pentropy = pd.read_csv('data/sorted-entropy-features-vs251.csv')
pfileid = pd.read_csv('data/sorted-file-id-features-vs251.csv')
ptridid = pd.read_csv('data/sorted-trid-id-features-vs251.csv')
plabels = pd.read_csv('data/sorted-train-labels-vs251.csv')
phtml.head()
Out[7]:
In [3]:
phtml.shape
Out[3]:
In [4]:
pentropy.head()
Out[4]:
In [5]:
fileidfeatures = pd.DataFrame(pfileid.iloc[:,[0,2]])
fileidfeatures.head()
Out[5]:
In [8]:
trididfeatures = pd.DataFrame(ptridid.iloc[:,[0,2,3]])
trididfeatures.head()
Out[8]:
In [12]:
labelset = pd.DataFrame(plabels.iloc[:,[0,4]]) # Get the family label only.
labelset.head()
Out[12]:
In [13]:
labelset.shape
Out[13]:
In [ ]:
In [ ]:
In [9]:
# Combine all the feature sets, ensure we drop all the rows that are not in the HTML sample set.
# NOTE: see model-selection-pe-coff.ipynb
combined_train_features = phtml.merge(pentropy, on='file_name', how='inner', suffixes=('_html','_entropy'))
combined_train_features = combined_train_features.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_html','_fid'))
combined_train_features = combined_train_features.merge(trididfeatures, on='file_name', how='inner', suffixes=('_html','_tid'))
combined_train_features.head()
Out[9]:
In [10]:
combined_train_features.shape
Out[10]:
In [11]:
combined_train_features.to_csv('data/combined-html-features-vs251.csv', index=False)
In [ ]:
# Write out the html training label set.
In [ ]:
# DEPRECATED
def get_training_data(sorted_train_features_df, sorted_train_labels_df, train_labels_file_out):
X = sorted_train_features_df.iloc[:,1:]
ylabels = sorted_train_labels_df.iloc[:,4]
# not necessary, labels start at zero. ylabels = ylabels - 1
sorted_sample_names = sorted_train_features_df.loc[:,'file_name']
# Now get the labels of the PE malware samples from the label set.
counter = 0
y = []
#train_names = sorted_train_labels['family_label']
for fname in sorted_sample_names:
counter += 1
if counter % 100 == 1:
print("Appending {:d} -> {:s}".format(counter, fname))
for idx, fname2 in enumerate(sorted_train_labels['file_name']):
if (fname2 == fname):
y.append(sorted_train_labels.iloc[idx, 4]) # Append the family class label.
break
###############################
# Write out the html sample train labels for later use and validation.
fop = open(train_labels_file_out, 'w')
fop.writelines("\n".join(str(x) for x in y))
fop.close()
###############################
return X,y
In [4]:
all_features = pd.read_csv('data/combined-html-features-vs251.csv')
all_features.head()
Out[4]:
In [5]:
X_all = all_features.iloc[:,1:]
X_all.head()
Out[5]:
In [6]:
X_all.shape
Out[6]:
In [9]:
# Train feature stats
X_all_train_means = X_all.mean()
X_all_train_medians = X_all.median()
X_all_train_maxs = X_all.max()
X_all_train_mins = X_all.min()
X_all_train_std = X_all.std()
X_all_train_cor = X_all.corr()
X_all_train_cov = X_all.cov()
X_all_train_maxs.head()
Out[9]:
In [10]:
X_all_train_cor.to_csv('data/html-feature-corr-vs251.csv')
X_all_train_cov.to_csv('data/html-feature-cov-vs251.csv')
In [11]:
all_column_names = list(X_all.columns)
all_column_names[:10]
Out[11]:
In [12]:
all_train_stats = pd.DataFrame()
all_train_stats['feature_name'] = all_column_names
all_train_stats.head()
Out[12]:
In [13]:
all_train_stats.shape
Out[13]:
In [14]:
all_train_stats['mean'] = list(X_all_train_means)
all_train_stats['median'] = list(X_all_train_medians)
all_train_stats['standard_deviation'] = list(X_all_train_std)
all_train_stats['max'] = list(X_all_train_maxs)
all_train_stats['min'] = list(X_all_train_mins)
all_train_stats.head()
Out[14]:
In [15]:
all_train_stats.to_csv('data/html-train-stats-vs251.csv')
In [16]:
plt.figure(figsize=(15,15))
x_graph = X_all['entropy']
num_bins = 100
n, bins, patches = plt.hist(x_graph, num_bins, normed=1, facecolor='green', alpha=0.5)
# add a 'best fit' line
y = mlab.normpdf(bins, x_graph.mean(), x_graph.std())
plt.plot(bins, y, 'r--')
plt.xlabel('HTML File Entropy')
plt.ylabel('HTML Sample Count')
plt.title('HTML Histogram Plot')
plt.show()
In [7]:
# VS251 feature set
phtml = pd.read_csv('data/sorted-html-features-vs252.csv')
pentropy = pd.read_csv('data/sorted-entropy-features-vs252.csv')
pfileid = pd.read_csv('data/sorted-file-id-features-vs252.csv')
ptridid = pd.read_csv('data/sorted-trid-id-features-vs252.csv')
plabels = pd.read_csv('data/sorted-train-labels-vs252.csv')
phtml.head()
Out[7]:
In [8]:
phtml.shape
Out[8]:
In [19]:
pentropy.head()
Out[19]:
In [9]:
fileidfeatures = pd.DataFrame(pfileid.iloc[:,[0,2]])
fileidfeatures.head()
Out[9]:
In [12]:
ptridid = pd.read_csv('data/sorted-trid-id-features-vs252.csv')
trididfeatures = pd.DataFrame(ptridid.iloc[:,[0,2,3]])
trididfeatures.head()
Out[12]:
In [13]:
combined_train_features = phtml.merge(pentropy, on='file_name', how='inner', suffixes=('_html','_entropy'))
combined_train_features = combined_train_features.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_html','_fid'))
combined_train_features = combined_train_features.merge(trididfeatures, on='file_name', how='inner', suffixes=('_html','_tid'))
combined_train_features.head()
Out[13]:
In [14]:
combined_train_features.shape
Out[14]:
In [15]:
combined_train_features.to_csv('data/combined-html-features-vs252.csv', index=False)
In [16]:
X_all = all_features.iloc[:,1:]
X_all.head()
Out[16]:
In [17]:
# Train feature stats
X_all_train_means = X_all.mean()
X_all_train_medians = X_all.median()
X_all_train_maxs = X_all.max()
X_all_train_mins = X_all.min()
X_all_train_std = X_all.std()
X_all_train_cor = X_all.corr()
X_all_train_cov = X_all.cov()
X_all_train_maxs.head()
Out[17]:
In [18]:
X_all_train_cor.to_csv('data/html-feature-corr-vs252.csv')
X_all_train_cov.to_csv('data/html-feature-cov-vs252.csv')
In [19]:
all_column_names = list(X_all.columns)
all_column_names[:10]
Out[19]:
In [20]:
all_train_stats = pd.DataFrame()
all_train_stats['feature_name'] = all_column_names
all_train_stats.head()
Out[20]:
In [21]:
all_train_stats.shape
Out[21]:
In [22]:
all_train_stats['mean'] = list(X_all_train_means)
all_train_stats['median'] = list(X_all_train_medians)
all_train_stats['standard_deviation'] = list(X_all_train_std)
all_train_stats['max'] = list(X_all_train_maxs)
all_train_stats['min'] = list(X_all_train_mins)
all_train_stats.head()
Out[22]:
In [23]:
all_train_stats.to_csv('data/html-train-stats-vs252.csv')
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [2]:
# VS251 feature set
phtml = pd.read_csv('data/sorted-html-features-vs263.csv')
pentropy = pd.read_csv('data/sorted-entropy-features-vs263.csv')
pfileid = pd.read_csv('data/sorted-file-id-features-vs263.csv')
ptridid = pd.read_csv('data/sorted-trid-id-features-vs263.csv')
plabels = pd.read_csv('data/sorted-train-labels-vs263.csv')
phtml.head()
Out[2]:
In [3]:
phtml.shape
Out[3]:
In [4]:
pentropy.head()
Out[4]:
In [5]:
fileidfeatures = pd.DataFrame(pfileid.iloc[:,[0,2]])
fileidfeatures.head()
Out[5]:
In [6]:
#ptridid = pd.read_csv('data/sorted-trid-id-features-vs252.csv')
trididfeatures = pd.DataFrame(ptridid.iloc[:,[0,2,3]])
trididfeatures.head()
Out[6]:
In [7]:
combined_train_features = phtml.merge(pentropy, on='file_name', how='inner', suffixes=('_html','_entropy'))
combined_train_features = combined_train_features.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_html','_fid'))
combined_train_features = combined_train_features.merge(trididfeatures, on='file_name', how='inner', suffixes=('_html','_tid'))
combined_train_features.head()
Out[7]:
In [8]:
combined_train_features.shape
Out[8]:
In [9]:
combined_train_features.to_csv('data/combined-html-features-vs263.csv', index=False)
In [11]:
X_all = combined_train_features.iloc[:,1:]
X_all.head()
Out[11]:
In [12]:
# Train feature stats
X_all_train_means = X_all.mean()
X_all_train_medians = X_all.median()
X_all_train_maxs = X_all.max()
X_all_train_mins = X_all.min()
X_all_train_std = X_all.std()
X_all_train_cor = X_all.corr()
X_all_train_cov = X_all.cov()
X_all_train_maxs.head()
Out[12]:
In [13]:
X_all_train_cor.to_csv('data/html-feature-corr-vs263.csv', index=False)
X_all_train_cov.to_csv('data/html-feature-cov-vs63.csv', index=False)
In [14]:
all_column_names = list(X_all.columns)
all_column_names[:10]
Out[14]:
In [16]:
all_train_stats = pd.DataFrame()
all_train_stats['feature_name'] = all_column_names
all_train_stats.head()
Out[16]:
In [17]:
all_train_stats.shape
Out[17]:
In [18]:
all_train_stats['mean'] = list(X_all_train_means)
all_train_stats['median'] = list(X_all_train_medians)
all_train_stats['standard_deviation'] = list(X_all_train_std)
all_train_stats['max'] = list(X_all_train_maxs)
all_train_stats['min'] = list(X_all_train_mins)
all_train_stats.head()
Out[18]:
In [20]:
all_train_stats.to_csv('data/html-train-stats-vs263.csv', index=False)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [2]:
# VS251 feature set
phtml = pd.read_csv('data/sorted-html-features-vs264.csv')
pentropy = pd.read_csv('data/sorted-entropy-features-vs264.csv')
pfileid = pd.read_csv('data/sorted-file-id-features-vs264.csv')
ptridid = pd.read_csv('data/sorted-trid-id-features-vs264.csv')
plabels = pd.read_csv('data/sorted-train-labels-vs264.csv')
phtml.head()
Out[2]:
In [3]:
phtml.shape
Out[3]:
In [4]:
pentropy.head()
Out[4]:
In [5]:
fileidfeatures = pd.DataFrame(pfileid.iloc[:,[0,2]])
fileidfeatures.head()
Out[5]:
In [6]:
#ptridid = pd.read_csv('data/sorted-trid-id-features-vs252.csv')
trididfeatures = pd.DataFrame(ptridid.iloc[:,[0,2,3]])
trididfeatures.head()
Out[6]:
In [7]:
combined_train_features = phtml.merge(pentropy, on='file_name', how='inner', suffixes=('_html','_entropy'))
combined_train_features = combined_train_features.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_html','_fid'))
combined_train_features = combined_train_features.merge(trididfeatures, on='file_name', how='inner', suffixes=('_html','_tid'))
combined_train_features.head()
Out[7]:
In [8]:
combined_train_features.shape
Out[8]:
In [9]:
combined_train_features.to_csv('data/combined-html-features-vs264.csv', index=False)
In [10]:
X_all = combined_train_features.iloc[:,1:]
X_all.head()
Out[10]:
In [11]:
# Train feature stats
X_all_train_means = X_all.mean()
X_all_train_medians = X_all.median()
X_all_train_maxs = X_all.max()
X_all_train_mins = X_all.min()
X_all_train_std = X_all.std()
X_all_train_cor = X_all.corr()
X_all_train_cov = X_all.cov()
X_all_train_maxs.head()
Out[11]:
In [12]:
X_all_train_cor.to_csv('data/html-feature-corr-vs264.csv', index=False)
X_all_train_cov.to_csv('data/html-feature-cov-vs264.csv', index=False)
In [13]:
all_column_names = list(X_all.columns)
all_column_names[:10]
Out[13]:
In [14]:
all_train_stats = pd.DataFrame()
all_train_stats['feature_name'] = all_column_names
all_train_stats.head()
Out[14]:
In [15]:
all_train_stats.shape
Out[15]:
In [16]:
all_train_stats['mean'] = list(X_all_train_means)
all_train_stats['median'] = list(X_all_train_medians)
all_train_stats['standard_deviation'] = list(X_all_train_std)
all_train_stats['max'] = list(X_all_train_maxs)
all_train_stats['min'] = list(X_all_train_mins)
all_train_stats.head()
Out[16]:
In [17]:
all_train_stats.to_csv('data/html-train-stats-vs264.csv', index=False)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [2]:
pentropy = pd.read_csv('data/sorted-entropy-features-vs251.csv')
pfileid = pd.read_csv('data/sorted-file-id-features-vs251.csv')
plabels = pd.read_csv('data/sorted-train-labels-vs251.csv')
pentropy.head()
Out[2]:
In [3]:
ftypes = pfileid['file_type']
html_files = []
for idx, ftype in enumerate(ftypes):
if 'HTML' in ftype:
html_files.append(pfileid.iloc[idx, 0])
print("Found {:d} HTML files.".format(len(html_files)))
In [6]:
html_files[:10]
Out[6]:
In [8]:
fop = open('data/html-file-list-vs251.txt','w')
for fname in html_files:
fop.write(fname + "\n")
fop.close()
Out[8]:
In [9]:
fop.close()
In [7]:
plabels.head()
Out[7]:
In [ ]:
In [4]:
pentropy = pd.read_csv('data/sorted-entropy-features-vs252.csv')
pfileid = pd.read_csv('data/sorted-file-id-features-vs252.csv')
plabels = pd.read_csv('data/sorted-train-labels-vs252.csv')
pentropy.head()
Out[4]:
In [5]:
ftypes = pfileid['file_type']
html_files = []
for idx, ftype in enumerate(ftypes):
if 'HTML' in ftype:
html_files.append(pfileid.iloc[idx, 0])
print("Found {:d} HTML files.".format(len(html_files)))
In [6]:
html_files[:10]
Out[6]:
In [7]:
fop = open('data/html-file-list-vs252.txt','w')
for fname in html_files:
fop.write(fname + "\n")
fop.close()
In [ ]:
In [2]:
pentropy = pd.read_csv('data/sorted-entropy-features-vs263.csv')
pfileid = pd.read_csv('data/sorted-file-id-features-vs263.csv')
plabels = pd.read_csv('data/sorted-train-labels-vs263.csv')
pentropy.head()
Out[2]:
In [3]:
ftypes = pfileid['file_type']
html_files = []
for idx, ftype in enumerate(ftypes):
if 'HTML' in ftype:
html_files.append(pfileid.iloc[idx, 0])
print("Found {:d} HTML files.".format(len(html_files)))
In [4]:
html_files[:10]
Out[4]:
In [5]:
fop = open('data/html-file-list-vs263.txt','w')
for fname in html_files:
fop.write(fname + "\n")
fop.close()
In [ ]:
In [2]:
pentropy = pd.read_csv('data/sorted-entropy-features-vs264.csv')
pfileid = pd.read_csv('data/sorted-file-id-features-vs264.csv')
plabels = pd.read_csv('data/sorted-train-labels-vs264.csv')
pentropy.head()
Out[2]:
In [3]:
ftypes = pfileid['file_type']
html_files = []
for idx, ftype in enumerate(ftypes):
if 'HTML' in ftype:
html_files.append(pfileid.iloc[idx, 0])
print("Found {:d} HTML files.".format(len(html_files)))
In [4]:
html_files[:10]
Out[4]:
In [5]:
fop = open('data/html-file-list-vs264.txt','w')
for fname in html_files:
fop.write(fname + "\n")
fop.close()
In [ ]:
In [ ]:
In [ ]: