1. Malicious HTML Feature Analysis.


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import matplotlib.mlab as mlab
import warnings
warnings.filterwarnings('ignore')

2. VirusShare 252 Feature Set Statistical Analysis.


In [7]:
# VS251 feature set
# Merge the feature sets.
# 
phtml = pd.read_csv('data/sorted-html-features-vs251.csv')
pentropy = pd.read_csv('data/sorted-entropy-features-vs251.csv')
pfileid = pd.read_csv('data/sorted-file-id-features-vs251.csv')
ptridid = pd.read_csv('data/sorted-trid-id-features-vs251.csv')
plabels = pd.read_csv('data/sorted-train-labels-vs251.csv')
phtml.head()


Out[7]:
file_name <!-- <!DOCTYPE <a <abbr <acronym <address <applet <area <article <aside <audio <b <base <basefont <bdi <bdo <big <blockquote <body
0 00027c21667d9119a454df8cef2dc1c7 8 1 9 0 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 ...
1 0021a84397cfcb73ac5adf90bab51036 12 1 34 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 ...
2 004bfef04321c825c670985159ff7150 17 1 55 0 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0 ...
3 005776d784e6a4e5034bb53ff8f3fd95 18 1 5 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ...
4 00eb23399012c56ee31d1e9266d527f6 13 1 5 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ...

5 rows × 125 columns


In [3]:
phtml.shape


Out[3]:
(2225, 125)

In [4]:
pentropy.head()


Out[4]:
file_name entropy file_size
0 00027c21667d9119a454df8cef2dc1c7 0.666599 18390
1 0003887ab64b8ae19ffa988638decac2 0.903260 1134320
2 0004376a62e22f6ad359467eb742b8ff 0.803515 149720
3 000634f03457d088c71dbffb897b1315 0.957584 1725502
4 00072ed24314e91b63b425b3dc572f50 0.486112 328093

5 rows × 3 columns


In [5]:
fileidfeatures = pd.DataFrame(pfileid.iloc[:,[0,2]])
fileidfeatures.head()


Out[5]:
file_name file_id
0 00027c21667d9119a454df8cef2dc1c7 38
1 0003887ab64b8ae19ffa988638decac2 25
2 0004376a62e22f6ad359467eb742b8ff 1
3 000634f03457d088c71dbffb897b1315 1
4 00072ed24314e91b63b425b3dc572f50 1

5 rows × 2 columns


In [8]:
trididfeatures = pd.DataFrame(ptridid.iloc[:,[0,2,3]])
trididfeatures.head()


Out[8]:
file_name percentage trid_id
0 00027c21667d9119a454df8cef2dc1c7 0.0 0
1 0003887ab64b8ae19ffa988638decac2 3.8 52
2 0004376a62e22f6ad359467eb742b8ff 3.5 13
3 000634f03457d088c71dbffb897b1315 4.6 21
4 00072ed24314e91b63b425b3dc572f50 4.4 5

5 rows × 3 columns


In [12]:
labelset = pd.DataFrame(plabels.iloc[:,[0,4]]) # Get the family label only.
labelset.head()


Out[12]:
file_name family_label
0 00027c21667d9119a454df8cef2dc1c7 4
1 0003887ab64b8ae19ffa988638decac2 0
2 0004376a62e22f6ad359467eb742b8ff 6
3 000634f03457d088c71dbffb897b1315 9
4 00072ed24314e91b63b425b3dc572f50 10

5 rows × 2 columns


In [13]:
labelset.shape


Out[13]:
(65536, 2)

In [ ]:


In [ ]:


In [9]:
# Combine all the feature sets, ensure we drop all the rows that are not in the HTML sample set.
# NOTE: see model-selection-pe-coff.ipynb

combined_train_features = phtml.merge(pentropy, on='file_name', how='inner', suffixes=('_html','_entropy'))
combined_train_features = combined_train_features.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_html','_fid'))
combined_train_features = combined_train_features.merge(trididfeatures, on='file_name', how='inner', suffixes=('_html','_tid'))

combined_train_features.head()


Out[9]:
file_name <!-- <!DOCTYPE <a <abbr <acronym <address <applet <area <article <aside <audio <b <base <basefont <bdi <bdo <big <blockquote <body
0 00027c21667d9119a454df8cef2dc1c7 8 1 9 0 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 ...
1 0021a84397cfcb73ac5adf90bab51036 12 1 34 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 ...
2 004bfef04321c825c670985159ff7150 17 1 55 0 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0 ...
3 005776d784e6a4e5034bb53ff8f3fd95 18 1 5 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ...
4 00eb23399012c56ee31d1e9266d527f6 13 1 5 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ...

5 rows × 130 columns


In [10]:
combined_train_features.shape


Out[10]:
(2225, 130)

In [11]:
combined_train_features.to_csv('data/combined-html-features-vs251.csv', index=False)

In [ ]:
# Write out the html training label set.

In [ ]:
# DEPRECATED
def get_training_data(sorted_train_features_df, sorted_train_labels_df, train_labels_file_out):
    
    X = sorted_train_features_df.iloc[:,1:]
    ylabels = sorted_train_labels_df.iloc[:,4]
    # not necessary, labels start at zero. ylabels = ylabels - 1 
    sorted_sample_names = sorted_train_features_df.loc[:,'file_name']
    
    # Now get the labels of the PE malware samples from the label set.
    counter = 0
    y = []
    #train_names = sorted_train_labels['family_label']
    for fname in sorted_sample_names:
        counter += 1
        
        if counter % 100 == 1:
            print("Appending {:d} -> {:s}".format(counter, fname))
        for idx, fname2 in enumerate(sorted_train_labels['file_name']):
            if (fname2 == fname):
                y.append(sorted_train_labels.iloc[idx, 4]) # Append the family class label.
                break
    
    ###############################
    # Write out the html sample train labels for later use and validation.
    fop = open(train_labels_file_out, 'w')
    fop.writelines("\n".join(str(x) for x in y))
    fop.close()
    ###############################
    
    return X,y

In [4]:
all_features = pd.read_csv('data/combined-html-features-vs251.csv')
all_features.head()


Out[4]:
file_name <!-- <!DOCTYPE <a <abbr <acronym <address <applet <area <article <aside <audio <b <base <basefont <bdi <bdo <big <blockquote <body
0 00027c21667d9119a454df8cef2dc1c7 8 1 9 0 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 ...
1 0021a84397cfcb73ac5adf90bab51036 12 1 34 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 ...
2 004bfef04321c825c670985159ff7150 17 1 55 0 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0 ...
3 005776d784e6a4e5034bb53ff8f3fd95 18 1 5 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ...
4 00eb23399012c56ee31d1e9266d527f6 13 1 5 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ...

5 rows × 130 columns


In [5]:
X_all = all_features.iloc[:,1:]
X_all.head()


Out[5]:
<!-- <!DOCTYPE <a <abbr <acronym <address <applet <area <article <aside <audio <b <base <basefont <bdi <bdo <big <blockquote <body <br
0 8 1 9 0 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 0 ...
1 12 1 34 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 ...
2 17 1 55 0 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0 0 ...
3 18 1 5 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 ...
4 13 1 5 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 ...

5 rows × 129 columns


In [6]:
X_all.shape


Out[6]:
(2225, 129)

In [9]:
# Train feature stats
X_all_train_means = X_all.mean()
X_all_train_medians = X_all.median()
X_all_train_maxs = X_all.max()
X_all_train_mins = X_all.min()
X_all_train_std = X_all.std()
X_all_train_cor = X_all.corr()
X_all_train_cov = X_all.cov()
X_all_train_maxs.head()


Out[9]:
<!--         1886
<!DOCTYPE       7
<a           5489
<abbr           0
<acronym        0
dtype: float64

In [10]:
X_all_train_cor.to_csv('data/html-feature-corr-vs251.csv')
X_all_train_cov.to_csv('data/html-feature-cov-vs251.csv')

In [11]:
all_column_names = list(X_all.columns)
all_column_names[:10]


Out[11]:
['<!--',
 '<!DOCTYPE',
 '<a',
 '<abbr',
 '<acronym',
 '<address',
 '<applet',
 '<area',
 '<article',
 '<aside']

In [12]:
all_train_stats = pd.DataFrame()
all_train_stats['feature_name'] = all_column_names
all_train_stats.head()


Out[12]:
feature_name
0 <!--
1 <!DOCTYPE
2 <a
3 <abbr
4 <acronym

5 rows × 1 columns


In [13]:
all_train_stats.shape


Out[13]:
(127, 1)

In [14]:
all_train_stats['mean'] = list(X_all_train_means)
all_train_stats['median'] = list(X_all_train_medians)
all_train_stats['standard_deviation'] = list(X_all_train_std)
all_train_stats['max'] = list(X_all_train_maxs)
all_train_stats['min'] = list(X_all_train_mins)
all_train_stats.head()


Out[14]:
feature_name mean median standard_deviation max min
0 <!-- 14.506067 9 48.245411 1886 0
1 <!DOCTYPE 0.865618 1 0.467861 7 0
2 <a 145.529888 26 649.051082 5489 0
3 <abbr 0.000000 0 0.000000 0 0
4 <acronym 0.000000 0 0.000000 0 0

5 rows × 6 columns


In [15]:
all_train_stats.to_csv('data/html-train-stats-vs251.csv')

In [16]:
plt.figure(figsize=(15,15))
x_graph = X_all['entropy']
num_bins = 100
n, bins, patches = plt.hist(x_graph, num_bins, normed=1, facecolor='green', alpha=0.5)
# add a 'best fit' line
y = mlab.normpdf(bins, x_graph.mean(), x_graph.std())
plt.plot(bins, y, 'r--')
plt.xlabel('HTML File Entropy')
plt.ylabel('HTML Sample Count')
plt.title('HTML Histogram Plot')
plt.show()

3. VirusShare 252


In [7]:
# VS251 feature set
phtml = pd.read_csv('data/sorted-html-features-vs252.csv')
pentropy = pd.read_csv('data/sorted-entropy-features-vs252.csv')
pfileid = pd.read_csv('data/sorted-file-id-features-vs252.csv')
ptridid = pd.read_csv('data/sorted-trid-id-features-vs252.csv')
plabels = pd.read_csv('data/sorted-train-labels-vs252.csv')
phtml.head()


Out[7]:
file_name <!-- <!DOCTYPE <a <abbr <acronym <address <applet <area <article <aside <audio <b <base <basefont <bdi <bdo <big <blockquote <body
0 0012a82ce8e0107d909959961d5862a1 26 1 9 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ...
1 001833c06efcaa39e7803c6a369e99dd 2 1 97 0 0 0 0 0 0 0 0 13 0 0 0 0 0 0 0 ...
2 001e6e1f510250f4de06f8c9c2784d45 1 1 7 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ...
3 00314e1bd4ecb9a50efe307ca2d001b7 12 1 87 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ...
4 00423f1656a26c53a787304f27aa60cd 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...

5 rows × 125 columns


In [8]:
phtml.shape


Out[8]:
(4292, 125)

In [19]:
pentropy.head()


Out[19]:
file_name entropy file_size
0 00002e640cafb741bea9a48eaee27d6f 0.992174 208860
1 000118d12cbf9ad6103e8b914a6e1ac3 0.834382 201600
2 0001776237ac37a69fcef93c1bac0988 0.966021 682192
3 000403e4e488356b7535cc613fbeb80b 0.773787 199168
4 0004c8b2a0f4680a5694d74199b40ea2 0.985592 1165440

5 rows × 3 columns


In [9]:
fileidfeatures = pd.DataFrame(pfileid.iloc[:,[0,2]])
fileidfeatures.head()


Out[9]:
file_name file_id
0 00002e640cafb741bea9a48eaee27d6f 133
1 000118d12cbf9ad6103e8b914a6e1ac3 1
2 0001776237ac37a69fcef93c1bac0988 1
3 000403e4e488356b7535cc613fbeb80b 1
4 0004c8b2a0f4680a5694d74199b40ea2 1

5 rows × 2 columns


In [12]:
ptridid = pd.read_csv('data/sorted-trid-id-features-vs252.csv')
trididfeatures = pd.DataFrame(ptridid.iloc[:,[0,2,3]])
trididfeatures.head()


Out[12]:
file_name percentage trid_id
0 00002e640cafb741bea9a48eaee27d6f 2.3 2
1 000118d12cbf9ad6103e8b914a6e1ac3 2.2 1
2 0001776237ac37a69fcef93c1bac0988 2.7 14
3 000403e4e488356b7535cc613fbeb80b 4.6 21
4 0004c8b2a0f4680a5694d74199b40ea2 2.2 1

5 rows × 3 columns


In [13]:
combined_train_features = phtml.merge(pentropy, on='file_name', how='inner', suffixes=('_html','_entropy'))
combined_train_features = combined_train_features.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_html','_fid'))
combined_train_features = combined_train_features.merge(trididfeatures, on='file_name', how='inner', suffixes=('_html','_tid'))

combined_train_features.head()


Out[13]:
file_name <!-- <!DOCTYPE <a <abbr <acronym <address <applet <area <article <aside <audio <b <base <basefont <bdi <bdo <big <blockquote <body
0 0012a82ce8e0107d909959961d5862a1 26 1 9 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ...
1 001833c06efcaa39e7803c6a369e99dd 2 1 97 0 0 0 0 0 0 0 0 13 0 0 0 0 0 0 0 ...
2 001e6e1f510250f4de06f8c9c2784d45 1 1 7 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ...
3 00314e1bd4ecb9a50efe307ca2d001b7 12 1 87 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ...
4 00423f1656a26c53a787304f27aa60cd 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...

5 rows × 130 columns


In [14]:
combined_train_features.shape


Out[14]:
(4292, 130)

In [15]:
combined_train_features.to_csv('data/combined-html-features-vs252.csv', index=False)

In [16]:
X_all = all_features.iloc[:,1:]
X_all.head()


Out[16]:
<!-- <!DOCTYPE <a <abbr <acronym <address <applet <area <article <aside <audio <b <base <basefont <bdi <bdo <big <blockquote <body <br
0 8 1 9 0 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 0 ...
1 12 1 34 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 ...
2 17 1 55 0 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0 0 ...
3 18 1 5 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 ...
4 13 1 5 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 ...

5 rows × 129 columns


In [17]:
# Train feature stats
X_all_train_means = X_all.mean()
X_all_train_medians = X_all.median()
X_all_train_maxs = X_all.max()
X_all_train_mins = X_all.min()
X_all_train_std = X_all.std()
X_all_train_cor = X_all.corr()
X_all_train_cov = X_all.cov()
X_all_train_maxs.head()


Out[17]:
<!--         1886
<!DOCTYPE       7
<a           5489
<abbr           0
<acronym        0
dtype: float64

In [18]:
X_all_train_cor.to_csv('data/html-feature-corr-vs252.csv')
X_all_train_cov.to_csv('data/html-feature-cov-vs252.csv')

In [19]:
all_column_names = list(X_all.columns)
all_column_names[:10]


Out[19]:
['<!--',
 '<!DOCTYPE',
 '<a',
 '<abbr',
 '<acronym',
 '<address',
 '<applet',
 '<area',
 '<article',
 '<aside']

In [20]:
all_train_stats = pd.DataFrame()
all_train_stats['feature_name'] = all_column_names
all_train_stats.head()


Out[20]:
feature_name
0 <!--
1 <!DOCTYPE
2 <a
3 <abbr
4 <acronym

5 rows × 1 columns


In [21]:
all_train_stats.shape


Out[21]:
(129, 1)

In [22]:
all_train_stats['mean'] = list(X_all_train_means)
all_train_stats['median'] = list(X_all_train_medians)
all_train_stats['standard_deviation'] = list(X_all_train_std)
all_train_stats['max'] = list(X_all_train_maxs)
all_train_stats['min'] = list(X_all_train_mins)
all_train_stats.head()


Out[22]:
feature_name mean median standard_deviation max min
0 <!-- 14.506067 9 48.245411 1886 0
1 <!DOCTYPE 0.865618 1 0.467861 7 0
2 <a 145.529888 26 649.051082 5489 0
3 <abbr 0.000000 0 0.000000 0 0
4 <acronym 0.000000 0 0.000000 0 0

5 rows × 6 columns


In [23]:
all_train_stats.to_csv('data/html-train-stats-vs252.csv')

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:

3. VirusShare 263 Data Analysis


In [ ]:


In [2]:
# VS251 feature set
phtml = pd.read_csv('data/sorted-html-features-vs263.csv')
pentropy = pd.read_csv('data/sorted-entropy-features-vs263.csv')
pfileid = pd.read_csv('data/sorted-file-id-features-vs263.csv')
ptridid = pd.read_csv('data/sorted-trid-id-features-vs263.csv')
plabels = pd.read_csv('data/sorted-train-labels-vs263.csv')
phtml.head()


Out[2]:
file_name <!-- <!DOCTYPE <a <abbr <acronym <address <applet <area <article <aside <audio <b <base <basefont <bdi <bdo <big <blockquote <body
0 0004c49071481789f1c8c80656638497 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ...
1 000a79e98b2a1a3bff2bcb93042d3e78 11 1 30 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ...
2 000e05a41be370f9912b48bec3c0905e 28 1 26 0 0 0 0 0 0 0 0 9 0 0 0 0 0 0 0 ...
3 00193fa83d95f128c9ceaa1cbe4d84d8 25 1 65 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ...
4 001f9cd2f4f410a658b38f03b62d169d 451 1 320 0 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0 ...

5 rows × 125 columns


In [3]:
phtml.shape


Out[3]:
(11583, 125)

In [4]:
pentropy.head()


Out[4]:
file_name entropy file_size
0 0002b2f621ea5786be03bf4153532dce 0.684706 81812
1 000401419eccde59975c713cfadc974c 0.800788 137131
2 00042f23bc15b89d9c6a7bde0e316f8b 0.989429 861184
3 0004824a60ff9fe1fb30d669a5baa627 0.802050 137630
4 0004c49071481789f1c8c80656638497 0.670559 22817

5 rows × 3 columns


In [5]:
fileidfeatures = pd.DataFrame(pfileid.iloc[:,[0,2]])
fileidfeatures.head()


Out[5]:
file_name file_id
0 0002b2f621ea5786be03bf4153532dce 5
1 000401419eccde59975c713cfadc974c 1
2 00042f23bc15b89d9c6a7bde0e316f8b 1
3 0004824a60ff9fe1fb30d669a5baa627 1
4 0004c49071481789f1c8c80656638497 13

5 rows × 2 columns


In [6]:
#ptridid = pd.read_csv('data/sorted-trid-id-features-vs252.csv')
trididfeatures = pd.DataFrame(ptridid.iloc[:,[0,2,3]])
trididfeatures.head()


Out[6]:
file_name percentage trid_id
0 0002b2f621ea5786be03bf4153532dce 7.4 1
1 000401419eccde59975c713cfadc974c 5.4 4
2 00042f23bc15b89d9c6a7bde0e316f8b 7.4 1
3 0004824a60ff9fe1fb30d669a5baa627 5.4 4
4 0004c49071481789f1c8c80656638497 0.0 0

5 rows × 3 columns


In [7]:
combined_train_features = phtml.merge(pentropy, on='file_name', how='inner', suffixes=('_html','_entropy'))
combined_train_features = combined_train_features.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_html','_fid'))
combined_train_features = combined_train_features.merge(trididfeatures, on='file_name', how='inner', suffixes=('_html','_tid'))

combined_train_features.head()


Out[7]:
file_name <!-- <!DOCTYPE <a <abbr <acronym <address <applet <area <article <aside <audio <b <base <basefont <bdi <bdo <big <blockquote <body
0 0004c49071481789f1c8c80656638497 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ...
1 000a79e98b2a1a3bff2bcb93042d3e78 11 1 30 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ...
2 000e05a41be370f9912b48bec3c0905e 28 1 26 0 0 0 0 0 0 0 0 9 0 0 0 0 0 0 0 ...
3 00193fa83d95f128c9ceaa1cbe4d84d8 25 1 65 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 ...
4 001f9cd2f4f410a658b38f03b62d169d 451 1 320 0 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0 ...

5 rows × 130 columns


In [8]:
combined_train_features.shape


Out[8]:
(11583, 130)

In [9]:
combined_train_features.to_csv('data/combined-html-features-vs263.csv', index=False)

In [11]:
X_all = combined_train_features.iloc[:,1:]
X_all.head()


Out[11]:
<!-- <!DOCTYPE <a <abbr <acronym <address <applet <area <article <aside <audio <b <base <basefont <bdi <bdo <big <blockquote <body <br
0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 ...
1 11 1 30 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 ...
2 28 1 26 0 0 0 0 0 0 0 0 9 0 0 0 0 0 0 0 0 ...
3 25 1 65 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 ...
4 451 1 320 0 0 0 0 0 0 0 0 6 0 0 0 0 0 0 0 0 ...

5 rows × 129 columns


In [12]:
# Train feature stats
X_all_train_means = X_all.mean()
X_all_train_medians = X_all.median()
X_all_train_maxs = X_all.max()
X_all_train_mins = X_all.min()
X_all_train_std = X_all.std()
X_all_train_cor = X_all.corr()
X_all_train_cov = X_all.cov()
X_all_train_maxs.head()


Out[12]:
<!--         10489
<!DOCTYPE        6
<a            7484
<abbr            0
<acronym         0
dtype: float64

In [13]:
X_all_train_cor.to_csv('data/html-feature-corr-vs263.csv', index=False)
X_all_train_cov.to_csv('data/html-feature-cov-vs63.csv', index=False)

In [14]:
all_column_names = list(X_all.columns)
all_column_names[:10]


Out[14]:
['<!--',
 '<!DOCTYPE',
 '<a',
 '<abbr',
 '<acronym',
 '<address',
 '<applet',
 '<area',
 '<article',
 '<aside']

In [16]:
all_train_stats = pd.DataFrame()
all_train_stats['feature_name'] = all_column_names
all_train_stats.head()


Out[16]:
feature_name
0 <!--
1 <!DOCTYPE
2 <a
3 <abbr
4 <acronym

5 rows × 1 columns


In [17]:
all_train_stats.shape


Out[17]:
(129, 1)

In [18]:
all_train_stats['mean'] = list(X_all_train_means)
all_train_stats['median'] = list(X_all_train_medians)
all_train_stats['standard_deviation'] = list(X_all_train_std)
all_train_stats['max'] = list(X_all_train_maxs)
all_train_stats['min'] = list(X_all_train_mins)
all_train_stats.head()


Out[18]:
feature_name mean median standard_deviation max min
0 <!-- 19.556419 12 100.718239 10489 0
1 <!DOCTYPE 0.859277 1 0.439838 6 0
2 <a 171.499266 32 722.061047 7484 0
3 <abbr 0.000000 0 0.000000 0 0
4 <acronym 0.000000 0 0.000000 0 0

5 rows × 6 columns


In [20]:
all_train_stats.to_csv('data/html-train-stats-vs263.csv', index=False)

In [ ]:


In [ ]:

4. VirusShare 264 Data Analysis.


In [ ]:


In [ ]:


In [2]:
# VS251 feature set
phtml = pd.read_csv('data/sorted-html-features-vs264.csv')
pentropy = pd.read_csv('data/sorted-entropy-features-vs264.csv')
pfileid = pd.read_csv('data/sorted-file-id-features-vs264.csv')
ptridid = pd.read_csv('data/sorted-trid-id-features-vs264.csv')
plabels = pd.read_csv('data/sorted-train-labels-vs264.csv')
phtml.head()


Out[2]:
file_name <!-- <!DOCTYPE <a <abbr <acronym <address <applet <area <article <aside <audio <b <base <basefont <bdi <bdo <big <blockquote <body
0 000070db76b6dc1ee3497a3f9319848c 94 1 73 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 ...
1 0003c05a1320e64fe72438ab48da7ecf 13 1 173 0 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 ...
2 0003e52a9267b657d9b08b2cbc0a2593 28 1 41 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 ...
3 0005743596135fe65f61da7a0eba0bb6 15 1 582 0 0 0 0 0 0 0 0 5 0 0 0 0 0 0 0 ...
4 000894ead589179a4a86dabe81661397 42 1 95 0 0 0 0 0 0 0 0 14 0 0 0 0 0 0 0 ...

5 rows × 125 columns


In [3]:
phtml.shape


Out[3]:
(41645, 125)

In [4]:
pentropy.head()


Out[4]:
file_name entropy file_size
0 000070db76b6dc1ee3497a3f9319848c 0.646594 81003
1 00009cbc0a90337e4c30950a51ae3d67 0.834079 700416
2 0003c05a1320e64fe72438ab48da7ecf 0.689533 121344
3 0003e52a9267b657d9b08b2cbc0a2593 0.662226 29585
4 0005743596135fe65f61da7a0eba0bb6 0.700392 178190

5 rows × 3 columns


In [5]:
fileidfeatures = pd.DataFrame(pfileid.iloc[:,[0,2]])
fileidfeatures.head()


Out[5]:
file_name file_id
0 000070db76b6dc1ee3497a3f9319848c 16
1 00009cbc0a90337e4c30950a51ae3d67 1
2 0003c05a1320e64fe72438ab48da7ecf 15
3 0003e52a9267b657d9b08b2cbc0a2593 16
4 0005743596135fe65f61da7a0eba0bb6 15

5 rows × 2 columns


In [6]:
#ptridid = pd.read_csv('data/sorted-trid-id-features-vs252.csv')
trididfeatures = pd.DataFrame(ptridid.iloc[:,[0,2,3]])
trididfeatures.head()


Out[6]:
file_name percentage trid_id
0 000070db76b6dc1ee3497a3f9319848c 0.0 0
1 00009cbc0a90337e4c30950a51ae3d67 5.5 4
2 0003c05a1320e64fe72438ab48da7ecf 0.0 0
3 0003e52a9267b657d9b08b2cbc0a2593 0.0 0
4 0005743596135fe65f61da7a0eba0bb6 0.6 8

5 rows × 3 columns


In [7]:
combined_train_features = phtml.merge(pentropy, on='file_name', how='inner', suffixes=('_html','_entropy'))
combined_train_features = combined_train_features.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_html','_fid'))
combined_train_features = combined_train_features.merge(trididfeatures, on='file_name', how='inner', suffixes=('_html','_tid'))

combined_train_features.head()


Out[7]:
file_name <!-- <!DOCTYPE <a <abbr <acronym <address <applet <area <article <aside <audio <b <base <basefont <bdi <bdo <big <blockquote <body
0 000070db76b6dc1ee3497a3f9319848c 94 1 73 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 ...
1 0003c05a1320e64fe72438ab48da7ecf 13 1 173 0 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 ...
2 0003e52a9267b657d9b08b2cbc0a2593 28 1 41 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 ...
3 0005743596135fe65f61da7a0eba0bb6 15 1 582 0 0 0 0 0 0 0 0 5 0 0 0 0 0 0 0 ...
4 000894ead589179a4a86dabe81661397 42 1 95 0 0 0 0 0 0 0 0 14 0 0 0 0 0 0 0 ...

5 rows × 130 columns


In [8]:
combined_train_features.shape


Out[8]:
(41645, 130)

In [9]:
combined_train_features.to_csv('data/combined-html-features-vs264.csv', index=False)

In [10]:
X_all = combined_train_features.iloc[:,1:]
X_all.head()


Out[10]:
<!-- <!DOCTYPE <a <abbr <acronym <address <applet <area <article <aside <audio <b <base <basefont <bdi <bdo <big <blockquote <body <br
0 94 1 73 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 ...
1 13 1 173 0 0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 0 ...
2 28 1 41 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 ...
3 15 1 582 0 0 0 0 0 0 0 0 5 0 0 0 0 0 0 0 0 ...
4 42 1 95 0 0 0 0 0 0 0 0 14 0 0 0 0 0 0 0 0 ...

5 rows × 129 columns


In [11]:
# Train feature stats
X_all_train_means = X_all.mean()
X_all_train_medians = X_all.median()
X_all_train_maxs = X_all.max()
X_all_train_mins = X_all.min()
X_all_train_std = X_all.std()
X_all_train_cor = X_all.corr()
X_all_train_cov = X_all.cov()
X_all_train_maxs.head()


Out[11]:
<!--          5458
<!DOCTYPE       17
<a           29689
<abbr            0
<acronym         0
dtype: float64

In [12]:
X_all_train_cor.to_csv('data/html-feature-corr-vs264.csv', index=False)
X_all_train_cov.to_csv('data/html-feature-cov-vs264.csv', index=False)

In [13]:
all_column_names = list(X_all.columns)
all_column_names[:10]


Out[13]:
['<!--',
 '<!DOCTYPE',
 '<a',
 '<abbr',
 '<acronym',
 '<address',
 '<applet',
 '<area',
 '<article',
 '<aside']

In [14]:
all_train_stats = pd.DataFrame()
all_train_stats['feature_name'] = all_column_names
all_train_stats.head()


Out[14]:
feature_name
0 <!--
1 <!DOCTYPE
2 <a
3 <abbr
4 <acronym

5 rows × 1 columns


In [15]:
all_train_stats.shape


Out[15]:
(129, 1)

In [16]:
all_train_stats['mean'] = list(X_all_train_means)
all_train_stats['median'] = list(X_all_train_medians)
all_train_stats['standard_deviation'] = list(X_all_train_std)
all_train_stats['max'] = list(X_all_train_maxs)
all_train_stats['min'] = list(X_all_train_mins)
all_train_stats.head()


Out[16]:
feature_name mean median standard_deviation max min
0 <!-- 19.129451 11 49.939974 5458 0
1 <!DOCTYPE 0.849274 1 0.414964 17 0
2 <a 142.901909 33 617.134809 29689 0
3 <abbr 0.000000 0 0.000000 0 0
4 <acronym 0.000000 0 0.000000 0 0

5 rows × 6 columns


In [17]:
all_train_stats.to_csv('data/html-train-stats-vs264.csv', index=False)

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:

4. Test Code Only.


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [2]:
pentropy = pd.read_csv('data/sorted-entropy-features-vs251.csv')
pfileid = pd.read_csv('data/sorted-file-id-features-vs251.csv')
plabels = pd.read_csv('data/sorted-train-labels-vs251.csv')
pentropy.head()


Out[2]:
file_name entropy file_size
0 00027c21667d9119a454df8cef2dc1c7 0.666599 18390
1 0003887ab64b8ae19ffa988638decac2 0.903260 1134320
2 0004376a62e22f6ad359467eb742b8ff 0.803515 149720
3 000634f03457d088c71dbffb897b1315 0.957584 1725502
4 00072ed24314e91b63b425b3dc572f50 0.486112 328093

In [3]:
ftypes = pfileid['file_type']
html_files = []
for idx, ftype in enumerate(ftypes):
    if 'HTML' in ftype:
        html_files.append(pfileid.iloc[idx, 0])
        
print("Found {:d} HTML files.".format(len(html_files)))


Found 2225 HTML files.

In [6]:
html_files[:10]


Out[6]:
['00027c21667d9119a454df8cef2dc1c7',
 '0021a84397cfcb73ac5adf90bab51036',
 '004bfef04321c825c670985159ff7150',
 '005776d784e6a4e5034bb53ff8f3fd95',
 '00eb23399012c56ee31d1e9266d527f6',
 '00f903208e7a3f9fbc29160366a69a6f',
 '011a4d091ec6b86eae2ad66d0ca46850',
 '013395072345fd9554f6154cba0cfe86',
 '014867cf6d1ffac369a136a19a2d55c7',
 '017050643d669dbbcf819c2e9eab1bc8']

In [8]:
fop = open('data/html-file-list-vs251.txt','w')
for fname in html_files:
    fop.write(fname + "\n")
    
fop.close()


Out[8]:
<function close>

In [9]:
fop.close()

In [7]:
plabels.head()


Out[7]:
file_name malware_type_x sample_label family_name family_label
0 00027c21667d9119a454df8cef2dc1c7 Trojan:JS/Redirector.QE 4 JS.Trojan.Redirector 4
1 0003887ab64b8ae19ffa988638decac2 OK 0 unknown 0
2 0004376a62e22f6ad359467eb742b8ff Worm:Win32/Picsys.C 6 Win32.Worm.Picsys 6
3 000634f03457d088c71dbffb897b1315 Worm:Win32/Rebhip 9 Win32.Worm.Rebhip 9
4 00072ed24314e91b63b425b3dc572f50 VirTool:Win32/VBInject.UG 10 Win32.VirTool.VBInject 10

In [ ]:


In [4]:
pentropy = pd.read_csv('data/sorted-entropy-features-vs252.csv')
pfileid = pd.read_csv('data/sorted-file-id-features-vs252.csv')
plabels = pd.read_csv('data/sorted-train-labels-vs252.csv')
pentropy.head()


Out[4]:
file_name entropy file_size
0 00002e640cafb741bea9a48eaee27d6f 0.992174 208860
1 000118d12cbf9ad6103e8b914a6e1ac3 0.834382 201600
2 0001776237ac37a69fcef93c1bac0988 0.966021 682192
3 000403e4e488356b7535cc613fbeb80b 0.773787 199168
4 0004c8b2a0f4680a5694d74199b40ea2 0.985592 1165440

In [5]:
ftypes = pfileid['file_type']
html_files = []
for idx, ftype in enumerate(ftypes):
    if 'HTML' in ftype:
        html_files.append(pfileid.iloc[idx, 0])
        
print("Found {:d} HTML files.".format(len(html_files)))


Found 4292 HTML files.

In [6]:
html_files[:10]


Out[6]:
['0012a82ce8e0107d909959961d5862a1',
 '001833c06efcaa39e7803c6a369e99dd',
 '001e6e1f510250f4de06f8c9c2784d45',
 '00314e1bd4ecb9a50efe307ca2d001b7',
 '00423f1656a26c53a787304f27aa60cd',
 '00585f47c137ee073c125974ca07db0b',
 '0058892a141817fdaa8ad8484c01676e',
 '006fca77167b682331565fb28c8d19c1',
 '007bf075c3bed22ca90ca0dbe37c2ba1',
 '0084401543c3290a508920942baabc01']

In [7]:
fop = open('data/html-file-list-vs252.txt','w')
for fname in html_files:
    fop.write(fname + "\n")
    
fop.close()

In [ ]:


In [2]:
pentropy = pd.read_csv('data/sorted-entropy-features-vs263.csv')
pfileid = pd.read_csv('data/sorted-file-id-features-vs263.csv')
plabels = pd.read_csv('data/sorted-train-labels-vs263.csv')
pentropy.head()


Out[2]:
file_name entropy file_size
0 0002b2f621ea5786be03bf4153532dce 0.684706 81812
1 000401419eccde59975c713cfadc974c 0.800788 137131
2 00042f23bc15b89d9c6a7bde0e316f8b 0.989429 861184
3 0004824a60ff9fe1fb30d669a5baa627 0.802050 137630
4 0004c49071481789f1c8c80656638497 0.670559 22817

In [3]:
ftypes = pfileid['file_type']
html_files = []
for idx, ftype in enumerate(ftypes):
    if 'HTML' in ftype:
        html_files.append(pfileid.iloc[idx, 0])
        
print("Found {:d} HTML files.".format(len(html_files)))


Found 11583 HTML files.

In [4]:
html_files[:10]


Out[4]:
['0004c49071481789f1c8c80656638497',
 '000a79e98b2a1a3bff2bcb93042d3e78',
 '000e05a41be370f9912b48bec3c0905e',
 '00193fa83d95f128c9ceaa1cbe4d84d8',
 '001f9cd2f4f410a658b38f03b62d169d',
 '00213f2cb1fd551508ef94a9b4c12c54',
 '002a356ec9dd0ee546d6183440d85286',
 '003109e0dc470fd52275547578725555',
 '004d11a99ffd3548bc25bd19e448e954',
 '0053ea7eebd7b241fd957b4a2b1a8c80']

In [5]:
fop = open('data/html-file-list-vs263.txt','w')
for fname in html_files:
    fop.write(fname + "\n")
    
fop.close()

In [ ]:


In [2]:
pentropy = pd.read_csv('data/sorted-entropy-features-vs264.csv')
pfileid = pd.read_csv('data/sorted-file-id-features-vs264.csv')
plabels = pd.read_csv('data/sorted-train-labels-vs264.csv')
pentropy.head()


Out[2]:
file_name entropy file_size
0 000070db76b6dc1ee3497a3f9319848c 0.646594 81003.0
1 00009cbc0a90337e4c30950a51ae3d67 0.834079 700416.0
2 0003c05a1320e64fe72438ab48da7ecf 0.689533 121344.0
3 0003e52a9267b657d9b08b2cbc0a2593 0.662226 29585.0
4 0005743596135fe65f61da7a0eba0bb6 0.700392 178190.0

In [3]:
ftypes = pfileid['file_type']
html_files = []
for idx, ftype in enumerate(ftypes):
    if 'HTML' in ftype:
        html_files.append(pfileid.iloc[idx, 0])
        
print("Found {:d} HTML files.".format(len(html_files)))


Found 41645 HTML files.

In [4]:
html_files[:10]


Out[4]:
['000070db76b6dc1ee3497a3f9319848c',
 '0003c05a1320e64fe72438ab48da7ecf',
 '0003e52a9267b657d9b08b2cbc0a2593',
 '0005743596135fe65f61da7a0eba0bb6',
 '000894ead589179a4a86dabe81661397',
 '000947473242042bf3be73852ed4b6c7',
 '000a29f46a0c60404ebb8af94f85daca',
 '000b72af6b43db3f9a5f0b59c44c5ab8',
 '000e89e8ca55d41e61d79eb55edb4108',
 '000ea53f08083c4ab0314f04c12b1eeb']

In [5]:
fop = open('data/html-file-list-vs264.txt','w')
for fname in html_files:
    fop.write(fname + "\n")
    
fop.close()

In [ ]:


In [ ]:


In [ ]: