In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import matplotlib.mlab as mlab
import warnings
warnings.filterwarnings('ignore') # this is really annoying.
In [2]:
all_combined_train_features = pd.read_csv('data/combined-pe-features-apt.csv')
combined_train_features = pd.read_csv('data/combined-pe-features-apt-reduced.csv')
# BROKEN DO NOT USE: function_counts = pd.read_csv('data/call-graph-reduced-function_counts-apt.csv')
train_labels = pd.read_csv('data/sorted-pe-coff-train-labels-apt.csv')
combined_train_features.head()
Out[2]:
In [14]:
all_combined_train_features.head()
Out[14]:
In [5]:
# BROKEN: do not use.
function_counts.head()
Out[5]:
In [2]:
train_labels.head()
Out[2]:
In [3]:
X_all = all_combined_train_features.iloc[:,1:]
y = train_labels['label']
X_all.head()
Out[3]:
In [ ]:
X_all.describe()
In [4]:
# Train feature stats
X_all_train_means = X_all.mean()
X_all_train_medians = X_all.median()
X_all_train_maxs = X_all.max()
X_all_train_mins = X_all.min()
X_all_train_std = X_all.std()
X_all_train_cor = X_all.corr()
X_all_train_cov = X_all.cov()
X_all_train_maxs.head()
Out[4]:
In [3]:
X_all.shape
Out[3]:
In [12]:
X_all_train_cor.head()
Out[12]:
In [10]:
X_all_train_cor.to_csv('data/pe-coff-feature-corr-apt.csv')
In [11]:
X_all_train_cov.head()
Out[11]:
In [13]:
X_all_train_cov.to_csv('data/pe-coff-feature-cov-apt.csv')
In [4]:
all_column_names = list(all_combined_train_features.columns)
In [ ]:
all_column_names[:20]
In [5]:
all_train_stats = pd.DataFrame()
all_train_stats['feature_name'] = all_column_names[1:]
all_train_stats.head()
Out[5]:
In [6]:
all_train_stats['mean'] = list(X_all_train_means)
all_train_stats['median'] = list(X_all_train_medians)
all_train_stats['standard_deviation'] = list(X_all_train_std)
all_train_stats['max'] = list(X_all_train_maxs)
all_train_stats['min'] = list(X_all_train_mins)
all_train_stats.head()
Out[6]:
In [7]:
all_train_stats.to_csv('data/pe-coff-train-stats-apt.csv')
In [ ]:
plt.figure(figsize=(15,15))
# plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, cmap=plt.cm.coolwarm)
plt.scatter(X_all['es'], X_all['ds'], c=y, cmap='brg')
plt.xlabel('ES Register')
plt.ylabel('DS Register')
plt.title('PE/COFF ASM Register Scatter Plot')
plt.show()
In [5]:
plt.figure(figsize=(15,15))
x_graph = X_all['entropy']
num_bins = 100
n, bins, patches = plt.hist(x_graph, num_bins, normed=1, facecolor='green', alpha=0.5)
# add a 'best fit' line
y = mlab.normpdf(bins, x_graph.mean(), x_graph.std())
plt.plot(bins, y, 'r--')
plt.xlabel('PE/COFF File Entropy')
plt.ylabel('PE/COFF Sample Count')
plt.title('PE/COFF Histogram Plot')
plt.show()
In [6]:
plt.figure(figsize=(15,15))
# rectangular box plot
plt.boxplot(x_graph, showfliers=True, patch_artist=True)
plt.show()
In [7]:
plt.figure(figsize=(15,15))
x_graph = X_all['vertex_count']
num_bins = 100
n, bins, patches = plt.hist(x_graph, num_bins, normed=1, facecolor='green', alpha=0.5)
# add a 'best fit' line
y = mlab.normpdf(bins, x_graph.mean(), x_graph.std())
plt.plot(bins, y, 'r--')
plt.xlabel('PE/COFF Vertex Count')
plt.ylabel('PE/COFF Sample Count')
plt.title('PE/COFF Histogram Plot')
plt.show()
In [6]:
all_train_stats = pd.read_csv('data/pe-coff-train-stats-apt.csv', index_col=0)
all_train_stats.head()
Out[6]:
In [7]:
X = all_train_stats.iloc[:,1:]
y = train_labels['label']
In [14]:
X['mean'].mean()
Out[14]:
In [16]:
X['mean'].std()
Out[16]:
In [ ]:
plt.figure(figsize=(15,15))
x_graph = X['mean']
num_bins = 100
# the histogram of the data, we have to remove the PE header characteristic stats as
# they are not counts but contain memory base location values and other large values
# that produce huge outliers that ruin the graphs.
n, bins, patches = plt.hist(x_graph[0:119], num_bins, normed=1, facecolor='green', alpha=0.5)
# add a 'best fit' line
y = mlab.normpdf(bins, x_graph[0:119].mean(), x_graph[0:119].std())
plt.plot(bins, y, 'r--')
plt.xlabel('ASM Feature Means')
plt.ylabel('ASM Feature Probability')
plt.title('PE/COFF Histogram Plot')
plt.show()
In [ ]:
plt.figure(figsize=(15,15))
x_graph = X['median']
num_bins = 100
# the histogram of the data, we have to remove the PE header characteristic stats as
# they are not counts but contain memory base location values and other large values
# that produce huge outliers that ruin the graphs.
n, bins, patches = plt.hist(x_graph[0:119], num_bins, normed=1, facecolor='green', alpha=0.5)
# add a 'best fit' line
y = mlab.normpdf(bins, x_graph[0:119].mean(), x_graph[0:119].std())
plt.plot(bins, y, 'r--')
plt.xlabel('ASM Feature Counts')
plt.ylabel('ASM Feature Means')
plt.title('PE/COFF Histogram Plot')
plt.show()
In [ ]:
plt.figure(figsize=(15,15))
x_graph = all_combined_train_features['edx']
num_bins = 100
# the histogram of the data, we have to remove the PE header characteristic stats as
# they are not counts but contain memory base location values and other large values
# that produce huge outliers that ruin the graphs.
n, bins, patches = plt.hist(x_graph, num_bins, normed=1, facecolor='green', alpha=0.5)
# add a 'best fit' line
y = mlab.normpdf(bins, x_graph.mean(), x_graph.std())
plt.plot(bins, y, 'r--')
plt.xlabel('EDX Feature Counts')
plt.ylabel('EDX Feature Probability')
plt.title('PE/COFF Histogram Plot')
plt.show()
In [ ]:
plt.figure(figsize=(15,15))
# rectangular box plot
plt.boxplot(x_graph, showfliers=True, patch_artist=True)
plt.show()
In [ ]:
help(plt.boxplot)
In [ ]:
plt.figure(figsize=(15,15))
# rectangular box plot
plt.boxplot(x_graph[0:119], vert=True, patch_artist=True)
plt.show()
In [ ]:
# notch shape box plot
bplot2 = axes[1].boxplot(all_data,
notch=True, # notch shape
vert=True, # vertical box aligmnent
patch_artist=True) # fill with color
In [ ]:
plt.figure(figsize=(15,15))
# Plot also the training points
# plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, cmap=plt.cm.coolwarm)
plt.scatter(range(X.shape[0]), X.iloc[:, 0], c=range(X.shape[0]), cmap='brg')
plt.xlabel('ASM Feature Index')
plt.ylabel('ASM Feature Means')
plt.title('PE/COFF Scatter Plot')
plt.yscale('log')
plt.show()
In [ ]:
In [ ]:
help(pd.read_csv)
In [ ]:
In [2]:
column_names = list(combined_train_features.columns)
In [ ]:
column_names[:10]
In [3]:
X = combined_train_features.iloc[:,1:]
y = train_labels['label']
In [4]:
# Train feature stats
# X_train_stats = X.describe()
X_train_means = X.mean()
X_train_medians = X.median()
X_train_std = X.std()
X_train_cor = X.corr()
X_train_cov = X.cov()
X_train_means.head()
Out[4]:
In [6]:
train_stats = pd.DataFrame()
train_stats['feature_name'] = column_names[1:]
train_stats.head()
Out[6]:
In [8]:
#train_stats= pd.DataFrame.join(X_train_means, X_train_medians)
#train_stats = pd.concat([X_train_means, X_train_medians, X_train_std])
#train_stats = train_stats.concat(pd.DataFrame(X_train_medians))
#train_stats = train_stats.merge(pd.DataFrame(X_train_std))
train_stats['mean'] = list(X_train_means) #.join(X_train_medians.to_frame())
train_stats['median'] = list(X_train_medians)
train_stats['standard_deviation'] = list(X_train_std)
#train_stats.columns = ['mean','median','std']
train_stats.head()
Out[8]:
In [ ]:
plt.figure(figsize=(15,15))
# Plot also the training points
# plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, cmap=plt.cm.coolwarm)
plt.scatter(range(X.shape[0]), X.iloc[:, 0], c=y, cmap='brg')
plt.xlabel('Sample Index')
plt.ylabel('EDI')
plt.title('PE/COFF Scatter Plot')
plt.show()
In [ ]:
plt.figure(figsize=(15,15))
plt.scatter(range(X.shape[0]), X.iloc[:, 1], c=y, cmap='brg')
plt.xlabel('Sample Index')
plt.ylabel('ESI')
plt.title('PE/COFF Scatter Plot')
plt.show()
In [ ]:
plt.figure(figsize=(15,15))
plt.scatter(range(X_all.shape[0]), X_all.loc[:,'entropy'], c=y, cmap='brg')
plt.xlabel('Sample Index')
plt.ylabel('ESI')
plt.title('PE/COFF Scatter Plot')
plt.show()
In [ ]:
plt.figure(figsize=(15,15))
# Plot also the training points
# plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, cmap=plt.cm.coolwarm)
plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, cmap='brg')
plt.xlabel('EDI')
plt.ylabel('ESI')
plt.title('PE/COFF Scatter Plot')
plt.show()
In [4]:
plt.figure(figsize=(15,15))
column_names = list(combined_train_features.columns)
# Plot also the training points
# plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, cmap=plt.cm.coolwarm)
plt.scatter(X.loc[:,'vertex_count'], X.loc[:,'edge_count'], c=X.loc[:,'delta_max'], cmap='brg')
plt.xlabel('vertex_count')
plt.ylabel('edge_count')
plt.title('PE/COFF Scatter Plot')
plt.show()
In [ ]:
column_names = list(combined_train_features.columns)
plt.figure(figsize=(20,400))
for idx in range(20):
plt.subplot(20, 1, idx + 1)
idx2 = idx + 1
plt.scatter(X.iloc[:,idx], X.iloc[:,idx2], c=y, cmap='brg')
plt.xlabel(column_names[idx])
plt.ylabel(column_names[idx2])
plt.title('PE/COFF Scatter Plot')
plt.show()
In [ ]:
help(plt.subplot)
In [ ]:
In [2]:
all_combined_features = pd.read_csv('data/combined-pe-features-vs251.csv')
y = pd.read_csv('data/sorted-pe-coff-train-labels-vs251.csv')
all_combined_features.head()
Out[2]:
In [5]:
y.head()
Out[5]:
In [5]:
column_names = all_combined_features.columns
column_names
Out[5]:
In [3]:
X_all = all_combined_features.iloc[:,1:]
#y_all = y['family_label']
y_all = list(y['label'])
len(y_all)
Out[3]:
In [4]:
X_all.describe()
Out[4]:
In [6]:
# Train feature stats
X_all_train_means = X_all.mean()
X_all_train_medians = X_all.median()
X_all_train_maxs = X_all.max()
X_all_train_mins = X_all.min()
X_all_train_std = X_all.std()
X_all_train_cor = X_all.corr()
X_all_train_cov = X_all.cov()
X_all_train_maxs.head()
Out[6]:
In [8]:
vs251_stats = pd.DataFrame()
vs251_stats['feature_name'] = list(column_names[1:])
vs251_stats.head()
Out[8]:
In [10]:
vs251_stats['mean'] = list(X_all_train_means)
vs251_stats['median'] = list(X_all_train_medians)
vs251_stats['standard_deviation'] = list(X_all_train_std)
vs251_stats['min'] = list(X_all_train_mins)
vs251_stats['max'] = list(X_all_train_maxs)
vs251_stats.head()
Out[10]:
In [11]:
vs251_stats.to_csv('data/pe-coff-statistics-vs251.csv', index=False)
In [12]:
X_all_train_cor.to_csv('data/pe-coff-corr-vs251.csv', index=False)
In [13]:
X_all_train_cov.to_csv('data/pe-coff-cov-vs251.csv', index=False)
In [9]:
type(X_all_train_means)
Out[9]:
In [10]:
plt.figure(figsize=(15,15))
#column_names = list(combined_train_features.columns)
# Plot also the training points
# plt.scatter(X.iloc[:, 0], X.iloc[:, 1], c=y, cmap=plt.cm.coolwarm)
plt.scatter(X_all.loc[:,'vertex_count'], X_all.loc[:,'edge_count'], c=y_all, cmap='brg')
plt.xlabel('vertex_count')
plt.ylabel('edge_count')
plt.title('PE/COFF Scatter Plot')
plt.show()
In [12]:
plt.figure(figsize=(15,15))
x_graph = X_all['entropy']
num_bins = 100
n, bins, patches = plt.hist(x_graph, num_bins, normed=1, facecolor='green', alpha=0.5)
# add a 'best fit' line
y = mlab.normpdf(bins, x_graph.mean(), x_graph.std())
plt.plot(bins, y, 'r--')
plt.xlabel('Shannons Entropy')
plt.ylabel('Feature Count')
plt.title('PE/COFF Histogram Plot')
plt.show()
In [13]:
plt.figure(figsize=(15,15))
# rectangular box plot
plt.boxplot(x_graph, vert=True, patch_artist=True)
plt.show()
In [5]:
function_count_features = pd.read_csv('data/sorted-pe-function-count-features-10percent-vs251.csv', na_filter=False)
sorted_train_labels = pd.read_csv('data/sorted-train-labels-vs251.csv', na_filter=False)
y = pd.read_csv('data/sorted-pe-coff-train-labels-vs251.csv', na_filter=False)
function_count_features.head()
Out[5]:
In [4]:
sorted_train_labels.head()
Out[4]:
In [5]:
y.head()
Out[5]:
In [6]:
X_all = function_count_features.iloc[:,1:]
y_all = list(y['label'])
len(y_all)
Out[6]:
In [7]:
X_all.describe()
Out[7]:
In [8]:
function_count_features.shape
Out[8]:
In [8]:
temp_x_4_std = X_all['sub_4'].std() * 4
temp_x_4_std
Out[8]:
In [ ]:
In [9]:
# OK, We are good to go :)
# Lets have a look at some graphs first.
plt.figure(figsize=(15,15))
x_graph = X_all['sub_4'].clip(-temp_x_4_std, temp_x_4_std)
num_bins = 100
n, bins, patches = plt.hist(x_graph, num_bins, normed=1, facecolor='green', alpha=0.5)
# add a 'best fit' line
y = mlab.normpdf(bins, x_graph.mean(), x_graph.std())
plt.plot(bins, y, 'r--')
plt.xlabel('Function Count (sub_4)')
plt.ylabel('Function Count Percentage')
plt.title('PE/COFF Function Count Histogram Plot')
plt.show()
In [10]:
plt.figure(figsize=(15,15))
# rectangular box plot
plt.boxplot(x_graph, vert=True, patch_artist=True)
plt.show()
In [ ]:
In [ ]:
help(pd.DataFrame.clip)
In [ ]:
In [2]:
all_combined_features = pd.read_csv('data/combined-pe-features-vs252.csv')
all_train_labels = pd.read_csv('data/sorted-pe-coff-train-labels-vs252.csv')
all_combined_features.head()
Out[2]:
In [3]:
all_train_labels.head()
Out[3]:
In [3]:
X_all = all_combined_features.iloc[:,1:]
y_all = list(all_train_labels['label'])
len(y_all)
Out[3]:
In [4]:
X_all.shape
Out[4]:
In [ ]:
feature_names = list(X_all.columns)
feature_names
In [10]:
vs252_stats = pd.DataFrame()
vs252_stats['feature_name'] = feature_names
vs252_stats['mean'] = list(X_all.mean())
vs252_stats['median'] = list(X_all.median())
vs252_stats['standard_deviation'] = list(X_all.std())
vs252_stats['min'] = list(X_all.min())
vs252_stats['max'] = list(X_all.max())
vs252_stats.head()
Out[10]:
In [11]:
vs252_stats.to_csv('data/pe-coff-statistics-vs252.csv', index=False)
In [5]:
function_count_features = pd.read_csv('data/sorted-pe-function-count-features-10percent-vs252.csv', na_filter=False)
sorted_train_labels = pd.read_csv('data/sorted-train-labels-vs252.csv', na_filter=False)
y = pd.read_csv('data/sorted-pe-coff-train-labels-vs252.csv', na_filter=False)
function_count_features.head()
Out[5]:
In [18]:
function_count_features.head(10)
Out[18]:
In [17]:
sorted_train_labels.head(10)
Out[17]:
In [13]:
y.head()
Out[13]:
In [6]:
X_all = function_count_features.iloc[:,1:]
y_all = list(y['label'])
len(y_all)
Out[6]:
In [7]:
X_all.describe()
Out[7]:
In [8]:
temp_x_4_std = X_all['sub_4'].std() * 4
temp_x_4_std
Out[8]:
In [9]:
# Histogram
plt.figure(figsize=(15,15))
x_graph = X_all['sub_4'].clip(-temp_x_4_std, temp_x_4_std)
num_bins = 100
n, bins, patches = plt.hist(x_graph, num_bins, normed=1, facecolor='green', alpha=0.5)
# add a 'best fit' line
y = mlab.normpdf(bins, x_graph.mean(), x_graph.std())
plt.plot(bins, y, 'r--')
plt.xlabel('Function Count (sub_4)')
plt.ylabel('Function Count Percentage')
plt.title('PE/COFF Function Count Histogram Plot')
plt.show()
In [10]:
# rectangular box plot
plt.figure(figsize=(15,15))
plt.boxplot(x_graph, vert=True, patch_artist=True)
plt.show()
In [13]:
#fig, axes = plt.subplots(nrows=2, ncols=10)
#df['A'].plot(ax=axes[0,0]); axes[0,0].set_title('A')
column_names = list(X_all.columns)
plt.figure(figsize=(20,400))
for idx in range(10):
plt.subplot(10, 1, idx + 1)
idx2 = idx + 1
plt.scatter(X_all.iloc[:,idx], X_all.iloc[:,idx2], c=y, cmap='brg')
plt.xlabel(column_names[idx])
plt.ylabel(column_names[idx2])
plt.title('PE/COFF Scatter Plot')
plt.show()
In [31]:
temp_x_3_std = X_all.iloc[:,0:10].std() * 3
temp_x_3_std
Out[31]:
In [32]:
temp_x_2_std = X_all.iloc[:,0:10].std() * 2
temp_x_2_std
Out[32]:
In [33]:
temp_x_1_std = X_all.iloc[:,0:10].std()
temp_x_1_std
Out[33]:
In [53]:
plt.figure(figsize=(20,10));
temp_x_3_std = X_all.iloc[:,0:10].std()
x_graph = X_all.iloc[:,0:10].clip(0, temp_x_3_std, axis=1) # apply the clip function column wise.
x_graph = X_all.iloc[:,0:10].clip(0, 10, axis=1)
bp = x_graph.boxplot()
plt.show()
In [59]:
X_all['__vba'].mean()
Out[59]:
In [60]:
X_all['__vba'].max()
Out[60]:
In [56]:
x_graph['__vba'].min()
Out[56]:
In [61]:
X_all['__vba'].std()
Out[61]:
In [16]:
plt.show()
In [ ]:
df = DataFrame
In [ ]:
help(pd.DataFrame.std)
In [ ]:
help(pd.DataFrame.clip)
In [2]:
all_combined_features = pd.read_csv('data/combined-pe-features-vs263.csv')
all_train_labels = pd.read_csv('data/sorted-pe-coff-train-labels-vs263.csv')
all_combined_features.head()
Out[2]:
In [3]:
X_all = all_combined_features.iloc[:,1:]
y_all = all_train_labels['label']
X_all.head()
Out[3]:
In [4]:
X_all.shape
Out[4]:
In [5]:
len(y_all)
Out[5]:
In [ ]:
feature_names = list(X_all.columns)
feature_names
In [8]:
vs263_stats = pd.DataFrame()
vs263_stats['feature_name'] = feature_names
vs263_stats['mean'] = list(X_all.mean())
vs263_stats['median'] = list(X_all.median())
vs263_stats['standard_deviation'] = list(X_all.std())
vs263_stats['min'] = list(X_all.min())
vs263_stats['max'] = list(X_all.max())
vs263_stats.head()
Out[8]:
In [9]:
vs263_stats.to_csv('data/pe-coff-statistics-vs263.csv', index=False)
In [ ]:
In [2]:
function_count_features = pd.read_csv('data/sorted-pe-function-count-features-10percent-vs263.csv', na_filter=False)
sorted_train_labels = pd.read_csv('data/sorted-train-labels-vs263.csv', na_filter=False)
y = pd.read_csv('data/sorted-pe-coff-train-labels-vs263.csv', na_filter=False)
function_count_features.head()
Out[2]:
In [3]:
X_all = function_count_features.iloc[:,1:]
y_all = list(y['label'])
len(y_all)
Out[3]:
In [4]:
sorted_train_labels.head()
Out[4]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [2]:
all_combined_features = pd.read_csv('data/combined-pe-features-vs264.csv')
all_train_labels = pd.read_csv('data/sorted-pe-coff-train-labels-vs264.csv')
all_combined_features.head()
Out[2]:
In [3]:
all_train_labels.head()
Out[3]:
In [4]:
all_train_labels.shape
Out[4]:
In [5]:
all_combined_features.shape
Out[5]:
In [6]:
X_all = all_combined_features.iloc[:,1:]
y_all = list(all_train_labels['label'])
In [8]:
feature_names = list(X_all.columns)
vs264 = pd.DataFrame()
vs264['feature_name'] = feature_names
vs264['mean'] = list(X_all.mean())
vs264['median'] = list(X_all.median())
vs264['standard_deviation'] = list(X_all.std())
vs264['min'] = list(X_all.min())
vs264['max'] = list(X_all.max())
vs264.head()
Out[8]:
In [9]:
vs264.to_csv('data/pe-coff-statistics-vs264.csv', index=False)
In [ ]:
In [ ]:
function_count_features = pd.read_csv('data/sorted-pe-function-count-features-10percent-vs264.csv', na_filter=False)
sorted_train_labels = pd.read_csv('data/sorted-train-labels-vs264.csv', na_filter=False)
y = pd.read_csv('data/sorted-pe-coff-train-labels-vs264.csv', na_filter=False)
function_count_features.head()
In [ ]:
In [ ]:
In [ ]: