In [2]:
import pefile
from pefile import PEFormatError
import os
from collections import Counter
import pandas as pd
from pandas import DataFrame as df
import prettyplotlib as ppl
import numpy as np
# prettyplotlib imports
import matplotlib.pyplot as plt
import matplotlib as mpl
from prettyplotlib import brewer2mpl
%matplotlib inline
#pd.options.display.mpl_style = 'default'
In [3]:
num_Sections = Counter()
section_Names = Counter()
num_Imports = Counter()
imported_DLLs = Counter()
imported_Functions = Counter()
In [4]:
for eachFile in os.listdir('train/benign'):
pe = pefile.PE(os.path.join('train/benign', eachFile))
num_Sections[len(pe.sections)] += 1
for eachSection in pe.sections:
refined_name = eachSection.Name.replace('\x00','').lower()
section_Names[refined_name] += 1
num_Imports[len(pe.DIRECTORY_ENTRY_IMPORT)] += 1
for eachImport in pe.DIRECTORY_ENTRY_IMPORT:
imported_DLLs[eachImport.dll.lower()] += 1
for eachFunction in eachImport.imports:
imported_Functions[eachFunction.name] += 1
In [5]:
all_sections = range(1, max(num_Sections.keys())+1)
all_section_counts = [num_Sections[x] for x in all_sections]
num_sections_df = df({'Sections':all_sections, 'Count':all_section_counts})
with ppl.pretty:
fig, ax = plt.subplots(1, figsize=(10,6))
ppl.bar(ax, np.arange(len(num_sections_df)), num_sections_df['Count'], annotate=True)
ax.set_title('Number of PE sections in benign files', fontsize=20)
plt.ylabel("Number of files with that number of sections")
plt.show()
In [6]:
section_names_df = df(section_Names.most_common(10))
section_names_df.columns = ['Name','Count']
# since I'm making a horizontal bar and they load from bottom
# up I need to sort ascending
section_names_df.sort('Count', inplace=True, ascending=True)
section_names_df.reset_index(inplace=True, drop=True)
ypos = np.arange(len(section_names_df)) + .1
with ppl.pretty:
fig, ax = plt.subplots(1, figsize=(10,6))
ppl.barh(ax, ypos, section_names_df['Count'], annotate=True)
plt.yticks(ypos +.4, section_names_df['Name'])
plt.xlim(0, max(section_names_df['Count'])*1.1)
plt.title("10 most frequently seen PE section names (benign)", fontsize=20)
plt.xlabel("Number of files with that section")
plt.show()
In [7]:
dll_df = df(imported_DLLs.most_common(10))
dll_df.columns = ['Name','Count']
dll_df.sort('Count', inplace=True, ascending=True)
dll_df.reset_index(inplace=True, drop=True)
ypos = np.arange(len(dll_df)) + .1
with ppl.pretty:
fig, ax = plt.subplots(1, figsize=(10,6))
ppl.barh(ax, ypos, dll_df['Count'], annotate=True)
plt.yticks(ypos+.4, dll_df['Name'])
plt.xlim(0, max(dll_df['Count'])*1.1)
plt.title('10 most frequently imported DLLs (benign)', fontsize=20)
plt.xlabel('Number of files importing that DLL')
plt.show()
In [8]:
mal_num_Sections = Counter()
mal_section_Names = Counter()
mal_num_Imports = Counter()
mal_imported_DLLs = Counter()
mal_imported_Functions = Counter()
In [9]:
for eachFile in os.listdir('train/malicious'):
pe = pefile.PE(os.path.join('train/malicious', eachFile))
mal_num_Sections[len(pe.sections)] += 1
for eachSection in pe.sections:
refined_name = eachSection.Name.replace('\x00','').lower()
mal_section_Names[refined_name] += 1
mal_num_Imports[len(pe.DIRECTORY_ENTRY_IMPORT)] += 1
for eachImport in pe.DIRECTORY_ENTRY_IMPORT:
mal_imported_DLLs[eachImport.dll.lower()] += 1
for eachFunction in eachImport.imports:
mal_imported_Functions[eachFunction.name] += 1
In [10]:
mal_all_sections = range(1, max(mal_num_Sections.keys())+1)
mal_all_section_counts = [mal_num_Sections[x] for x in mal_all_sections]
mal_num_sections_df = df({'Sections':mal_all_sections, 'Count':mal_all_section_counts})
with ppl.pretty:
fig, ax = plt.subplots(1, figsize=(10,6))
ppl.bar(ax, np.arange(len(mal_num_sections_df)), mal_num_sections_df['Count'], annotate=True, color="#756b94")
ax.set_title('Number of PE sections in malicious files', fontsize=20)
plt.ylabel("Number of files with that number of sections")
plt.show()
In [11]:
mal_section_names_df = df(mal_section_Names.most_common(10))
mal_section_names_df.columns = ['Name','Count']
# since I'm making a horizontal bar and they load from bottom
# up I need to sort ascending
mal_section_names_df.sort('Count', inplace=True, ascending=True)
mal_section_names_df.reset_index(inplace=True, drop=True)
ypos = np.arange(len(mal_section_names_df)) + .1
with ppl.pretty:
fig, ax = plt.subplots(1, figsize=(10,6))
ppl.barh(ax, ypos, mal_section_names_df['Count'], annotate=True, color="#756b94")
plt.yticks(ypos +.4, mal_section_names_df['Name'])
plt.xlim(0, max(mal_section_names_df['Count'])*1.1)
plt.title("10 most frequently seen PE section names (malicious)", fontsize=20)
plt.xlabel("Number of files with that section")
plt.show()
In [12]:
mal_dll_df = df(mal_imported_DLLs.most_common(10))
mal_dll_df.columns = ['Name','Count']
mal_dll_df.sort('Count', inplace=True, ascending=True)
mal_dll_df.reset_index(inplace=True, drop=True)
ypos = np.arange(len(mal_dll_df)) + .1
with ppl.pretty:
fig, ax = plt.subplots(1, figsize=(10,6))
ppl.barh(ax, ypos, mal_dll_df['Count'], annotate=True, color="#756b94")
plt.yticks(ypos+.4, mal_dll_df['Name'])
plt.xlim(0, max(mal_dll_df['Count'])*1.1)
plt.title('10 most frequently imported DLLs (malicious)', fontsize=20)
plt.xlabel('Number of files importing that DLL')
plt.show()
In [13]:
kevdog = df({'Name':list(set(imported_DLLs.keys() + mal_imported_DLLs.keys()))})
kevdog['benign'] = [imported_DLLs[x] for x in kevdog['Name']]
kevdog['malicious'] = [mal_imported_DLLs[x] for x in kevdog['Name']]
kevdog['total'] = kevdog['benign'] + kevdog['malicious']
kevdog['possible'] = kevdog['malicious'] > kevdog['benign'] * 1.2
kevdog.sort('total', inplace=True, ascending=True)
kevdog.reset_index(inplace=True, drop=True)
kevdog.tail(10)
Out[13]:
In [14]:
deez_data = kevdog.tail(10)
#deez_data = kevdog[kevdog['possible'] == True].tail(10)
with ppl.pretty:
fig, ax = plt.subplots(1,figsize=(16,10))
set2 = brewer2mpl.get_map('Set2', 'Qualitative', 3).mpl_colors
height = 0.35
ypos = np.arange(len(deez_data['Name'])) + height
vpos = ypos + height
malicious = ppl.barh(ypos,deez_data['malicious'],height,color=set2[1],align='center')
benign = ppl.barh(vpos, deez_data['benign'], height, color=set2[2], align='center')
ax.legend([malicious, benign], ['Malicious','Benign'], loc=7)
plt.yticks(ypos+0.175, list(deez_data['Name']))
plt.xlabel("Number of Occurences", fontsize=12)
plt.ylabel("DLL Name", fontsize=12)
plt.title("DLL Imports for benign and malicious files", fontsize=20)
plt.show()
So right now it looks like there isn't a striking difference between the DLL imports for malware or for benign files. There probably isn't a useful feature we can extract from this information.
In [15]:
pe_sections = df({'name':list(set(section_Names.keys() + mal_section_Names.keys()))})
pe_sections['benign'] = [section_Names[x] for x in pe_sections['name']]
pe_sections['malicious'] = [mal_section_Names[x] for x in pe_sections['name']]
pe_sections['total'] = pe_sections['benign'] + pe_sections['malicious']
pe_sections.sort('total', inplace=True, ascending=True)
pe_sections.reset_index(inplace=True, drop=True)
pe_sections.tail(10)
Out[15]:
In [16]:
deez_sections = pe_sections.tail(10)
with ppl.pretty:
fig, ax = plt.subplots(1,figsize=(16,10))
height = 0.35
ypos = np.arange(len(deez_sections['name'])) + height
vpos = ypos + height
malicious = ppl.barh(ypos,deez_sections['malicious'],height,color=set2[1],align='center')
benign = ppl.barh(vpos, deez_sections['benign'], height, color=set2[2], align='center')
ax.legend([malicious, benign], ['Malicious','Benign'], loc=7)
plt.yticks(ypos+0.175, list(deez_sections['name']))
plt.xlabel("Number of Occurences", fontsize=12)
plt.ylabel("PE Section Name", fontsize=12)
plt.title("PE Sections for benign and malicious files", fontsize=20)
plt.show()
PE Sections seem to give us something to work with. There are the top 7 PE section names which are used by both malware and benign files. But after the top seven you get into really obscure names (likely random) which are only used by malware and not regular files. So one feature we could look at is whether or not the file has a PE section that isn't in the top 7.
In [35]:
functions_df = df({'name':list(set(mal_imported_Functions.keys() + imported_Functions.keys()))})
functions_df['benign'] = [imported_Functions[x] for x in functions_df['name']]
functions_df['malicious'] = [mal_imported_Functions[x] for x in functions_df['name']]
functions_df['total'] = functions_df['benign'] + functions_df['malicious']
functions_df['possible'] = functions_df['malicious'] >= functions_df['benign'] * 2
functions_df['delta'] = functions_df['malicious'] - functions_df['benign']
functions_df.sort('delta', inplace=True, ascending=True)
#plot_data = functions_df[functions_df['possible']==True].tail(25)
plot_data = functions_df.tail(10)
with ppl.pretty:
fig, ax = plt.subplots(1,figsize=(16,10))
height = 0.35
ypos = np.arange(len(plot_data['name'])) + height
vpos = ypos + height
malicious = ppl.barh(ypos, plot_data['malicious'],height,color=set2[1],align='center')
benign = ppl.barh(vpos, plot_data['benign'], height, color=set2[2], align='center')
ax.legend([malicious, benign], ['Malicious','Benign'], loc=7)
plt.yticks(ypos+0.175, list(plot_data['name']))
plt.show()
In [ ]: