Demographics information about benign and malicious files

This notebook will go through the files that were selected in the File Extraction notebook and gather demographic information about the files. This notebook will not work unless the code from the File Extraction notebook has already been run


In [2]:
import pefile
from pefile import PEFormatError 
import os
from collections import Counter
import pandas as pd
from pandas import DataFrame as df

import prettyplotlib as ppl
import numpy as np

# prettyplotlib imports
import matplotlib.pyplot as plt
import matplotlib as mpl
from prettyplotlib import brewer2mpl


%matplotlib inline
#pd.options.display.mpl_style = 'default'

In [3]:
num_Sections = Counter()
section_Names = Counter()
num_Imports = Counter()
imported_DLLs = Counter()
imported_Functions = Counter()

In [4]:
for eachFile in os.listdir('train/benign'):
    pe = pefile.PE(os.path.join('train/benign', eachFile))
    num_Sections[len(pe.sections)] += 1
    for eachSection in pe.sections:
        refined_name = eachSection.Name.replace('\x00','').lower()
        section_Names[refined_name] += 1
    num_Imports[len(pe.DIRECTORY_ENTRY_IMPORT)] += 1
    for eachImport in pe.DIRECTORY_ENTRY_IMPORT:
        imported_DLLs[eachImport.dll.lower()] += 1
        for eachFunction in eachImport.imports:
            imported_Functions[eachFunction.name] += 1

How many sections do most files have and what are they called?


In [5]:
all_sections = range(1, max(num_Sections.keys())+1)
all_section_counts = [num_Sections[x] for x in all_sections]

num_sections_df = df({'Sections':all_sections, 'Count':all_section_counts})
with ppl.pretty:
    fig, ax = plt.subplots(1, figsize=(10,6))
    
ppl.bar(ax, np.arange(len(num_sections_df)), num_sections_df['Count'], annotate=True)
ax.set_title('Number of PE sections in benign files', fontsize=20)
plt.ylabel("Number of files with that number of sections")
plt.show()



In [6]:
section_names_df = df(section_Names.most_common(10))
section_names_df.columns = ['Name','Count']
# since I'm making a horizontal bar and they load from bottom
# up I need to sort ascending
section_names_df.sort('Count', inplace=True, ascending=True)
section_names_df.reset_index(inplace=True, drop=True)
ypos = np.arange(len(section_names_df)) + .1
with ppl.pretty:
    fig, ax = plt.subplots(1, figsize=(10,6))

ppl.barh(ax, ypos, section_names_df['Count'], annotate=True)
plt.yticks(ypos +.4, section_names_df['Name'])
plt.xlim(0, max(section_names_df['Count'])*1.1)
plt.title("10 most frequently seen PE section names (benign)", fontsize=20)
plt.xlabel("Number of files with that section")
plt.show()



In [7]:
dll_df = df(imported_DLLs.most_common(10))
dll_df.columns = ['Name','Count']
dll_df.sort('Count', inplace=True, ascending=True)
dll_df.reset_index(inplace=True, drop=True)
ypos = np.arange(len(dll_df)) + .1

with ppl.pretty:
    fig, ax = plt.subplots(1, figsize=(10,6))

ppl.barh(ax, ypos, dll_df['Count'], annotate=True)
plt.yticks(ypos+.4, dll_df['Name'])
plt.xlim(0, max(dll_df['Count'])*1.1)
plt.title('10 most frequently imported DLLs (benign)', fontsize=20)
plt.xlabel('Number of files importing that DLL')
plt.show()



In [8]:
mal_num_Sections = Counter()
mal_section_Names = Counter()
mal_num_Imports = Counter()
mal_imported_DLLs = Counter()
mal_imported_Functions = Counter()

In [9]:
for eachFile in os.listdir('train/malicious'):
    pe = pefile.PE(os.path.join('train/malicious', eachFile))
    mal_num_Sections[len(pe.sections)] += 1
    for eachSection in pe.sections:
        refined_name = eachSection.Name.replace('\x00','').lower()
        mal_section_Names[refined_name] += 1
    mal_num_Imports[len(pe.DIRECTORY_ENTRY_IMPORT)] += 1
    for eachImport in pe.DIRECTORY_ENTRY_IMPORT:
        mal_imported_DLLs[eachImport.dll.lower()] += 1
        for eachFunction in eachImport.imports:
            mal_imported_Functions[eachFunction.name] += 1

In [10]:
mal_all_sections = range(1, max(mal_num_Sections.keys())+1)
mal_all_section_counts = [mal_num_Sections[x] for x in mal_all_sections]

mal_num_sections_df = df({'Sections':mal_all_sections, 'Count':mal_all_section_counts})
with ppl.pretty:
    fig, ax = plt.subplots(1, figsize=(10,6))
    
ppl.bar(ax, np.arange(len(mal_num_sections_df)), mal_num_sections_df['Count'], annotate=True, color="#756b94")
ax.set_title('Number of PE sections in malicious files', fontsize=20)
plt.ylabel("Number of files with that number of sections")
plt.show()



In [11]:
mal_section_names_df = df(mal_section_Names.most_common(10))
mal_section_names_df.columns = ['Name','Count']
# since I'm making a horizontal bar and they load from bottom
# up I need to sort ascending
mal_section_names_df.sort('Count', inplace=True, ascending=True)
mal_section_names_df.reset_index(inplace=True, drop=True)
ypos = np.arange(len(mal_section_names_df)) + .1
with ppl.pretty:
    fig, ax = plt.subplots(1, figsize=(10,6))

ppl.barh(ax, ypos, mal_section_names_df['Count'], annotate=True, color="#756b94")
plt.yticks(ypos +.4, mal_section_names_df['Name'])
plt.xlim(0, max(mal_section_names_df['Count'])*1.1)
plt.title("10 most frequently seen PE section names (malicious)", fontsize=20)
plt.xlabel("Number of files with that section")
plt.show()



In [12]:
mal_dll_df = df(mal_imported_DLLs.most_common(10))
mal_dll_df.columns = ['Name','Count']
mal_dll_df.sort('Count', inplace=True, ascending=True)
mal_dll_df.reset_index(inplace=True, drop=True)
ypos = np.arange(len(mal_dll_df)) + .1

with ppl.pretty:
    fig, ax = plt.subplots(1, figsize=(10,6))

ppl.barh(ax, ypos, mal_dll_df['Count'], annotate=True, color="#756b94")
plt.yticks(ypos+.4, mal_dll_df['Name'])
plt.xlim(0, max(mal_dll_df['Count'])*1.1)
plt.title('10 most frequently imported DLLs (malicious)', fontsize=20)
plt.xlabel('Number of files importing that DLL')
plt.show()


Comparing malware and benign files


In [13]:
kevdog = df({'Name':list(set(imported_DLLs.keys() + mal_imported_DLLs.keys()))})
kevdog['benign'] = [imported_DLLs[x] for x in kevdog['Name']]
kevdog['malicious'] = [mal_imported_DLLs[x] for x in kevdog['Name']]
kevdog['total'] = kevdog['benign'] + kevdog['malicious']
kevdog['possible'] = kevdog['malicious'] > kevdog['benign'] * 1.2
kevdog.sort('total', inplace=True, ascending=True)
kevdog.reset_index(inplace=True, drop=True)
kevdog.tail(10)


Out[13]:
Name benign malicious total possible
314 gdi32.dll 95 89 184 False
315 shlwapi.dll 146 40 186 False
316 ntdll.dll 261 0 261 False
317 oleaut32.dll 177 93 270 False
318 shell32.dll 182 128 310 False
319 ole32.dll 245 98 343 False
320 msvcrt.dll 392 5 397 False
321 advapi32.dll 323 178 501 False
322 user32.dll 312 375 687 True
323 kernel32.dll 485 615 1100 True

In [14]:
deez_data = kevdog.tail(10)
#deez_data = kevdog[kevdog['possible'] == True].tail(10)
with ppl.pretty:
    fig, ax = plt.subplots(1,figsize=(16,10))
    
set2 = brewer2mpl.get_map('Set2', 'Qualitative', 3).mpl_colors
height = 0.35
ypos = np.arange(len(deez_data['Name'])) + height
vpos = ypos +  height
malicious = ppl.barh(ypos,deez_data['malicious'],height,color=set2[1],align='center')
benign = ppl.barh(vpos, deez_data['benign'], height, color=set2[2], align='center')
ax.legend([malicious, benign], ['Malicious','Benign'], loc=7)
plt.yticks(ypos+0.175, list(deez_data['Name']))
plt.xlabel("Number of Occurences", fontsize=12)
plt.ylabel("DLL Name", fontsize=12)
plt.title("DLL Imports for benign and malicious files", fontsize=20)
plt.show()


So right now it looks like there isn't a striking difference between the DLL imports for malware or for benign files. There probably isn't a useful feature we can extract from this information.


In [15]:
pe_sections = df({'name':list(set(section_Names.keys() + mal_section_Names.keys()))})
pe_sections['benign'] = [section_Names[x] for x in pe_sections['name']]
pe_sections['malicious'] = [mal_section_Names[x] for x in pe_sections['name']]
pe_sections['total'] = pe_sections['benign'] + pe_sections['malicious']
pe_sections.sort('total', inplace=True, ascending=True)
pe_sections.reset_index(inplace=True, drop=True)
pe_sections.tail(10)


Out[15]:
name benign malicious total
44 .dfslcnz 0 56 56
45 .gfkrgj2 0 66 66
46 .h38dbn1 0 66 66
47 .pdata 284 0 284
48 .idata 395 20 415
49 .rdata 127 372 499
50 .reloc 541 320 861
51 .data 551 367 918
52 .text 600 387 987
53 .rsrc 574 599 1173

In [16]:
deez_sections = pe_sections.tail(10)

with ppl.pretty:
    fig, ax = plt.subplots(1,figsize=(16,10))

height = 0.35
ypos = np.arange(len(deez_sections['name'])) + height
vpos = ypos +  height
malicious = ppl.barh(ypos,deez_sections['malicious'],height,color=set2[1],align='center')
benign = ppl.barh(vpos, deez_sections['benign'], height, color=set2[2], align='center')
ax.legend([malicious, benign], ['Malicious','Benign'], loc=7)
plt.yticks(ypos+0.175, list(deez_sections['name']))
plt.xlabel("Number of Occurences", fontsize=12)
plt.ylabel("PE Section Name", fontsize=12)
plt.title("PE Sections for benign and malicious files", fontsize=20)
plt.show()


PE Sections seem to give us something to work with. There are the top 7 PE section names which are used by both malware and benign files. But after the top seven you get into really obscure names (likely random) which are only used by malware and not regular files. So one feature we could look at is whether or not the file has a PE section that isn't in the top 7.


In [35]:
functions_df = df({'name':list(set(mal_imported_Functions.keys() + imported_Functions.keys()))})
functions_df['benign'] = [imported_Functions[x] for x in functions_df['name']]
functions_df['malicious'] = [mal_imported_Functions[x] for x in functions_df['name']]
functions_df['total'] = functions_df['benign'] + functions_df['malicious']
functions_df['possible'] = functions_df['malicious'] >= functions_df['benign'] * 2
functions_df['delta'] = functions_df['malicious'] - functions_df['benign']
functions_df.sort('delta', inplace=True, ascending=True)
#plot_data = functions_df[functions_df['possible']==True].tail(25)
plot_data = functions_df.tail(10)

with ppl.pretty:
    fig, ax = plt.subplots(1,figsize=(16,10))

height = 0.35
ypos = np.arange(len(plot_data['name'])) + height
vpos = ypos +  height
malicious = ppl.barh(ypos, plot_data['malicious'],height,color=set2[1],align='center')
benign = ppl.barh(vpos, plot_data['benign'], height, color=set2[2], align='center')
ax.legend([malicious, benign], ['Malicious','Benign'], loc=7)
plt.yticks(ypos+0.175, list(plot_data['name']))
plt.show()



In [ ]: