Demographics information about benign and malicious files

This notebook will go through the files that were selected in the File Extraction notebook and gather demographic information about the files. This notebook will not work unless the code from the File Extraction notebook has already been run



In [2]:

    
import pefile
from pefile import PEFormatError 
import os
from collections import Counter
import pandas as pd
from pandas import DataFrame as df

import prettyplotlib as ppl
import numpy as np

# prettyplotlib imports
import matplotlib.pyplot as plt
import matplotlib as mpl
from prettyplotlib import brewer2mpl


%matplotlib inline
#pd.options.display.mpl_style = 'default'



In [3]:

    
num_Sections = Counter()
section_Names = Counter()
num_Imports = Counter()
imported_DLLs = Counter()
imported_Functions = Counter()



In [4]:

    
for eachFile in os.listdir('train/benign'):
    pe = pefile.PE(os.path.join('train/benign', eachFile))
    num_Sections[len(pe.sections)] += 1
    for eachSection in pe.sections:
        refined_name = eachSection.Name.replace('\x00','').lower()
        section_Names[refined_name] += 1
    num_Imports[len(pe.DIRECTORY_ENTRY_IMPORT)] += 1
    for eachImport in pe.DIRECTORY_ENTRY_IMPORT:
        imported_DLLs[eachImport.dll.lower()] += 1
        for eachFunction in eachImport.imports:
            imported_Functions[eachFunction.name] += 1

How many sections do most files have and what are they called?



In [5]:

    
all_sections = range(1, max(num_Sections.keys())+1)
all_section_counts = [num_Sections[x] for x in all_sections]

num_sections_df = df({'Sections':all_sections, 'Count':all_section_counts})
with ppl.pretty:
    fig, ax = plt.subplots(1, figsize=(10,6))
    
ppl.bar(ax, np.arange(len(num_sections_df)), num_sections_df['Count'], annotate=True)
ax.set_title('Number of PE sections in benign files', fontsize=20)
plt.ylabel("Number of files with that number of sections")
plt.show()



In [6]:

    
section_names_df = df(section_Names.most_common(10))
section_names_df.columns = ['Name','Count']
# since I'm making a horizontal bar and they load from bottom
# up I need to sort ascending
section_names_df.sort('Count', inplace=True, ascending=True)
section_names_df.reset_index(inplace=True, drop=True)
ypos = np.arange(len(section_names_df)) + .1
with ppl.pretty:
    fig, ax = plt.subplots(1, figsize=(10,6))

ppl.barh(ax, ypos, section_names_df['Count'], annotate=True)
plt.yticks(ypos +.4, section_names_df['Name'])
plt.xlim(0, max(section_names_df['Count'])*1.1)
plt.title("10 most frequently seen PE section names (benign)", fontsize=20)
plt.xlabel("Number of files with that section")
plt.show()



In [7]:

    
dll_df = df(imported_DLLs.most_common(10))
dll_df.columns = ['Name','Count']
dll_df.sort('Count', inplace=True, ascending=True)
dll_df.reset_index(inplace=True, drop=True)
ypos = np.arange(len(dll_df)) + .1

with ppl.pretty:
    fig, ax = plt.subplots(1, figsize=(10,6))

ppl.barh(ax, ypos, dll_df['Count'], annotate=True)
plt.yticks(ypos+.4, dll_df['Name'])
plt.xlim(0, max(dll_df['Count'])*1.1)
plt.title('10 most frequently imported DLLs (benign)', fontsize=20)
plt.xlabel('Number of files importing that DLL')
plt.show()



In [8]:

    
mal_num_Sections = Counter()
mal_section_Names = Counter()
mal_num_Imports = Counter()
mal_imported_DLLs = Counter()
mal_imported_Functions = Counter()



In [9]:

    
for eachFile in os.listdir('train/malicious'):
    pe = pefile.PE(os.path.join('train/malicious', eachFile))
    mal_num_Sections[len(pe.sections)] += 1
    for eachSection in pe.sections:
        refined_name = eachSection.Name.replace('\x00','').lower()
        mal_section_Names[refined_name] += 1
    mal_num_Imports[len(pe.DIRECTORY_ENTRY_IMPORT)] += 1
    for eachImport in pe.DIRECTORY_ENTRY_IMPORT:
        mal_imported_DLLs[eachImport.dll.lower()] += 1
        for eachFunction in eachImport.imports:
            mal_imported_Functions[eachFunction.name] += 1



In [10]:

    
mal_all_sections = range(1, max(mal_num_Sections.keys())+1)
mal_all_section_counts = [mal_num_Sections[x] for x in mal_all_sections]

mal_num_sections_df = df({'Sections':mal_all_sections, 'Count':mal_all_section_counts})
with ppl.pretty:
    fig, ax = plt.subplots(1, figsize=(10,6))
    
ppl.bar(ax, np.arange(len(mal_num_sections_df)), mal_num_sections_df['Count'], annotate=True, color="#756b94")
ax.set_title('Number of PE sections in malicious files', fontsize=20)
plt.ylabel("Number of files with that number of sections")
plt.show()



In [11]:

    
mal_section_names_df = df(mal_section_Names.most_common(10))
mal_section_names_df.columns = ['Name','Count']
# since I'm making a horizontal bar and they load from bottom
# up I need to sort ascending
mal_section_names_df.sort('Count', inplace=True, ascending=True)
mal_section_names_df.reset_index(inplace=True, drop=True)
ypos = np.arange(len(mal_section_names_df)) + .1
with ppl.pretty:
    fig, ax = plt.subplots(1, figsize=(10,6))

ppl.barh(ax, ypos, mal_section_names_df['Count'], annotate=True, color="#756b94")
plt.yticks(ypos +.4, mal_section_names_df['Name'])
plt.xlim(0, max(mal_section_names_df['Count'])*1.1)
plt.title("10 most frequently seen PE section names (malicious)", fontsize=20)
plt.xlabel("Number of files with that section")
plt.show()



In [12]:

    
mal_dll_df = df(mal_imported_DLLs.most_common(10))
mal_dll_df.columns = ['Name','Count']
mal_dll_df.sort('Count', inplace=True, ascending=True)
mal_dll_df.reset_index(inplace=True, drop=True)
ypos = np.arange(len(mal_dll_df)) + .1

with ppl.pretty:
    fig, ax = plt.subplots(1, figsize=(10,6))

ppl.barh(ax, ypos, mal_dll_df['Count'], annotate=True, color="#756b94")
plt.yticks(ypos+.4, mal_dll_df['Name'])
plt.xlim(0, max(mal_dll_df['Count'])*1.1)
plt.title('10 most frequently imported DLLs (malicious)', fontsize=20)
plt.xlabel('Number of files importing that DLL')
plt.show()

Comparing malware and benign files



In [13]:

    
kevdog = df({'Name':list(set(imported_DLLs.keys() + mal_imported_DLLs.keys()))})
kevdog['benign'] = [imported_DLLs[x] for x in kevdog['Name']]
kevdog['malicious'] = [mal_imported_DLLs[x] for x in kevdog['Name']]
kevdog['total'] = kevdog['benign'] + kevdog['malicious']
kevdog['possible'] = kevdog['malicious'] > kevdog['benign'] * 1.2
kevdog.sort('total', inplace=True, ascending=True)
kevdog.reset_index(inplace=True, drop=True)
kevdog.tail(10)









    Out[13]:






  
    
      
      Name
      benign
      malicious
      total
      possible
    
  
  
    
      314
          gdi32.dll
        95
        89
        184
       False
    
    
      315
        shlwapi.dll
       146
        40
        186
       False
    
    
      316
          ntdll.dll
       261
         0
        261
       False
    
    
      317
       oleaut32.dll
       177
        93
        270
       False
    
    
      318
        shell32.dll
       182
       128
        310
       False
    
    
      319
          ole32.dll
       245
        98
        343
       False
    
    
      320
         msvcrt.dll
       392
         5
        397
       False
    
    
      321
       advapi32.dll
       323
       178
        501
       False
    
    
      322
         user32.dll
       312
       375
        687
        True
    
    
      323
       kernel32.dll
       485
       615
       1100
        True



In [14]:

    
deez_data = kevdog.tail(10)
#deez_data = kevdog[kevdog['possible'] == True].tail(10)
with ppl.pretty:
    fig, ax = plt.subplots(1,figsize=(16,10))
    
set2 = brewer2mpl.get_map('Set2', 'Qualitative', 3).mpl_colors
height = 0.35
ypos = np.arange(len(deez_data['Name'])) + height
vpos = ypos +  height
malicious = ppl.barh(ypos,deez_data['malicious'],height,color=set2[1],align='center')
benign = ppl.barh(vpos, deez_data['benign'], height, color=set2[2], align='center')
ax.legend([malicious, benign], ['Malicious','Benign'], loc=7)
plt.yticks(ypos+0.175, list(deez_data['Name']))
plt.xlabel("Number of Occurences", fontsize=12)
plt.ylabel("DLL Name", fontsize=12)
plt.title("DLL Imports for benign and malicious files", fontsize=20)
plt.show()

So right now it looks like there isn't a striking difference between the DLL imports for malware or for benign files. There probably isn't a useful feature we can extract from this information.



In [15]:

    
pe_sections = df({'name':list(set(section_Names.keys() + mal_section_Names.keys()))})
pe_sections['benign'] = [section_Names[x] for x in pe_sections['name']]
pe_sections['malicious'] = [mal_section_Names[x] for x in pe_sections['name']]
pe_sections['total'] = pe_sections['benign'] + pe_sections['malicious']
pe_sections.sort('total', inplace=True, ascending=True)
pe_sections.reset_index(inplace=True, drop=True)
pe_sections.tail(10)



In [16]:

    
deez_sections = pe_sections.tail(10)

with ppl.pretty:
    fig, ax = plt.subplots(1,figsize=(16,10))

height = 0.35
ypos = np.arange(len(deez_sections['name'])) + height
vpos = ypos +  height
malicious = ppl.barh(ypos,deez_sections['malicious'],height,color=set2[1],align='center')
benign = ppl.barh(vpos, deez_sections['benign'], height, color=set2[2], align='center')
ax.legend([malicious, benign], ['Malicious','Benign'], loc=7)
plt.yticks(ypos+0.175, list(deez_sections['name']))
plt.xlabel("Number of Occurences", fontsize=12)
plt.ylabel("PE Section Name", fontsize=12)
plt.title("PE Sections for benign and malicious files", fontsize=20)
plt.show()

PE Sections seem to give us something to work with. There are the top 7 PE section names which are used by both malware and benign files. But after the top seven you get into really obscure names (likely random) which are only used by malware and not regular files. So one feature we could look at is whether or not the file has a PE section that isn't in the top 7.



In [35]:

    
functions_df = df({'name':list(set(mal_imported_Functions.keys() + imported_Functions.keys()))})
functions_df['benign'] = [imported_Functions[x] for x in functions_df['name']]
functions_df['malicious'] = [mal_imported_Functions[x] for x in functions_df['name']]
functions_df['total'] = functions_df['benign'] + functions_df['malicious']
functions_df['possible'] = functions_df['malicious'] >= functions_df['benign'] * 2
functions_df['delta'] = functions_df['malicious'] - functions_df['benign']
functions_df.sort('delta', inplace=True, ascending=True)
#plot_data = functions_df[functions_df['possible']==True].tail(25)
plot_data = functions_df.tail(10)

with ppl.pretty:
    fig, ax = plt.subplots(1,figsize=(16,10))

height = 0.35
ypos = np.arange(len(plot_data['name'])) + height
vpos = ypos +  height
malicious = ppl.barh(ypos, plot_data['malicious'],height,color=set2[1],align='center')
benign = ppl.barh(vpos, plot_data['benign'], height, color=set2[2], align='center')
ax.legend([malicious, benign], ['Malicious','Benign'], loc=7)
plt.yticks(ypos+0.175, list(plot_data['name']))
plt.show()



In [ ]:

	name	benign	malicious	total
44	.dfslcnz	0	56	56
45	.gfkrgj2	0	66	66
46	.h38dbn1	0	66	66
47	.pdata	284	0	284
48	.idata	395	20	415
49	.rdata	127	372	499
50	.reloc	541	320	861
51	.data	551	367	918
52	.text	600	387	987
53	.rsrc	574	599	1173

	Name	benign	malicious	total	possible
314	gdi32.dll	95	89	184	False
315	shlwapi.dll	146	40	186	False
316	ntdll.dll	261	0	261	False
317	oleaut32.dll	177	93	270	False
318	shell32.dll	182	128	310	False
319	ole32.dll	245	98	343	False
320	msvcrt.dll	392	5	397	False
321	advapi32.dll	323	178	501	False
322	user32.dll	312	375	687	True
323	kernel32.dll	485	615	1100	True