In [1]:
# Using python 2.7 as Anaconda does not have winappdbg module official or community.
# Winappdbg is only run on Windows due to the ctypes module having Windows specific declarations.
from multiprocessing import Pool
import os
import peutils
import pefile
import sys
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import subprocess as sub
#from csv import writer
#import unpack
In [ ]:
ext_drive = '/opt/vs/train/'
ext_drive2 = '/opt/vs/train2/'
file_list = os.listdir(ext_drive)
len(file_list)
In [13]:
def load_packer_id_map():
# Load the packer ID scalar labels and create a map. There are a lot of duplicate names so the total is less than
# the number of packers listed in the signature db.
packer_id_map = {}
counter = 0
fip = open('data/packer-id.csv','r')
in_lines = fip.readlines()
for idx in range(1,len(in_lines)):
tokens = in_lines[idx].split(',')
packer_name = tokens[0]
if packer_name not in packer_id_map.keys():
packer_id_map[packer_name] = int(tokens[1])
counter += 1
fip.close()
print('Completed {:d} packer IDs.'.format(counter))
return packer_id_map
def sort_and_save_packer_id_feature_file():
packers = pd.read_csv('data/packer-id-features.csv')
# DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values().
sorted_packers = packers.sort('file_name')
sorted_packers.to_csv('data/sorted-packer-id-features.csv', index=False)
sorted_packers.head(20)
return
def combine_packer_id_files():
# Function to combine the four packer id files in one file
# 1. list data directory
# 2. For each file in file list that matches (\d\d\d\d-packer-id-features.csv)
# 3. Trim the filenames if necessary (should remove VirusShare_ prefix).
# 4. Concatenate the unsorted packer id feature files.
# 5. Sort and write to data/sorted-packer-id-features.csv
fop = open('data/packer-id-features.csv','w')
fop.write('file_name,packer_name,packer_id,valid_pe,is_packed\n')
p1 = re.compile('\d{3,5}-sorted-packer-id-features.csv') # This is the PID prefix for each file.
file_list = os.listdir('data/')
counter = 0
for file_name in file_list:
if p1.match(file_name):
fip = open('data/' + file_name, 'r')
in_lines = fip.readlines()
#if counter > 0:
# in_lines = in_lines[1:] # skip the column header row
fop.writelines(in_lines)
counter += len(in_lines)
fip.close()
print('Completed combine of {:d} packer ID features.'.format(counter))
fop.close()
sort_and_save_packer_id_feature_file()
return
#NOTE: move to feature-extraction-packer-id.py
def generate_sample_packer_id(file_list):
# Generate scalar packer IDs for each sample.
pid = os.getpid()
file_name = "data/" + str(pid) + "-sorted-packer-id-features.csv"
fop = open(file_name,'w')
#fop.write('file_name,packer_type,label,is_valid,is_packed\n') put column headers in during the combine stage.
out_lines = []
packer_id_map = load_packer_id_map()
signatures = peutils.SignatureDatabase('data/userdb-sans.txt')
non_pe_counter = 0
pe_file_counter = 0
exception_counter = 0
signat = 'unknown'
error_str = 'none'
for idx, file_name in enumerate(file_list):
tokens = file_name.split('_')
truncated_file_name = tokens[1] # remove the VirusShare_ prefix from the filename.
matches = None
packer_id = 0
is_valid = 0
is_packed = 0
try:
pe = pefile.PE(ext_drive + file_name, fast_load=True)
pe_file_counter += 1
#matches = signatures.match_all(pe, ep_only = True)
is_valid = 1
try:
if peutils.is_probably_packed(pe): # NOTE: peutils.is_valid() has not been implemented yet.
#is_valid = 1
is_packed = 1
matches = signatures.match(pe, ep_only = True)
signat = matches[0]
if (signat in packer_id_map.keys()):
packer_id = packer_id_map[signat]
else:
packer_id = 0
#signat = signat.replace(',','') # remove commas or they will cause an error when loading dataframes.
# NOTE: If the signature database has commas in the packer name then remove them or they will
# cause problems later on when loading the dataframes.
row = truncated_file_name + "," + signat + "," + str(packer_id) + "," + str(is_valid) + "," + str(is_packed) + "\n"
except:
signat = ",unknown,0," + str(is_valid) + "," + str(is_packed) + "\n"
row = truncated_file_name + signat
pe.close()
except Exception as e:
error_str = str(e)
non_pe_counter += 1
error_str = error_str.replace(',','') # remove commas or they will cause an error when loading dataframes.
signat = "," + error_str + ",0,0,0\n"
row = truncated_file_name + signat
out_lines.append(row)
if (idx % 1000) == 0: # print progress
fop.writelines(out_lines)
out_lines = []
print('{:s} - {:s} - {:d} - {:s}'.format(str(pid),truncated_file_name,idx,signat))
if len(out_lines) > 0:
fop.writelines(out_lines)
out_lines = []
fop.close()
print('{:s} - Completed {:d} non PE files and {:d} PE files.'.format(str(pid), non_pe_counter, pe_file_counter))
return
In [ ]:
ext_drive = '/opt/vs/train/'
tfiles = os.listdir(ext_drive)
generate_sample_packer_id(tfiles[0:50])
In [14]:
#NOTE: move to feature-extraction-packer-id.py
ext_drive = '/opt/vs/train/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print("Files: {:d} - {:d} - {:d}".format(len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4)))
trains = [train1, train2, train3, train4]
p = Pool(4)
p.map(generate_sample_packer_id, trains)
print('Completed processing {:d} files.'.format(len(tfiles)))
combine_packer_id_files()
In [19]:
# Save the malware packer id features sets from VirusShare 251 and 252.
packer_id_features = pd.read_csv('data/sorted-packer-id-features.csv')
#sorted_packer_id_features = packer_id_features_1.value_sort(on='file_name')
sorted_packer_id_features = packer_id_features.sort('file_name')
sorted_packer_id_features.to_csv('data/sorted-packer-id-features-vs251-252.csv', index=False)
sorted_packer_id_features.head()
Out[19]:
In [ ]:
ext_drive = '/opt/vs/train2/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print("Files: {:d} - {:d} - {:d}".format(len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4)))
trains = [train1, train2, train3, train4]
p = Pool(4)
p.map(generate_sample_packer_id, trains)
print('Completed processing {:d} files.'.format(len(tfiles)))
combine_packer_id_files()
In [17]:
# Now sort the packer id features by filename
#NOTE: move to feature-extraction-packer-id.py
packers = pd.read_csv('data/packer-id-features.csv')
packers.head(20)
Out[17]:
In [ ]:
# trim the file_names and sort
#NOTE: move to feature-extraction-packer-id.py
counter = 0
for file_name in packers['file_name']:
tokens = file_name.split('_')
packers.iloc[counter,0] = tokens[1]
counter += 1
if (counter % 1000) == 0:
print('{:s}'.format(file_name))
packers.head(20)
In [25]:
# DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values()!!!
sorted_packers = packers.sort('file_name')
sorted_packers.head(20)
Out[25]:
In [26]:
#NOTE: move to feature-extraction-packer-id.py
sorted_packers.to_csv('data/sorted-packer-id-features.csv', index=False)
In [27]:
sorted_packers.shape
Out[27]:
In [ ]:
# This is one of the packer ID databases used by PEid.exe
fip = open('data/userdb-sans.txt', 'r')
in_lines = fip.readlines()
len(in_lines)
In [ ]:
# This is one of the packer ID databases used by PEid.exe
fip = open('data/userdb-sans.txt', 'r')
in_lines = fip.readlines()
fop = open('data/packer-id.csv','w')
fop.write('packer_name,packer_id\n')
fop.write('unknown,0\n')
p1 = re.compile('\[(.*)\]')
out_lines = []
row = ' '
counter = 0
for idx, line in enumerate(in_lines):
if line.startswith('['):
counter += 1
m = p1.match(line)
if m != None:
row = m.group(1) + ',' + str(counter) + '\n'
out_lines.append(row)
else:
continue
else:
continue
if (idx % 100) == 0: # print progress
fop.writelines(out_lines)
out_lines = []
print('Filename: {:s} - {:d}'.format(row.rstrip(),idx))
if len(out_lines) > 0:
fop.writelines(out_lines)
out_lines = []
fip.close()
fop.close()
print('Completed {:d} packer IDs.'.format(counter))
In [10]:
# Load the malware packer id features sets from VirusShare 251 and 252.
packer_id_features = pd.read_csv('data/sorted-packer-id-features-vs251-252.csv')
packer_id_features.head(20)
Out[10]:
In [12]:
# Lets have a look at all the valid PE binaries that are packed or suspected of being packed.
packed_files = packer_id_features[packer_id_features['is_packed'] == 1]
packed_files.head(20)
Out[12]:
In [14]:
# 16321 packed files to run through unpack.py
# Some of these are likely to have been wrongly classified as packed, but
# the unpacker should be able to handle them correctly.
import sys
import traceback
import winappdbg
import time
import struct
import ctypes
# Log file which we log info to
logfile = None
class MyEventHandler(winappdbg.EventHandler):
###
# A. Declaring variables
###
# A.1 used to keep track of allocated executable memory
allocedmem = {}
# A.2 used to indicate that we've found the entry point
entrypt = 0x00000000
#
# variables used to find and disassemble unpacking loop
#
# A.3 used to indicate that we're single stepping
tracing = -1
# A.4 remember the last two eip values
lasteip = [0x00000000,0x00000000]
# A.5 lowest eip address we see
lowesteip = 0xffffffff
# A.6 highest eip address we see
highesteip = 0x00000000
# A.7 list of addresses which we've disassembled
disasmd = []
# A.8 keeps track of addresses and instructions
# that write to the allocated memory block(s)
writeaddrs = {}
#
# variables used to keep track of created processes
#
# A.9 keeps track of created processes to map
# hProcess from WriteProcessMemory() back to
# process name
createdprocesses = {}
# A.10 keeps track of processes that were created
# with the CREATE_SUSPENDED flag set
createsuspended = {}
#
# variables used for logging
#
# A.11 used to keep a log of events
eventlog = []
###
# B. Class methods (functions)
###
### B.1
# get_funcargs(event)
# query winappdbg to get the function arguments
#
# return a tuple consisting of the return address
# and a sub-tuple of function arguments
###
def get_funcargs(self,event):
h = event.hook
t = event.get_thread()
tid = event.get_tid()
return (t.get_pc(),h.get_params(tid))
### B.2
# guarded_read(d,t,addr,size)
# read memory after checking for, and if necessary,
# disabling memory breakpoints
#
# returns a string of data
###
def guarded_read(self,d,t,addr,size):
# keep track of breakpoints that we disabled
# so that we can enable them again after we've
# finished
reenablebps = []
# initialise the variable to hold the read
# memory data
data = ""
# check that the requested size is sane
if (size > 0):
p = t.get_process()
# check to see if the requested address falls within
# any of the existing memory breakpoints by checking
# if either the requested start address or end address
# is covered by any breakpoint
mem_bps = d.get_all_page_breakpoints()
for (pid,pgbp) in mem_bps:
(startaddr,endaddr) = pgbp.get_span()
if (pid == p.get_pid()) and (pgbp.is_here(addr) or pgbp.is_here(addr + size - 1)):
log("[D] Memory read in guarded memory. Disabling breakpoint: {:s}".format(pgbp))
pgbp.disable(p,t)
reenablebps.append(pgbp)
# read the memory
data = p.read(addr,size)
# enable all of the breakpoints that we disabled
if (len(reenablebps) > 0):
for pgbp in reenablebps:
log("[D] Re-enabling breakpoint: {:s}".format(pgbp))
pgbp.enable(p,t)
# return the read memory as a string
return data
###
# C. API Hooks
###
### C.1
# apiHooks: winappdbg defined hash of API calls to hook
#
# Each entry is indexed by library name and is an array of
# tuples consisting of API call name and number of args
###
apiHooks = {
"kernel32.dll":[
("VirtualAlloc",4),
("VirtualAllocEx",5),
("IsDebuggerPresent",0),
("CreateProcessA",10),
("CreateProcessW",10),
("WriteProcessMemory",5)
],
"advapi32.dll":[
("CryptDecrypt",6)
],
"wininet.dll":[
("InternetOpenA",5),
("InternetOpenW",5)
],
"ntdll.dll":[
("RtlDecompressBuffer",6)
],
"secur32.dll":[
("EncryptMessage",4),
("DecryptMessage",4)
]
}
###
# API hook callback functions
#
# These are defined by winappdbg and consist of functions
# named pre_<apifuncname> and post_<apifuncname> which are
# called on entry to, and on exit from, the given API
# function (<apifuncname>), respectively.
###
# C.2
# VirtualAlloc() hook(s)
#
def post_VirtualAllocEx(self,event,retval):
try:
# C.2.1 Get the return address and arguments
(ra,(hProcess,lpAddress,dwSize,flAllocationType,flProtect)) = self.get_funcargs(event)
# Get an instance to the debugger which triggered the event
# and also the process id and thread id of the process to which
# the event pertains
d = event.debug
pid = event.get_pid()
tid = event.get_tid()
# Log the fact that we've seen a VirtualAllocEx() call
log("[*] <%d:%d> 0x%x: VirtualAllocEx(0x%x,0x%x,0x%x (%d),0x%x,0x%03x) = 0x%x" % (pid,tid,ra,hProcess,lpAddress,dwSize,dwSize,flAllocationType,flProtect,retval))
# C.2.2 All the memory protection bits which include EXECUTE
# permission use bits 4 - 7, which is nicely matched
# by masking (ANDing) it with 0xf0 and checking for a
# non-zero result
if (flProtect & 0x0f0):
log("[-] Request for EXECUTEable memory")
# We can only set page guards on our own process
# otherwise page guard exception will occur in
# system code when this process attempts to write
# to the allocated memory.
# This causes ZwWriteVirtualMemory() to fail
# We can, however, set a page guard on it when
# this process creates the remote thread, as it
# will have presumably stopped writing to the
# other process' memory at that point.
# C.2.2.1 Check that this VirtualAllocEx() call is for
# the current process (hProcess == -1), and if
# so, ask the winappdbg debugger instance to
# create a page guard on the memory region.
# Also add information about the allocated region
# to our allocedmem hash, indexed by pid and
# base address.
if (hProcess == 0xffffffff):
d.watch_buffer(pid,retval,dwSize - 1,self.guard_page_exemem)
self.allocedmem[(pid,retval)] = dwSize
# C.2.3 Create a JSON event log entry
self.eventlog.append({
"time": time.time(),
"name": "VirtualAllocEx",
"type": "Win32 API",
"pid": pid,
"tid": tid,
"addr": ra,
"args": {
"hProcess": hProcess,
"lpAddress": lpAddress,
"dwSize": dwSize,
"flAllocationType": flAllocationType,
"flProtect": flProtect
},
"ret": retval
})
except:
traceback.print_exc()
raise
return
def post_VirtualAlloc(self,event,retval):
try:
# C.2.4 Get the return address and arguments
(ra,(lpAddress,dwSize,flAllocationType,flProtect)) = self.get_funcargs(event)
# Get an instance to the debugger which triggered the event
# and also the process id and thread id of the process to which
# the event pertains
d = event.debug
pid = event.get_pid()
tid = event.get_tid()
# Log the fact that we've seen a VirtualAlloc() call
# This is so that we get the address in the debuggee code from which it was called
# where as if we just let the VirtualAllocEx() hook log it, the address from
# which it was called is inside the VirtualAlloc() code in kernel32.dll
log("[*] <%d:%d> 0x%x: VirtualAlloc(0x%x,0x%x (%d),0x%x,0x%03x) = 0x%x" % (pid, tid, ra, lpAddress, dwSize, dwSize, flAllocationType, flProtect, retval))
# C.2.5 Create a JSON event log entry
self.eventlog.append({
"time": time.time(),
"name": "VirtualAlloc",
"type": "Win32 API",
"pid": pid,
"tid": tid,
"addr": ra,
"args": {
"lpAddress": lpAddress,
"dwSize": dwSize,
"flAllocationType": flAllocationType,
"flProtect": flProtect
},
"ret": retval
})
except:
traceback.print_exc()
raise
return
# C.3
# CryptDecrypt() hook(s)
#
def pre_CryptDecrypt(self,event,*args):
# C.3.1 Get the return address and arguments
(ra,hKey,hHash,Final,dwFlags,pbData,pdwDataLen) = (args[0],args[1],args[2],args[3],args[4],args[5],args[6])
# C.3.2 Get a Process object and dereference the pdwDataLen argument to read the buffer size
p = event.get_process()
buffsize = p.read_uint(pdwDataLen)
# C.3.3 Save a copy of the encrypted data
filename = "%s.memblk0x%x.enc" % (sys.argv[1],pbData)
log("[-] Dumping %d bytes of encrypted memory at 0x%x to %s" % (buffsize,pbData,filename))
databuff = open(filename,"wb")
databuff.write(p.read(pbData,buffsize));
databuff.close()
return
def post_CryptDecrypt(self,event,retval):
# C.3.4 Get the return address and arguments
(ra,(hKey,hHash,Final,dwFlags,pbData,pdwDataLen)) = self.get_funcargs(event)
# Get a Process object, and dereference the pdwDataLen argument
p = event.get_process()
buffsize = p.read_uint(pdwDataLen)
pid = event.get_pid()
tid = event.get_tid()
log("[*] <%d:%d> 0x%x: CryptDecrypt(0x%x,0x%x,0x%x,0x%x,0x%x,0x%x (%d)) = %d" % (pid, tid, ra, hKey, hHash, Final, dwFlags, pbData, buffsize, buffsize, retval))
# C.3.5 Save a copy of the decrypted data
filename_enc = "%s.memblk0x%x.enc" % (sys.argv[1],pbData)
filename = "%s.memblk0x%x.dec" % (sys.argv[1],pbData)
log("[-] Dumping %d bytes of decrypted memory at 0x%x to %s" % (buffsize,pbData,filename))
databuff = open(filename,"wb")
databuff.write(p.read(pbData,buffsize))
databuff.close()
# C.3.6 Create a JSON event log entry
pid = event.get_pid()
tid = event.get_tid()
self.eventlog.append({
"time": time.time(),
"name": "CryptDecrypt",
"type": "Win32 API",
"pid": pid,
"tid": tid,
"addr": ra,
"args": {
"hKey": hKey,
"hHash": hHash,
"Final": Final,
"dwFlags": dwFlags,
"pbData": pdwDataLen
},
"ret": retval,
"info": {
"filename_enc": filename_enc,
"filename_dec": filename
}
})
return
# C.4
# RtlDecompressBuffer() hook(s)
#
def pre_RtlDecompressBuffer(self,event,*args):
try:
# C.4.1 Get the return address and arguments
(ra,CompressionFormat,UncompressedBuffer,UncompressedBufferSize,CompressedBuffer,CompressedBufferSize,FinalUncompressedSize) = (args[0],args[1],args[2],args[3],args[4],args[5],args[6])
p = event.get_process()
# C.4.2 Save a copy of the compressed data
filename = "%s.memblk0x%x.comp" % (sys.argv[1],CompressedBuffer)
log("[-] Dumping %d bytes of compressed memory at 0x%x to %s" % (CompressedBufferSize,CompressedBuffer,filename))
databuff = open(filename,"wb")
databuff.write(p.read(CompressedBuffer,CompressedBufferSize));
databuff.close()
except:
traceback.print_exc()
raise
return
def post_RtlDecompressBuffer(self,event,retval):
try:
# C.4.3 Get the return address and arguments
(ra,(CompressionFormat,UncompressedBuffer,UncompressedBufferSize,CompressedBuffer,CompressedBufferSize,FinalUncompressedSize)) = self.get_funcargs(event)
pid = event.get_pid()
tid = event.get_tid()
log("[*] <%d:%d> 0x%x: RtlDecompressBuffer(0x%x,0x%x,0x%x,0x%x,0x%x,0x%x): %d" % (pid,tid,ra,CompressionFormat,UncompressedBuffer,UncompressedBufferSize,CompressedBuffer,CompressedBufferSize,FinalUncompressedSize,retval))
# Get a Process object, and dereference the FinalUncompressedSize argument
p = event.get_process()
buffsize = p.read_uint(FinalUncompressedSize)
# C.4.4 save a copy of the decompressed data
filename_comp = "%s.memblk0x%x.comp" % (sys.argv[1],CompressedBuffer)
filename = "%s.memblk0x%x.decomp" % (sys.argv[1],UncompressedBuffer)
log("[-] Dumping %d bytes of decompressed memory at 0x%x to %s" % (buffsize,UncompressedBuffer,filename))
databuff = open(filename,"wb")
databuff.write(p.read(UncompressedBuffer,buffsize))
databuff.close()
# C.4.5 Create a JSON event log entry
self.eventlog.append({
"time": time.time(),
"name": "RtlDecompressBuffer",
"type": "Win32 API",
"pid": pid,
"tid": tid,
"addr": ra,
"args": {
"CompressionFormat": CompressionFormat,
"UncompressedBuffer": UncompressedBuffer,
"UncompressedBufferSize": UncompressedBufferSize,
"CompressedBuffer": CompressedBuffer,
"CompressedBufferSize": CompressedBufferSize,
"FinalUncompressedSize": FinalUncompressedSize
},
"ret": retval,
"info": {
"filename_comp": filename_comp,
"filename_decomp": filename
}
})
except:
traceback.print_exc()
raise
return
# C.5
# CreateProcess() hook(s)
#
def post_CreateProcess(self,event,retval,fUnicode):
try:
# C.5.1 Get the return address and arguments
(ra,(lpApplicationName,lpCommandLine,lpProcessAttributes,lpThreadAttributes,bInheritHandles,dwCreationFlags,lpEnvironment,lpCurrentDirectory,lpStartupInfo,lpProcessInformation)) = self.get_funcargs(event)
p = event.get_process()
t = event.get_thread()
pid = event.get_pid()
tid = event.get_tid()
# C.5.2 Dereference arguments
# Use the Process object to dereference the lpApplicationName and lpCommandLine arguments
# as either ASCII or WCHAR depending on the fUnicode argument
# (and hence whether we were called from post_CreateProcessA() or post_CreateProcessW() respectively
szApplicationName = p.peek_string(lpApplicationName,fUnicode)
szCommandLine = p.peek_string(lpCommandLine,fUnicode)
# If the lpProcessInformation argument is a valid pointer...
if (lpProcessInformation):
# ... dereference it to get the ProcessInformation structure
d = event.debug
ProcessInformation = self.guarded_read(d,t,lpProcessInformation,16)
# Extract the various fields from the ProcessInformation structure
hProcess = struct.unpack("<L",ProcessInformation[0:4])[0]
hThread = struct.unpack("<L",ProcessInformation[4:8])[0]
dwProcessId = struct.unpack("<L",ProcessInformation[8:12])[0]
dwThreadId = struct.unpack("<L",ProcessInformation[12:16])[0]
else:
log("[E] lpProcessInformation is null")
log("[*] <%d:%d> 0x%x: CreateProcess(\"%s\",\"%s\",0x%x): %d (0x%x, 0x%x, <%d:%d>)" % (pid,tid,ra,szApplicationName,szCommandLine,dwCreationFlags,retval,hProcess,hThread,dwProcessId,dwThreadId))
# C.5.3 Check if the process is being created in a suspended state (CREATE_SUSPENDED flag)...
if (dwCreationFlags & 0x4):
# ... hook the ResumeThread() API call
# so that we are notified when it is resumed
d = event.debug
stat = d.hook_function(pid,"ResumeThread",preCB = self.hook_createsuspendedresume,paramCount = 1)
self.createsuspended[(pid,hThread)] = dwProcessId
log("[-] CREATE_SUSPENDED. Hooking ResumeThread() (%d)" % stat)
# C.5.4 Keep track of processes that were created, so we know which
# process any WriteProcessMemory() calls are writing to
self.createdprocesses[hProcess] = {
"time": time.time(),
"ppid": pid,
"ptid": tid,
"paddr": ra,
"ApplicationName":szApplicationName,
"CommandLine": szCommandLine,
"CreationFlags": dwCreationFlags,
"hProcess": hProcess,
"hThread": hThread,
"ProcessId": dwProcessId,
"ThreadId": dwThreadId
}
# C.5.5 Create a JSON event log entry
self.eventlog.append({
"time": time.time(),
"name": "CreateProcess",
"type": "Win32 API",
"pid": pid,
"tid": tid,
"addr": ra,
"args": {
"ApplicationName":szApplicationName,
"CommandLine": szCommandLine,
"CreationFlags": dwCreationFlags,
"hProcess": hProcess,
"hThread": hThread,
"ProcessId": dwProcessId,
"ThreadId": dwThreadId
},
"info": {
"fUnicode":fUnicode
},
"ret": retval
})
except:
traceback.print_exc()
raise
return
# C.5.6 post_CreateProcessA() and post_CreateProcessW()
# Actual hook call-back function called by WinAppDbg
# To save duplicating code between this and post_CreateProcessW()
# both of them call post_CreateProcess() with a parameter, fUnicode,
# which specifies whether the strings are ASCII (CreateProcessA())
# or WCHAR (CreateProcessW())
def post_CreateProcessA(self,event,retval):
self.post_CreateProcess(event,retval,False)
return
def post_CreateProcessW(self,event,retval):
self.post_CreateProcess(event,retval,True)
return
# hook_createsuspendedresume() is a call-back function called when
# ResumeThread() is call by a process which has created a suspended
# process
def hook_createsuspendedresume(self,event,*args):
# C.5.7 Get the return address and arguments
(ra,(hThread,)) = self.get_funcargs(event)
pid = event.get_pid()
tid = event.get_tid()
log("[*] <%d:%d> 0x%x: ResumeThread(0x%x)" % (pid,tid,ra,hThread))
# C.5.8 Find the process id of the resumed process
if ((pid,hThread) in self.createsuspended):
pidresumed = self.createsuspended[(pid,hThread)]
log("[-] New suspended process (pid %d) resumed" % pidresumed)
return
# C.6
# WriteProcessMemory() hook(s)
#
def post_WriteProcessMemory(self,event,retval):
# C.6.1 Get the return address and arguments
try:
(ra,(hProcess,lpBaseAddress,lpBuffer,nSize,lpNumberOfBytesWritten)) = self.get_funcargs(event)
pid = event.get_pid()
tid = event.get_tid()
log("[*] <%d:%d> 0x%x: WriteProcessMemory(0x%x,0x%x,0x%x,0x%x,0x%x): %d" % (pid,tid,ra,hProcess,lpBaseAddress,lpBuffer,nSize,lpNumberOfBytesWritten,retval))
d = event.debug
t = event.get_thread()
# C.6.2 Dereference lpNumberOfBytesWritten to get the number of bytes written to the target process'
# address space
if (lpNumberOfBytesWritten):
NumberOfBytesWritten = struct.unpack("<L",self.guarded_read(d,t,lpNumberOfBytesWritten,4))[0]
else:
NumberOfBytesWritten = None
# C.6.3 Get process information that was saved by CreateProcess() hook
if (hProcess in self.createdprocesses):
ProcessId = self.createdprocesses[hProcess]["ProcessId"]
ApplicationName = self.createdprocesses[hProcess]["ApplicationName"]
CommandLine = self.createdprocesses[hProcess]["CommandLine"]
else:
log("[W] hProcess not in createdprocesses[]")
ProcessId = None
ApplicationName = None
CommandLine = None
d = event.debug
t = event.get_thread()
# C.6.4 Save a copy of the written memory
pid = event.get_pid()
tid = event.get_tid()
filename = "%s.memblk0x%x-%d.wpm" % (sys.argv[1],lpBaseAddress,ProcessId)
log("[-] Dumping %d bytes of memory at %d:0x%x written to %d:0x%x to %s" % (nSize,pid,lpBuffer,ProcessId,lpBaseAddress,filename))
databuff = open(filename,"wb")
databuff.write(self.guarded_read(d,t,lpBuffer,nSize))
databuff.close()
# C.6.5 Create a JSON event log entry
self.eventlog.append({
"time": time.time(),
"name": "WriteProcessMemory",
"type": "Win32 API",
"pid": pid,
"tid": tid,
"addr": ra,
"args": {
"hProcess": hProcess,
"lpBaseAddress": lpBaseAddress,
"lpBuffer": lpBuffer,
"nSize": nSize,
"lpNumberOfBytesWritten": lpNumberOfBytesWritten,
"NumberOfBytesWritten": NumberOfBytesWritten
},
"ret": retval,
"info": {
"filename": filename,
"targetprocesspid": ProcessId,
"targetprocessname": ApplicationName,
"targetprocesscmdline": CommandLine
}
})
except:
traceback.print_exc()
raise
return
# C.7
# IsDebuggerPresent() hook(s)
# (mainly added so that AutoIt compiled scripts would run, but also useful
# as an anti-anti-malware technique)
#
def post_IsDebuggerPresent(self,event,retval):
# C.7.1 Get the return address and arguments
(ra,noargs) = self.get_funcargs(event)
pid = event.get_pid()
tid = event.get_tid()
log("[*] <%d:%d> 0x%x: IsDebuggerPresent(): 0x%x" % (pid,tid,ra,retval))
log("[-] Returning 0")
# C.7.2 Changed the 'eax' register (return value) to '0' (no debugger present)
# just before we continue running the calling thread
t = event.get_thread()
t.set_register("Eax",0x0)
# C.7.3 Create a JSON event log entry
self.eventlog.append({
"time": time.time(),
"name": "IsDebuggerPresent",
"type": "Win32 API",
"pid": pid,
"tid": tid,
"addr": ra,
"args": {},
"ret": retval,
"info": {}
})
return
# C.8
# InternetOpen() hook(s)
#
def post_InternetOpen(self,event,retval,fUnicode):
# C.8.1 Get the return address and arguments
(ra,(lpszAgent,dwAccessType,lpszProxyName,lpszProxyBypass,dwFlags)) = self.get_funcargs(event)
pid = event.get_pid()
tid = event.get_tid()
# C.8.2 Dereference arguments
p = event.get_process()
szAgent = p.peek_string(lpszAgent,fUnicode)
szProxyName = p.peek_string(lpszProxyName,fUnicode)
szProxyBypass = p.peek_string(lpszProxyBypass,fUnicode)
log("[*] <%d:%d> 0x%x: InternetOpen(\"%s\",0x%x,\"%s\",\"%s\",0x%x) = 0x%x" % (pid,tid,ra,szAgent,dwAccessType,szProxyName,szProxyBypass,dwFlags,retval))
# C.8.3 Create a JSON event log entry
self.eventlog.append({
"time": time.time(),
"name": "InternetOpen",
"type": "Win32 API",
"pid": pid,
"tid": tid,
"addr": ra,
"args": {},
"ret": retval,
"info": {}
})
return
def post_InternetOpenA(self,event,retval):
self.post_InternetOpen(event,retval,False)
return
def post_InternetOpenW(self,event,retval):
self.post_InternetOpen(event,retval,True)
return
def pre_EncryptMessage(self,event,*args):
# C.?.1 Get the return address and arguments
try:
(ra,phContext,fQOP,pMessage,MessageSeqNo) = (args[0],args[1],args[2],args[3],args[4])
pid = event.get_pid()
tid = event.get_tid()
# Right -- this is going to get annoying
# pMessage is a pointer to a SecBufferDesc structure
# which describes an array of SecBuffer structures
p = event.get_process()
l = p.get_label_at_address(ra)
# really ought to use a ctypes struct for this!
ulVersion = p.peek_uint(pMessage)
cBuffers = p.peek_uint(pMessage + 4)
pBuffers = p.peek_uint(pMessage + 8)
log("[*] <%d:%d> %s 0x%x: EncryptMessage(...)" % (pid,tid,l,ra))
log("[D] ulVersion: %d" % ulVersion)
log("[D] cBuffers: %d" % cBuffers)
log("[D] pBuffers: 0x%x" % pBuffers)
# dump buffer list
for i in range(0,cBuffers):
cbBuffer = p.peek_uint(pBuffers + (i * 12) + 0)
BufferType = p.peek_uint(pBuffers + (i * 12) + 4)
pvBuffer = p.peek_uint(pBuffers + (i * 12) + 8)
if (BufferType == 1): # SECBUFFER_DATA
# we have data to save
filename = sys.argv[1] + ".encmsg0x%08x-%d" % (pvBuffer,pid)
f = open(filename,"ab")
f.write(p.peek(pvBuffer,cbBuffer))
f.close()
log("[D]")
log("[D] cbBuffer: 0x%x (%d)" % (cbBuffer,cbBuffer))
log("[D] BufferType: 0x%x" % BufferType)
log("[D] pvBuffer: 0x%x" % pvBuffer)
except:
traceback.print_exc()
raise
return
def post_DecryptMessage(self,event,retval):
# C.?.1 Get the return address and arguments
try:
(ra,(phContext,pMessage,MessageSeqNo,pfQOP)) = self.get_funcargs(event)
pid = event.get_pid()
tid = event.get_tid()
# Right -- this is going to get annoying
# pMessage is a pointer to a SecBufferDesc structure
# which describes an array of SecBuffer structures
p = event.get_process()
# really ought to use a ctypes struct for this!
ulVersion = p.peek_uint(pMessage)
cBuffers = p.peek_uint(pMessage + 4)
pBuffers = p.peek_uint(pMessage + 8)
log("[*] <%d:%d> 0x%x: DecryptMessage(...)" % (pid,tid,ra))
log("[D] ulVersion: %d" % ulVersion)
log("[D] cBuffers: %d" % cBuffers)
log("[D] pBuffers: 0x%x" % pBuffers)
# dump buffer list
for i in range(0,cBuffers):
cbBuffer = p.peek_uint(pBuffers + (i * 12) + 0)
BufferType = p.peek_uint(pBuffers + (i * 12) + 4)
pvBuffer = p.peek_uint(pBuffers + (i * 12) + 8)
if (BufferType == 1): # SECBUFFER_DATA
# we have data to save
filename = sys.argv[1] + ".decmsg0x%08x-%d" % (pvBuffer,pid)
f = open(filename,"ab")
f.write(p.peek(pvBuffer,cbBuffer))
f.close()
log("[D]")
log("[D] cbBuffer: 0x%x (%d)" % (cbBuffer,cbBuffer))
log("[D] BufferType: 0x%x" % BufferType)
log("[D] pvBuffer: 0x%x" % pvBuffer)
except:
traceback.print_exc()
raise
return
###
# D. winappdbg debug event handlers
###
### D.1
# create_process
#
# winappdbg defined callback function to handle process creation events
###
def create_process(self,event):
p = event.get_process()
pid = event.get_pid()
tid = event.get_tid()
log("[*] <%d:%d> Create process event for pid %d (%s)" % (pid,tid,p.get_pid(),p.get_image_name()))
log("[-] command line: %s" % p.get_command_line())
#log("[D] Create process event for pid %d (%d)" % (pid,tid))
self.eventlog.append({
"time": time.time(),
"name": event.get_event_name(),
"type": "WinAppDbg Event",
"pid": pid,
"tid": tid,
"info": {
"pid": p.get_pid(),
"module_base": event.get_module_base(),
"filename": event.get_filename(),
"cmdline": p.get_command_line()
},
})
return
### D.2
# exit_process
#
# winappdbg defined callback function to handle process exit events
###
def exit_process(self,event):
pid = event.get_pid()
tid = event.get_tid()
log("[*] <%d:%d> Exit process event for %s: 0x%x" % (pid,tid,event.get_filename(),event.get_exit_code()))
self.eventlog.append({
"time": time.time(),
"name": event.get_event_name(),
"type": "WinAppDbg Event",
"pid": pid,
"tid": tid,
"info": {
"module_base": event.get_module_base(),
"filename": event.get_filename(),
"exitcode": event.get_exit_code()
},
})
return
### D.3
# create_thread
#
# winappdbg defined callback function to handle thread creation events
###
def create_thread(self,event):
pid = event.get_pid()
tid = event.get_tid()
t = event.get_thread()
name = t.get_name()
log("[*] <%d:%d> Create thread event \"%s\" @ 0x%x" % (pid,tid,name,event.get_start_address()))
self.eventlog.append({
"time": time.time(),
"name": event.get_event_name(),
"type": "WinAppDbg Event",
"pid": pid,
"tid": tid,
"info": {
"startaddress": event.get_start_address(),
"threadname": name
},
})
return
### D.4
# exit_thread
#
# winappdbg defined callback function to handle thread exit events
###
def exit_thread(self,event):
pid = event.get_pid()
tid = event.get_tid()
t = event.get_thread()
name = t.get_name()
log("[*] <%d:%d> Exit thread event \"%s\"" % (pid,tid,name,))
self.eventlog.append({
"time": time.time(),
"name": event.get_event_name(),
"type": "WinAppDbg Event",
"pid": pid,
"tid": tid,
"info": {
"threadname": name
},
})
return
### D.5
# load_dll
#
# winappdbg defined callback function to handle DLL load events
###
def load_dll(self,event):
pid = event.get_pid()
tid = event.get_tid()
log("[*] <%d:%d> Load DLL event: %s" % (pid,tid,event.get_filename()))
self.eventlog.append({
"time": time.time(),
"name": event.get_event_name(),
"type": "WinAppDbg Event",
"pid": pid,
"tid": tid,
"info": {
"module_base": event.get_module_base(),
"filename": event.get_filename(),
},
})
return
### D.6
# event
#
# winappdbg defined callback function to handle any remaining events
###
def event(self,event):
pid = event.get_pid()
tid = event.get_tid()
log("[*] <%d:%d> Unhandled event: %s" % (pid,tid,event.get_event_name()))
return
###
# E. winappdbg debug exception handlers
###
### E.1
# guard_page
#
# winappdbg defined callback function to handle guard page exceptions
###
def guard_page_exemem(self,exception):
try:
f_type = exception.get_fault_type()
e_addr = exception.get_exception_address()
f_addr = exception.get_fault_address()
# get the process and thread ids
pid = exception.get_pid()
tid = exception.get_tid()
# It is interesting to log this, but it generates a lot of log
# output and slows the whole process down
#log("[!] <%d:%d> 0x%x: GUARD_PAGE(%d) exception for address 0x%x" % (pid,tid,e_addr,f_type,f_addr))
#log("[*] VirtualAlloc()d memory address 0x%x accessed (%d) from 0x%x (%s)" % (f_addr,f_type,e_addr,instr))
# E.1.2 Was it a memory write operation?
if (f_type == winappdbg.win32.EXCEPTION_WRITE_FAULT):
# E.1.2.1 Use the writeaddrs[] array to check to see
# if we have already logged access from this
# address, as unpacking is generally done in
# a loop and we don't want to log the same
# instructions for each iteration
if not e_addr in self.writeaddrs:
p = exception.get_process()
t = exception.get_thread()
label = p.get_label_at_address(e_addr)
instr = t.disassemble_instruction(e_addr)[2].lower()
log("[*] VirtualAlloc()d memory address 0x%x written from 0x%x (%s): %s" % (f_addr,e_addr,label,instr))
self.writeaddrs[e_addr] = instr
# E.1.2.2 Use the tracing variable to see if we have
# already started tracing, that is single
# stepping. If not, enable it, and make a note
# of the fact by setting the tracing variable
# to True
if (self.tracing == -1):
self.tracing = 0
d = exception.debug
log("[-] Enabling tracing")
d.start_tracing(exception.get_tid())
# E.1.3 Was it a memory instruction fetch (execute) operation,
# and if so, are we still looking for the entry point address?
if (f_type == winappdbg.win32.EXCEPTION_EXECUTE_FAULT) and (self.entrypt == 0):
self.entrypt = e_addr
t = exception.get_thread()
jmpinstr = t.disassemble_instruction(self.lasteip[0])[2].lower()
# E.1.3.1 Log what we've found
#log("[D] lasteip[1]: 0x%x" % self.lasteip[1])
log("[*] Found unpacked entry point at 0x%x called from 0x%x (%s) (after executing %d instructions)" % (self.entrypt,self.lasteip[0],jmpinstr,self.tracing))
log("[-] Unpacking loop at 0x%x - 0x%x" % (self.lowesteip,self.highesteip))
pid = exception.get_pid()
tid = exception.get_tid()
elog = ({
"time": time.time(),
"name": "unpacking loop found",
"type": "unpack event",
"pid": pid,
"tid": tid,
"info": {
"unpacked_entry_point": self.entrypt,
"callingaddr": self.lasteip[0],
"callinginstr": jmpinstr
},
})
# E.1.3.2
for (mem_pid,memblk) in self.allocedmem:
if (mem_pid == pid):
size = self.allocedmem[(mem_pid,memblk)]
endaddr = memblk + size - 1
if (e_addr >= memblk) and (e_addr <= endaddr):
# E.1.3.3 Log what we're doing and delete the memory breakpoint
log("[-] Dumping %d bytes of memory range 0x%x - 0x%x" % (size,memblk,endaddr))
d = exception.debug
d.dont_watch_buffer(exception.get_pid(),memblk,size - 1)
# E.1.3.4 Disable single-step debugging
self.tracing = -1
d.stop_tracing(exception.get_tid())
# E.1.3.5 Reset unpacking loop variables
self.entrypt = 0x00000000
#del self.lasteip
self.lasteip = [0x00000000,0x00000000]
self.lowesteip = 0xffffffff
self.highest = 0x00000000
# E.1.3.6 Dump the memory block to a file
p = exception.get_process()
filename = sys.argv[1] + ".memblk0x%08x" % memblk
dumpfile = open(filename,"wb")
dumpfile.write(p.read(memblk,size))
dumpfile.close()
elog["info"]["filename"] = filename
self.eventlog.append(elog)
except Exception as e:
traceback.print_exc()
raise
return
### E.2
# single_step
#
# winappdbg defined callback function to handle single step exceptions
###
def single_step(self,exception):
try:
# E.2.1 Get the exception address
e_addr = exception.get_exception_address()
# E.2.2 If we have just looped back (eip has gone backward)
if (e_addr < self.lasteip[1]):
# Remember this lower address as the lowest loop address
if self.lowesteip == 0xffffffff: self.lowesteip = e_addr
# ... and the address we just jumped from as the highest loop address
if self.highesteip == 0x00000000: self.highesteip = self.lasteip[1]
# E.2.3 If we are executing an instruction within the bounds of the loop
# and we haven't already disassembled this address, then do so
if (e_addr >= self.lowesteip) and (e_addr <= self.highesteip) and (not e_addr in self.disasmd):
t = exception.get_thread()
disasm = t.disassemble_instruction(e_addr)
instr = disasm[2].lower()
log(" 0x%x: %s" % (e_addr,instr))
self.disasmd.append(e_addr)
# E.2.4 Remember the last two instruction addresses (eip values)
# We need to remember the last two in order to be able to
# disassemble the instruction that jumped to the original
# entry point in the unpacked code
self.lasteip[0] = self.lasteip[1]
self.lasteip[1] = e_addr
# E.2.5 Increment the instruction counter, and check to see if
# we have reached our limit of 250,000 instructions.
# If so, assume that there is no unpacking loop and stop
# tracing (to speed up execution).
self.tracing += 1
if (self.tracing >= 250000):
log("[E] Reached tracing limit of 250000 instructions")
d = exception.debug
pid = exception.get_pid()
d.break_at(pid,e_addr,self.bp_stoptracing)
self.tracing = -1
except Exception as e:
traceback.print_exc()
raise
return
# E.2.6 bp_stoptracing()
# Set as a breakpoint handler when we want to stop tracing, as we can't
# disable single-step tracing from within the single-step call-back function.
def bp_stoptracing(self,exception):
log("[D] Single-step instruction limit reached -- stopping tracing")
d = exception.debug
tid = exception.get_tid()
pid = exception.get_pid()
d.stop_tracing(tid)
d.dont_break_at(pid,exception.get_exception_address())
return
### E.3
# exception
#
# winappdbg defined callback function to handle remaining exceptions
###
def exception(self,exception):
log("[*] Unhandled exception at 0x%x: %s" % (exception.get_exception_address(),exception.get_exception_name()))
#log("[-] 0x%x fault at 0x%x" % (exception.get_fault_type(),exception.get_fault_address()))
return
#
#### end of MyEventHandler class
#
###
# F. Miscellaneous functions
###
### F.1
# log(msg):
###
def log(msg):
global logfile
print(msg)
if not logfile:
logfile = open(sys.argv[1] + ".log","w")
if logfile:
logfile.write(msg + "\n")
logfile.flush()
#logfile.log_text(msg)
return
### F.2
# simple_debugger(argv):
###
def simple_debugger(filename):
global logfile
try:
handler = MyEventHandler()
#logfile = winappdbg.textio.Logger(filename + ".log",verbose = True)
except:
traceback.print_exc()
with winappdbg.Debug(handler, bKillOnExit = True, bHostileCode = False) as debug:
log("[*] Starting {:s}".format(filename))
debug.execl(filename, bFollow = False)
log("[*] Starting debug loop")
debug.loop()
log("[*] Terminating")
log("[D] Number of created processes: {:d}".format(len(handler.createdprocesses)))
for i in range(0, len(handler.eventlog)):
log("{:s}".format(handler.eventlog[i]))
return
###
# G. Start of script execution
###
#log("[*] Started at %s" % time.strftime("%Y-%m-%d %H:%M:%S"))
#simple_debugger(sys.argv[1])
#log("[*] Completed at %s" % time.strftime("%Y-%m-%d %H:%M:%S"))
# End of original unpack.py code
# Now write out the file list so we can run the list through unpack.py
# on our Windows XP sandbox VM.
def write_packed_file_list(packer_id_feature_file, packed_list_file_name):
# Load the malware packer id features sets from the sample set.
packer_id_features = pd.read_csv(packer_id_feature_file)
packed_files = packer_id_features[packer_id_features['is_packed'] == 1]
fop = open(packed_list_file_name, 'w')
counter = 0
for file_name in packed_files['file_name']:
full_name = "VirusShare_" + file_name + "\n"
fop.write(full_name)
counter += 1
fop.close()
print("Write {:d} filenames.".format(counter))
return
def unpack_pe_binaries(file_list):
counter = 0
unpacked = 0
error_count = 0
for file_name in file_list:
file_name = file_name.rstrip() # remove the newlines or else !!!
file_path = ext_drive + file_name
if (os.path.isfile(file_path)):
# call unpack.py
unpacked += 1
else:
#print("Error: file does not exist - {:s}".format(file_path))
error_count += 1
counter += 1
if (counter % 1000) == 0: # print progress
print('Disassembled: {:d} - {:s}'.format(counter,file_name))
print("Disassembled {:d} binaries with {:d} file path errors.".format(unpacked, error_count))
return
In [ ]:
In [ ]:
# For this we will use our old friend objdump.
!objdump
In [2]:
# Load the malware packer id features sets from VirusShare 251 and 252.
packer_id_features = pd.read_csv('data/sorted-packer-id-features-vs251-252.csv')
unpacked_files = packer_id_features[packer_id_features['is_packed'] == 0]
unpacked_pe_files = unpacked_files[unpacked_files['valid_pe'] == 1]
unpacked_pe_files.head(20)
Out[2]:
In [4]:
def write_unpacked_file_list(packer_id_feature_file, unpacked_list_file_name):
# Load the malware packer id features sets from the sample set.
packer_id_features = pd.read_csv(packer_id_feature_file)
unpacked_files = packer_id_features[packer_id_features['is_packed'] == 0]
unpacked_pe_files = unpacked_files[unpacked_files['valid_pe'] == 1]
fop = open(unpacked_list_file_name, 'w')
counter = 0
for file_name in unpacked_pe_files['file_name']:
full_name = "VirusShare_" + file_name + "\n"
fop.write(full_name)
counter += 1
print("Write {:d} filenames.".format(counter))
In [8]:
unpacked_pe_files.shape
Out[8]:
In [9]:
# Looks like we might have some false negatives on the "is_packed" feature, so lets have a look at the groups.
# The majority seem to be ok, but there are a few thousand that may be misclassified as having been packed
# but the entropy values indicate otherwise. So we will go ahead and disassemble the whole lot anyway and see what
# the result is.
packer_counts = unpacked_pe_files['packer_name'].value_counts()
packer_counts
Out[9]:
In [2]:
def disassemble_pe_binaries(file_list):
# Use command "objdump -D file_name" to dump out all sections of the PE binary.
counter = 0
disassed = 0
error_count = 0
command_line = ""
for file_name in file_list:
file_name = file_name.rstrip() # remove the newlines or else !!!
file_path1 = ext_drive1 + file_name
file_path2 = ext_drive2 + file_name
file_path3 = ext_drive3 + file_name
file_path4 = ext_drive4 + file_name
asm_file_name = "/opt/vs/asm/" + file_name + ".asm"
hdr_file_name = "/opt/vs/asm/" + file_name + ".txt"
if (os.path.isfile(file_path1)):
#command_line = "objdump -d {:s} > {:s}".format(file_path1, asm_file_name)
#sub.call(["rasm2", "-d", "-a", "x86", "-s", "intel", "-f", file_path, "-O", asm_file_name])
#sub.call(["./idaq69", "-B", file_path])
#sub.call(["python", "vivisect", "-B", file_path])
#sub.call(["objdump", "-g", "-x", "-D", "-s", "-t", "-T", "-M", "intel", file_path], stdout=fop)
# We will have to use objdump for now although the output is not optimal for
# machine learning objectives, need to translate call operand target addresses
# to function names, but all the alternatives do not work in batch mode or
# do not work at all.
# NOTE: IDA Pro Demo does not save any output, IDA Pro Free has a
# popup window on startup that prevents batch processing mode.
# NOTE: A call instruction at 0x00471d16 in IDA Pro Free (idag.exe) calls the
# function that displays the popup window.
#
sub.call(["wine", '/opt/vs/ida/idag.exe', "-B", file_path1])
# Dump section headers and import tables.
fop = open(hdr_file_name, "w")
sub.call(["objdump", "-g", "-x", file_path1], stdout=fop)
fop.close()
# now delete the binary, we do not need it anymore.
#sub.call(["rm", file_path1])
disassed += 1
elif (os.path.isfile(file_path2)):
sub.call(["wine", "/opt/vs/ida/idag.exe", "-B", file_path2])
# Dump section headers and import tables.
fop = open(hdr_file_name, "w")
sub.call(["objdump", "-g", "-x", file_path2], stdout=fop)
fop.close()
# now delete the binary, we do not need it anymore.
#sub.call(["rm", file_path2])
disassed += 1
else:
#print("Error: file does not exist - {:s}".format(file_path))
error_count += 1
counter += 1
if (counter % 1) == 0: # print progress
print('Disassembled: {:d} - {:s}'.format(counter, file_name))
print("Disassembled {:d} binaries with {:d} file path errors.".format(disassed, error_count))
#sub.call(["mv", "*.asm", "/opt/vs/asm"])
return
In [6]:
#ext_drive1 = '/opt/vs/train1/'
ext_drive1 = '/opt/vs/train/'
ext_drive2 = '/opt/vs/train2/'
ext_drive3 = '/opt/vs/train3/'
ext_drive4 = '/opt/vs/train4/'
#write_unpacked_file_list('data/sorted-packer-id-features-vs251-252.csv')
#fip = open("data/unpacked_file_list.txt")
#file_list = fip.readlines()
file_list = os.listdir('/opt/vs/train')
disassemble_pe_binaries(file_list[:10])
In [24]:
ls /opt/vs/
In [ ]:
help(os)
In [ ]:
signatures = peutils.SignatureDatabase('data/userdb-sans.txt')
In [ ]:
ext_drive = '/opt/vs/train/'
ext_drive2 = '/opt/vs/train2/'
file_list = os.listdir(ext_drive)
len(file_list)
In [ ]:
out_lines = []
for idx, file_name in enumerate(file_list):
try:
pe = pefile.PE(ext_drive + file_name, fast_load=True)
#pe_file_counter += 1
#matches = signatures.match_all(pe, ep_only = True)
matches = signatures.match(pe, ep_only = True)
if matches == None:
row = file_name + ",unknown,0\n"
else:
row = file_name + ",".join(str(e) for e in matches) + "\n"
out_lines.append(row)
pe.close()
if (idx % 1000) == 0: # print progress
out_lines = []
print('Filename: {:s} - {:d}'.format(row,idx))
except:
#non_pe_counter += 1
out_lines.append(file_name + ",nonpe,0\n")
In [ ]:
help(peutils)
In [2]:
testdf = pd.read_csv('data/sorted-packer-id-features.csv')
testdf.head(20)
Out[2]:
In [3]:
packer_counts = testdf['packer_name'].value_counts()
packer_counts
Out[3]:
In [4]:
packer_counts[:10].plot(kind='barh', rot=0)
plt.show()
In [5]:
plt.show()
In [ ]:
In [12]:
pe_counts = testdf['valid_pe'].value_counts()
pe_counts
Out[12]:
In [13]:
is_packed = testdf['is_packed'].value_counts()
is_packed
Out[13]:
In [16]:
pecrows = testdf[testdf['packer_name'] == 'PECompact v2.0']
pecrows
Out[16]:
In [17]:
unkrows = testdf[testdf['packer_name'] == 'unknown']
unkrows
Out[17]:
In [19]:
nonpe = testdf[testdf['valid_pe'] == 0]
nonpe
Out[19]:
In [2]:
fip = open("/opt/vs/error-list.txt","r")
file_list = fip.readlines()
fip.close()
for file_name in file_list:
file_name = file_name.rstrip() # remove the newlines or else !!!
file_path1 = "/opt/vs/train1/" + file_name
if (os.path.isfile(file_path1)):
sub.call(["wine", '/opt/vs/ida/idag.exe', "-B", file_path1])
print("Disassembling PE file: {:s}".format(file_path1))
In [ ]:
ext_drive = "/media/derek/TOSHIBA EXT/train3"
file_list = os.listdir(ext_drive)
for file_name in file_list[:10]:
file_name = file_name.rstrip() # remove the newlines or else !!!
file_path1 = "/opt/vs/train1/" + file_name
if (os.path.isfile(file_path1)):
sub.call(["wine", '/opt/vs/ida/idag.exe', "-B", file_path1])
print("Disassembling PE file: {:s}".format(file_path1))
In [ ]:
ext_drive = "/media/derek/TOSHIBA EXT/train3/"
file_list = os.listdir(ext_drive)
for file_name in file_list[:10]:
file_name = file_name.rstrip() # remove the newlines or else !!!
file_path1 = ext_drive + file_name
if (os.path.isfile(file_path1)):
sub.call(["file", file_path1])
print("File: {:s}".format(file_path1))
In [2]:
ext_drive = "/media/derek/TOSHIBA EXT/train3/"
file_list = os.listdir(ext_drive)
for file_name in file_list[:10]:
file_name = file_name.rstrip() # remove the newlines or else !!!
file_path1 = ext_drive + file_name
if (os.path.isfile(file_path1)):
sub.call(["file", file_path1])
print("File: {:s}".format(file_path1))
In [33]:
# file type/magic signature generation tests
def process_files(file_list):
# Iterate over the file list and output the results from the "file" command.
out_lines = []
file_counter = 0
pid = os.getpid()
file_name = "data/" + str(pid) + "-file-id.csv"
fop = open(file_name,'w')
for idx, file_name in enumerate(file_list):
file_name = file_name.rstrip() # remove the newlines or else !!!
file_path = ext_drive + file_name
if (os.path.isfile(file_path)):
#print("File: {:s}".format(file_path))
signat = sub.check_output(["file","-b", file_path]) # Use the brief option, we do not need the file name.
out_lines.append(signat)
if (idx % 10) == 0: # print progress
fop.writelines(out_lines)
out_lines = []
print('{:s} - {:s} - {:d} - {:s}'.format(str(pid), file_name, idx, signat))
if len(out_lines) > 0:
fop.writelines(out_lines)
out_lines = []
fop.close()
return
def process_trids(file_list):
# Iterate over the file list and output the results from TrID.
out_lines = []
high_score_line = ""
file_counter = 0
pid = os.getpid()
file_name = "data/" + str(pid) + "-trid-id.csv"
fop = open(file_name,'w')
for idx, file_name in enumerate(file_list):
file_name = file_name.rstrip() # remove the newlines or else !!!
file_path1 = ext_drive + file_name
if (os.path.isfile(file_path1)):
#print("File: {:s}".format(file_path1))
signat = sub.check_output(["/opt/vs/trid", file_path1])
components = signat.split('\n')
for idx2, line in enumerate(components):
if line.startswith("Collect"):
high_score_line = components[idx2 + 1]
out_lines.append(high_score_line) # If we find a TrID signature the next line
break # contains the highest probability file type.
if (idx % 10) == 0: # print progress
fop.writelines(out_lines)
out_lines = []
print('{:s} - {:s} - {:d} - {:s}'.format(str(pid), file_name, idx, high_score_line))
if len(out_lines) > 0:
fop.writelines(out_lines)
out_lines = []
fop.close()
return
def combine_magic_reports():
# Concatenate the four report files into one file.
fop = open('data/magic-reports.csv','w')
p1 = re.compile('\d{3,5}-\w+-id.csv') # This is the pattern for each file.
file_list = os.listdir('data/')
counter = 0
for file_name in file_list:
if p1.match(file_name):
fip = open('data/' + file_name, 'r')
in_lines = fip.readlines()
fop.writelines(in_lines)
counter += len(in_lines)
fip.close()
print('Completed combine of {:d} magic reports.'.format(counter))
fop.close()
return
In [ ]:
In [ ]:
daf = pd.read_csv('data/sorted-entropy-features-vs263.csv')
daf.head()
In [ ]:
fsizes = daf[daf['file_size'] > 50000000]
fsizes.head()
In [26]:
fsizes.shape
Out[26]:
In [ ]:
daf1 = pd.read_csv('data/5299-entropy-features-bin.csv', names=['file_name','entropy','file_size'])
daf1.head()
In [17]:
daf1.shape
Out[17]:
In [ ]:
fsizes1 = daf1[daf1['file_size'] > 50000000]
fsizes1.head()
In [22]:
fsizes1.shape
Out[22]:
In [ ]:
fsizes1.head(17)
In [ ]:
In [ ]:
ext_drive = "/opt/vs/apt/"
tfiles = os.listdir(ext_drive)
process_files(tfiles)
In [ ]:
ext_drive = "/opt/vs/apt/"
tfiles = os.listdir(ext_drive)
process_trids(tfiles)
In [31]:
def get_unpacked_file_list(packer_id_feature_file, file_id_feature_file, trid_id_feature_file):
# Load the malware packer id features and file id features from the sample set.
packer_id_features = pd.read_csv(packer_id_feature_file)
file_id_features = pd.read_csv(file_id_feature_file)
trid_id_features = pd.read_csv(trid_id_feature_file)
# Get a list of unpacked PE files that are not .NET CIL format.
# IDA Pro cannot disassemble .NET files.
unpacked_files = packer_id_features[packer_id_features['is_packed'] == 0]
unpacked_pe_files = unpacked_files[unpacked_files['valid_pe'] == 1]
not_dot_net = []
counter = 0
dot_net_counter = 0
# Get the trid and file rows that are for unpacked PE files.
trids = trid_id_features[trid_id_features['file_name'].isin(unpacked_pe_files['file_name'])]
fids = file_id_features[file_id_features['file_name'].isin(unpacked_pe_files['file_name'])]
# Iterate over the unpacked PE file list and check if each is a .NET file.
# If not a .NET file then add to file list.
pe_names_list = unpacked_pe_files['file_name']
for idx, file_name in enumerate(pe_names_list):
trid_name = trids.iloc[idx, 1]
fid_name = fids.iloc[idx, 1]
trid_name = trid_name.lower()
fid_name = fid_name.lower()
if trid_name.find('.net') > -1 or fid_name.find('.net') > -1:
print('Found: {:s} - {:s}'.format(trid_name, fid_name))
dot_net_counter += 1
continue
#print('Found: {:s} - {:s}'.format(trid_name, fid_name))
not_dot_net.append(file_name)
counter += 1
file_list = []
counter = 0
# Iterate over the file list and prepend the full file name.
for file_name in not_dot_net:
full_name = "VirusShare_" + file_name + "\n"
file_list.append(full_name)
counter += 1
print("Got {:d} unpacked PE filenames and {:d} .NET filenames.".format(counter, dot_net_counter))
return file_list
In [32]:
packer_id_file = 'data/sorted-packer-id-features-vs251.csv'
file_id_file = 'data/sorted-file-id-features-vs251.csv'
trid_id_file = 'data/sorted-trid-id-features-vs251.csv'
unflist = get_unpacked_file_list(packer_id_file, file_id_file, trid_id_file)
len(unflist)
Out[32]:
In [ ]:
In [15]:
packer_id_file = 'data/sorted-packer-id-features-apt.csv'
file_id_file = 'data/sorted-file-id-features-apt.csv'
trid_id_file = 'data/sorted-trid-id-features-apt.csv'
packer_id_features = pd.read_csv(packer_id_file)
file_id_features = pd.read_csv(file_id_file)
trid_id_features = pd.read_csv(trid_id_file)
# Get a list of unpacked PE files that are not .NET CIL format.
# IDA Pro cannot disassemble .NET files.
unpacked_files = packer_id_features[packer_id_features['is_packed'] == 0]
unpacked_pe_files = unpacked_files[unpacked_files['valid_pe'] == 1]
not_dot_net = []
counter = 0
# Get the trid and file rows that are for unpacked PE files.
trids = trid_id_features[trid_id_features['file_name'].isin(unpacked_pe_files['file_name'])]
fids = file_id_features[file_id_features['file_name'].isin(unpacked_pe_files['file_name'])]
trids.head()
Out[15]:
In [18]:
fids.head()
Out[18]:
In [10]:
def get_elf_file_list(packer_id_feature_file, file_id_feature_file, trid_id_feature_file):
# Load the malware packer id features and file id features from the sample set.
packer_id_features = pd.read_csv(packer_id_feature_file)
file_id_features = pd.read_csv(file_id_feature_file)
trid_id_features = pd.read_csv(trid_id_feature_file)
counter = 0
# Iterate over the unpacked PE file list and check if each is a .NET file.
# If not a .NET file then add to file list.
file_names_list = file_id_features['file_name']
file_list = []
for idx, file_name in enumerate(file_names_list):
trid_name = trid_id_features.iloc[idx, 1]
fid_name = file_id_features.iloc[idx, 1]
#trid_name = trid_name.lower()
#fid_name = fid_name.lower()
if trid_name.find('ELF') > -1 or fid_name.find('ELF') > -1:
print('Found: {:s} - {:s}'.format(trid_name, fid_name))
counter += 1
full_name = "VirusShare_" + file_name + "\n"
file_list.append(full_name)
fop = open('data/elf-file-list.txt','w')
fop.writelines(file_list)
fop.close()
print("Got {:d} ELF filenames.".format(counter))
return file_list
In [11]:
packer_id_file = 'data/sorted-packer-id-features-vs251.csv'
file_id_file = 'data/sorted-file-id-features-vs251.csv'
trid_id_file = 'data/sorted-trid-id-features-vs251.csv'
unflist = get_elf_file_list(packer_id_file, file_id_file, trid_id_file)
unflist
Out[11]:
In [12]:
packer_id_file = 'data/sorted-packer-id-features-vs252.csv'
file_id_file = 'data/sorted-file-id-features-vs252.csv'
trid_id_file = 'data/sorted-trid-id-features-vs252.csv'
unflist = get_elf_file_list(packer_id_file, file_id_file, trid_id_file)
unflist
Out[12]:
In [13]:
packer_id_file = 'data/sorted-packer-id-features-vs263.csv'
file_id_file = 'data/sorted-file-id-features-vs263.csv'
trid_id_file = 'data/sorted-trid-id-features-vs263.csv'
unflist = get_elf_file_list(packer_id_file, file_id_file, trid_id_file)
unflist
Out[13]:
In [14]:
packer_id_file = 'data/sorted-packer-id-features-vs264.csv'
file_id_file = 'data/sorted-file-id-features-vs264.csv'
trid_id_file = 'data/sorted-trid-id-features-vs264.csv'
unflist = get_elf_file_list(packer_id_file, file_id_file, trid_id_file)
unflist
Out[14]: