In [1]:
from multiprocessing import Pool
import os
from csv import writer
import numpy as np
import pandas as pd
import math
import scipy.misc
import array
import time as tm
import re
import subprocess as sub
import graph as gra
In [2]:
call_opcodes = ['call', 'int']
call_blocks = ['sub_', 'main', 'start']
def construct_call_graph(lines, log_file):
vertex = '.program_entry_point' # this is the root node, corresponds to the program original entry point not C main().
vertex_count = 1
edge_count = 0
cfgraph = gra.Graph()
cfgraph.add_vertex(vertex)
for row in lines:
row = row.rstrip('\r\n') # get rid of newlines they are annoying.
if row.startswith(';'):
continue
if ';' in row:
row = row.split(';')[0] # get rid of comments they are annoying.
#print(row)
if 'call' in row or ' int ' in row:
out_line = "-> {:s}".format(row)
log_file.write(out_line + "\n")
if row.startswith("sub_"):
out_line = "Vertex: {:s}".format(row)
log_file.write(out_line + "\n")
# get rid of all these things they are annoying.
row = row.replace('short','').replace('ds:',' ')
row = row.replace('dword','').replace('near','')
row = row.replace('ptr','').replace(',',' ') #.replace(':',' ').replace(',',' ')
row = row.replace('@','').replace('?','')
parts = row.split() # tokenize code line
if (len(parts) < 2): # this is just a comment line
continue
if ('endp' in parts): # ignore subroutine end labels
continue
# check for subroutines and block labels
# block and subroutine labels are always after the .text HHHHHHHH relative address
for block in call_blocks:
token = parts[0]
idx = token.find(block)
if ((idx == 0) or ('proc' in parts)):
# add new vertex to the graph, we are now in a new subroutine
vertex = token
cfgraph.add_vertex(vertex)
vertex_count += 1
out_line = "Vertex: {:d} {:s}".format(vertex_count, vertex)
log_file.write(out_line + "\n")
# print(out_line)
break
# now check for edge opcode
for opcode in call_opcodes: # check the line for a new edge
if opcode in parts:
# Extract desination address/function name/interrupt number as the directed edge.
idx = parts.index(opcode)
edge_count += 1
if ((idx + 1) < len(parts)): # in a few ASM files there is no operand, disassembly error?
next_vertex = parts[idx + 1]
else:
next_vertex = "none"
cfgraph.add_edge(vertex, next_vertex)
# print("Edge: " + vertex + " " + parts[idx] + " " + edge)
break
# print("Vertex Count: {:d}".format(vertex_count))
return cfgraph
def extract_call_graphs(multi_params):
asm_files = multi_params.file_list
ftot = len(asm_files)
ext_drive = multi_params.ext_drive
pid = os.getpid()
feature_file = 'data/' + str(pid) + "-" + multi_params.feature_file
log_file_name = 'data/' + str(pid) + "-" + multi_params.feature_file + ".log"
log_file = open(log_file_name, 'w')
print('Process ID: {:d} Graph Feature file: {:s}'.format(pid, feature_file))
graph_lines = []
graph_features = []
graph_file = open('data/' + str(pid) + "-" + multi_params.graph_file, 'w') # write as a graphviz DOT format file
with open(feature_file, 'w') as f:
# write the column names for the csv file
fw = writer(f)
#colnames = ['filename','vertex_count','edge_count','delta_max','density','diameter']
#colnames = ['file_name','vertex_count','edge_count','delta_max','density']
#fw.writerow(colnames) put in combine_feature_files
# Now iterate through the file list and extract the call graph from each file.
for idx, fname in enumerate(asm_files):
fasm = open(ext_drive + fname, 'r') #, errors='ignore')
lines = fasm.readlines()
fasm.close()
call_graph = construct_call_graph(lines, log_file)
cgvc = call_graph.n_vertices()
cgec = call_graph.n_edges()
cgdm = call_graph.delta_max()
cgde = call_graph.density()
# cdia = call_graph.diameter() this is constantly problematic !!!
fname_parts = fname.split('_') # Truncate the file name to the hash value.
trunc_name = fname_parts[1]
trunc_name = trunc_name[:trunc_name.find('.pe.asm')]
graph_features.append([trunc_name] + [cgvc, cgec, cgdm, cgde])
call_graph.set_graph_name(trunc_name)
#graph_lines.append(call_graph.to_str('multinoleaf'))
graph_lines.append(call_graph.to_str('graphviz'))
del(call_graph) # for some reason new graphs get appended to the previous graphs if not deleted???
# Print progress
if (idx + 1) % 10 == 0:
print(pid, idx + 1, 'of', ftot, 'files processed.')
fw.writerows(graph_features)
graph_file.writelines(graph_lines)
graph_features = []
graph_lines = []
# Write remaining files
if len(graph_lines) > 0:
fw.writerows(graph_features)
graph_file.writelines(graph_lines)
graph_features = []
graph_lines = []
graph_file.close()
log_file.close()
print('Process ID: {:d} finished.'.format(pid))
return
def combine_feature_files(out_file):
# Function to combine the newly generated call graph feature files into one file:
# 1. list data directory
# 2. For each file in file list that matches (\d\d\d\d-call-graph-features.csv)
# 3. Trim the filenames if necessary (should remove VirusShare_ prefix).
# 4. Concatenate the unsorted asm feature files.
# 5. Sort and write to data/sorted-call-graph-features.csv
fop = open('data/sorted-' + out_file, 'w')
colnames = 'file_name,vertex_count,edge_count,delta_max,density'
fop.write(colnames)
print("Column names: {:s}".format(colnames))
p1 = re.compile('\d{3,5}-' + out_file) # This is the PID prefix for each file.
file_list = os.listdir('data/')
counter = 0
for file_name in file_list:
if p1.match(file_name):
fip = open('data/' + file_name, 'r')
in_lines = fip.readlines()
fop.writelines(in_lines)
counter += len(in_lines)
fip.close()
print('Completed combine of {:d} call graph features.'.format(counter))
fop.close()
cgs = pd.read_csv('data/' + out_file)
# DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values().
sorted_cgs = cgs.sort('file_name')
sorted_cgs.to_csv('data/sorted-' + out_file, index=False)
sorted_cgs.head(20)
return
def combine_graph_files(out_file):
# Function to combine the newly generated call graph files into one file:
# 1. list data directory
# 2. For each file in file list that matches (\d\d\d\d-call-graphs.gv)
# 3. Trim the filenames if necessary (should remove VirusShare_ prefix).
# 4. Concatenate the unsorted asm feature files.
# 5. Sort and write to data/sorted-call-graphs.gv
fop = open('data/' + out_file, 'w')
p1 = re.compile('\d{3,5}-' + out_file) # This is the PID prefix for each file.
file_list = os.listdir('data/')
counter = 0
for file_name in file_list:
if p1.match(file_name):
fip = open('data/' + file_name, 'r')
in_lines = fip.readlines()
fop.writelines(in_lines)
counter += len(in_lines)
fip.close()
print('Completed combine of {:d} call graph lines.'.format(counter))
fop.close()
return
class Multi_Params(object):
def __init__(self, featurefile="", graphfile="", extdrive="", filelist=[]):
self.feature_file = featurefile
self.graph_file = graphfile
self.ext_drive = extdrive
self.file_list = filelist
In [3]:
feature_file = 'sorted-pe-call-graph-features-apt.csv'
graph_file = 'pe-call-graphs-apt.gv'
ext_drive = '/opt/vs/aptasm/'
file_ext = '-apt'
file_list = os.listdir(ext_drive)
tfiles = [i for i in file_list if '.pe.asm' in i]
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
#mp1 = Multi_Params(feature_file, graph_file, ext_drive, train1)
#mp2 = Multi_Params(feature_file, graph_file, ext_drive, train2)
#mp3 = Multi_Params(feature_file, graph_file, ext_drive, train3)
#mp4 = Multi_Params(feature_file, graph_file, ext_drive, train4)
# Single process test.
mp1 = Multi_Params(feature_file, graph_file, ext_drive, tfiles)
extract_call_graphs(mp1)
In [ ]:
In [ ]:
In [ ]: