1. Test Construction of Call Graphs From PE ASM Files.


In [1]:
from multiprocessing import Pool
import os
from csv import writer
import numpy as np
import pandas as pd
import math
import scipy.misc
import array
import time as tm
import re
import subprocess as sub
import graph as gra

In [2]:
call_opcodes = ['call', 'int']
call_blocks = ['sub_', 'main', 'start']

def construct_call_graph(lines, log_file):
    vertex = '.program_entry_point' # this is the root node, corresponds to the program original entry point not C main().
    vertex_count = 1
    edge_count = 0
    cfgraph = gra.Graph()
    cfgraph.add_vertex(vertex)
    
    for row in lines:
        row = row.rstrip('\r\n')  # get rid of newlines they are annoying.
        if row.startswith(';'):
            continue
        if ';' in row:
            row = row.split(';')[0] # get rid of comments they are annoying.
            #print(row)
      
        if 'call' in row or ' int ' in row:
            out_line = "->  {:s}".format(row)
            log_file.write(out_line + "\n")
            
        if row.startswith("sub_"):
            out_line = "Vertex:  {:s}".format(row)
            log_file.write(out_line + "\n")            
            
        # get rid of all these things they are annoying.
        row = row.replace('short','').replace('ds:',' ')
        row = row.replace('dword','').replace('near','')
        row = row.replace('ptr','').replace(',',' ') #.replace(':',' ').replace(',',' ')
        row = row.replace('@','').replace('?','')
        parts = row.split() # tokenize code line
        
        if (len(parts) < 2): # this is just a comment line
            continue
        
        if ('endp' in parts): # ignore subroutine end labels
            continue
        
        # check for subroutines and block labels
        # block and subroutine labels are always after the .text HHHHHHHH relative address
        for block in call_blocks:
            token = parts[0]  
            idx = token.find(block)
            if ((idx == 0) or ('proc' in parts)):
                # add new vertex to the graph, we are now in a new subroutine
                vertex = token
                cfgraph.add_vertex(vertex)
                vertex_count += 1
                
                out_line = "Vertex: {:d}  {:s}".format(vertex_count, vertex)
                log_file.write(out_line + "\n")
                # print(out_line)
            
                break

        # now check for edge opcode    
        for opcode in call_opcodes: # check the line for a new edge
            if opcode in parts:
                # Extract desination address/function name/interrupt number as the directed edge.
                idx = parts.index(opcode)
                edge_count += 1
                if ((idx + 1) < len(parts)): # in a few ASM files there is no operand, disassembly error?
                    next_vertex = parts[idx + 1]
                else:
                    next_vertex = "none"
                cfgraph.add_edge(vertex, next_vertex)
                # print("Edge: " + vertex + " " + parts[idx] + " " + edge)
                break

    # print("Vertex Count: {:d}".format(vertex_count))
    
    return cfgraph


def extract_call_graphs(multi_params):
    asm_files = multi_params.file_list
    ftot = len(asm_files)
    ext_drive = multi_params.ext_drive
    
    pid = os.getpid()
    feature_file = 'data/' + str(pid) + "-" + multi_params.feature_file 
    log_file_name = 'data/' + str(pid) + "-" + multi_params.feature_file + ".log"
    log_file = open(log_file_name, 'w')
    
    print('Process ID: {:d} Graph Feature file: {:s}'.format(pid, feature_file))
    
    graph_lines = []
    graph_features = []
    graph_file = open('data/' + str(pid) + "-" + multi_params.graph_file, 'w') # write as a graphviz DOT format file
    
    with open(feature_file, 'w') as f:
        # write the column names for the csv file
        fw = writer(f)
        #colnames = ['filename','vertex_count','edge_count','delta_max','density','diameter']
        #colnames = ['file_name','vertex_count','edge_count','delta_max','density']
        #fw.writerow(colnames) put in combine_feature_files
        
        # Now iterate through the file list and extract the call graph from each file.
        for idx, fname in enumerate(asm_files):
            fasm = open(ext_drive + fname, 'r') #, errors='ignore')
            lines = fasm.readlines()
            fasm.close()
            
            call_graph = construct_call_graph(lines, log_file)
            cgvc = call_graph.n_vertices()
            cgec = call_graph.n_edges()
            cgdm = call_graph.delta_max()
            cgde = call_graph.density()
            
            # cdia = call_graph.diameter() this is constantly problematic !!!
            
            fname_parts = fname.split('_') # Truncate the file name to the hash value.
            trunc_name = fname_parts[1]
            trunc_name = trunc_name[:trunc_name.find('.pe.asm')]
            
            graph_features.append([trunc_name] + [cgvc, cgec, cgdm, cgde])
            call_graph.set_graph_name(trunc_name)
            #graph_lines.append(call_graph.to_str('multinoleaf')) 
            graph_lines.append(call_graph.to_str('graphviz'))
            
            del(call_graph) # for some reason new graphs get appended to the previous graphs if not deleted???
            
            # Print progress
            if (idx + 1) % 10 == 0:
                print(pid, idx + 1, 'of', ftot, 'files processed.')
                fw.writerows(graph_features)
                graph_file.writelines(graph_lines)
                graph_features = []
                graph_lines = []
                
        # Write remaining files
        if len(graph_lines) > 0:
            fw.writerows(graph_features)
            graph_file.writelines(graph_lines)
            graph_features = []
            graph_lines = []

    graph_file.close()
    log_file.close()
    
    print('Process ID: {:d} finished.'.format(pid))
    
    return



    
def combine_feature_files(out_file):
    # Function to combine the newly generated call graph feature files into one file:
    # 1. list data directory
    # 2. For each file in file list that matches (\d\d\d\d-call-graph-features.csv)
    # 3. Trim the filenames if necessary (should remove VirusShare_  prefix).
    # 4. Concatenate the unsorted asm feature files.
    # 5. Sort and write to data/sorted-call-graph-features.csv
    
    fop = open('data/sorted-' + out_file, 'w')
    colnames = 'file_name,vertex_count,edge_count,delta_max,density'
    fop.write(colnames)
    
    print("Column names: {:s}".format(colnames))
    
    p1 = re.compile('\d{3,5}-' + out_file) # This is the PID prefix for each file.
    file_list = os.listdir('data/')
    counter = 0
    
    for file_name in file_list:
        if p1.match(file_name):
            fip = open('data/' + file_name, 'r')
            in_lines = fip.readlines()
            fop.writelines(in_lines)
            counter += len(in_lines)
            fip.close()
            
    print('Completed combine of {:d} call graph features.'.format(counter))  
    
    fop.close()
    
    cgs = pd.read_csv('data/' + out_file)
    # DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values().
    sorted_cgs = cgs.sort('file_name')
    sorted_cgs.to_csv('data/sorted-' + out_file, index=False)
    sorted_cgs.head(20)
    
    return



def combine_graph_files(out_file):
    # Function to combine the newly generated call graph files into one file:
    # 1. list data directory
    # 2. For each file in file list that matches (\d\d\d\d-call-graphs.gv)
    # 3. Trim the filenames if necessary (should remove VirusShare_  prefix).
    # 4. Concatenate the unsorted asm feature files.
    # 5. Sort and write to data/sorted-call-graphs.gv
    
    fop = open('data/' + out_file, 'w')
    
    p1 = re.compile('\d{3,5}-' + out_file) # This is the PID prefix for each file.
    file_list = os.listdir('data/')
    counter = 0
    
    for file_name in file_list:
        if p1.match(file_name):
            fip = open('data/' + file_name, 'r')
            in_lines = fip.readlines()
            fop.writelines(in_lines)
            counter += len(in_lines)
            fip.close()
            
    print('Completed combine of {:d} call graph lines.'.format(counter))  
    
    fop.close()
    
    return



class Multi_Params(object):
    def __init__(self, featurefile="", graphfile="", extdrive="", filelist=[]):
        self.feature_file = featurefile
        self.graph_file = graphfile
        self.ext_drive = extdrive
        self.file_list = filelist

In [3]:
feature_file = 'sorted-pe-call-graph-features-apt.csv'
graph_file = 'pe-call-graphs-apt.gv'
ext_drive = '/opt/vs/aptasm/'
file_ext = '-apt'

file_list = os.listdir(ext_drive)
tfiles = [i for i in file_list if '.pe.asm' in i]

quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]

#mp1 = Multi_Params(feature_file, graph_file, ext_drive, train1)
#mp2 = Multi_Params(feature_file, graph_file, ext_drive, train2)
#mp3 = Multi_Params(feature_file, graph_file, ext_drive, train3)
#mp4 = Multi_Params(feature_file, graph_file, ext_drive, train4)

# Single process test.
mp1 = Multi_Params(feature_file, graph_file, ext_drive, tfiles)
extract_call_graphs(mp1)


Process ID: 3646 Graph Feature file: data/3646-sorted-pe-call-graph-features-apt.csv
(3646, 10, 'of', 271, 'files processed.')
(3646, 20, 'of', 271, 'files processed.')
(3646, 30, 'of', 271, 'files processed.')
(3646, 40, 'of', 271, 'files processed.')
(3646, 50, 'of', 271, 'files processed.')
(3646, 60, 'of', 271, 'files processed.')
(3646, 70, 'of', 271, 'files processed.')
(3646, 80, 'of', 271, 'files processed.')
(3646, 90, 'of', 271, 'files processed.')
(3646, 100, 'of', 271, 'files processed.')
(3646, 110, 'of', 271, 'files processed.')
(3646, 120, 'of', 271, 'files processed.')
(3646, 130, 'of', 271, 'files processed.')
(3646, 140, 'of', 271, 'files processed.')
(3646, 150, 'of', 271, 'files processed.')
(3646, 160, 'of', 271, 'files processed.')
(3646, 170, 'of', 271, 'files processed.')
(3646, 180, 'of', 271, 'files processed.')
(3646, 190, 'of', 271, 'files processed.')
(3646, 200, 'of', 271, 'files processed.')
(3646, 210, 'of', 271, 'files processed.')
(3646, 220, 'of', 271, 'files processed.')
(3646, 230, 'of', 271, 'files processed.')
(3646, 240, 'of', 271, 'files processed.')
(3646, 250, 'of', 271, 'files processed.')
(3646, 260, 'of', 271, 'files processed.')
(3646, 270, 'of', 271, 'files processed.')
Process ID: 3646 finished.

In [ ]:


In [ ]:


In [ ]: