1. Generate Function Counts from the Call Graph Files.

  Construct a dictionary of functions names and counts for every ASM file call graph then
  write the function counts out to a csv feature file, it will be a sparse matrix.
  feature columns will be like (filename, function names in sorted order.....)

  - generate_function_counts.py

In [1]:
import numpy as np
import pandas as pd
import graph as gra # http://www.python-course.eu/graphs_python.php
import os
from csv import writer
from multiprocessing import Pool

In [12]:
# Generate column names for the function count feature set
#call_graph_files = ['../3815-malware-call-graphs.gv', '../3816-malware-call-graphs.gv', '../3817-malware-call-graphs.gv', '../3818-malware-call-graphs.gv']
#call_graph_files = ['data/2278-malware-call-graphs.gv']

def generate_column_names(call_graph_file):
    counter = 0
    column_names = ['filename']
    graph_names = []
    graph_name = "none"
    graph_functions = {}

    #fapi = open("data/APIs.txt")
    #defined_apis = fapi.readlines()
    #defined_apis = defined_apis[0].split(',')
    #fapi.close()
    
    pid = os.getpid()
    print('Process id:', pid)
    column_names_file = 'data/' + str(pid) + '-reduced-column-names.csv'  
    print('Column names file: {:s}'.format(column_names_file))
    graph_names_file = 'data/' + str(pid) + '-graph-names.csv'  
    print('Graph names file: {:s}'.format(graph_names_file))    

    with open(call_graph_file, 'r') as cfg:
        print("Starting graph file: {:s}".format(call_graph_file))
        for line in cfg:
            
            if line.startswith('digraph'):
                tokens = line.split()
                graph_name = tokens[1]
                graph_names.append(graph_name)
                continue
                
            line = line.rstrip('\r\n')  # get rid of newlines they are annoying.
            # get rid of all these things they are annoying.
            line = line.replace(';',' ').replace('{',' ').replace('}',' ').replace('->',' ')
            parts = line.split() # tokenize call graph line
            
            
            #graph_name = parts[0] # this is for single line call graphs.
            #parts = parts[1:]
            #graph_names.append(graph_name)
            #graph_functions = {}
            
            for func in parts:
                #if func not in defined_apis: # ignore these API functions, they have already been counted.
                if func.startswith('sub') or func.startswith('loc') or func.startswith('unk'):
                    func = func[:5] # lets try to reduce the vast number of functions.
                elif func.startswith('eax+') or func.startswith('ebx+') or func.startswith('ecx+') or func.startswith('edx+'):
                    func = func[:5]
                elif func.startswith('edi+') or func.startswith('esi+'):
                    func = func[:5]
                elif func.startswith('byte_') or func.startswith('word_'): # or func.startswith('nullsub')
                    func = func[:6]
                else: # reduce the feature set some more so my pissy pants PC can handle it.
                    func = func[:8]
                if func not in column_names: # NOTE: or in Defined APIs, these have already been counted.    
                    column_names.append(func)

 
            counter += 1
            # Print progress
            if ((counter + 1) % 1000) == 0:
                print("Processed number {:d} Graph_name {:s} Total column names {:d}".format(counter,graph_name,len(column_names)))       

                
    with open(column_names_file, 'w') as cols:
        fw = writer(cols)
        fw.writerow(column_names)
    
    print("Completed writing {:d} column names.".format(len(column_names)))

    with open(graph_names_file, 'w') as gras:
        fw = writer(gras)
        fw.writerow(graph_names)
    
    print("Completed writing {:d} graph names.".format(len(graph_names)))
    
    return

In [ ]:
def merge_column_names_single_line():
    # Generate the merged column names file single line.
    counter = 0
    column_names = []
    column_name_files = ['data/3346-reduced-column-names.csv', 'data/3347-reduced-column-names.csv', 'data/3348-reduced-column-names.csv', 'data/3349-reduced-column-names.csv']
    for cnamefile in column_name_files:
        with open(cnamefile, 'r') as cras:
            print("Starting file: {:s}".format(cnamefile))
            colstr = cras.readline()
            colnames = colstr.split(',')
            for cname in colnames:
                if cname not in column_names:
                    column_names.append(cname)

                counter += 1
                # Print progress
                if ((counter + 1) % 1000) == 0:
                    print("Processed column names {:d}".format(counter))       

    with open('data/all-reduced-function-column-names.csv', 'w') as cols:
        fw = writer(cols)
        fw.writerow(column_names)

    print("Completed writing column names total = {:d}".format(len(column_names)))
    
    return

In [ ]:
def merge_column_names_multi_line():
    #Generate the merged column names file multiline.
    counter = 0
    column_names = []
    column_name_files = ['data/3346-reduced-column-names.csv', 'data/3347-reduced-column-names.csv', 'data/3348-reduced-column-names.csv', 'data/3349-reduced-column-names.csv']
    for cnamefile in column_name_files:
        with open(cnamefile, 'r') as cras:
            print("Starting file: {:s}".format(cnamefile))
            colstr = cras.readline()
            colnames = colstr.split(',')
            for cname in colnames:
                if cname not in column_names:    
                    column_names.append(cname)

                counter += 1
                # Print progress
                if ((counter + 1) % 1000) == 0:
                    print("Processed column names {:d}".format(counter))       

    with open('data/all-reduced-function-column-names-multiline.csv', 'w') as cols:
        for cname in column_names:
            outline = cname + "\n"
            cols.write(outline)

    print("Completed writing column names total = {:d}".format(len(column_names)))
    
    return

In [10]:
# call_graph_files = ['../3815-malware-call-graphs.gv', '../3816-malware-call-graphs.gv', '../3817-malware-call-graphs.gv', '../3818-malware-call-graphs.gv']

def generate_function_counts(call_graph_file):
    # Generate function counts from graph files of the ASM malware samples.
    
    counter = 0
    error_count = 0
    
    #fapi = open("data/APIs.txt")
    #defined_apis = fapi.readlines()
    #defined_apis = defined_apis[0].split(',')
    #fapi.close()
    
    colf = open('data/all-reduced-function-column-names.csv', 'r')
    all_column_names = []
    column_lines = colf.readlines()
    for line in column_lines:
        all_column_names += line.split(',')
    col_names_len = len(all_column_names)
    colf.close()
    print("Column Names: {:d}".format(col_names_len))
    
    pid = os.getpid()
    print('Process id:', pid)
    feature_file_name = 'data/' + str(pid) + '-call-graph-reduced-function_counts.csv'  
    print('Call graph function counts file: {:s}'.format(feature_file_name))
    feature_file = open(feature_file_name, 'w')
    fw = writer(feature_file)
    
    call_graph_function_features = []
    
    with open(call_graph_file, 'r', errors='ignore') as cfg:
        for line in cfg:
            line.rstrip('\r\n')  # get rid of newlines they are annoying.
            # get rid of all these things they are annoying.
            line = line.replace(',',' ').replace('[',' ').replace(']',' ').replace('->',' ').replace("\'", ' ')
            parts = line.split() # tokenize graph line
            
            graph_name = parts[0]
            parts = parts[1:]
            function_dict = {}
            
            # now generate the function counts for this call graph
            
            for func in parts:
                #if func not in defined_apis: # ignore these API functions, they have already been counted.
                if func.startswith('sub') or func.startswith('loc') or func.startswith('unk'):
                    func = func[:5] # lets try to reduce the vast number of functions.
                elif func.startswith('eax+') or func.startswith('ebx+') or func.startswith('ecx+') or func.startswith('edx+'):
                    func = func[:5]
                elif func.startswith('edi+') or func.startswith('esi+'):
                    func = func[:5]
                elif func.startswith('byte_') or func.startswith('word_'): # or func.startswith('nullsub')
                    func = func[:6]
                else: # reduce the feature set some more so my pissy pants PC can handle it.
                    func = func[:8]

                if (func in function_dict):
                    function_dict[func] += 1
                else:
                    function_dict[func] = 1
            
            # now generate the output row for this call graph

            function_counts = [0] * col_names_len # zero everything because this is a sparse matrix
            for func in function_dict:
                for idx, cname in enumerate(all_column_names):
                    if func == cname:
                        function_counts[idx] = function_dict[func]
                        break
                
            call_graph_function_features.append([graph_name] + function_counts)
            
            # Print progress and write out rows
            counter += 1
            if ((counter + 1) % 100) == 0:
                print("{:d} Graph: {:s} Count: {:d}".format(pid, graph_name, counter))
                fw.writerows(call_graph_function_features)
                call_graph_function_features = []
                
        # Write remaining files
        if len(call_graph_function_features) > 0:
            fw.writerows(call_graph_function_features)
            call_graph_function_features = []  
    
    feature_file.close()
    
    print("Completed processing {:d} graphs.".format(counter))
    
    return

In [ ]:
# Test on the APT call graph files.
call_graph_file = 'data/3646-pe-call-graphs-apt.gv'
function_count_file = 'data/function-counts-apt.gv'
function_column_names_single_file = 'data/function-column-names-single-apt.txt'
function_column_names_multi_file = 'data/function-column-names-multi-apt.txt'
generate_column_names(call_graph_file)

In [ ]:


In [ ]:


In [ ]:


In [ ]:
# Ok, so we still have 71000+ features even after severely reducing the function name lengths.
# This is a problem. Having to process such a huge sparse matrix requires a lot of memory.
# Solution 1: rent an AWS server with plenty-o-ram.
# Solution 2: buy more RAM for my linux box.
# Solution 3: break the sparse matrix into smaller chunks and process individually.
# Solution 4: try the pandas sparse matrix data structure.
# Goto: feature-reduction-call-graphs.ipynb

2. Reduce Call Graph Feature Sets

   The original feature set of function calls extracted from the call graphs had
   over 130,000 features. Reducing the feature set by truncating the function names
   produces a feature set of 71,319 function names. Using chi-squared tests to select
   the best 10% of features produces a set of 1561 call graph features.

   - feature_reduction_function_names.py

In [ ]:
def get_function_column_names()
    # Preliminary column name setup.
    colf = open('data/all-reduced-function-column-names.csv', 'r')
    all_column_names = []
    column_lines = colf.readlines()
    for line in column_lines:
        all_column_names += line.split(',')

    col_names_len = len(all_column_names)
    colf.close()
    print("Column Names: {:d}".format(col_names_len))
    
    return all_column_names

In [ ]:
# Solution 3: slice the matrix into small chunks for processing.
# the pandas spare matrix still takes too long, break up into 10 different feature sets and try again.
onetenth = int(sorted_call_graph_function_train_1.shape[1]/10)
startidx = 1 # skip the filename column
endidx = onetenth
for idx1 in range(1,10):
    print("Processing column set {:d} -> {:d}".format(startidx, endidx))
    X = sorted_call_graph_function_train_1.iloc[:,startidx:endidx]
    y = []
    train_names = sorted_train_labels['Id']
    for fname in sorted_call_graph_function_train_1['filename']:
        # print("Appending {:s}".format(fname))
        for idx2,fname2 in enumerate(sorted_train_labels['Id']):
            if (fname2 == fname):
                y.append(sorted_train_labels.iloc[idx2,1])
                break

    # Find the top 10 percent variance features.
    print(X.shape)
    print(len(y))
    fsp = SelectPercentile(chi2, 10)
    X_new_10 = fsp.fit_transform(X,y)
    selected_names = fsp.get_support(indices=True)
    selected_names = selected_names + 1
    data_trimmed = sorted_call_graph_function_train_1.iloc[:,selected_names]
    data_fnames = pd.DataFrame(sorted_call_graph_function_train_1['filename'])
    data_reduced = data_fnames.join(data_trimmed)
    # Write to file as we do not have enough memory.
    filename = "data/sorted-function-counts-" + str(idx1) + "-10perc.csv"
    print("Writing file: {:s}".format(filename))
    data_reduced.to_csv(filename, index=False)
    startidx = endidx
    endidx += onetenth
    
    
# finish of the remaining columns
print("Processing final column set {:d} -> {:d}".format(startidx, endidx))
X = sorted_call_graph_function_train_1.iloc[:,startidx:]
y = []
train_names = sorted_train_labels['Id']
for fname in sorted_call_graph_function_train_1['filename']:
    for idx1,fname2 in enumerate(sorted_train_labels['Id']):
        if (fname2 == fname):
            y.append(sorted_train_labels.iloc[idx1,1])
            break

# Find the top 10 percent variance features.
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
data_trimmed = sorted_call_graph_function_train_1.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_call_graph_function_train_1['filename'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/sorted-function-counts-10-10perc.csv"
data_reduced.to_csv(filename, index=False)

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: