Construct a dictionary of functions names and counts for every ASM file call graph then
write the function counts out to a csv feature file, it will be a sparse matrix.
feature columns will be like (filename, function names in sorted order.....)
- generate_function_counts.py
In [1]:
import numpy as np
import pandas as pd
import graph as gra # http://www.python-course.eu/graphs_python.php
import os
from csv import writer
from multiprocessing import Pool
In [12]:
# Generate column names for the function count feature set
#call_graph_files = ['../3815-malware-call-graphs.gv', '../3816-malware-call-graphs.gv', '../3817-malware-call-graphs.gv', '../3818-malware-call-graphs.gv']
#call_graph_files = ['data/2278-malware-call-graphs.gv']
def generate_column_names(call_graph_file):
counter = 0
column_names = ['filename']
graph_names = []
graph_name = "none"
graph_functions = {}
#fapi = open("data/APIs.txt")
#defined_apis = fapi.readlines()
#defined_apis = defined_apis[0].split(',')
#fapi.close()
pid = os.getpid()
print('Process id:', pid)
column_names_file = 'data/' + str(pid) + '-reduced-column-names.csv'
print('Column names file: {:s}'.format(column_names_file))
graph_names_file = 'data/' + str(pid) + '-graph-names.csv'
print('Graph names file: {:s}'.format(graph_names_file))
with open(call_graph_file, 'r') as cfg:
print("Starting graph file: {:s}".format(call_graph_file))
for line in cfg:
if line.startswith('digraph'):
tokens = line.split()
graph_name = tokens[1]
graph_names.append(graph_name)
continue
line = line.rstrip('\r\n') # get rid of newlines they are annoying.
# get rid of all these things they are annoying.
line = line.replace(';',' ').replace('{',' ').replace('}',' ').replace('->',' ')
parts = line.split() # tokenize call graph line
#graph_name = parts[0] # this is for single line call graphs.
#parts = parts[1:]
#graph_names.append(graph_name)
#graph_functions = {}
for func in parts:
#if func not in defined_apis: # ignore these API functions, they have already been counted.
if func.startswith('sub') or func.startswith('loc') or func.startswith('unk'):
func = func[:5] # lets try to reduce the vast number of functions.
elif func.startswith('eax+') or func.startswith('ebx+') or func.startswith('ecx+') or func.startswith('edx+'):
func = func[:5]
elif func.startswith('edi+') or func.startswith('esi+'):
func = func[:5]
elif func.startswith('byte_') or func.startswith('word_'): # or func.startswith('nullsub')
func = func[:6]
else: # reduce the feature set some more so my pissy pants PC can handle it.
func = func[:8]
if func not in column_names: # NOTE: or in Defined APIs, these have already been counted.
column_names.append(func)
counter += 1
# Print progress
if ((counter + 1) % 1000) == 0:
print("Processed number {:d} Graph_name {:s} Total column names {:d}".format(counter,graph_name,len(column_names)))
with open(column_names_file, 'w') as cols:
fw = writer(cols)
fw.writerow(column_names)
print("Completed writing {:d} column names.".format(len(column_names)))
with open(graph_names_file, 'w') as gras:
fw = writer(gras)
fw.writerow(graph_names)
print("Completed writing {:d} graph names.".format(len(graph_names)))
return
In [ ]:
def merge_column_names_single_line():
# Generate the merged column names file single line.
counter = 0
column_names = []
column_name_files = ['data/3346-reduced-column-names.csv', 'data/3347-reduced-column-names.csv', 'data/3348-reduced-column-names.csv', 'data/3349-reduced-column-names.csv']
for cnamefile in column_name_files:
with open(cnamefile, 'r') as cras:
print("Starting file: {:s}".format(cnamefile))
colstr = cras.readline()
colnames = colstr.split(',')
for cname in colnames:
if cname not in column_names:
column_names.append(cname)
counter += 1
# Print progress
if ((counter + 1) % 1000) == 0:
print("Processed column names {:d}".format(counter))
with open('data/all-reduced-function-column-names.csv', 'w') as cols:
fw = writer(cols)
fw.writerow(column_names)
print("Completed writing column names total = {:d}".format(len(column_names)))
return
In [ ]:
def merge_column_names_multi_line():
#Generate the merged column names file multiline.
counter = 0
column_names = []
column_name_files = ['data/3346-reduced-column-names.csv', 'data/3347-reduced-column-names.csv', 'data/3348-reduced-column-names.csv', 'data/3349-reduced-column-names.csv']
for cnamefile in column_name_files:
with open(cnamefile, 'r') as cras:
print("Starting file: {:s}".format(cnamefile))
colstr = cras.readline()
colnames = colstr.split(',')
for cname in colnames:
if cname not in column_names:
column_names.append(cname)
counter += 1
# Print progress
if ((counter + 1) % 1000) == 0:
print("Processed column names {:d}".format(counter))
with open('data/all-reduced-function-column-names-multiline.csv', 'w') as cols:
for cname in column_names:
outline = cname + "\n"
cols.write(outline)
print("Completed writing column names total = {:d}".format(len(column_names)))
return
In [10]:
# call_graph_files = ['../3815-malware-call-graphs.gv', '../3816-malware-call-graphs.gv', '../3817-malware-call-graphs.gv', '../3818-malware-call-graphs.gv']
def generate_function_counts(call_graph_file):
# Generate function counts from graph files of the ASM malware samples.
counter = 0
error_count = 0
#fapi = open("data/APIs.txt")
#defined_apis = fapi.readlines()
#defined_apis = defined_apis[0].split(',')
#fapi.close()
colf = open('data/all-reduced-function-column-names.csv', 'r')
all_column_names = []
column_lines = colf.readlines()
for line in column_lines:
all_column_names += line.split(',')
col_names_len = len(all_column_names)
colf.close()
print("Column Names: {:d}".format(col_names_len))
pid = os.getpid()
print('Process id:', pid)
feature_file_name = 'data/' + str(pid) + '-call-graph-reduced-function_counts.csv'
print('Call graph function counts file: {:s}'.format(feature_file_name))
feature_file = open(feature_file_name, 'w')
fw = writer(feature_file)
call_graph_function_features = []
with open(call_graph_file, 'r', errors='ignore') as cfg:
for line in cfg:
line.rstrip('\r\n') # get rid of newlines they are annoying.
# get rid of all these things they are annoying.
line = line.replace(',',' ').replace('[',' ').replace(']',' ').replace('->',' ').replace("\'", ' ')
parts = line.split() # tokenize graph line
graph_name = parts[0]
parts = parts[1:]
function_dict = {}
# now generate the function counts for this call graph
for func in parts:
#if func not in defined_apis: # ignore these API functions, they have already been counted.
if func.startswith('sub') or func.startswith('loc') or func.startswith('unk'):
func = func[:5] # lets try to reduce the vast number of functions.
elif func.startswith('eax+') or func.startswith('ebx+') or func.startswith('ecx+') or func.startswith('edx+'):
func = func[:5]
elif func.startswith('edi+') or func.startswith('esi+'):
func = func[:5]
elif func.startswith('byte_') or func.startswith('word_'): # or func.startswith('nullsub')
func = func[:6]
else: # reduce the feature set some more so my pissy pants PC can handle it.
func = func[:8]
if (func in function_dict):
function_dict[func] += 1
else:
function_dict[func] = 1
# now generate the output row for this call graph
function_counts = [0] * col_names_len # zero everything because this is a sparse matrix
for func in function_dict:
for idx, cname in enumerate(all_column_names):
if func == cname:
function_counts[idx] = function_dict[func]
break
call_graph_function_features.append([graph_name] + function_counts)
# Print progress and write out rows
counter += 1
if ((counter + 1) % 100) == 0:
print("{:d} Graph: {:s} Count: {:d}".format(pid, graph_name, counter))
fw.writerows(call_graph_function_features)
call_graph_function_features = []
# Write remaining files
if len(call_graph_function_features) > 0:
fw.writerows(call_graph_function_features)
call_graph_function_features = []
feature_file.close()
print("Completed processing {:d} graphs.".format(counter))
return
In [ ]:
# Test on the APT call graph files.
call_graph_file = 'data/3646-pe-call-graphs-apt.gv'
function_count_file = 'data/function-counts-apt.gv'
function_column_names_single_file = 'data/function-column-names-single-apt.txt'
function_column_names_multi_file = 'data/function-column-names-multi-apt.txt'
generate_column_names(call_graph_file)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
# Ok, so we still have 71000+ features even after severely reducing the function name lengths.
# This is a problem. Having to process such a huge sparse matrix requires a lot of memory.
# Solution 1: rent an AWS server with plenty-o-ram.
# Solution 2: buy more RAM for my linux box.
# Solution 3: break the sparse matrix into smaller chunks and process individually.
# Solution 4: try the pandas sparse matrix data structure.
# Goto: feature-reduction-call-graphs.ipynb
The original feature set of function calls extracted from the call graphs had
over 130,000 features. Reducing the feature set by truncating the function names
produces a feature set of 71,319 function names. Using chi-squared tests to select
the best 10% of features produces a set of 1561 call graph features.
- feature_reduction_function_names.py
In [ ]:
def get_function_column_names()
# Preliminary column name setup.
colf = open('data/all-reduced-function-column-names.csv', 'r')
all_column_names = []
column_lines = colf.readlines()
for line in column_lines:
all_column_names += line.split(',')
col_names_len = len(all_column_names)
colf.close()
print("Column Names: {:d}".format(col_names_len))
return all_column_names
In [ ]:
# Solution 3: slice the matrix into small chunks for processing.
# the pandas spare matrix still takes too long, break up into 10 different feature sets and try again.
onetenth = int(sorted_call_graph_function_train_1.shape[1]/10)
startidx = 1 # skip the filename column
endidx = onetenth
for idx1 in range(1,10):
print("Processing column set {:d} -> {:d}".format(startidx, endidx))
X = sorted_call_graph_function_train_1.iloc[:,startidx:endidx]
y = []
train_names = sorted_train_labels['Id']
for fname in sorted_call_graph_function_train_1['filename']:
# print("Appending {:s}".format(fname))
for idx2,fname2 in enumerate(sorted_train_labels['Id']):
if (fname2 == fname):
y.append(sorted_train_labels.iloc[idx2,1])
break
# Find the top 10 percent variance features.
print(X.shape)
print(len(y))
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
data_trimmed = sorted_call_graph_function_train_1.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_call_graph_function_train_1['filename'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/sorted-function-counts-" + str(idx1) + "-10perc.csv"
print("Writing file: {:s}".format(filename))
data_reduced.to_csv(filename, index=False)
startidx = endidx
endidx += onetenth
# finish of the remaining columns
print("Processing final column set {:d} -> {:d}".format(startidx, endidx))
X = sorted_call_graph_function_train_1.iloc[:,startidx:]
y = []
train_names = sorted_train_labels['Id']
for fname in sorted_call_graph_function_train_1['filename']:
for idx1,fname2 in enumerate(sorted_train_labels['Id']):
if (fname2 == fname):
y.append(sorted_train_labels.iloc[idx1,1])
break
# Find the top 10 percent variance features.
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
data_trimmed = sorted_call_graph_function_train_1.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_call_graph_function_train_1['filename'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/sorted-function-counts-10-10perc.csv"
data_reduced.to_csv(filename, index=False)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: