In [1]:
import numpy as np
import pandas as pd
import graph as gra # http://www.python-course.eu/graphs_python.php
import os
from csv import writer
from multiprocessing import Pool
In [2]:
opcodes = ['call','int','ja','jb','jc','je','jg','jge','jl','jle','jmp','jna','jnb','jnl','jno','jnp','jns','jnz','jo','jp','jz']
blocks = ['sub_', 'loc_', 'locret_']
#blocks = ['loc_','locret_']
In [ ]:
# TRAINING
# Now divide the train files into four groups for multiprocessing
ext_drive = '/opt/kaggle/train/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print(len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4)))
trains = [train1, train2, train3, train4]
p = Pool(4)
p.map(extract_flow_control_graphs, trains)
In [ ]:
# TESTING
# Now divide the test files into four groups for multiprocessing
ext_drive = '/opt/kaggle/test/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
test1 = tfiles[:quart]
test2 = tfiles[quart:(2*quart)]
test3 = tfiles[(2*quart):(3*quart)]
test4 = tfiles[(3*quart):]
print(len(tfiles), quart, (len(test1)+len(test2)+len(test3)+len(test4)))
tests = [test1, test2, test3, test4]
p = Pool(4)
p.map(extract_flow_control_graphs, tests)
In [16]:
def construct_flow_control_graph(lines):
vertex = '.program_entry_point' # this is the root node, corresponds to the program entry point not C main().
vertex_count = 1
edge_count = 0
cfgraph = gra.Graph()
cfgraph.add_vertex(vertex)
for row in lines:
row = row.rstrip('\r\n') # get rid of newlines they are annoying.
if ';' in row:
row = row.split(';')[0] # get rid of comments they are annoying.
#print(row)
# get rid of all these things they are annoying.
row = row.replace('short',' ')
row = row.replace('ds:',' ')
row = row.replace('dword',' ')
row = row.replace('ptr',' ').replace(':',' ').replace(',',' ') #.replace('??',' ')
row = row.replace('@','').replace('?','')
parts = row.split() # tokenize code line
parts_len = len(parts)
if (parts_len < 3): # this is just a comment line, do NOT change this!!!
continue
if (parts_len > 3 ):
if (parts[3] == 'endp'): # skip procedure end labels
continue
if (parts[3] == 'proc'): # check for procedures not labelled sub_????
vertex = parts[2]
cfgraph.add_vertex(vertex)
vertex_count += 1
continue
# check for subroutines and block labels
# block and subroutine labels are always after the .text HHHHHHHH relative address
for block in blocks:
token = parts[2]
idx = token.find(block)
if (idx == 0): # add new vertex to the graph, we are now in a new subroutine or code block
vertex = token
cfgraph.add_vertex(vertex)
# print("Vertex: " + vertex)
vertex_count += 1
break
# now check for edge opcode
for opcode in opcodes: # check the line for a new edge
if opcode in parts:
# Extract desination address/function name/interrupt number as the directed edge.
idx = parts.index(opcode)
if ((idx + 1) < parts_len):
next_vertex = parts[idx + 1]
else:
next_vertex = "none"
cfgraph.add_edge(vertex, next_vertex)
# print("Edge: " + vertex + " " + parts[idx] + " " + edge)
break
print("Vertex Count: {:d}".format(vertex_count))
return cfgraph
In [ ]:
help(str.find)
In [ ]:
help(str.split)
In [17]:
def extract_flow_control_graphs(tfiles):
asm_files = [i for i in tfiles if '.asm' in i]
ftot = len(asm_files)
pid = os.getpid()
print('Process id:', pid)
feature_file = 'data/' + str(pid) + '-malware-flow-control-graph.csv'
print('Flow Control Graph file:', feature_file)
graph_lines = []
with open(feature_file, 'w') as f:
# write the column names for the csv file
fw = writer(f)
# colnames = ['filename','entropy','filesize']
# fw.writerow(colnames)
# Now iterate through the file list and extract the graph from each file.
for idx, fname in enumerate(asm_files):
fasm = open(ext_drive + fname, 'r', errors='ignore')
#filesize = os.path.getsize(ext_drive + fname)
lines = fasm.readlines()
flow_control_graph = construct_flow_control_graph(lines)
graph_lines.append([fname[:fname.find('.asm')]] + [str(flow_control_graph.to_str_multi_line_sorted())])
del(flow_control_graph)
# Print progress
if (idx+1) % 10 == 0:
print(pid, idx + 1, 'of', ftot, 'files processed.')
fw.writerows(graph_lines)
feature_counts = []
# Write remaining files
if len(graph_lines) > 0:
fw.writerows(graph_lines)
graph_lines = []
In [ ]:
In [ ]:
In [2]:
call_opcodes = ['call','int']
call_blocks = ['sub_']
In [ ]:
In [ ]:
In [3]:
def construct_call_graph(lines):
vertex = '.program_entry_point' # this is the root node, corresponds to the program entry point not C main().
vertex_count = 1
edge_count = 0
cfgraph = gra.Graph()
cfgraph.add_vertex(vertex)
for row in lines:
row = row.rstrip('\r\n') # get rid of newlines they are annoying.
if ';' in row:
row = row.split(';')[0] # get rid of comments they are annoying.
#print(row)
# get rid of all these things they are annoying.
row = row.replace('short','').replace('ds:',' ')
row = row.replace('dword','').replace('near','')
row = row.replace('ptr','').replace(':',' ').replace(',',' ') #.replace('??',' ')
row = row.replace('@','').replace('?','')
parts = row.split() # tokenize code line
if (len(parts) < 4): # this is just a comment line
continue
if (parts[3] == 'endp'): # ignore subroutine end labels
continue
# check for subroutines and block labels
# block and subroutine labels are always after the .text HHHHHHHH relative address
for block in call_blocks:
token = parts[2]
idx = token.find(block)
if ((idx == 0) or (parts[3] == 'proc')):
# add new vertex to the graph, we are now in a new subroutine
vertex = token
cfgraph.add_vertex(vertex)
# print("Vertex: " + vertex)
vertex_count += 1
break
# now check for edge opcode
for opcode in call_opcodes: # check the line for a new edge
if opcode in parts:
# Extract desination address/function name/interrupt number as the directed edge.
idx = parts.index(opcode)
edge_count += 1
if ((idx + 1) < len(parts)): # in a few ASM files there is no operand, disassembly error?
next_vertex = parts[idx + 1]
else:
next_vertex = "none"
cfgraph.add_edge(vertex, next_vertex)
# print("Edge: " + vertex + " " + parts[idx] + " " + edge)
break
# print("Vertex Count: {:d}".format(vertex_count))
return cfgraph
In [4]:
def extract_call_graphs(tfiles):
asm_files = [i for i in tfiles if '.asm' in i]
ftot = len(asm_files)
pid = os.getpid()
print('Process id:', pid)
feature_file = 'data/' + str(pid) + '-malware-call-graph-features.csv'
print('Graph Feature file:', feature_file)
graph_lines = []
graph_features = []
graph_file = open('data/' + str(pid) + '-malware-call-graphs.gv', 'w') # write as a graphviz DOT format file
with open(feature_file, 'w') as f:
# write the column names for the csv file
fw = writer(f)
#colnames = ['filename','vertex_count','edge_count','delta_max','density','diameter']
colnames = ['filename','vertex_count','edge_count','delta_max','density']
fw.writerow(colnames)
# Now iterate through the file list and extract the call graph from each file.
for idx, fname in enumerate(asm_files):
fasm = open(ext_drive + fname, 'r', errors='ignore')
lines = fasm.readlines()
call_graph = construct_call_graph(lines)
cgvc = call_graph.n_vertices()
cgec = call_graph.n_edges()
cgdm = call_graph.delta_max()
cgde = call_graph.density()
# cdia = call_graph.diameter() this is constantly problematic !!!
graph_features.append([fname[:fname.find('.asm')]] + [cgvc, cgec, cgdm, cgde])
call_graph.set_graph_name(fname[:fname.find('.asm')])
# graph_lines.append(call_graph.to_str('multinoleaf'))
graph_lines.append(call_graph.to_str('singlenoleaf'))
del(call_graph) # for some reason new graphs get appended to the previous graphs if not deleted???
# Print progress
if (idx + 1) % 10 == 0:
print(pid, idx + 1, 'of', ftot, 'files processed.')
fw.writerows(graph_features)
graph_file.writelines(graph_lines)
graph_features = []
graph_lines = []
# Write remaining files
if len(graph_lines) > 0:
fw.writerows(graph_features)
graph_file.writelines(graph_lines)
graph_features = []
graph_lines = []
graph_file.close()
print('Process id: {:d} finished.'.format(pid))
In [4]:
# Generate column names for the function count feature set
#call_graph_files = ['../3815-malware-call-graphs.gv', '../3816-malware-call-graphs.gv', '../3817-malware-call-graphs.gv', '../3818-malware-call-graphs.gv']
#call_graph_files = ['data/2278-malware-call-graphs.gv']
def generate_column_names(call_graph_file):
counter = 0
column_names = ['filename']
graph_names = []
graph_name = "none"
graph_functions = {}
fapi = open("data/APIs.txt")
defined_apis = fapi.readlines()
defined_apis = defined_apis[0].split(',')
fapi.close()
pid = os.getpid()
print('Process id:', pid)
column_names_file = 'data/' + str(pid) + '-reduced-column-names.csv'
print('Column names file: {:s}'.format(column_names_file))
graph_names_file = 'data/' + str(pid) + '-graph-names.csv'
print('Graph names file: {:s}'.format(graph_names_file))
with open(call_graph_file, 'r', errors='ignore') as cfg:
print("Starting graph file: {:s}".format(call_graph_file))
for line in cfg:
line = line.rstrip('\r\n') # get rid of newlines they are annoying.
# get rid of all these things they are annoying.
line = line.replace(',',' ').replace('[',' ').replace(']',' ').replace('->',' ').replace("\'", ' ')
parts = line.split() # tokenize call graph line
graph_name = parts[0]
parts = parts[1:]
graph_names.append(graph_name)
graph_functions = {}
for func in parts:
if func not in defined_apis: # ignore these API functions, they have already been counted.
if func.startswith('sub') or func.startswith('loc') or func.startswith('unk'):
func = func[:5] # lets try to reduce the vast number of functions.
elif func.startswith('eax+') or func.startswith('ebx+') or func.startswith('ecx+') or func.startswith('edx+'):
func = func[:5]
elif func.startswith('edi+') or func.startswith('esi+'):
func = func[:5]
elif func.startswith('byte_') or func.startswith('word_'): # or func.startswith('nullsub')
func = func[:6]
else: # reduce the feature set some more so my pissy pants PC can handle it.
func = func[:8]
if func not in column_names: # NOTE: or in Defined APIs, these have already been counted.
column_names.append(func)
counter += 1
# Print progress
if ((counter + 1) % 1000) == 0:
print("Processed number {:d} Graph_name {:s} Total column names {:d}".format(counter,graph_name,len(column_names)))
with open(column_names_file, 'w') as cols:
fw = writer(cols)
fw.writerow(column_names)
print("Completed writing {:d} column names.".format(len(column_names)))
with open(graph_names_file, 'w') as gras:
fw = writer(gras)
fw.writerow(graph_names)
print("Completed writing {:d} graph names.".format(len(graph_names)))
In [ ]:
# Generate the merged column names file single line.
counter = 0
column_names = []
column_name_files = ['data/3346-reduced-column-names.csv', 'data/3347-reduced-column-names.csv', 'data/3348-reduced-column-names.csv', 'data/3349-reduced-column-names.csv']
for cnamefile in column_name_files:
with open(cnamefile, 'r') as cras:
print("Starting file: {:s}".format(cnamefile))
colstr = cras.readline()
colnames = colstr.split(',')
for cname in colnames:
if cname not in column_names:
column_names.append(cname)
counter += 1
# Print progress
if ((counter + 1) % 1000) == 0:
print("Processed column names {:d}".format(counter))
with open('data/all-reduced-function-column-names.csv', 'w') as cols:
fw = writer(cols)
fw.writerow(column_names)
print("Completed writing column names total = {:d}".format(len(column_names)))
In [ ]:
#Generate the merged column names file multiline.
counter = 0
column_names = []
column_name_files = ['data/3346-reduced-column-names.csv', 'data/3347-reduced-column-names.csv', 'data/3348-reduced-column-names.csv', 'data/3349-reduced-column-names.csv']
for cnamefile in column_name_files:
with open(cnamefile, 'r') as cras:
print("Starting file: {:s}".format(cnamefile))
colstr = cras.readline()
colnames = colstr.split(',')
for cname in colnames:
if cname not in column_names:
column_names.append(cname)
counter += 1
# Print progress
if ((counter + 1) % 1000) == 0:
print("Processed column names {:d}".format(counter))
with open('data/all-reduced-function-column-names-multiline.csv', 'w') as cols:
for cname in column_names:
outline = cname + "\n"
cols.write(outline)
print("Completed writing column names total = {:d}".format(len(column_names)))
In [ ]:
help(cols)
In [13]:
print("Completed writing column names total = {:d}".format(len(column_names)))
In [ ]:
call_graph_files = ['/opt/kaggle/3662-malware-call-graphs-sline.gv', '/opt/kaggle/3663-malware-call-graphs-sline.gv', '/opt/kaggle/3664-malware-call-graphs-sline.gv', '/opt/kaggle/3665-malware-call-graphs-sline.gv']
p = Pool(4)
p.map(generate_column_names, call_graph_files)
In [10]:
print(len(defined_apis))
In [12]:
# Generate function counts from graph files of the ASM malware samples.
# call_graph_files = ['../3815-malware-call-graphs.gv', '../3816-malware-call-graphs.gv', '../3817-malware-call-graphs.gv', '../3818-malware-call-graphs.gv']
def generate_function_counts(call_graph_file):
counter = 0
error_count = 0
fapi = open("data/APIs.txt")
defined_apis = fapi.readlines()
defined_apis = defined_apis[0].split(',')
fapi.close()
colf = open('data/all-reduced-function-column-names.csv', 'r')
all_column_names = []
column_lines = colf.readlines()
for line in column_lines:
all_column_names += line.split(',')
col_names_len = len(all_column_names)
colf.close()
print("Column Names: {:d}".format(col_names_len))
pid = os.getpid()
print('Process id:', pid)
feature_file_name = 'data/' + str(pid) + '-call-graph-reduced-function_counts.csv'
print('Call graph function counts file: {:s}'.format(feature_file_name))
feature_file = open(feature_file_name, 'w')
fw = writer(feature_file)
call_graph_function_features = []
with open(call_graph_file, 'r', errors='ignore') as cfg:
for line in cfg:
line.rstrip('\r\n') # get rid of newlines they are annoying.
# get rid of all these things they are annoying.
line = line.replace(',',' ').replace('[',' ').replace(']',' ').replace('->',' ').replace("\'", ' ')
parts = line.split() # tokenize graph line
graph_name = parts[0]
parts = parts[1:]
function_dict = {}
# now generate the function counts for this call graph
for func in parts:
if func not in defined_apis: # ignore these API functions, they have already been counted.
if func.startswith('sub') or func.startswith('loc') or func.startswith('unk'):
func = func[:5] # lets try to reduce the vast number of functions.
elif func.startswith('eax+') or func.startswith('ebx+') or func.startswith('ecx+') or func.startswith('edx+'):
func = func[:5]
elif func.startswith('edi+') or func.startswith('esi+'):
func = func[:5]
elif func.startswith('byte_') or func.startswith('word_'): # or func.startswith('nullsub')
func = func[:6]
else: # reduce the feature set some more so my pissy pants PC can handle it.
func = func[:8]
if (func in function_dict):
function_dict[func] += 1
else:
function_dict[func] = 1
# now generate the output row for this call graph
function_counts = [0] * col_names_len # zero everything because this is a sparse matrix
for func in function_dict:
for idx, cname in enumerate(all_column_names):
if func == cname:
function_counts[idx] = function_dict[func]
break
call_graph_function_features.append([graph_name] + function_counts)
# Print progress and write out rows
counter += 1
if ((counter + 1) % 100) == 0:
print("{:d} Graph: {:s} Count: {:d}".format(pid, graph_name, counter))
fw.writerows(call_graph_function_features)
call_graph_function_features = []
# Write remaining files
if len(call_graph_function_features) > 0:
fw.writerows(call_graph_function_features)
call_graph_function_features = []
feature_file.close()
print("Completed processing {:d} graphs.".format(counter))
In [ ]:
call_graph_files = ['/opt/kaggle/3662-malware-call-graphs-sline.gv', '/opt/kaggle/3663-malware-call-graphs-sline.gv', '/opt/kaggle/3664-malware-call-graphs-sline.gv', '/opt/kaggle/3665-malware-call-graphs-sline.gv']
p = Pool(4)
p.map(generate_function_counts, call_graph_files)
In [ ]:
# Ok, so we still have 71000+ features even after severely reducing the function name lengths.
# This is a problem. Having to process such a huge sparse matrix requires a lot of memory.
# Solution 1: rent an AWS server with plenty-o-ram.
# Solution 2: buy more RAM for my linux box.
# Solution 3: break the sparse matrix into smaller chunks and process individually.
# Solution 4: try the pandas sparse matrix data structure.
# Goto: feature-reduction-call-graphs.ipynb
In [ ]:
# TRAINING
# Now divide the train files into four groups for multiprocessing
ext_drive = '/opt/kaggle/train/'
tfiles = os.listdir(ext_drive)
quart = int(len(tfiles)/4)
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print(len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4)))
trains = [train1, train2, train3, train4]
p = Pool(4)
p.map(extract_call_graphs, trains)
In [ ]:
# TESTING
# Now divide the test files into four groups for multiprocessing
ext_drive = '/opt/kaggle/test/'
tfiles = os.listdir(ext_drive)
quart = int(len(tfiles)/4)
test1 = tfiles[:quart]
test2 = tfiles[quart:(2*quart)]
test3 = tfiles[(2*quart):(3*quart)]
test4 = tfiles[(3*quart):]
print(len(tfiles), quart, (len(test1)+len(test2)+len(test3)+len(test4)))
tests = [test1, test2, test3, test4]
p = Pool(4)
p.map(extract_call_graphs, tests)
In [5]:
# Test graph generation
ext_drive = '/opt/kaggle/train/'
tfiles = ['0A32eTdBKayjCWhZqDOQ.asm', '1aAwe4J9VHrsq8uEoZhf.asm']
extract_graph(tfiles)
In [7]:
# Test graph generation
ext_drive = '/opt/kaggle/train/'
tfiles = ['0A32eTdBKayjCWhZqDOQ.asm', '0ACDbR5M3ZhBJajygTuf.asm']
extract_graph(tfiles)
In [18]:
# Test graph generation
ext_drive = '/opt/kaggle/train/'
tfiles = ['0ACDbR5M3ZhBJajygTuf.asm']
extract_flow_control_graphs(tfiles)
In [5]:
# Test call graph generation
ext_drive = '/opt/kaggle/train/'
tfiles = ['0A32eTdBKayjCWhZqDOQ.asm', '0ACDbR5M3ZhBJajygTuf.asm']
extract_call_graphs(tfiles)
In [5]:
# Test call graph generation
ext_drive = '/opt/kaggle/train/'
tfiles = ['0A32eTdBKayjCWhZqDOQ.asm', '0ACDbR5M3ZhBJajygTuf.asm']
extract_call_graphs(tfiles)
In [5]:
g = { "a" : ["c"],
"b" : ["c","e","f"],
"c" : ["a","b","d","e"],
"d" : ["c"],
"e" : ["b","c","f"],
"f" : ["b","e"]
}
graph = gra.Graph(g)
diameter = graph.diameter()
print(diameter)
In [15]:
# Test call graph generation
ext_drive = '/opt/kaggle/train/'
tfiles = ['0A32eTdBKayjCWhZqDOQ.asm', '0ACDbR5M3ZhBJajygTuf.asm']
extract_call_graphs(tfiles)
In [25]:
# Test call graph generation
ext_drive = '/opt/kaggle/train/'
tfiles = ['0A32eTdBKayjCWhZqDOQ.asm', '0ACDbR5M3ZhBJajygTuf.asm']
extract_call_graphs(tfiles)
In [ ]:
print(cfgraph.vertices())
In [ ]:
fasm = open('/opt/kaggle/train/1aAwe4J9VHrsq8uEoZhf.asm', 'r', errors='ignore')
lines = fasm.readlines()
parse_asm_code(lines)
In [ ]:
fasm = open('/opt/kaggle/train/0A32eTdBKayjCWhZqDOQ.asm', 'r', errors='ignore')
lines = fasm.readlines()
cfgraph = parse_asm_code(lines)
In [ ]:
help(open)
In [8]:
from collections import OrderedDict
a = {'foo': 1, 'bar': 2}
a
Out[8]:
In [9]:
{'foo': 1, 'bar': 2}
b = OrderedDict(sorted(a.items()))
b
Out[9]:
In [10]:
myDic={10: 'b', 3:'a', 5:'c'}
sorted_list=sorted(myDic.keys())
sorted_list
Out[10]:
In [ ]:
# Deprecated, too slowwwww.
def construct_function_dict(call_graph):
vertex_dict = call_graph.get_vertex_counts()
file_name = call_graph.get_graph_name()
function_dict[file_name] = vertex_dict
return
def generate_column_names():
for file_name in function_dict:
function_counts = function_dict[file_name]
for func_name in function_counts:
if func_name not in function_column_names:
function_column_names.append(func_name)
function_column_names.sort()
return
def write_function_counts():
# open function count feature file
pid = os.getpid()
feature_file = 'data/' + str(pid) + '-malware-function-count-features.csv'
print('Function Count Feature file:', feature_file)
f = open(feature_file, 'w')
fw = writer(f)
colnames = ['filename'] + function_column_names
fw.writerow(colnames)
counter = 0
function_features = []
function_count_list = [0] * len(function_column_names)
for filename in function_dict:
function_counts = function_dict[filename]
for function in function_counts:
idx = function_column_names.index(function)
function_count_list[idx] = function_counts[function]
print(filename)
print(function_count_list)
function_features.append([filename] + function_count_list)
counter += 1
# Print progress
if (counter + 1) % 10 == 0:
print(pid, counter + 1, ' files processed.')
fw.writerows(function_features)
function_features = []
# Write remaining files
if (len(function_features) > 0):
fw.writerows(function_features)
func_features = []
f.close()