In [11]:
from multiprocessing import Pool
import os
from csv import writer
import numpy as np
import math
import scipy.misc
import array
import time as tm
In [ ]:
keywords = ['Virtual','Offset','loc','Import','Imports','var','Forwarder','UINT','LONG','BOOL','WORD','BYTES','large','short','dd','db','dw','XREF','ptr','DATA','FUNCTION','extrn','byte','word','dword','char','DWORD','stdcall','arg','locret','asc','align','WinMain','unk','cookie','off','nullsub','DllEntryPoint','System32','dll','CHUNK','BASS','HMENU','DLL','LPWSTR','void','HRESULT','HDC','LRESULT','HANDLE','HWND','LPSTR','int','HLOCAL','FARPROC','ATOM','HMODULE','WPARAM','HGLOBAL','entry','rva','COLLAPSED','config','exe','Software','CurrentVersion','__imp_','INT_PTR','UINT_PTR','---Seperator','PCCTL_CONTEXT','__IMPORT_','INTERNET_STATUS_CALLBACK','.rdata:','.data:','.text:','case','installdir','market','microsoft','policies','proc','scrollwindow','search','trap','visualc','___security_cookie','assume','callvirtualalloc','exportedentry','hardware','hkey_current_user','hkey_local_machine','sp-analysisfailed','unableto']
known_sections = ['.text', '.data', '.bss', '.rdata', '.edata', '.idata', '.rsrc', '.tls', '.reloc']
registers = ['edx','esi','es','fs','ds','ss','gs','cs','ah','al',
'ax','bh','bl','bx','ch','cl','cx','dh','dl','dx',
'eax','ebp','ebx','ecx','edi','esp']
opcodes = ['add','al','bt','call','cdq','cld','cli','cmc','cmp','const','cwd','daa','db'
,'dd','dec','dw','endp','ends','faddp','fchs','fdiv','fdivp','fdivr','fild'
,'fistp','fld','fstcw','fstcwimul','fstp','fword','fxch','imul','in','inc'
,'ins','int','jb','je','jg','jge','jl','jmp','jnb','jno','jnz','jo','jz'
,'lea','loope','mov','movzx','mul','near','neg','not','or','out','outs'
,'pop','popf','proc','push','pushf','rcl','rcr','rdtsc','rep','ret','retn'
,'rol','ror','sal','sar','sbb','scas','setb','setle','setnle','setnz'
,'setz','shl','shld','shr','sidt','stc','std','sti','stos','sub','test'
,'wait','xchg','xor']
In [ ]:
paths = ['train','test']
def count_1gram(byte_code):
OneByte = [0]*16**2
for row in byte_code:
row = row.rstrip('\r\n')
codes = row.split()[1:]
# Convert code to 1byte
OneByteCode = []
for i in codes:
if i != '??':
OneByteCode += [int(i,16)]
# Calculate the frequency of 1byte
for i in OneByteCode:
OneByte[i] += 1
return OneByte
def count_2gram(byte_code):
twoByte = [0]*16**4
for row in byte_code:
codes = row[:-2].split()[1:]
codes_2g = codes[:-1]
for i in range(len(codes_2g)):
codes_2g[i] += codes[i+1]
twoByteCode = []
for i in codes_2g:
if '??' not in i:
twoByteCode += [int(i,16)]
for i in twoByteCode:
twoByte[i] += 1
return twoByte
def count_4gram(byte_code): #TODO: NOT DONE YET
twoByte = [0]*16**8
for row in byte_code:
codes = row[:-2].split()[1:]
codes_2g = codes[:-1]
for i in range(len(codes_2g)):
codes_2g[i]+= codes[i+1]
twoByteCode = []
for i in codes_2g:
if '??' not in i:
twoByteCode += [int(i,16)]
for i in twoByteCode:
twoByte[i] += 1
return twoByte
def count_asm_symbols(asm_code):
symbols = [0]*7
for row in asm_code:
if '*' in row:
symbols[0] += 1
if '-' in row:
symbols[1] += 1
if '+' in row:
symbols[2] += 1
if '[' in row:
symbols[3] += 1
if ']' in row:
symbols[4] += 1
if '@' in row:
symbols[5] += 1
if '?' in row:
symbols[6] += 1
return symbols
def count_asm_registers(asm_code):
registers_values = [0]*len(registers)
for row in asm_code:
parts = row.replace(',',' ').replace('+',' ').replace('*',' ').replace('[',' ').replace(']',' ') \
.replace('-',' ').split()
for register in registers:
registers_values[registers.index(register)] += parts.count(register)
return registers_values
def count_asm_opcodes(asm_code):
opcodes_values = [0]*len(opcodes)
for row in asm_code:
parts = row.split()
for opcode in opcodes:
if opcode in parts:
opcodes_values[opcodes.index(opcode)] += 1
break
return opcodes_values
def count_asm_APIs(asm_code, apis):
apis_values = [0]*len(apis)
for row in asm_code:
for i in range(len(apis)):
if apis[i] in row:
apis_values[i] += 1
break
return apis_values
def count_asm_misc(asm_code):
keywords_values = [0]*len(keywords)
for row in asm_code:
for i in range(len(keywords)):
if keywords[i] in row:
keywords_values[i] += 1
break
return keywords_values
In [ ]:
# Extract features from test/training asm files, file list is passed in as a parameter
def extract_asm_features(tfiles):
pid = os.getpid()
print('Process id:', pid)
feature_file = 'data/' + str(pid) + '-malware-features-asm.csv' # Windows API, symbols, registers, opcodes, etc...
print('feature file:', feature_file)
fapi = open("data/APIs.txt")
defined_apis = fapi.readlines()
defined_apis = defined_apis[0].split(',')
asm_files = [i for i in tfiles if '.asm' in i]
ftot = len(asm_files)
feature_counts = []
with open(feature_file, 'w') as f:
# write the csv header
fw = writer(f)
colnames = ['filename'] + registers + opcodes + defined_apis + keywords
fw.writerow(colnames)
for idx, fname in enumerate(asm_files):
fasm = open(ext_drive + fname, 'r')
content = fasm.readlines()
reg_vals = count_asm_registers(content)
opc_vals = count_asm_opcodes(content)
api_vals = count_asm_APIs(content, defined_apis)
#sec_vals = count_asm_sections(content)
mis_vals = count_asm_misc(content)
count_vals = reg_vals + opc_vals + api_vals + mis_vals
feature_counts.append([fname[:fname.find('.asm')]] + count_vals)
# Writing rows after every 10 files processed
if (idx+1) % 10 == 0:
print(pid, idx + 1, 'of', ftot, 'files processed.')
fw.writerows(feature_counts)
feature_counts = []
# Writing remaining files
if len(feature_counts) > 0:
fw.writerows(feature_counts)
feature_counts = []
In [ ]:
# TRAINING
# Now divide the train files into four groups for multiprocessing
ext_drive = '/opt/kaggle/train/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4))
trains = [train1, train2, train3, train4]
p = Pool(4)
p.map(extract_asm_features, trains)
In [ ]:
# TESTING
# Now divide the test files into four groups for multiprocessing
ext_drive = '/opt/kaggle/test/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
test1 = tfiles[:quart]
test2 = tfiles[quart:(2*quart)]
test3 = tfiles[(2*quart):(3*quart)]
test4 = tfiles[(3*quart):]
print len(tfiles), quart, (len(test1)+len(test2)+len(test3)+len(test4))
tests = [test1, test2, test3, test4]
p = Pool(4)
p.map(extract_asm_features, tests)
In [ ]:
In [2]:
# Calculate Shannon's Entropy, https://en.wikipedia.org/wiki/Entropy_(information_theory)
def calculate_entropy(byte_counts, total):
entropy = 0.0
for count in byte_counts:
# If no bytes of this value were seen in the value, it doesn't affect
# the entropy of the file.
if count == 0:
continue
# p is the probability of seeing this byte in the file, as a floating-point number
p = 1.0 * count / total
entropy -= p * math.log(p, 256)
return entropy
In [ ]:
In [3]:
def entropy_counter(byte_code):
byte_counts = [0] * 256
total = 0
#code_length = len(byte_code)
for row in byte_code:
nrow = row.rstrip('\r\n')
bytes = nrow.split(' ')
# skip first token as it is the relative memory address
# print(bytes)
for i in range(1, len(bytes)):
# print(bytes[i])
if bytes[i] != '??':
byte_counts[int(bytes[i], 16)] += 1
else:
byte_counts[0] += 1
total += 1
entropy = calculate_entropy(byte_counts, total)
return entropy
In [ ]:
In [ ]:
In [6]:
# feature extraction for the .byte files
def extract_byte_features(tfiles):
byte_files = [i for i in tfiles if '.bytes' in i]
ftot = len(byte_files)
pid = os.getpid()
print('Process id:', pid)
feature_file = 'data/' + str(pid) + '-malware-features-byte.csv' # entropy, file size, ngrams...
print('feature file:', feature_file)
feature_counts = []
with open(feature_file, 'w') as f:
# write the column names for the csv file
fw = writer(f)
colnames = ['filename'] + ['entropy'] + ['filesize']
fw.writerow(colnames)
# Now iterate through the file list and extract the features from each file.
for idx, fname in enumerate(byte_files):
fasm = open(ext_drive + fname, 'r')
filesize = os.path.getsize(ext_drive + fname)
lines = fasm.readlines()
# TODO: Do ngram extraction
# First do entropy calculations and filesize
entropy = entropy_counter(lines)
#print(fname + ' : entropy = ' + str(entropy) + ' file size = ' + str(filesize))
count_vals = [entropy, filesize]
feature_counts.append([fname[:fname.find('.byte')]] + count_vals)
# Print progress
if (idx+1) % 10 == 0:
print(pid, idx + 1, 'of', ftot, 'files processed.')
fw.writerows(feature_counts)
feature_counts = []
# Write remaining files
if len(feature_counts) > 0:
fw.writerows(feature_counts)
feature_counts = []
In [ ]:
# TRAINING
# Now divide the train files into four groups for multiprocessing
ext_drive = '/opt/kaggle/train/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4))
trains = [train1, train2, train3, train4]
p = Pool(4)
p.map(extract_byte_features, trains)
In [ ]:
# TESTING
# Now divide the test files into four groups for multiprocessing
ext_drive = '/opt/kaggle/test/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
test1 = tfiles[:quart]
test2 = tfiles[quart:(2*quart)]
test3 = tfiles[(2*quart):(3*quart)]
test4 = tfiles[(3*quart):]
print len(tfiles), quart, (len(test1)+len(test2)+len(test3)+len(test4))
tests = [test1, test2, test3, test4]
p = Pool(4)
p.map(extract_byte_features, tests)
In [ ]:
In [ ]:
# From Say_No_to_Overfitting
def entropy(p,n):
p_ratio = float(p)/(p+n)
n_ratio = float(n)/(p+n)
return -p_ratio*math.log(p_ratio) - n_ratio * math.log(n_ratio)
def info_gain(p0,n0,p1,n1,p,n):
return entropy(p,n) - float(p0+n0)/(p+n)*entropy(p0,n0) - float(p1+n1)/(p+n)*entropy(p1,n1)
In [13]:
def read_image(filename):
f = open(filename,'rb')
ln = os.path.getsize(filename) # length of file in bytes
width = 256
rem = ln%width
a = array.array("B") # uint8 array
a.fromfile(f,ln-rem)
f.close()
g = np.reshape(a,(len(a)/width,width))
g = np.uint8(g)
g.resize((1000,))
return list(g)
In [15]:
# Do asm image extraction
def extract_asm_image_features(tfiles):
asm_files = [i for i in tfiles if '.asm' in i]
ftot = len(asm_files)
pid = os.getpid()
print('Process id:', pid)
feature_file = 'data/' + str(pid) + '-test-image-features-asm.csv'
print('feature file:', feature_file)
outrows = []
with open(feature_file,'w') as f:
fw = writer(f)
column_names = ['filename'] + [("ASM_{:s}".format(str(x))) for x in range(1000)]
fw.writerow(column_names)
for idx, fname in enumerate(asm_files):
file_id = fname.split('.')[0]
image_data = read_image(ext_drive + fname)
outrows.append([file_id] + image_data)
# Print progress
if (idx+1) % 10 == 0:
print(pid, idx + 1, 'of', ftot, 'files processed.')
fw.writerows(outrows)
outrows = []
# Write remaining files
if len(outrows) > 0:
fw.writerows(outrows)
outrows = []
In [21]:
# Do byte image extraction
def extract_byte_image_features(tfiles):
asm_files = [i for i in tfiles if '.bytes' in i]
ftot = len(asm_files)
pid = os.getpid()
print('Process id:', pid)
feature_file = 'data/' + str(pid) + '-train-image-features-byte.csv'
print('feature file:', feature_file)
outrows = []
with open(feature_file,'w') as f:
fw = writer(f)
column_names = ['filename'] + [("BYTE_{:s}".format(str(x))) for x in range(1000)]
fw.writerow(column_names)
for idx, fname in enumerate(asm_files):
file_id = fname.split('.')[0]
image_data = read_image(ext_drive + fname)
outrows.append([file_id] + image_data)
# Print progress
if (idx+1) % 10 == 0:
print(pid, idx + 1, 'of', ftot, 'files processed.')
fw.writerows(outrows)
outrows = []
# Write remaining files
if len(outrows) > 0:
fw.writerows(outrows)
outrows = []
In [ ]:
# TRAIN FILES ASM
# Now divide the train files into four groups for multiprocessing
start_time = tm.time()
ext_drive = '/opt/kaggle/train/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4))
trains = [train1, train2, train3, train4]
p = Pool(4)
p.map(extract_asm_image_features, trains)
print("Elapsed time: {:.2f} hours.".format((tm.time() - start_time)/3600.0))
In [ ]:
# TRAIN FILES BYTE
# Now divide the train files into four groups for multiprocessing
start_time = tm.time()
ext_drive = '/opt/kaggle/train/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4))
trains = [train1, train2, train3, train4]
p = Pool(4)
p.map(extract_byte_image_features, trains)
print("Elapsed time: {:.2f} hours.".format((tm.time() - start_time)/3600.0))
In [ ]:
# TEST FILES ASM
# Now divide the test files into four groups for multiprocessing
start_time = tm.time()
ext_drive = '/opt/kaggle/test/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
test1 = tfiles[:quart]
test2 = tfiles[quart:(2*quart)]
test3 = tfiles[(2*quart):(3*quart)]
test4 = tfiles[(3*quart):]
print len(tfiles), quart, (len(test1)+len(test2)+len(test3)+len(test4))
tests = [test1, test2, test3, test4]
p = Pool(4)
p.map(extract_asm_image_features, tests)
print("Elapsed time: {:.2f} hours.".format((tm.time() - start_time)/3600.0))
In [ ]:
# TEST FILES BYTE
# Now divide the test files into four groups for multiprocessing
start_time = tm.time()
ext_drive = '/opt/kaggle/test/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
test1 = tfiles[:quart]
test2 = tfiles[quart:(2*quart)]
test3 = tfiles[(2*quart):(3*quart)]
test4 = tfiles[(3*quart):]
print len(tfiles), quart, (len(test1)+len(test2)+len(test3)+len(test4))
tests = [test1, test2, test3, test4]
p = Pool(4)
p.map(extract_byte_image_features, tests)
print("Elapsed time: {:.2f} hours.".format((tm.time() - start_time)/3600.0))
In [10]:
def extract_entropy(tfiles):
byte_files = [i for i in tfiles if '.bytes' in i]
for fname in byte_files:
f = open(ext_drive + fname, 'r')
filesize = os.path.getsize(ext_drive + fname)
lines = f.readlines()
entropy = entropy_counter(lines)
print(fname + ' : entropy = ' + str(entropy) + ' file size = ' + str(filesize))
In [36]:
# this is just a 1gram counter that calls calculate_entropy when it is finished counting bytes
# ONLY USE FOR XXD format files
def entropy_counter_xxd(byte_code):
byte_counts = [0] * 256
total = 0
#code_length = len(byte_code)
for row in byte_code:
bytes = row.split(' ')
# skip first and last tokens, also xxd puts two spaces before the last column so subtract 2 to skip
# print(bytes)
# codes = row[:-2].split()[1:]
for i in range(1, 9):
# print(bytes[i])
hexword = bytes[i]
highbyte = hexword[0:2]
lowbyte = hexword[2:]
binary_val = int(highbyte, 16)
byte_counts[binary_val] += 1
binary_val = int(lowbyte, 16)
byte_counts[binary_val] += 1
total += 2
entropy = calculate_entropy(byte_counts, total)
return entropy
In [54]:
num1 = 2
num2 = 3
print(num1, " is not equal to ", num2)
In [51]:
tfiles = os.listdir('/temp/')
byte_files = [i for i in tfiles if '.bytes' in i]
for fname in byte_files:
f = open('/temp/' + fname, 'r')
filesize = os.path.getsize('/temp/' + fname)
lines = f.readlines()
entropy = entropy_counter(lines)
print(fname + ' : entropy = ' + str(entropy) + ' file size = ' + str(filesize))
In [41]:
tfiles = os.listdir('/temp/')
byte_files = [i for i in tfiles if '.bytes' in i]
for fname in byte_files:
f = open('/temp/' + fname, 'r')
filesize = os.path.getsize('/temp/' + fname)
lines = f.readlines()
entropy = entropy_counter(lines)
print(fname + ' : entropy = ' + str(entropy) + ' file size = ' + str(filesize))
In [14]:
help(str.split)
In [ ]:
pid = os.getpid()
print 'Process id:', pid
feature_file = 'data/' + str(pid) + '-malware-features-asm.csv' # Windows API, symbols, registers, opcodes, defines etc...
print 'feature file:', feature_file
In [ ]:
#alternative separation method
ext_drive = '/opt/kaggle/test/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4))
In [ ]:
ext_drive = '/opt/kaggle/malware/'
train_paths = ['train1', 'train2', 'train3', 'train4']
feature_file = ext_drive + 'data/' + train_paths[2] + '-malware-features-asm.csv'
print feature_file
In [ ]:
column_names = ['filename'] + [ ("ASM_{:s}".format(str(x))) for x in range(1000)]
print(column_names)