In [11]:
from multiprocessing import Pool
import os
from csv import writer
import numpy as np
import math
import scipy.misc
import array
import time as tm

1. ASM Feature Extraction

Run time for asm feature extraction: 21.5 hours

Machine:

  • AMD 6 Core CPU
  • 8GB RAM
  • 120GB SSD
  • 2TB HDD

In [ ]:
keywords = ['Virtual','Offset','loc','Import','Imports','var','Forwarder','UINT','LONG','BOOL','WORD','BYTES','large','short','dd','db','dw','XREF','ptr','DATA','FUNCTION','extrn','byte','word','dword','char','DWORD','stdcall','arg','locret','asc','align','WinMain','unk','cookie','off','nullsub','DllEntryPoint','System32','dll','CHUNK','BASS','HMENU','DLL','LPWSTR','void','HRESULT','HDC','LRESULT','HANDLE','HWND','LPSTR','int','HLOCAL','FARPROC','ATOM','HMODULE','WPARAM','HGLOBAL','entry','rva','COLLAPSED','config','exe','Software','CurrentVersion','__imp_','INT_PTR','UINT_PTR','---Seperator','PCCTL_CONTEXT','__IMPORT_','INTERNET_STATUS_CALLBACK','.rdata:','.data:','.text:','case','installdir','market','microsoft','policies','proc','scrollwindow','search','trap','visualc','___security_cookie','assume','callvirtualalloc','exportedentry','hardware','hkey_current_user','hkey_local_machine','sp-analysisfailed','unableto']
known_sections = ['.text', '.data', '.bss', '.rdata', '.edata', '.idata', '.rsrc', '.tls', '.reloc']
registers = ['edx','esi','es','fs','ds','ss','gs','cs','ah','al',
                 'ax','bh','bl','bx','ch','cl','cx','dh','dl','dx',
                 'eax','ebp','ebx','ecx','edi','esp']

opcodes = ['add','al','bt','call','cdq','cld','cli','cmc','cmp','const','cwd','daa','db'
                ,'dd','dec','dw','endp','ends','faddp','fchs','fdiv','fdivp','fdivr','fild'
                ,'fistp','fld','fstcw','fstcwimul','fstp','fword','fxch','imul','in','inc'
                ,'ins','int','jb','je','jg','jge','jl','jmp','jnb','jno','jnz','jo','jz'
                ,'lea','loope','mov','movzx','mul','near','neg','not','or','out','outs'
                ,'pop','popf','proc','push','pushf','rcl','rcr','rdtsc','rep','ret','retn'
                ,'rol','ror','sal','sar','sbb','scas','setb','setle','setnle','setnz'
                ,'setz','shl','shld','shr','sidt','stc','std','sti','stos','sub','test'
                ,'wait','xchg','xor']

In [ ]:
paths = ['train','test']

def count_1gram(byte_code):
    OneByte = [0]*16**2
    for row in byte_code:
        row = row.rstrip('\r\n')
        codes = row.split()[1:]
        # Convert code to 1byte
        OneByteCode = []
        for i in codes:
            if i != '??':
                OneByteCode += [int(i,16)]

        # Calculate the frequency of 1byte
        for i in OneByteCode:
                    OneByte[i] += 1
    return OneByte


def count_2gram(byte_code):
    twoByte = [0]*16**4
    for row in byte_code:
        codes = row[:-2].split()[1:]
        codes_2g = codes[:-1]
        for i in range(len(codes_2g)):
            codes_2g[i] += codes[i+1]

        twoByteCode = []
        for i in codes_2g:
            if '??' not in i:
                twoByteCode += [int(i,16)]

        for i in twoByteCode:
            twoByte[i] += 1
    return twoByte

def count_4gram(byte_code): #TODO: NOT DONE YET
    twoByte = [0]*16**8
    for row in byte_code:
        codes = row[:-2].split()[1:]
        codes_2g = codes[:-1]
        for i in range(len(codes_2g)):
            codes_2g[i]+= codes[i+1]

        twoByteCode = []
        for i in codes_2g:
            if '??' not in i:
                twoByteCode += [int(i,16)]

        for i in twoByteCode:
            twoByte[i] += 1
    return twoByte

def count_asm_symbols(asm_code):
    symbols = [0]*7
    for row in asm_code:
        if '*' in row:
            symbols[0] += 1
        if '-' in row:
            symbols[1] += 1
        if '+' in row:
            symbols[2] += 1
        if '[' in row:
            symbols[3] += 1
        if ']' in row:
            symbols[4] += 1
        if '@' in row:
            symbols[5] += 1
        if '?' in row:
            symbols[6] += 1

    return symbols


def count_asm_registers(asm_code):
    registers_values = [0]*len(registers)
    for row in asm_code:
        parts = row.replace(',',' ').replace('+',' ').replace('*',' ').replace('[',' ').replace(']',' ') \
                    .replace('-',' ').split()
        for register in registers:
            registers_values[registers.index(register)] += parts.count(register)
    return registers_values


def count_asm_opcodes(asm_code):
    opcodes_values = [0]*len(opcodes)
    for row in asm_code:
        parts = row.split()

        for opcode in opcodes:
            if opcode in parts:
                opcodes_values[opcodes.index(opcode)] += 1
                break
    return opcodes_values


def count_asm_APIs(asm_code, apis):
    apis_values = [0]*len(apis)
    for row in asm_code:
        for i in range(len(apis)):
            if apis[i] in row:
                apis_values[i] += 1
                break
    return apis_values


def count_asm_misc(asm_code):
    keywords_values = [0]*len(keywords)
    for row in asm_code:
        for i in range(len(keywords)):
            if keywords[i] in row:
                keywords_values[i] += 1
                break
    return keywords_values

In [ ]:
# Extract features from test/training asm files, file list is passed in as a parameter

def extract_asm_features(tfiles):
    
    pid = os.getpid()
    print('Process id:', pid)
    feature_file = 'data/' + str(pid) + '-malware-features-asm.csv' # Windows API, symbols, registers, opcodes, etc...   
    print('feature file:', feature_file)

    fapi = open("data/APIs.txt")
    defined_apis = fapi.readlines()
    defined_apis = defined_apis[0].split(',')

    asm_files = [i for i in tfiles if '.asm' in i]
    ftot = len(asm_files)
    
    feature_counts = []
    with open(feature_file, 'w') as f:
        # write the csv header
        fw = writer(f)
        colnames = ['filename'] + registers + opcodes + defined_apis + keywords
        fw.writerow(colnames)
        
        for idx, fname in enumerate(asm_files):
            fasm = open(ext_drive + fname, 'r')
            content = fasm.readlines()
            
            reg_vals = count_asm_registers(content)
            opc_vals = count_asm_opcodes(content)
            api_vals = count_asm_APIs(content, defined_apis)
            #sec_vals = count_asm_sections(content)
            mis_vals = count_asm_misc(content)
            count_vals = reg_vals + opc_vals + api_vals + mis_vals
            
            feature_counts.append([fname[:fname.find('.asm')]] + count_vals)   
            
            # Writing rows after every 10 files processed
            if (idx+1) % 10 == 0:
              print(pid, idx + 1, 'of', ftot, 'files processed.')
              fw.writerows(feature_counts)
              feature_counts = []
                
        # Writing remaining files
        if len(feature_counts) > 0:
            fw.writerows(feature_counts)
            feature_counts = []

In [ ]:
# TRAINING
# Now divide the train files into four groups for multiprocessing
ext_drive = '/opt/kaggle/train/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4))
trains = [train1, train2, train3, train4]
p = Pool(4)
p.map(extract_asm_features, trains)

In [ ]:
# TESTING
# Now divide the test files into four groups for multiprocessing
ext_drive = '/opt/kaggle/test/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
test1 = tfiles[:quart]
test2 = tfiles[quart:(2*quart)]
test3 = tfiles[(2*quart):(3*quart)]
test4 = tfiles[(3*quart):]
print len(tfiles), quart, (len(test1)+len(test2)+len(test3)+len(test4))
tests = [test1, test2, test3, test4]
p = Pool(4)
p.map(extract_asm_features, tests)

In [ ]:

2. Byte Feature Extraction

Run time for byte feature extraction: 35 minutes

Machine:

  • AMD 6 Core CPU
  • 8GB RAM
  • 120GB SSD
  • 2TB HDD

In [2]:
# Calculate Shannon's Entropy, https://en.wikipedia.org/wiki/Entropy_(information_theory)

def calculate_entropy(byte_counts, total):
    
  entropy = 0.0

  for count in byte_counts:
    # If no bytes of this value were seen in the value, it doesn't affect
    # the entropy of the file.
    if count == 0:
        continue
    # p is the probability of seeing this byte in the file, as a floating-point number
    p = 1.0 * count / total
    entropy -= p * math.log(p, 256)
    

  return entropy

In [ ]:


In [3]:
def entropy_counter(byte_code):
  byte_counts = [0] * 256
  total = 0
  #code_length = len(byte_code)
  for row in byte_code:
    nrow = row.rstrip('\r\n')
    bytes = nrow.split(' ')
    # skip first token as it is the relative memory address
    # print(bytes) 
    for i in range(1, len(bytes)):
      #   print(bytes[i])
      if bytes[i] != '??':
        byte_counts[int(bytes[i], 16)] += 1
      else:
        byte_counts[0] += 1
      total += 1
        
  entropy = calculate_entropy(byte_counts, total)

  return entropy

In [ ]:


In [ ]:


In [6]:
# feature extraction for the .byte files

def extract_byte_features(tfiles):
    byte_files = [i for i in tfiles if '.bytes' in i]
    ftot = len(byte_files)
    
    pid = os.getpid()
    print('Process id:', pid)
    feature_file = 'data/' + str(pid) + '-malware-features-byte.csv' # entropy, file size, ngrams...   
    print('feature file:', feature_file)
    
    feature_counts = []
    with open(feature_file, 'w') as f:
        # write the column names for the csv file
        fw = writer(f)
        colnames = ['filename'] + ['entropy'] + ['filesize']
        fw.writerow(colnames)
        
        # Now iterate through the file list and extract the features from each file.
        for idx, fname in enumerate(byte_files):
            fasm = open(ext_drive + fname, 'r')
            filesize = os.path.getsize(ext_drive + fname)
            lines = fasm.readlines()
            
            # TODO: Do ngram extraction
            # First do entropy calculations and filesize
            
            entropy = entropy_counter(lines)
            #print(fname + ' : entropy = ' + str(entropy) + ' file size = ' + str(filesize))
            count_vals = [entropy, filesize]
            
            feature_counts.append([fname[:fname.find('.byte')]] + count_vals)   
            
            # Print progress
            if (idx+1) % 10 == 0:
              print(pid, idx + 1, 'of', ftot, 'files processed.')
              fw.writerows(feature_counts)
              feature_counts = []
                
        # Write remaining files
        if len(feature_counts) > 0:
            fw.writerows(feature_counts)
            feature_counts = []

In [ ]:
# TRAINING
# Now divide the train files into four groups for multiprocessing
ext_drive = '/opt/kaggle/train/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4))
trains = [train1, train2, train3, train4]
p = Pool(4)
p.map(extract_byte_features, trains)

In [ ]:
# TESTING
# Now divide the test files into four groups for multiprocessing
ext_drive = '/opt/kaggle/test/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
test1 = tfiles[:quart]
test2 = tfiles[quart:(2*quart)]
test3 = tfiles[(2*quart):(3*quart)]
test4 = tfiles[(3*quart):]
print len(tfiles), quart, (len(test1)+len(test2)+len(test3)+len(test4))
tests = [test1, test2, test3, test4]
p = Pool(4)
p.map(extract_byte_features, tests)

In [ ]:

3. Byte and ASM Image Extraction

Run time for byte feature extraction: 2.4 hours

Machine:

  • AMD 6 Core CPU
  • 8GB RAM
  • 120GB SSD
  • 2TB HDD

In [ ]:
# From Say_No_to_Overfitting
def entropy(p,n):
    p_ratio = float(p)/(p+n)
    n_ratio = float(n)/(p+n)
    return -p_ratio*math.log(p_ratio) - n_ratio * math.log(n_ratio)

def info_gain(p0,n0,p1,n1,p,n):
    return entropy(p,n) - float(p0+n0)/(p+n)*entropy(p0,n0) - float(p1+n1)/(p+n)*entropy(p1,n1)

In [13]:
def read_image(filename):
    f = open(filename,'rb')
    ln = os.path.getsize(filename) # length of file in bytes
    width = 256
    rem = ln%width
    a = array.array("B") # uint8 array
    a.fromfile(f,ln-rem)
    f.close()
    g = np.reshape(a,(len(a)/width,width))
    g = np.uint8(g)
    g.resize((1000,))
    return list(g)

In [15]:
# Do asm image extraction
def extract_asm_image_features(tfiles):
    asm_files = [i for i in tfiles if '.asm' in i]
    ftot = len(asm_files)
    
    pid = os.getpid()
    print('Process id:', pid)
    feature_file = 'data/' + str(pid) + '-test-image-features-asm.csv'  
    print('feature file:', feature_file)
    
    outrows = []
    with open(feature_file,'w') as f:
        fw = writer(f)
        column_names = ['filename'] + [("ASM_{:s}".format(str(x))) for x in range(1000)]
        fw.writerow(column_names)
        for idx, fname in enumerate(asm_files):
            file_id = fname.split('.')[0]
            image_data = read_image(ext_drive + fname)
            outrows.append([file_id] + image_data)
            
            # Print progress
            if (idx+1) % 10 == 0:
              print(pid, idx + 1, 'of', ftot, 'files processed.')
              fw.writerows(outrows)
              outrows = []
                                       
        # Write remaining files
        if len(outrows) > 0:
            fw.writerows(outrows)
            outrows = []

In [21]:
# Do byte image extraction
def extract_byte_image_features(tfiles):
    asm_files = [i for i in tfiles if '.bytes' in i]
    ftot = len(asm_files)
    
    pid = os.getpid()
    print('Process id:', pid)
    feature_file = 'data/' + str(pid) + '-train-image-features-byte.csv'   
    print('feature file:', feature_file)
    
    outrows = []
    with open(feature_file,'w') as f:
        fw = writer(f)
        column_names = ['filename'] + [("BYTE_{:s}".format(str(x))) for x in range(1000)]
        fw.writerow(column_names)
        for idx, fname in enumerate(asm_files):
            file_id = fname.split('.')[0]
            image_data = read_image(ext_drive + fname)
            outrows.append([file_id] + image_data)
            
            # Print progress
            if (idx+1) % 10 == 0:
              print(pid, idx + 1, 'of', ftot, 'files processed.')
              fw.writerows(outrows)
              outrows = []
                                       
        # Write remaining files
        if len(outrows) > 0:
            fw.writerows(outrows)
            outrows = []

In [ ]:
# TRAIN FILES ASM
# Now divide the train files into four groups for multiprocessing
start_time = tm.time()
ext_drive = '/opt/kaggle/train/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4))
trains = [train1, train2, train3, train4]
p = Pool(4)
p.map(extract_asm_image_features, trains)
print("Elapsed time: {:.2f} hours.".format((tm.time() - start_time)/3600.0))

In [ ]:
# TRAIN FILES BYTE
# Now divide the train files into four groups for multiprocessing
start_time = tm.time()
ext_drive = '/opt/kaggle/train/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4))
trains = [train1, train2, train3, train4]
p = Pool(4)
p.map(extract_byte_image_features, trains)
print("Elapsed time: {:.2f} hours.".format((tm.time() - start_time)/3600.0))

In [ ]:
# TEST FILES ASM
# Now divide the test files into four groups for multiprocessing
start_time = tm.time()
ext_drive = '/opt/kaggle/test/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
test1 = tfiles[:quart]
test2 = tfiles[quart:(2*quart)]
test3 = tfiles[(2*quart):(3*quart)]
test4 = tfiles[(3*quart):]
print len(tfiles), quart, (len(test1)+len(test2)+len(test3)+len(test4))
tests = [test1, test2, test3, test4]
p = Pool(4)
p.map(extract_asm_image_features, tests)
print("Elapsed time: {:.2f} hours.".format((tm.time() - start_time)/3600.0))

In [ ]:
# TEST FILES BYTE
# Now divide the test files into four groups for multiprocessing
start_time = tm.time()
ext_drive = '/opt/kaggle/test/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
test1 = tfiles[:quart]
test2 = tfiles[quart:(2*quart)]
test3 = tfiles[(2*quart):(3*quart)]
test4 = tfiles[(3*quart):]
print len(tfiles), quart, (len(test1)+len(test2)+len(test3)+len(test4))
tests = [test1, test2, test3, test4]
p = Pool(4)
p.map(extract_byte_image_features, tests)
print("Elapsed time: {:.2f} hours.".format((tm.time() - start_time)/3600.0))

CODE TESTING BELOW ONLY:


In [10]:
def extract_entropy(tfiles):
  byte_files = [i for i in tfiles if '.bytes' in i]
  for fname in byte_files:
    f = open(ext_drive + fname, 'r')
    filesize = os.path.getsize(ext_drive + fname)
    lines = f.readlines()
    entropy = entropy_counter(lines)
    print(fname + ' : entropy = ' + str(entropy) + ' file size = ' + str(filesize))

In [36]:
# this is just a 1gram counter that calls calculate_entropy when it is finished counting bytes
# ONLY USE FOR XXD format files
def entropy_counter_xxd(byte_code):
  byte_counts = [0] * 256
  total = 0
  #code_length = len(byte_code)
  for row in byte_code:
    bytes = row.split(' ')
    # skip first and last tokens, also xxd puts two spaces before the last column so subtract 2 to skip
    # print(bytes) 
    # codes = row[:-2].split()[1:]
    for i in range(1, 9):
      #   print(bytes[i])
      hexword = bytes[i]
      highbyte = hexword[0:2]
      lowbyte = hexword[2:]
      binary_val = int(highbyte, 16)
      byte_counts[binary_val] += 1
      binary_val = int(lowbyte, 16)
      byte_counts[binary_val] += 1
      total += 2
        
  entropy = calculate_entropy(byte_counts, total)

  return entropy

In [54]:
num1 = 2
num2 = 3
print(num1, " is not equal to ", num2)


2  is not equal to  3

In [51]:
tfiles = os.listdir('/temp/')
byte_files = [i for i in tfiles if '.bytes' in i]
for fname in byte_files:
    f = open('/temp/' + fname, 'r')
    filesize = os.path.getsize('/temp/' + fname)
    lines = f.readlines()
    entropy = entropy_counter(lines)
    print(fname + ' : entropy = ' + str(entropy) + ' file size = ' + str(filesize))


0A32eTdBKayjCWhZqDOQ.bytes : entropy = 0.7799095070146072 file size = 4356052
0ACDbR5M3ZhBJajygTuf.bytes : entropy = 0.4677855747448026 file size = 5731328

In [41]:
tfiles = os.listdir('/temp/')
byte_files = [i for i in tfiles if '.bytes' in i]
for fname in byte_files:
    f = open('/temp/' + fname, 'r')
    filesize = os.path.getsize('/temp/' + fname)
    lines = f.readlines()
    entropy = entropy_counter(lines)
    print(fname + ' : entropy = ' + str(entropy) + ' file size = ' + str(filesize))


downloader.bytes : entropy = 0.36326625172661214 file size = 4288
fu.bytes : entropy = 0.7363563437480696 file size = 411648
agobot.bytes : entropy = 0.9887759997811357 file size = 857600

In [14]:
help(str.split)


Help on method_descriptor:

split(...)
    S.split(sep=None, maxsplit=-1) -> list of strings
    
    Return a list of the words in S, using sep as the
    delimiter string.  If maxsplit is given, at most maxsplit
    splits are done. If sep is not specified or is None, any
    whitespace string is a separator and empty strings are
    removed from the result.


In [ ]:
pid = os.getpid()
print 'Process id:', pid 
feature_file = 'data/' + str(pid) + '-malware-features-asm.csv' # Windows API, symbols, registers, opcodes, defines etc...   
print 'feature file:', feature_file

In [ ]:
#alternative separation method
ext_drive = '/opt/kaggle/test/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4))

In [ ]:
ext_drive = '/opt/kaggle/malware/'
train_paths = ['train1', 'train2', 'train3', 'train4']
feature_file = ext_drive + 'data/' + train_paths[2] + '-malware-features-asm.csv'
print feature_file

In [ ]:
column_names = ['filename'] + [ ("ASM_{:s}".format(str(x))) for x in range(1000)]
print(column_names)