1. Data Preparation and Extraction of Features

Features sets will consist of:
- Entropy and file size from packed binaries.
- Entropy and file size from unpacked binaries.
- ASM features from disassembled unpacked binaries.
- Executable header features.
- Call Graph Features.
- Sample Statistics.
- PE packer type.
- Behavioural features from Cuckoo Sandbox reports.
- Memory features from Volatility reports.

Training labels will be generated from ClamAV, Windows Defender and VirusTotal.com reports.



In [1]:

    
from multiprocessing import Pool
import os
from csv import writer
import numpy as np
import pandas as pd
import math
import scipy.misc
import array
import time as tm
import re
import subprocess as sub



In [2]:

    
ext_drive = '/opt/vs/'
tfiles = os.listdir(ext_drive + "train")



In [4]:

    
len(tfiles)









    Out[4]:





65536

2. Generate Entropy and File Size of Packed Binaries and Non-Binary Files

Script: feature_extraction_entropy.py



In [12]:

    
# Calculate Shannon's Entropy, https://en.wikipedia.org/wiki/Entropy_(information_theory)

def calculate_entropy(byte_counts, total):
    
    entropy = 0.0

    for count in byte_counts:
        # If no bytes of this value were seen in the value, it doesn't affect
        # the entropy of the file.
        if count == 0:
            continue
        # p is the probability of seeing this byte in the file, as a floating-point number
        p = 1.0 * count / total
        entropy -= p * math.log(p, 256)
    

    return entropy


def entropy_counter(byte_code):
    
    byte_counts = [0] * 256
    code_length = len(byte_code)
    
    for i in range(len(byte_code)):
        byte_counts[int(byte_code[i])] += 1
        
    entropy = calculate_entropy(byte_counts, code_length)

    return entropy


def sort_and_save_entropy_feature_file():
    entropys = pd.read_csv('data/entropy-features.csv')
    # DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values().
    sorted_entropys = entropys.sort('file_name')
    sorted_entropys.to_csv('data/sorted-entropy-features.csv', index=False)
    sorted_entropys.head(20)
    
    return


def combine_entropy_files():
    # Function to combine the newly generated entropy files into one file:
    # 1. list data directory
    # 2. For each file in file list that matches (\d\d\d\d-entropy-features.csv)
    # 3. Trim the filenames if necessary (should remove VirusShare_  prefix).
    # 4. Concatenate the unsorted packer id feature files.
    # 5. Sort and write to data/sorted-packer-id-features.csv
    fop = open('data/entropy-features.csv','w')
    fop.write('file_name,entropy,file_size\n')
    p1 = re.compile('\d{3,5}-entropy-features-bin.csv') # This is the PID prefix for each file.
    file_list = os.listdir('data/')
    counter = 0
    for file_name in file_list:
        if p1.match(file_name):
            fip = open('data/' + file_name, 'r')
            in_lines = fip.readlines()
            fop.writelines(in_lines)
            counter += len(in_lines)
            fip.close()
            
    print('Completed combine of {:d} entropy features.'.format(counter))  
    
    fop.close()
    
    sort_and_save_entropy_feature_file()
    
    return


# feature extraction for the binary files

def extract_binary_features(tfiles):
    #byte_files = [i for i in tfiles if '.bytes' in i]
    ftot = len(tfiles)
    
    pid = os.getpid()
    print('Process id:', pid)
    feature_file = 'data/' + str(pid) + '-entropy-features-bin.csv' # entropy, file size, ngrams...   
    print('feature file:', feature_file)
    
    feature_counts = []
    with open(feature_file, 'w') as f:
        # Write the column names for the csv file
        fw = writer(f)
        # Do this when combining the files.
        #colnames = ['file_name'] + ['entropy'] + ['file_size'] 
        #fw.writerow(colnames)
        
        # Now iterate through the file list and extract the features from each file.
        for idx, fname in enumerate(tfiles):
            fasm = open(ext_drive + fname, 'rb')
            filesize = os.path.getsize(ext_drive + fname)
            in_bytes = fasm.read()
            
            # TODO: Do ngram extraction
            # First do entropy calculations and filesize
            # Convert the input array into a byte array to prevent type errors
            # in entropy counter function.
            in_bytes = bytearray(in_bytes)
            #print("Type = {:s}").format(type(in_bytes))
            entropy = entropy_counter(in_bytes)
            
            count_vals = [entropy, filesize]
            
            feature_counts.append([fname[fname.find('_')+1:]] + count_vals)   
            
            fasm.close()
            
            # Print progress
            if (idx+1) % 1000 == 0:
                print("{:d} - {:d} of {:d} files processed.".format(pid, idx + 1, ftot))
                fw.writerows(feature_counts)
                feature_counts = []
                
        # Write remaining files
        if len(feature_counts) > 0:
            fw.writerows(feature_counts)
            feature_counts = []

        print("Completed processing {:d} rows for feature file {:s}".format(ftot,feature_file))



In [ ]:

    
# TRAINING
# Now divide the train files into four groups for multiprocessing
ext_drive = '/opt/vs/train/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print("Files: {:d} - {:d} - {:d}".format(len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4))))
trains = [train1, train2, train3, train4]
p = Pool(4)
p.map(extract_binary_features, trains)
combine_entropy_files()



In [ ]:

    
# TRAINING
# Now divide the train files into four groups for multiprocessing
ext_drive = '/opt/vs/train2/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print("Files: {:d} - {:d} - {:d}".format(len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4))))
trains = [train1, train2, train3, train4]
p = Pool(4)
p.map(extract_binary_features, trains)
combine_entropy_files()



In [15]:

    
entropys = pd.read_csv('data/sorted-entropy-features.csv')
sorted_entropys = entropys.sort('file_name')
sorted_entropys.to_csv('data/sorted-entropy-features-vs251-252.csv', index=False)
sorted_entropys.head(20)









    Out[15]:






  
    
      
      file_name
      entropy
      file_size
    
  
  
    
      0    
       00002e640cafb741bea9a48eaee27d6f
       0.992174
        208860
    
    
      1    
       000118d12cbf9ad6103e8b914a6e1ac3
       0.834382
        201600
    
    
      2    
       0001776237ac37a69fcef93c1bac0988
       0.966021
        682192
    
    
      65536
       00027c21667d9119a454df8cef2dc1c7
       0.666599
         18390
    
    
      65537
       0003887ab64b8ae19ffa988638decac2
       0.903260
       1134320
    
    
      3    
       000403e4e488356b7535cc613fbeb80b
       0.773787
        199168
    
    
      65538
       0004376a62e22f6ad359467eb742b8ff
       0.803515
        149720
    
    
      4    
       0004c8b2a0f4680a5694d74199b40ea2
       0.985592
       1165440
    
    
      5    
       000595d8b586915c12053104cf845097
       0.841920
        264240
    
    
      65539
       000634f03457d088c71dbffb897b1315
       0.957584
       1725502
    
    
      65540
       00072ed24314e91b63b425b3dc572f50
       0.486112
        328093
    
    
      65541
       00092d369958b67557da8661cc9093bc
       0.845657
        522936
    
    
      6    
       00093d5fa5cb7ce77f6eaf39962daa12
       0.803481
        742064
    
    
      7    
       00099926d51b44c6f8c93a48c2567891
       0.997032
        725288
    
    
      65542
       0009a64f786fa29bfa6423278cc74f02
       0.996663
        671280
    
    
      8    
       000a2db4762dc06628a086c9e117f884
       0.535436
         61551
    
    
      65543
       000ac11fa7587b2316470b154254a219
       0.997824
       1874471
    
    
      9    
       000ae2c63ba69fc93dfc395b40bfe03a
       0.899481
        487386
    
    
      65544
       000ae90736a51c47543dcc6d8a735362
       0.863887
        260144
    
    
      65545
       000b41258d624ef2d6e430822d0c0c8f
       0.992772
        590824
    
  

20 rows × 3 columns

3. Generate Entropy and File Size of Unpacked Binaries



In [ ]:

    
# TODO: everything.



In [ ]:

4. Generate PE ASM and Header Features

- PE Header Features from objdump header summaries.
- ASM Features from IDA Pro assembly files.

- Script: feature_extraction_pe_asm.py



In [ ]:

    
keywords = ['Virtual','Offset','loc','Import','Imports','var','Forwarder','UINT','LONG'
            ,'BOOL','WORD','BYTES','large','short','dd','db','dw','XREF','ptr','DATA'
            ,'FUNCTION','extrn','byte','word','dword','char','DWORD','stdcall','arg'
            ,'locret','asc','align','WinMain','unk','cookie','off','nullsub','DllEntryPoint'
            ,'System32','dll','CHUNK','BASS','HMENU','DLL','LPWSTR','void','HRESULT','HDC'
            ,'LRESULT','HANDLE','HWND','LPSTR','int','HLOCAL','FARPROC','ATOM','HMODULE'
            ,'WPARAM','HGLOBAL','entry','rva','COLLAPSED','config','exe','Software'
            ,'CurrentVersion','__imp_','INT_PTR','UINT_PTR','---Seperator','PCCTL_CONTEXT'
            ,'__IMPORT_','INTERNET_STATUS_CALLBACK','.rdata:','.data:','.text:','case'
            ,'installdir','market','microsoft','policies','proc','scrollwindow','search'
            ,'trap','visualc','___security_cookie','assume','callvirtualalloc','exportedentry'
            ,'hardware','hkey_current_user','hkey_local_machine','sp-analysisfailed','unableto']

known_sections = ['.text', '.data', '.bss', '.rdata', '.edata', '.idata', '.rsrc', '.tls', '.reloc']

registers = ['edx','esi','es','fs','ds','ss','gs','cs','ah','al',
                 'ax','bh','bl','bx','ch','cl','cx','dh','dl','dx',
                 'eax','ebp','ebx','ecx','edi','esp']

opcodes = ['add','al','bt','call','cdq','cld','cli','cmc','cmp','const','cwd','daa','db'
                ,'dd','dec','dw','endp','ends','faddp','fchs','fdiv','fdivp','fdivr','fild'
                ,'fistp','fld','fstcw','fstcwimul','fstp','fword','fxch','imul','in','inc'
                ,'ins','int','jb','je','jg','jge','jl','jmp','jnb','jno','jnz','jo','jz'
                ,'lea','loope','mov','movzx','mul','near','neg','not','or','out','outs'
                ,'pop','popf','proc','push','pushf','rcl','rcr','rdtsc','rep','ret','retn'
                ,'rol','ror','sal','sar','sbb','scas','setb','setle','setnle','setnz'
                ,'setz','shl','shld','shr','sidt','stc','std','sti','stos','sub','test'
                ,'wait','xchg','xor']


def count_asm_symbols(asm_code):
    symbols = [0]*7
    for row in asm_code:
        if '*' in row:
            symbols[0] += 1
        if '-' in row:
            symbols[1] += 1
        if '+' in row:
            symbols[2] += 1
        if '[' in row:
            symbols[3] += 1
        if ']' in row:
            symbols[4] += 1
        if '@' in row:
            symbols[5] += 1
        if '?' in row:
            symbols[6] += 1

    return symbols


def count_asm_registers(asm_code):
    registers_values = [0]*len(registers)
    for row in asm_code:
        parts = row.replace(',',' ').replace('+',' ').replace('*',' ').replace('[',' ').replace(']',' ') \
                    .replace('-',' ').split()
        for register in registers:
            registers_values[registers.index(register)] += parts.count(register)
    return registers_values


def count_asm_opcodes(asm_code):
    opcodes_values = [0]*len(opcodes)
    for row in asm_code:
        parts = row.split()

        for opcode in opcodes:
            if opcode in parts:
                opcodes_values[opcodes.index(opcode)] += 1
                break
    return opcodes_values


def count_asm_APIs(asm_code, apis):
    apis_values = [0]*len(apis)
    for row in asm_code:
        for i in range(len(apis)):
            if apis[i] in row:
                apis_values[i] += 1
                break
    return apis_values


def count_asm_misc(asm_code):
    keywords_values = [0]*len(keywords)
    for row in asm_code:
        for i in range(len(keywords)):
            if keywords[i] in row:
                keywords_values[i] += 1
                break
    return keywords_values


# Extract features from test/training asm files, file list is passed in as a parameter

def extract_asm_features(tfiles):
    
    pid = os.getpid()
    print('Process id:', pid)
    feature_file = 'data/' + str(pid) + '-malware-features-asm.csv' # Windows API, symbols, registers, opcodes, etc...   
    print('feature file:', feature_file)

    fapi = open("data/APIs.txt")
    defined_apis = fapi.readlines()
    defined_apis = defined_apis[0].split(',')

    asm_files = [i for i in tfiles if '.asm' in i]
    ftot = len(asm_files)
    
    feature_counts = []
    with open(feature_file, 'w') as f:
        # write the csv header
        fw = writer(f)
        colnames = ['file_name'] + registers + opcodes + defined_apis + keywords
        fw.writerow(colnames)
        
        for idx, fname in enumerate(asm_files):
            fasm = open(ext_drive + fname, 'r')
            content = fasm.readlines()
            
            reg_vals = count_asm_registers(content)
            opc_vals = count_asm_opcodes(content)
            api_vals = count_asm_APIs(content, defined_apis)
            #sec_vals = count_asm_sections(content)
            mis_vals = count_asm_misc(content)
            count_vals = reg_vals + opc_vals + api_vals + mis_vals
            
            feature_counts.append([fname[:fname.find('.asm')]] + count_vals)   
            
            # Writing rows after every 10 files processed
            if (idx+1) % 10 == 0:
              print(pid, idx + 1, 'of', ftot, 'files processed.')
              fw.writerows(feature_counts)
              feature_counts = []
                
        # Writing remaining files
        if len(feature_counts) > 0:
            fw.writerows(feature_counts)
            feature_counts = []

    return



In [ ]:



In [ ]:

8. Clean and Sort Function Names

- script: function_name_clean.py



In [ ]:

    
# Need to clean up and sort these function names for ASM feature extraction.
fip = open('data/all-function-column-names-multiline.csv')
function_names = fip.readlines()
fip.close()

function_names.sort()
function_names[:50]

fop = open('data/sorted-function-names-multiline.txt','w')
fop.writelines(function_names)
fop.close()



In [22]:

    
fip = open('data/sorted-function-names-multiline.txt','r')
sorted_function_names = fip.readlines()
fip.close()
fip = open('data/APIs.txt','r')
api_names_str = fip.readline()
fip.close()
api_names_str = api_names_str.rstrip()
api_names = api_names_str.split(',')
api_names.sort()
len(api_names)









    Out[22]:





792



In [23]:

    
for idx in range(len(sorted_function_names)):
    sorted_function_names[idx] = sorted_function_names[idx].rstrip()
    
for aname in api_names:
    if aname not in sorted_function_names:
        sorted_function_names.append(aname)
        
sorted_function_names.sort()
len(sorted_function_names)









    Out[23]:





155548



In [ ]:

    
sorted_function_names[:50]



In [25]:

    
function_count = len(sorted_function_names)
total_chars = 0
for func_name in sorted_function_names:
    total_chars += len(func_name)
    
avg_name_len = int(total_chars / function_count)
avg_name_len









    Out[25]:





32



In [ ]:

    
# truncate function names to reduce the size of the huge sparse matrix.
function_column_names = []
for func in sorted_function_names:
    if func.startswith('sub') or func.startswith('loc') or func.startswith('unk'):
        func = func[:5] # lets try to reduce the vast number of functions.
    elif func.startswith('eax+') or func.startswith('ebx+') or func.startswith('ecx+') or func.startswith('edx+'):
        func = func[:3]
    elif func.startswith('edi+') or func.startswith('esi+'):
        func = func[:3]
    elif func.startswith('byte_') or func.startswith('word_') or func.startswith('off_'):
        func = func[:4]
    elif func.startswith('_') or func.startswith('$'):
        func = func[1:]
    elif func.startswith('__') or func.startswith('$$'):
        func = func[2:]
    #else: need a regex here to match a bunch of random crap 
    #    func = func[:33]
        
    if len(func) > 32: # Reduce the the function name length to max of average function length.
        func = func[:32]
        
    if func not in function_column_names:    
        function_column_names.append(func)
        
function_column_names[:50]



In [27]:

    
fop = open('data/sorted-reduced-function-names.txt','w')

for fname in function_column_names:
    fop.write(fname + "\n")
        
fop.close()

len(function_column_names)









    Out[27]:





143048



In [ ]:

    
# Use a regex to remove function names that are just hexadecimal addresses.
p1 = re.compile('\d\w+h')
reduced_function_names = []
fip = open('data/sorted-reduced-function-names.txt','r')
function_column_names = fip.readlines()
fip.close()

fop = open('data/sorted-reduced-function-names-hexless.txt','w')
for fname in function_column_names:
    fname = fname.rstrip()
    m = p1.match(fname)
    if m == None:
        fop.write(fname + "\n")
        reduced_function_names.append(fname)
        
fop.close()
reduced_function_names[:50]



In [29]:

    
len(reduced_function_names)









    Out[29]:





135436



In [ ]:

10. Test Code Only



In [4]:

    
signat = sub.check_output(["file",'-b', '/opt/vs/agobot.exe'])



In [5]:

    
signat









    Out[5]:





'PE32 executable (GUI) Intel 80386, for MS Windows, UPX compressed\n'

Generate C Library API Tokens for ASM and Header Feature Extraction.



In [13]:

    
# Generate libc function calls for ELF API feature extraction.
# Put in feature_extraction_elf_asm.py

def generate_libc_api():
    # Extract libc function and variable names from GNU Libc Documentation.
    # Put in feature_extraction.py
    
    fipfunc = open('data/libc-function-index.txt', 'r')
    funclines = fipfunc.readlines()
    fipfunc.close()
    
    counter = 0
    func_list = []
    
    for idx, fline in enumerate(funclines):
        fline = fline.replace('\t','').replace('\n','')
        if fline.startswith('|'):
            tokens = fline.split('|')   # The function names are |funcname|
            funcname = tokens[1]
            func_list.append(funcname)
            counter += 1
            
            
    print("Found {:d} function definitions for libc api.".format(counter))
    
    fop = open('data/elf-libc-api.txt', 'w')
    
    for func_name in func_list:
        fop.write(func_name + "\n")
        
    fop.close()
    
    return func_list


def generate_libc_var():
    # Extract libc function and variable names from GNU Libc Documentation.
    # Put in feature_extraction.py

    fipvar = open('data/libc-variable-index.txt', 'r')
    varlines = fipvar.readlines()
    fipvar.close()
    
    counter = 0
    var_list = []
    
    for idx, vline in enumerate(varlines):
        vline = vline.replace('\t','').replace('\n','')
        if vline.startswith('|'):
            tokens = vline.split('|')   # The function names are |funcname|
            varname = tokens[1]
            var_list.append(varname)
            counter += 1
            
            
    print("Found {:d} variable definitions for libc api.".format(counter))
    
    fop = open('data/elf-libc-var.txt', 'w')
    
    for var_name in var_list:
        fop.write(var_name + "\n")
        
    fop.close()
    
    return var_list



In [ ]:

    
func_list = generate_libc_api()
func_list

Validate Disassembly Results.



In [18]:

    
# Check interrupted disassembly results for train1 feature set.
# def validate_disassembly():
# put in disassemble_pe.py

t1asm = os.listdir('/opt/vs/train1asm/')
t1hdr = os.listdir('/opt/vs/train1hdr/')
asm_files = []
hdr_files = []

for fname in t1asm:
    if fname.endswith('.asm'):
        asm_files.append(fname)
        
for fname in t1hdr:
    if fname.endswith('.txt'):
        hdr_files.append(fname)
        
        
print("asm dir: {:d} asm files {:d} hdr dir {:d} hdr files {:d}".format(len(t1asm),len(asm_files),len(t1hdr),len(hdr_files)))









    



asm dir: 21314 asm files 21314 hdr dir 21409 hdr files 21409



In [19]:

    
len(t1hdr) - len(t1asm)









    Out[19]:





95



In [14]:

    
os.path.getsize('/opt/vs/train1asm/VirusShare_5ac1817d757a27edb90cdf887ba66870.asm')









    Out[14]:





77814888



In [20]:

    
counter = 0
missing_hdr_list = []

for fname in asm_files:
    hdr_name = fname.replace('.asm', '.txt')
    if hdr_name not in hdr_files:
        print("{:s} not in header file list.".format(hdr_name))
        counter += 1
        missing_asm_list.append(fname)
        
print("{:d} missing header files.".format(counter))









    



0 missing header files.



In [ ]:

    
counter = 0
missing_asm_list = []

for fname in hdr_files:
    asm_name = fname.replace('.txt','.asm')
    if asm_name not in asm_files:
        print("{:s} not in asm file list.".format(asm_name))
        counter += 1
        missing_asm_list.append(fname)
        
print("{:d} missing assembly files.".format(counter))



In [23]:

    
counter = 0
fop = open('data/disass-train1-missing-asm-files.txt', 'w')
for fname in missing_asm_list:
    fop.write(fname + "\n")
    counter += 1
        
fop.close()
print("Wrote {:d} missing asm file names.".format(counter))









    



Wrote 95 missing asm file names.



In [ ]:



In [25]:

    
counter = 0
bad_hdr_list = []

for fname in hdr_files:
    fsize = os.path.getsize('/opt/vs/train1hdr/' + fname)
    if fsize < 1000:
        print("{:s} bad output, filesize = {:d}.".format(fname, fsize))
        counter += 1
        bad_hdr_list.append(fname)
        
print("{:d} bad header files.".format(counter))









    



VirusShare_d5eff38b212286c46db007aa7159ffd8.txt bad output, filesize = 0.
VirusShare_592d7ac775519110d58e9ce1975c1b5b.txt bad output, filesize = 0.
VirusShare_4a0c79f6ad27b0a674b08005d102e16d.txt bad output, filesize = 0.
VirusShare_c80d9b2dbf9b7953a3b6e9b51a39a0c2.txt bad output, filesize = 0.
4 bad header files.



In [ ]:

    
counter = 0
bad_asm_list = []

for fname in asm_files:
    fsize = os.path.getsize('/opt/vs/train1asm/' + fname)
    if fsize < 1000:
        print("{:s} bad output, filesize = {:d}.".format(fname, fsize))
        counter += 1
        bad_asm_list.append(fname)
        
print("{:d} bad asm files.".format(counter))



In [ ]:



In [3]:

    
apt_df = pd.read_csv('data/sorted-entropy-features-apt.csv')
apt_file_list = apt_df['file_name']
apt_file_list.shape









    Out[3]:





(293,)



In [5]:

    
apt_file_list.head()









    Out[5]:





0    001dd76872d80801692ff942308c64e6
1    002325a0a67fded0381b5648d7fe9b8e
2    00dbb9e1c09dbdafb360f3163ba5a3de
3    0149b7bd7218aab4e257d28469fddb0d
4    01e0dc079d4e33d8edd050c4900818da
Name: file_name, dtype: object



In [7]:

    
f_list = os.listdir('/home/derek/project/temp/train/')
counter = 0
file_list = []
for fname in f_list:
    if fname.startswith('Virus'):
        tname = fname[fname.find('_') + 1:]
        file_list.append(tname)
        counter += 1
        
print("Got {:d} files in training directory.".format(counter))

apt_list = np.array(apt_file_list)
for fname in file_list:
    if fname not in apt_list:
        print("Extra file: {:s}".format(fname))









    



Got 294 files in training directory.
Extra file: 00248ef21706d78c1f0e1eca3cab72c3



In [ ]:

Rename Header and ASM Files Generated by IDA Pro.



In [5]:

    
def rename_header_files(ext_dir):
    # Rename all the PE headers files so it is easier to process them.
    
    file_list = os.listdir(ext_dir)
    counter = 0
    
    for fname in file_list:
        if fname.startswith('Virus'):
            file_path = ext_dir + fname
            trunc_name = fname[0:fname.find('.txt')]
            new_path = ext_dir + trunc_name + '.pe.txt'
            result = sub.check_call(['mv', file_path, new_path])
            counter += 1

        if (counter % 1000) == 0:
            print('Renamed {:d} header files.'.format(counter))

    print('Completed move of {:d} header files.'.format(counter))
    
    return



In [ ]:

    
ext_dir = '/home/derek/project/temp/'
rename_header_files(ext_dir)



In [2]:

    
def rename_asm_files(ext_dir, new_dir):
    # Rename all the PE ASM files and move to a new directory
    # so it is easier to process them.
    
    file_list = os.listdir(ext_dir)
    counter = 0
    
    print("Got {:d} files in directory {:s}".format(len(file_list), ext_dir))
    
    for fname in file_list:
        if fname.endswith('.asm'):
            file_path = ext_dir + fname
            trunc_name = fname[0:fname.find('.asm')]
            new_path = new_dir + trunc_name + '.pe.asm'
            result = sub.check_call(['mv', file_path, new_path])
            counter += 1

            if (counter % 1000) == 0:
                print('Renamed {:d} ASM files.'.format(counter))

    print('Completed rename of {:d} ASM files.'.format(counter))
    
    return



In [ ]:

    
rename_asm_files('/opt/vs/train3/','opt/vs/train3asm/')



In [4]:

    
def rename_asm_files_fix(ext_dir):
    # Rename all the PE headers files so it is easier to process them.
    
    file_list = os.listdir(ext_dir)
    pe_counter = 0
    unpe_counter = 0
    
    print("Got total files: {:d}".format(len(file_list)))
    
    for fname in file_list:
        if fname.endswith('.pe.asm'):
            pe_counter += 1
        elif fname.endswith('.asm'):
            file_path = ext_dir + fname
            trunc_name = fname[0:fname.find('.asm')]
            new_path = ext_dir + trunc_name + '.pe.asm'
            result = sub.check_call(['mv', file_path, new_path])
            unpe_counter += 1

            if (unpe_counter % 1000) == 0:
                print('Renamed {:d} ASM files.'.format(unpe_counter))

    print('Completed move of {:d} ASM files with {:d} files already renamed.'.format(unpe_counter, pe_counter))
    
    return



In [ ]:

    
rename_asm_files_fix('/opt/vs/train3/')



In [ ]:

Validate Disassembly Results Part 2

- disassemble_pe.py



In [4]:

    
# Moved to feature-extraction-validation.ipynb



In [ ]:



In [ ]:

Find ELF Disassembly Files.



In [6]:

    
def find_elf_train1asm(ext_dir):
    # Rename all the PE headers files so it is easier to process them.
    
    file_list = os.listdir(ext_dir)
    elf_counter = 0
    
    print("Got total files: {:d}".format(len(file_list)))
    
    for fname in file_list:
        if 'elf' in fname:
            elf_counter += 1
            file_path = ext_dir + fname
            #trunc_name = fname[0:fname.find('.elf')]
            new_path = '/opt/vs/' + fname
            #result = sub.check_call(['mv', file_path, new_path])

            #if (elf_counter % 1000) == 0:
            print('ELF ASM file {:s}.'.format(file_path))

    print('Completed move of {:d} ELF ASM files.'.format(elf_counter))
    
    return



In [7]:

    
find_elf_train1asm('/opt/vs/train1asm/')









    



Got total files: 54911
Completed move of 0 ELF ASM files.



In [8]:

    
find_elf_train1asm('/opt/vs/train2asm/')









    



Got total files: 46166
Completed move of 0 ELF ASM files.



In [ ]:

    
find_elf_train1asm('/opt/vs/train3asm/')



In [ ]:

    
find_elf_train1asm('/opt/vs/train4asm/')



In [ ]:

    
find_elf_train1asm('/opt/vs/aptasm/')

Test Generation of PE/COFF Header Tokens.

- generate_pe_header_tokens.py



In [27]:

    
def save_token_counts(token_counter_map, out_file):
    # Output the malware sample classification counts.
    fop = open(out_file, 'w')
    csv_wouter = writer(fop)
    cols = ['token_name','count'] # write out the column names.
    csv_wouter.writerow(cols)
    outlines = []
    sorted_keys = token_counter_map.keys()
    sorted_keys.sort()
    counter = 0
    for key in sorted_keys:
        outlines.append([key, token_counter_map[key]])
        counter += 1
        if (counter % 100) == 0: # write out some lines
            csv_wouter.writerows(outlines)
            outlines = []
            print("Processed token {:s} -> {:d}.".format(key, token_counter_map[key]))

    # Finish off.
    if (len(outlines) > 0):
        csv_wouter.writerows(outlines)
        outlines = []

    print("Completed writing {:d} tokens.".format(len(sorted_keys)))    
    fop.close()

    return


def get_token_count_map(token_df):
    # Read in the token count file and create a dict.
    token_dict = {}
    type_y = np.array(token_df['token_name'])
    
    for idx in range(token_df.shape[0]): # First fill the dict with the token counts
        token_dict[token_df.iloc[idx,0]] = token_df.iloc[idx,1]
        

    return token_dict


def combine_token_files():
    # TODO: everything
    
    return


def generate_pe_tokens(file_list, out_token_file, out_count_file):

    psections = re.compile('\s+\d{1,2}\s+(\.\w+|\w+)\s+\d+')            # Pattern for section names.
    pdlls = re.compile('\s+DLL Name: (\w+)')                  # Pattern for import DLL names.
    pfunctions = re.compile('\s+\w+\s+\d{1,4}\s+(.+)')        # Pattern for import function names.
    preloc = re.compile('\s+reloc')                           # Pattern for relocation entries.
    pexports = re.compile('\s+\[\s*\d+\]\s+(\w+)')            # Pattern for exported function names.
    
    token_counter_map = {}
    counter = 0
    pid = os.getpid()
    
    for idx, fname in enumerate(file_list):

        fip = open(fname, 'r')
        in_lines = fip.readlines()
        
        counter += 1
        
        for line in in_lines:

            line = line.rstrip() # get rid of newlines they are annoying.
            token_val = ""
            
            m = preloc.match(line)
            if m != None:
                #token_val = m.group(2)
                continue

            m = psections.match(line)
            if m != None:
                token_val = m.group(1)
                print("Section: {:s}".format(token_val))
            else:
                m = pdlls.match(line)
                if m != None:
                    token_val = m.group(1)
                else:
                    m = pfunctions.match(line)
                    if m != None:
                        token_val = m.group(1)
                    else:                 
                        m = pexports.match(line)
                        if m != None:
                            token_val = m.group(1)
                            print("Export: {:s}".format(token_val))
                        else:   
                            continue
                        
            # Count the token type.
            if token_val in token_counter_map.keys():
                token_counter_map[token_val] += 1
            else:
                token_counter_map[token_val] = 1


        if (counter % 100) == 0:
            print("{:d} Processed {:d} header files.".format(pid, counter))

        fip.close()
        
        
    save_token_counts(token_counter_map, out_count_file)
    
    return



In [ ]:



In [ ]:

    
ext_drive = '/opt/vs/hdr/'
file_list = os.listdir(ext_drive)
file_paths = []

for fname in file_list:
    file_paths.append(ext_drive + fname)
    
generate_pe_tokens(file_paths,'data/pe-header-tokens-apt.txt','data/pe-coff-header-token-counts-apt.csv')



In [ ]:



In [6]:

    
# Testing PE header token generation.

def save_token_counts(token_counter_map, out_file_name):
    # Output the PE Header token counts.
    pid = os.getpid()
    out_file = "data/" + str(pid) + "-" + out_file_name
    fop = open(out_file, 'w')
    csv_wouter = writer(fop)

    outlines = []
    sorted_keys = token_counter_map.keys()
    sorted_keys.sort()
    counter = 0
    
    for key in sorted_keys:
        outlines.append([key, token_counter_map[key]])
        counter += 1
        if (counter % 100) == 0: # write out some lines
            csv_wouter.writerows(outlines)
            outlines = []
            print("Processed token {:s} -> {:d}.".format(key, token_counter_map[key]))

    # Finish off.
    if (len(outlines) > 0):
        csv_wouter.writerows(outlines)
        outlines = []

    print("Completed writing {:d} tokens.".format(len(sorted_keys)))    
    fop.close()

    return


def get_token_count_map(token_df):
    # Read in the token count file and create a dict.
    token_dict = {}
    type_y = np.array(token_df['token_name'])
    
    for idx in range(token_df.shape[0]): # First fill the dict with the token counts
        token_dict[token_df.iloc[idx,0]] = token_df.iloc[idx,1]
        

    return token_dict

    
def generate_pe_tokens(mp_params):
    # Parse a bunch of PE/COFF headers dumped by objdump and extract
    # section names, import DLLs, import functions and exported functions.
    file_list = mp_params.file_list
    out_count_file = mp_params.count_file
    
    psections = re.compile('\s+\d{1,2}\s+(\.\w+|\w+)\s+\d+')  # Pattern for section names.
    pdlls = re.compile('\s+DLL Name: (\w+)')                  # Pattern for import DLL names.
    pfunctions = re.compile('\s+\w+\s+\d{1,4}\s+(.+)')        # Pattern for import function names.
    preloc = re.compile('\s+reloc')                           # Pattern for relocation entries.
    pexports = re.compile('\s+\[\s*\d+\]\s+(\w+)')            # Pattern for exported function names.
    
    token_counter_map = {}
    counter = 0
    pid = os.getpid()
    
    for idx, fname in enumerate(file_list):

        fip = open(fname, 'r')
        in_lines = fip.readlines()
        
        counter += 1
        
        for line in in_lines:

            line = line.rstrip() # get rid of newlines they are annoying.
            token_val = ""
            
            m = preloc.match(line)
            if m != None:
                #token_val = m.group(2)
                continue

            m = psections.match(line)
            if m != None:
                token_val = m.group(1)
                #print("Section: {:s}".format(token_val))
            else:
                m = pdlls.match(line)
                if m != None:
                    token_val = m.group(1)
                else:
                    m = pfunctions.match(line)
                    if m != None:
                        token_val = m.group(1)
                    else:                 
                        m = pexports.match(line)
                        if m != None:
                            token_val = m.group(1)
                            #print("Export: {:s}".format(token_val))
                        else:   
                            continue
                        
            # Count the token type.
            if token_val in token_counter_map.keys():
                token_counter_map[token_val] += 1
            else:
                token_counter_map[token_val] = 1


        if (counter % 100) == 0:
            print("{:d} Processed {:d} header files.".format(pid, counter))

        fip.close()
        
        
    save_token_counts(token_counter_map, out_count_file)
    
    return


def save_combine(token_counter_map, out_file_name):
    # Save the combined token counts.
    
    out_file = "data/" + out_file_name
    fop = open(out_file, 'w')
    csv_wouter = writer(fop)
    cols = ['token_name','count'] 
    csv_wouter.writerow(cols)
    
    outlines = []
    sorted_keys = token_counter_map.keys()
    sorted_keys.sort()
    counter = 0
    
    for key in sorted_keys:
        outlines.append([key, token_counter_map[key]])
        counter += 1
        if (counter % 100) == 0: # write out some lines
            csv_wouter.writerows(outlines)
            outlines = []
            print("Processed token {:s} -> {:d}.".format(key, token_counter_map[key]))

    # Finish off.
    if (len(outlines) > 0):
        csv_wouter.writerows(outlines)
        outlines = []

    fop.close()
    
    print("Completed writing {:d} tokens.".format(len(sorted_keys)))  
    
    return


def combine_token_files(token_file, count_file):
    # Function to combine the newly generated token files into one file:
    # 1. list data directory
    # 2. For each file in file list that matches (\d\d\d\d-pe-header-tokens.csv)
    # 3. Trim the filenames if necessary (should remove VirusShare_  prefix).
    # 4. Concatenate the unsorted token feature files.
    # 5. Sort and write to data/sorted-token-features.csv

    
    p1 = re.compile('\d{3,5}-' + count_file) # This is the PID prefix for each file.
    file_list = os.listdir('data/')
    counter = 0
    token_map = {}
    
    for file_name in file_list:
        if p1.match(file_name):
            fip = open('data/' + file_name, 'r')
            in_lines = fip.readlines()
            for line in in_lines:
                tokens = line.split(',')
                if tokens[0] not in token_map.keys():
                    token_map[tokens[0]] = int(tokens[1])
                else:
                    token_map[tokens[0]] += int(tokens[1])
                    
            counter += len(in_lines)
            fip.close()
            
 

    save_combine(token_map, token_file)
    
    print('Completed combine of {:d} PE/COFF header tokens.'.format(counter)) 
    
    return


class Multi_Params(object):
    def __init__(self, tokenfile="", countfile="", filelist=[]):
        self.token_file = tokenfile
        self.count_file = countfile
        self.file_list = filelist



In [3]:

    
token_file = 'pe-header-tokens-apt.txt'
count_file = 'pe-header-token-counts-apt.csv'
ext_drive = '/opt/vs/apthdr/'
file_list = os.listdir(ext_drive)
tfiles = []

for fname in file_list:
    tfiles.append(ext_drive + fname)



In [ ]:

    
mp1 = Multi_Params(token_file, count_file, tfiles)

generate_pe_tokens(mp1)



In [7]:

    
combine_token_files(token_file, count_file)









    



Processed token AdjustTokenPrivileges -> 63.
Processed token CryptGetUserKey -> 3.
Processed token GetAdaptersInfo -> 2.
Processed token GetSystemTimeAsFileTime -> 29.
Processed token IsValidLocale -> 7.
Processed token OutputDebugStringA -> 32.
Processed token SHCreateDirectoryExA -> 7.
Processed token UnhandledExceptionFilter -> 64.
Processed token _configthreadlocale -> 2.
Processed token free -> 120.
Processed token wsprintfA -> 20.
Completed writing 1103 tokens.
Completed combine of 1103 PE/COFF header tokens.

Test PE Header Feature Extraction

- feature_extraction_pe_header.py



In [6]:

    
# Testing PE header feature extraction.

field_list = [ "Characteristics","Time/Date","Magic","MajorLinkerVersion","MinorLinkerVersion",
"SizeOfCode","SizeOfInitializedData","SizeOfUninitializedData","AddressOfEntryPoint",
"BaseOfCode","BaseOfData","ImageBase","SectionAlignment","FileAlignment",
"MajorOSystemVersion","MinorOSystemVersion","MajorImageVersion","MinorImageVersion",
"MajorSubsystemVersion","MinorSubsystemVersion","Win32Version",
"SizeOfImage","SizeOfHeaders","CheckSum","Subsystem","DllCharacteristics","SizeOfStackReserve",
"SizeOfStackCommit","SizeOfHeapReserve","SizeOfHeapCommit","LoaderFlags","NumberOfRvaAndSizes" ]

field_list_len = len(field_list)

ptime = re.compile("Time/Date\s+(.+)") # Time/Date pattern for PE Header field.



def get_field_values(header_lines):

    field_vals = [0] * field_list_len
    
    for idx1 in range(0,44): # The PE header fields are the first 44 lines of the file.
        
        line = header_lines[idx1].rstrip()
        tokens = line.split()
        
        for idx2, field_name in enumerate(field_list):
            
            if field_name in tokens:
                if field_name.startswith("Time"):
                    time_match = ptime.match(field_name)
                    if time_match != None:   
                        time_str = time_match.group(1) 
                        time_s = tm.strptime(time_str, "%a %b %d %H:%M:%S %Y") # Convert time string to epoch int.
                        time_epoch = tm.mktime(time_s)
                    else:
                        time_epoch = 0
                        
                    field_vals[idx2] = time_epoch

                elif len(tokens) > 1:
                    field_vals[idx2] = int(tokens[1], 16) # Convert the hex value of the field to int.
                
    return field_vals
                
                
def count_header_keywords(asm_code, keywords, klen):
    
    keywords_values = [0] * klen
    
    for row in asm_code:
        for i in range(klen):
            if keywords[i] in row:
                keywords_values[i] += 1
                break
                
    return keywords_values


def extract_header_features(multi_parameters):
    # 1. Get the feature file and token/keyword file names
    # 2. Create an array of token/keyword values.
    # 3. Iterate throught the PE header file list and counter the occurrence of the keywords in each file.

    pid = os.getpid()
    feature_file = 'data/' + str(pid) + "-" + multi_parameters.out_file  
    token_file = 'data/' + multi_parameters.token_file
    
    print('Process id: {:d} - Feature file: {:s} - Keyword file: {:s}'.format(pid, feature_file, token_file))

    hdr_pd = pd.read_csv(token_file)
    tokens = list(hdr_pd['token_name'])
    tlen = len(tokens)

    for idx, token in enumerate(tokens): # Clamp the token name length and demangle C++ names, they are annoying.
        token = token.replace('@','').replace('$','').replace('?','')
        if len(token) > 32:
            tokens[idx] = token[:32]
        else:
            tokens[idx] = token
            
    asm_files = [i for i in tfiles if '.pe.txt' in i]
    ftot = len(asm_files)
    
    feature_counts = []
    with open(feature_file, 'w') as f:

        fw = writer(f)
        
        for idx, fname in enumerate(asm_files):
            
            fasm = open(ext_drive + fname, 'r')
            content = fasm.readlines()
            fasm.close()
            
            fname = fname[fname.find("_")+1:] # Remove VirusShare_ from the start of the file name.
            
            field_vals = get_field_values(content)
            keyword_vals = count_header_keywords(content, tokens, tlen)
            
            feature_counts.append([fname[0:fname.find('.pe.txt')]] + field_vals + keyword_vals)   
            
            # Writing rows after every 10 files processed
            if (idx+1) % 1000 == 0:
                print("{:d} - {:d} of {:d} files processed.".format(pid, idx + 1, ftot))
                fw.writerows(feature_counts)
                feature_counts = []
                
        # Writing remaining features
        if len(feature_counts) > 0:
            fw.writerows(feature_counts)
            feature_counts = []

    print("{:d} Completed processing {:d} PE header files.".format(pid, ftot))
                      
    return


def combine_feature_files(feature_file_name, token_file):
    # Function to combine the newly generated PE header feature files into one file:
    # 1. list data directory
    # 2. For each file in file list that matches (\d\d\d\d-pe-header-features.csv)
    # 3. Trim the filenames if necessary (should remove VirusShare_  prefix).
    # 4. Concatenate the unsorted pe header feature files.
    # 5. Sort and write to data/sorted-pe-header-features.csv
    
    hdr_pd = pd.read_csv('data/' + token_file)
    tokens = list(hdr_pd['token_name'])
    for idx, token in enumerate(tokens): # Clamp the token name length and demangle C++ names, they are annoying.
        token = token.replace('@','').replace('$','').replace('?','')
        if len(token) > 32:
            tokens[idx] = token[:32]
        else:
            tokens[idx] = token
        


    fop = open('data/' + feature_file_name,'w')
    colnames = "file_name," + ",".join(field_list) + "," + ",".join(tokens) + "\n"
    print("Column names: {:s}".format(colnames))
    fop.write(colnames)                    

    p1 = re.compile('\d{3,5}-' + feature_file_name) # This is the PID prefix for each file.
    file_list = os.listdir('data/')
    counter = 0
    
    for file_name in file_list:
        if p1.match(file_name):
            fip = open('data/' + file_name, 'r')
            in_lines = fip.readlines()
            fop.writelines(in_lines)
            counter += len(in_lines)
            fip.close()
            
    
    fop.close()
    
    features = pd.read_csv('data/' + feature_file_name)
    # DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values().
    sorted_features = features.sort('file_name')
    sorted_features.to_csv('data/sorted-' + feature_file_name, index=False)
    
    print('Completed combine of {:d} PE header file features.'.format(counter))  
    
    return




class Multi_Params(object):
    def __init__(self, outfile="", tokenfile="", fieldnames=[], filelist=[]):
        self.out_file = outfile
        self.token_file = tokenfile
        self.field_names = fieldnames
        self.file_list = filelist



In [ ]:

    
header_field_names = 'pe-coff-header-field-names.txt'
out_file = 'pe-header-features-apt.csv'
token_file = 'pe-header-tokens-apt.csv'
ext_drive = '/opt/vs/apthdr/'
tfiles = os.listdir(ext_drive)

mp1 = Multi_Params(out_file, token_file, header_field_names, tfiles)

extract_header_features(mp1)

combine_feature_files(out_file, token_file)



In [7]:

    
combine_feature_files(out_file, token_file)









    



Column names: file_name,Characteristics,Time/Date,Magic,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOSystemVersion,MinorOSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,Win32Version,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,.CRT,.bss,.data,.idata,.pdata,.rdata,.reloc,.rsrc,.text,.upx,<none>,0basic_fstreamDUchar_traitsDstds,0basic_fstreamDUchar_traitsDstds,0basic_iostreamDUchar_traitsDstd,0basic_streambufDUchar_traitsDst,0basic_stringDUchar_traitsDstdVa,0basic_stringDUchar_traitsDstdVa,0Initios_basestdQAEXZ,0_LockitstdQAEXZ,0_WinitstdQAEXZ,0exceptionQAEABQBDZ,0exceptionQAEABV0Z,0ios_basestdIAEXZ,0localestdQAEXZ,1basic_filebufDUchar_traitsDstds,1basic_fstreamDUchar_traitsDstds,1basic_iosDUchar_traitsDstdstdUA,1basic_iostreamDUchar_traitsDstd,1basic_streambufDUchar_traitsDst,1basic_stringDUchar_traitsDstdVa,1Initios_basestdQAEXZ,1_LockitstdQAEXZ,1_WinitstdQAEXZ,1exceptionUAEXZ,1ios_basestdUAEXZ,1localestdQAEXZ,1type_infoUAEXZ,2YAPAXIZ,3YAXPAXZ,6basic_ostreamDUchar_traitsDstds,6stdYAAAVbasic_ostreamDUchar_tra,7ios_basestdQBE_NXZ,8stdYA_NABVbasic_stringDUchar_tr,8stdYA_NABVbasic_stringDUchar_tr,9stdYA_NABVbasic_stringDUchar_tr,HstdYAAVbasic_stringDUchar_trait,HstdYAAVbasic_stringDUchar_trait,MstdYA_NABVbasic_stringDUchar_tr,_7basic_filebufDUchar_traitsDstd,_7basic_fstreamDUchar_traitsDstd,_7basic_iosDUchar_traitsDstdstd6,_8basic_fstreamDUchar_traitsDstd,_8basic_fstreamDUchar_traitsDstd,_Dbasic_fstreamDUchar_traitsDstd,_UYAPAXIZ,_VYAXPAXZ,_C1_Nullstrbasic_stringDUchar_tr,_Copybasic_stringDUchar_traitsDs,_Eosbasic_stringDUchar_traitsDst,_Fpzstd3_JB,_Freezebasic_stringDUchar_traits,_Growbasic_stringDUchar_traitsDs,_Initbasic_filebufDUchar_traitsD,_Initcvtbasic_filebufDUchar_trai,_Splitbasic_stringDUchar_traitsD,_Tidybasic_stringDUchar_traitsDs,_XlenstdYAXXZ,_XranstdYAXXZ,__FiopenstdYAPAU_iobufPBDHZ,_type_info_dtor_internal_methodt,appendbasic_stringDUchar_traitsD,appendbasic_stringDUchar_traitsD,appendbasic_stringDUchar_traitsD,assignbasic_stringDUchar_traitsD,assignbasic_stringDUchar_traitsD,c_strbasic_stringDUchar_traitsDs,clearbasic_iosDUchar_traitsDstds,clearios_basestdQAEXH_NZ,closebasic_filebufDUchar_traitsD,closebasic_fstreamDUchar_traitsD,coutstd3Vbasic_ostreamDUchar_tra,endlstdYAAAVbasic_ostreamDUchar_,erasebasic_stringDUchar_traitsDs,gcountbasic_istreamDUchar_traits,insertbasic_stringDUchar_traitsD,is_openbasic_fstreamDUchar_trait,max_sizebasic_stringDUchar_trait,nposbasic_stringDUchar_traitsDst,openbasic_filebufDUchar_traitsDs,openbasic_fstreamDUchar_traitsDs,readbasic_istreamDUchar_traitsDs,seekgbasic_istreamDUchar_traitsD,seekpbasic_ostreamDUchar_traitsD,setstatebasic_iosDUchar_traitsDs,tellpbasic_ostreamDUchar_traitsD,terminateYAXXZ,writebasic_ostreamDUchar_traitsD,ADVAPI32,AcquireCredentialsHandleA,AdjustTokenPrivileges,AllocConsole,AllocateAndInitializeSid,AssignProcessToJobObject,AttachConsole,BeginPaint,BitBlt,BuildExplicitAccessWithNameA,BuildSecurityDescriptorW,CLSIDFromString,COMCTL32,COMDLG32,CRYPT32,CertAddCertificateContextToStore,CertCloseStore,CertEnumCertificatesInStore,CertEnumSystemStore,CertFindCertificateInStore,CertFreeCertificateChain,CertFreeCertificateContext,CertGetCertificateChain,CertGetCertificateContextPropert,CertGetNameStringW,CertNameToStrA,CertOpenStore,CertOpenSystemStoreA,CertVerifyCertificateChainPolicy,ChangeServiceConfig2A,ChangeServiceConfigA,CharLowerA,CharToOemA,CharToOemBuffA,CharToOemBuffW,CharUpperA,CharUpperW,CloseDesktop,CloseHandle,CloseServiceHandle,CloseWindowStation,CmUnRegisterCallback,CoCreateInstance,CoInitialize,CodeView,CommDlgExtendedError,CompareFileTime,CompareStringA,CompareStringW,ConnectNamedPipe,ContinueDebugEvent,ControlService,ConvertSidToStringSidW,CopyFileA,CopyFileW,CopyRect,CreateCompatibleBitmap,CreateCompatibleDC,CreateDCA,CreateDirectoryA,CreateDirectoryW,CreateEventA,CreateFileA,CreateFileMappingA,CreateFileMappingW,CreateFileW,CreateJobObjectW,CreateMutexA,CreateNamedPipeA,CreateNamedPipeW,CreatePipe,CreateProcessA,CreateProcessAsUserA,CreateProcessW,CreateProcessWithLogonW,CreateRemoteThread,CreateServiceA,CreateServiceW,CreateStreamOnHGlobal,CreateThread,CreateToolhelp32Snapshot,CreateWaitableTimerA,CreateWindowExA,CreateWindowExW,CredEnumerateW,CredFree,CryptAcquireCertificatePrivateKe,CryptAcquireContextA,CryptAcquireContextW,CryptBinaryToStringA,CryptCreateHash,CryptDecrypt,CryptDeriveKey,CryptDestroyHash,CryptDestroyKey,CryptDuplicateKey,CryptEncrypt,CryptEnumProvidersW,CryptExportKey,CryptGenRandom,CryptGetHashParam,CryptGetKeyParam,CryptGetProvParam,CryptGetUserKey,CryptHashData,CryptImportKey,CryptReleaseContext,CryptSetHashParam,CryptSetKeyParam,CryptStringToBinaryA,DNSAPI,DbgPrint,DebugActiveProcess,DebugBreak,DebugSetProcessKillOnExit,DecodePointer,DecryptMessage,DefWindowProcA,DefWindowProcW,DeleteCriticalSection,DeleteDC,DeleteFileA,DeleteFileW,DeleteObject,DeleteSecurityContext,DeleteService,DeleteUrlCacheEntry,DeregisterEventSource,DestroyIcon,DestroyWindow,DeviceIoControl,DialogBoxParamA,DialogBoxParamW,DisconnectNamedPipe,DispatchMessageA,DispatchMessageW,DnsFree,DnsQuery_A,DosDateTimeToFileTime,DrawIcon,DuplicateHandle,DuplicateTokenEx,EnableWindow,EncodePointer,EncryptMessage,EndDialog,EndPaint,EnterCriticalSection,EnumProcessModules,EnumProcesses,EnumServicesStatusExA,EnumServicesStatusExW,EnumSystemLocalesA,EnumWindows,ExAllocatePoolWithQuotaTag,ExAllocatePoolWithTag,ExFreePoolWithTag,ExitProcess,ExitThread,ExitWindowsEx,ExpandEnvironmentStringsA,ExpandEnvironmentStringsW,FLTMGR,FileTimeToLocalFileTime,FileTimeToSystemTime,FillConsoleOutputCharacterW,FindClose,FindFirstFileA,FindFirstFileW,FindFirstVolumeA,FindNextFileA,FindNextFileW,FindResourceA,FindResourceExA,FindResourceW,FindVolumeClose,FindWindowExA,FindWindowExW,FlsAlloc,FlsFree,FlsGetValue,FlsSetValue,FltEnumerateFilters,FltEnumerateInstances,FltGetFilterInformation,FltGetVolumeFromInstance,FltObjectDereference,FlushConsoleInputBuffer,FlushFileBuffers,FormatMessageA,FormatMessageW,FreeConsole,FreeContextBuffer,FreeCredentialsHandle,FreeEnvironmentStringsA,FreeEnvironmentStringsW,FreeLibrary,FreeLibraryAndExitThread,FreeResource,FreeSid,GDI32,GenerateConsoleCtrlEvent,GetACP,GetAdaptersInfo,GetCPInfo,GetClassNameA,GetClassNameW,GetClientRect,GetCommandLineA,GetCommandLineW,GetComputerNameA,GetComputerNameExW,GetComputerNameW,GetConsoleCP,GetConsoleDisplayMode,GetConsoleMode,GetConsoleOutputCP,GetConsoleScreenBufferInfo,GetCurrentDirectoryA,GetCurrentDirectoryW,GetCurrentProcess,GetCurrentProcessId,GetCurrentThread,GetCurrentThreadId,GetDC,GetDIBits,GetDateFormatA,GetDateFormatW,GetDesktopWindow,GetDeviceCaps,GetDiskFreeSpaceA,GetDiskFreeSpaceExA,GetDiskFreeSpaceExW,GetDiskFreeSpaceW,GetDlgItem,GetDlgItemTextA,GetDlgItemTextW,GetDriveTypeA,GetDriveTypeW,GetEnvironmentStrings,GetEnvironmentStringsW,GetEnvironmentVariableA,GetEnvironmentVariableW,GetExitCodeProcess,GetExitCodeThread,GetFileAttributesA,GetFileAttributesExA,GetFileAttributesW,GetFileInformationByHandle,GetFileSize,GetFileTime,GetFileTitleA,GetFileType,GetFullPathNameA,GetFullPathNameW,GetKernelObjectSecurity,GetLastError,GetLocalTime,GetLocaleInfoA,GetLocaleInfoW,GetLogicalDriveStringsA,GetLogicalDrives,GetLongPathNameA,GetMessageA,GetMessageW,GetModuleBaseNameA,GetModuleFileNameA,GetModuleFileNameExA,GetModuleFileNameW,GetModuleHandleA,GetModuleHandleW,GetModuleInformation,GetNativeSystemInfo,GetNumberFormatA,GetNumberFormatW,GetOEMCP,GetObjectA,GetObjectW,GetOpenFileNameW,GetParent,GetPrivateProfileStringA,GetProcAddress,GetProcessHeap,GetProcessId,GetProcessTimes,GetProcessWindowStation,GetSaveFileNameW,GetSecurityDescriptorDacl,GetSecurityInfo,GetShortPathNameA,GetShortPathNameW,GetStartupInfoA,GetStartupInfoW,GetStdHandle,GetStockObject,GetStringTypeA,GetStringTypeW,GetSysColor,GetSystemDirectoryA,GetSystemInfo,GetSystemMetrics,GetSystemPowerStatus,GetSystemTime,GetSystemTimeAsFileTime,GetTempFileNameA,GetTempPathA,GetTempPathW,GetThreadContext,GetThreadDesktop,GetTickCount,GetTimeFormatA,GetTimeFormatW,GetTimeZoneInformation,GetTokenInformation,GetUserDefaultLCID,GetUserNameA,GetUserNameExA,GetUserNameExW,GetUserObjectInformationA,GetUserObjectInformationW,GetUserProfileDirectoryA,GetVersion,GetVersionExA,GetVersionExW,GetVolumeInformationA,GetWindow,GetWindowLongA,GetWindowLongW,GetWindowRect,GetWindowTextA,GetWindowTextW,GetWindowThreadProcessId,GetWindowsDirectoryA,GlobalAlloc,GlobalFree,GlobalLock,GlobalMemoryStatus,GlobalUnlock,Heap32ListFirst,Heap32ListNext,HeapAlloc,HeapCreate,HeapDestroy,HeapFree,HeapReAlloc,HeapSetInformation,HeapSize,HeapValidate,HttpAddRequestHeadersA,HttpAddRequestHeadersW,HttpEndRequestA,HttpOpenRequestA,HttpOpenRequestW,HttpQueryInfoA,HttpQueryInfoW,HttpSendRequestA,HttpSendRequestExA,HttpSendRequestExW,INIT,I_RpcGetBuffer,ImpersonateLoggedOnUser,InitCommonControlsEx,InitializeCriticalSection,InitializeCriticalSectionAndSpin,InitializeSecurityContextA,InstallService,InterlockedCompareExchange,InterlockedDecrement,InterlockedExchange,InterlockedIncrement,InternetAttemptConnect,InternetCloseHandle,InternetConnectA,InternetConnectW,InternetCrackUrlA,InternetCrackUrlW,InternetGetConnectedState,InternetOpenA,InternetOpenUrlA,InternetOpenW,InternetQueryDataAvailable,InternetQueryOptionA,InternetReadFile,InternetSetCookieA,InternetSetCookieW,InternetSetOptionA,InternetSetOptionExA,InternetSetOptionW,InternetWriteFile,IoCreateDevice,IoCreateSymbolicLink,IoDeleteDevice,IoDeleteSymbolicLink,IoEnumerateRegisteredFiltersList,IofCompleteRequest,IsBadCodePtr,IsBadReadPtr,IsBadWritePtr,IsDBCSLeadByte,IsDebuggerPresent,IsIconic,IsProcessorFeaturePresent,IsValidCodePage,IsValidLocale,IsWindow,IsWindowVisible,KERNEL32,KeBugCheckEx,Kernel32,KillTimer,LCMapStringA,LCMapStringW,LZ32,LZClose,LZCopy,LZOpenFileA,LeaveCriticalSection,LoadAcceleratorsA,LoadBitmapA,LoadBitmapW,LoadCursorA,LoadCursorW,LoadIconA,LoadIconW,LoadImageA,LoadLibraryA,LoadLibraryExA,LoadLibraryW,LoadResource,LoadStringA,LoadStringW,LocalAlloc,LocalFileTimeToFileTime,LocalFree,LocalReAlloc,LockResource,LockServiceDatabase,LogonUserA,LookupAccountSidA,LookupAccountSidW,LookupPrivilegeNameW,LookupPrivilegeValueA,LookupPrivilegeValueW,LsaClose,LsaEnumerateLogonSessions,LsaFreeMemory,LsaFreeReturnBuffer,LsaGetLogonSessionData,LsaOpenPolicy,LsaQueryInformationPolicy,MFC42,MPR,MSVCP60,MSVCR80,MSVCR90,MSVCRT,MakeAbsoluteSD,MapViewOfFile,MapWindowPoints,Mcdl,MessageBoxA,MessageBoxW,MmGetSystemRoutineAddress,Module32First,Module32FirstW,Module32Next,Module32NextW,MoveFileA,MoveFileExA,MoveFileExW,MoveFileW,MultiByteToWideChar,NETAPI32,NdrConvert,NdrNonConformantStringMarshall,NdrNonConformantStringUnmarshall,NdrServerInitializeNew,NetApiBufferFree,NetServerEnum,NetShareGetInfo,Netbios,NtBuildNumber,OLE32,OLEAUT32,ObOpenObjectByPointer,ObfDereferenceObject,OemToCharA,OemToCharBuffA,OleInitialize,OleUninitialize,OpenDesktopA,OpenFile,OpenFileMappingA,OpenFileMappingW,OpenInputDesktop,OpenProcess,OpenProcessToken,OpenSCManagerA,OpenSCManagerW,OpenServiceA,OpenServiceW,OpenThread,OpenWindowStationA,OutputDebugStringA,PAGE,PFXExportCertStoreEx,PSAPI,PathCanonicalizeW,PathCombineW,PathFileExistsA,PathIsDirectoryA,PathIsRelativeW,PeekMessageA,PeekMessageW,PeekNamedPipe,PostMessageA,PostMessageW,PostQuitMessage,PostThreadMessageW,ProceA,Process32First,Process32FirstW,Process32Next,Process32NextW,PsDereferencePrimaryToken,PsGetProcessId,PsGetProcessImageFileName,PsGetVersion,PsInitialSystemProcess,PsProcessType,PsReferencePrimaryToken,PsSetCreateProcessNotifyRoutine,PsSetCreateThreadNotifyRoutine,PsSetLoadImageNotifyRoutine,QueryContextAttributesA,QueryPerformanceCounter,QueryPerformanceFrequency,QueryServiceConfig2A,QueryServiceConfigA,QueryServiceObjectSecurity,QueryServiceStatus,QueryServiceStatusEx,RPCRT4,RaiseException,ReadConsoleInputA,ReadDirectoryChangesW,ReadFile,ReadProcessMemory,RealizePalette,RegCloseKey,RegConnectRegistryA,RegCreateKeyA,RegCreateKeyExA,RegCreateKeyExW,RegDeleteKeyA,RegDeleteValueA,RegEnumKeyExA,RegEnumKeyExW,RegEnumValueA,RegOpenKeyA,RegOpenKeyExA,RegOpenKeyExW,RegQueryInfoKeyA,RegQueryInfoKeyW,RegQueryValueExA,RegQueryValueExW,RegSaveKeyA,RegSetValueExA,RegSetValueExW,RegisterClassExA,RegisterClassExW,RegisterEventSourceA,RegisterEventSourceW,RegisterServiceCtrlHandlerA,ReleaseDC,ReleaseMutex,RemoveDirectoryA,ReportEventA,ReportEventW,ResetEvent,ResumeThread,RevertToSelf,RpcMgmtStopServerListening,RpcRaiseException,RpcServerListen,RpcServerRegisterIf,RpcServerUseProtseqEpA,RtlCaptureContext,RtlCompareMemory,RtlInitUnicodeString,RtlLookupFunctionEntry,RtlPcToFileHeader,RtlUnwind,RtlUnwindEx,RtlVirtualUnwind,RundllInstall,RundllInstallA,RundllUninstall,RundllUninstallA,SHAutoComplete,SHBrowseForFolderA,SHBrowseForFolderW,SHChangeNotify,SHCreateDirectoryExA,SHELL32,SHFileOperationA,SHFileOperationW,SHGetFileInfoA,SHGetFileInfoW,SHGetMalloc,SHGetPathFromIDListA,SHGetPathFromIDListW,SHGetSpecialFolderLocation,SHGetSpecialFolderPathA,SHGetSpecialFolderPathW,SHLWAPI,Secur32,SelectObject,SelectPalette,SendDlgItemMessageA,SendDlgItemMessageW,SendMessageA,SendMessageW,ServiceInstall,ServiceMain,SetConsoleCtrlHandler,SetConsoleCursorPosition,SetConsoleMode,SetConsoleTitleW,SetCurrentDirectoryA,SetCurrentDirectoryW,SetCursorPos,SetDlgItemTextA,SetDlgItemTextW,SetEndOfFile,SetEntriesInAclA,SetEnvironmentVariableA,SetEnvironmentVariableW,SetErrorMode,SetEvent,SetFileAttributesA,SetFileAttributesW,SetFilePointer,SetFileSecurityA,SetFileSecurityW,SetFileTime,SetFocus,SetForegroundWindow,SetHandleCount,SetKernelObjectSecurity,SetLastError,SetMenu,SetNamedPipeHandleState,SetPriorityClass,SetProcessPriorityBoost,SetProcessWindowStation,SetSecurityDescriptorDacl,SetSecurityInfo,SetServiceObjectSecurity,SetServiceStatus,SetStdHandle,SetThreadContext,SetThreadDesktop,SetThreadPriority,SetTimer,SetUnhandledExceptionFilter,SetVolumeLabelA,SetWaitableTimer,SetWindowLongA,SetWindowLongW,SetWindowPos,SetWindowTextA,SetWindowTextW,ShellExecuteA,ShellExecuteExA,ShellExecuteExW,ShellExecuteW,ShowWindow,SizeofResource,Sleep,StartServiceA,StartServiceCtrlDispatcherA,StartServiceW,StrToIntA,StretchBlt,SuspendThread,SystemTimeToFileTime,TerminateJobObject,TerminateProcess,TerminateThread,Thread32First,Thread32Next,TlsAlloc,TlsFree,TlsGetValue,TlsSetValue,Toolhelp32ReadProcessMemory,TranslateAcceleratorA,TranslateMessage,URLDownloadToFileA,USER32,USERENV,UnServiceInstall,UnhandledExceptionFilter,UninstallService,UnlockServiceDatabase,UnmapViewOfFile,UpdateWindow,VirtualAlloc,VirtualAllocEx,VirtualFree,VirtualFreeEx,VirtualProtect,VirtualProtectEx,VirtualQueryEx,WINHTTP,WININET,WINMM,WNetAddConnection2A,WNetCancelConnection2A,WS2_32,WSACloseEvent,WSACreateEvent,WSAEnumNetworkEvents,WSAEventSelect,WSAResetEvent,WSASocketA,WSAWaitForMultipleEvents,WTSAPI32,WTSCloseServer,WTSEnumerateProcessesW,WTSEnumerateSessionsW,WTSFreeMemory,WTSOpenServerW,WaitForDebugEvent,WaitForInputIdle,WaitForMultipleObjects,WaitForSingleObject,WaitNamedPipeW,WideCharToMultiByte,WinExec,WinHttpAddRequestHeaders,WinHttpCloseHandle,WinHttpConnect,WinHttpCrackUrl,WinHttpGetIEProxyConfigForCurren,WinHttpGetProxyForUrl,WinHttpOpen,WinHttpOpenRequest,WinHttpQueryDataAvailable,WinHttpQueryHeaders,WinHttpQueryOption,WinHttpReadData,WinHttpReceiveResponse,WinHttpSendRequest,WinHttpSetOption,WinHttpSetTimeouts,WinHttpWriteData,WriteConsoleA,WriteConsoleInputA,WriteConsoleW,WriteFile,WritePrivateProfileStringA,WriteProcessMemory,ZwClose,ZwDuplicateToken,ZwOpenProcessTokenEx,ZwQuerySystemInformation,ZwSetInformationProcess,_CxxThrowException,_EH_prolog,_XcptFilter,__CppXcptFilter,__CxxFrameHandler,__CxxFrameHandler3,__clean_type_info_names_internal,__dllonexit,__getmainargs,__iob_func,__p___initenv,__p__commode,__p__environ,__p__fmode,__set_app_type,__setusermatherr,_access,_acmdln,_adjust_fdiv,_alldiv,_allmul,_alloca_probe,_allshl,_amsg_exit,_atoi64,_aulldiv,_aullrem,_aullshr,_beginthread,_beginthreadex,_cexit,_chdir,_chkstk,_close,_configthreadlocale,_controlfp,_controlfp_s,_crt_debugger_hook,_decode_pointer,_encode_pointer,_encoded_null,_endthreadex,_errno,_except_handler3,_except_handler4_common,_exit,_fdopen,_filelength,_fileno,_findclose,_findfirst,_findfirst64i32,_findnext,_findnext64i32,_ftime,_ftime64,_ftol,_get_osfhandle,_getch,_getcwd,_gmtime32,_initterm,_initterm_e,_invoke_watson,_iob,_ismbblead,_itoa,_lclose,_local_unwind2,_localtime64,_localtime64_s,_lock,_lopen,_malloc_crt,_mbschr,_mbscmp,_mbsicmp,_mbsinc,_mbsnbcat,_mbsnbcmp,_mbsnbcpy,_mbsrchr,_mkdir,_onexit,_open,_purecall,_read,_setmbcp,_setmode,_snprintf,_splitpath,_stat,_stat32,_strcmpi,_strdate,_stricmp,_strlwr,_strnicmp,_strtime,_strupr,_time32,_time64,_ui64toa,_ultoa,_unlink,_unlock,_vsnprintf,_vsnwprintf,_wcsicmp,_wcsnicmp,_wfopen,abort,abs,addLogonSession,asctime_s,atexit,atof,atoi,atol,ceil,comdlg32,delLogonSession,exit,fclose,fflush,fgetc,fgetpos,fgets,floor,fopen,fprintf,fputc,fputs,fread,free,freopen,fseek,ftell,fwprintf,fwrite,getDescription,getLocalAccounts,getLogonPasswords,getLogonSessions,getMSV,getMSVFunctions,getSAMFunctions,getSECFunctions,getSecrets,getTsPkg,getTsPkgFunctions,getWDigest,getWDigestFunctions,getchar,getenv,gmtime,install,installA,iphlpapi,isalnum,isdigit,islower,isprint,isspace,isupper,isxdigit,keybd_event,localtime,lstrcatA,lstrcatW,lstrcmpiA,lstrcpyA,lstrcpynA,lstrlenA,lstrlenW,malloc,mbstowcs,memchr,memcmp,memcpy,memmove,memset,mouse_event,msvcrt,nNOTES,ntdll,ntoskrnl,ole32,ping,printf,qsort,raise,rand,realloc,rewind,scanf,setlocale,signal,sprintf,srand,sscanf,strcat,strchr,strcmp,strcpy,strerror,strftime,strlen,strncat,strncmp,strncpy,strpbrk,strrchr,strstr,strtok,strtoul,system,time,timeGetTime,tolower,uninstall,uninstallA,urlmon,vfprintf,vsprintf,wcscat,wcschr,wcscmp,wcscpy,wcslen,wcsrchr,wcsstr,wcstombs,wprintf,wsprintfA,wsprintfW,wvsprintfA,wvsprintfW

Completed combine of 275 PE header file features.



In [ ]:

    
out_file = 'pe-header-features-vs251.csv'
token_file = 'pe-header-tokens-vs251.csv'
combine_feature_files(out_file, token_file)

Test PE/COFF ASM Feature Extraction.



In [15]:

    
# Test PE ASM feature extraction.

x86_registers = ['edx','esi','es','fs','ds','ss','gs','cs','ah','al',
                 'ax','bh','bl','bx','ch','cl','cx','dh','dl','dx',
                 'eax','ebp','ebx','ecx','edi','esp']

x86_opcodes = ['add','al','bt','call','cdq','cld','cli','cmc','cmp','const','cwd','daa','db'
                ,'dd','dec','dw','endp','ends','faddp','fchs','fdiv','fdivp','fdivr','fild'
                ,'fistp','fld','fstcw','fstcwimul','fstp','fword','fxch','imul','in','inc'
                ,'ins','int','jb','je','jg','jge','jl','jmp','jnb','jno','jnz','jo','jz'
                ,'lea','loope','mov','movzx','mul','near','neg','not','or','out','outs'
                ,'pop','popf','proc','push','pushf','rcl','rcr','rdtsc','rep','ret','retn'
                ,'rol','ror','sal','sar','sbb','scas','setb','setle','setnle','setnz'
                ,'setz','shl','shld','shr','sidt','stc','std','sti','stos','sub','test'
                ,'wait','xchg','xor']


def count_asm_registers(asm_code):
    registers_values = [0]*len(x86_registers) # Need to optimise this init stuff to global vars.
    
    for row in asm_code:
        parts = row.replace(',',' ').replace('+',' ').replace('*',' ').replace('[',' ').replace(']',' ') \
                    .replace('-',' ').split()

        for idx, register in enumerate(x86_registers):
            registers_values[idx] += parts.count(register)

    return registers_values


def count_asm_opcodes(asm_code):
    opcodes_values = [0]*len(x86_opcodes)
    
    for row in asm_code:
        parts = row.split()

        for idx, opcode in enumerate(x86_opcodes):
            if opcode in parts:
                opcodes_values[idx] += 1
                break

    return opcodes_values

def extract_asm_features(multi_param):
    
    pid = os.getpid()
    feature_file = 'data/' + str(pid) + "-" + multi_param.temp_file
    print('Process id: {:d} feature file: {:s}'.format(pid,feature_file))

    # Do this in call graph feature extraction.
    #fapi = open("data/APIs.txt")
    #defined_apis = fapi.readlines()
    #defined_apis = defined_apis[0].split(',')

    asm_files = [i for i in tfiles if '.asm' in i]
    ftot = len(asm_files)
    
    feature_counts = []
    with open(feature_file, 'w') as f:
        
        fw = writer(f)
        
        for idx, fname in enumerate(asm_files):
            
            fasm = open(ext_drive + fname, 'r')
            content = fasm.readlines()
            fasm.close()
            
            fname = fname[fname.find("_")+1:] # Remove VirusShare_ from the start of the file name.
            
            reg_vals = count_asm_registers(content)
            opc_vals = count_asm_opcodes(content)
            #api_vals = count_asm_APIs(content, defined_apis) put in Call Graph features.
            #sec_vals = count_asm_sections(content) already in PE header features.
            #mis_vals = count_asm_misc(content) mostly already in PE header and call graph features.
            count_vals = reg_vals + opc_vals # + api_vals + mis_vals
            
            feature_counts.append([fname[:fname.find('.asm')]] + count_vals)   
            
            # Writing rows after every 10 files processed
            if (idx+1) % 10 == 0:
                print("{:d} {:d} of {:d} files processed.".format(pid, idx + 1, ftot))
                fw.writerows(feature_counts)
                feature_counts = []
                
        # Writing remaining files
        if len(feature_counts) > 0:
            fw.writerows(feature_counts)
            feature_counts = []

    return



def combine_asm_files(out_file, temp_file):
    # Function to combine the newly generated asm feature files into one file:
    # 1. list data directory
    # 2. For each file in file list that matches (\d\d\d\d-asm-features.csv)
    # 3. Trim the filenames if necessary (should remove VirusShare_  prefix).
    # 4. Concatenate the unsorted asm feature files.
    # 5. Sort and write to data/sorted-asm-features.csv
    fop = open('data/' + out_file,'w')
    colnames = "file_name," + ",".join(x86_registers) + "," + ",".join(x86_opcodes) + "\n"
    fop.write(colnames)
    
    print("Column names: {:s}".format(colnames))
    
    p1 = re.compile('\d{3,5}-' + temp_file) # This is the PID prefix for each file.
    file_list = os.listdir('data/')
    counter = 0
    
    for file_name in file_list:
        if p1.match(file_name):
            fip = open('data/' + file_name, 'r')
            in_lines = fip.readlines()
            fop.writelines(in_lines)
            counter += len(in_lines)
            fip.close()
            
    print('Completed combine of {:d} ASM features.'.format(counter))  
    
    fop.close()
    
    asms = pd.read_csv('data/' + out_file)
    # DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values().
    sorted_asms = asms.sort('file_name')
    sorted_asms.to_csv('data/sorted-' + out_file, index=False)
    sorted_asms.head(20)
    
    return


class Multi_Params(object):
    def __init__(self, featurefile="", tempfile="", filelist=[]):
        self.feature_file = featurefile
        self.temp_file = tempfile
        self.file_list = filelist



In [2]:

    
# Includes x86 and amd64 registers and opcodes.
x86_registers = ['edx','esi','es','fs','ds','ss','gs','cs','ah','al',
                 'ax','bh','bl','bx','ch','cl','cx','dh','dl','dx',
                 'eax','ebp','ebx','ecx','edi','esp','rax','rbx','rcx','rdx','rsi',
                 'rdi','rbp','rsp','r8','r9','r10','r11','r12','r13','r14','r15']

x86_opcodes = ['mov','movabs','movbe','movsbl','movsbw','movswl','movsbq','movswq','movslq','movsx',
               'movsxd','movzb','movzw','movzx','push','pusha','pop','popa','xchg','in','out','lea',
               'lds','les','lfs','lgs','lss','clc','cld','cli','clts','cmc','lahf','sahf','pushf',
               'popf','stc','std','sti','add','inc','sub','dec','sbb','cmp','test','and','or','xor',
               'clr','adc','neg','not','aaa','aas','daa','das','aad','aam','cbw','cdqe','cwde','cwd',
               'cdq','cqo','cbtw','cltq','cwtl','cwtd','cltd','cqto','mul','imul','div','idiv',
               'rol','ror','rcl','rcr','sal','shl','shr','sar','shld','shrd','call','lcall',
               'jmp','ljmp','ret','lret','retf','enter','leave','jo','jno','jb','jc','jnae','jnb',
               'jnc','jae','je','jz','jne','jnz','jbe','jna','jnbe','ja','js','jns','jp','jpe',
               'jnp','jpo','jl','jnge','jnl','jge','jle','jng','jnle','jg','jcxz','jecxz','jrcxz',
               'loop','loopz','loope','loopnz','loopne','seto','setno','setb','setc','setnae',
               'setnb','setnc','setae','sete','setz','setne','setnz','setbe','setna','setnbe',
               'seta','sets','setns','setp','setpe','setnp','setpo','setl','setnge','setnl',
               'setge','setle','setng','setnle','setg','cmps','scmp','ins','outs','lods',
               'slod','movs','smov','scas','ssca','stos','ssto','xlat','bsf','bsr','bt',
               'btc','btr','bts','int','int3','into','iret','rsm','bound','hlt','nop','arpl',
               'lar','lgdt','lidt','lldt','lmsw','lsl','ltr','sgdt','sidt','sldt','smsw','str',
               'verr','verw','fld','fild','fildll','fldt','fbld','fst','fist','fstp','fistp',
               'fistpll','fstpt','fbstp','fxch','fcom','ficom','fcomp','ficomp','fcompp','fucom',
               'fucomp','fucompp','ftst','fxam','fld1','fldl2t','fldl2e','fldpi','fldlg2','fldln2',
               'fldz','fadd','fiadd','faddp','fsub','fisub','fsubp','fsubr','fisubr','fsubrp',
               'fmul','fimul','fmulp','fdiv','fidiv','fdivp','fdivr','fidivr','fdivrp','f2xm1',
               'fyl2x','fptan','fpatan','fxtract','fprem1','fdecstp','fincstp','fprem','fyl2xp1',
               'fsqrt','fsincos','frndint','fscale','fsin','fcos','fchs','fabs','fninit','finit',
               'fldcw','fnstcw','fstcw','fnstsw','fstsw','fnclex','fclex','fnstenv','fstenv',
               'fldenv','fnsave','fsave','frstor','fneni','feni','fndisi','fdisi','fnsetpm',
               'fsetpm','frstpm','ffree','ffreep','fnop','fwait','addr16','addr32','aword',
               'adword','data16','data32','word','dword','lock','wait','cs','ds','es','fs',
               'gs','ss','rep','repe','repz','repne','repnz','ht','hnt','rex','rexz','rexy',
               'rexyz','rexx','rexxz','rexxy','rexxyz','rex64','rex64z','rex64y','rex64yz',
               'rex64x','rex64xz','rex64xy','rex64xyz','bswap','xadd','cmpxchg','invd','wbinvd',
               'invlpg','cpuid','wrmsr','rdtsc','rdmsr','cmpxchg8b','sysenter','sysexit','fxsave',
               'fxsave64','fxrstor','fxrstor64','rdpmc','ud2','ud2a','ud1','ud2b','cmovo','cmovno',
               'cmovb','cmovc','cmovnae','cmovae','cmovnc','cmovnb','cmove','cmovz','cmovne',
               'cmovnz','cmovbe','cmovna','cmova','cmovnbe','cmovs','cmovns','cmovp','cmovnp',
               'cmovl','cmovnge','cmovge','cmovnl','cmovle','cmovng','cmovg','cmovnle','cmovpe',
               'cmovpo','fcmovb','fcmovnae','fcmove','fcmovbe','fcmovna','fcmovu','fcmovae',
               'fcmovnb','fcmovne','fcmova','fcmovnbe','fcmovnu','fcomi','fucomi','fcomip',
               'fcompi','fucomip','fucompi','movnti','clflush','lfence','mfence','pause','emms',
               'movd','movq','packssdw','packsswb','packuswb','paddb','paddw','paddd','paddq',
               'paddsb','paddsw','paddusb','paddusw','pand','pandn','pcmpeqb','pcmpeqw','pcmpeqd',
               'pcmpgtb','pcmpgtw','pcmpgtd','pmaddwd','pmulhw','pmullw','por','psllw','pslld',
               'psllq','psraw','psrad','psrlw','psrld','psrlq','psubb','psubw','psubd','psubq',
               'psubsb','psubsw','psubusb','psubusw','punpckhbw','punpckhwd','punpckhdq',
               'punpcklbw','punpcklwd','punpckldq','pxor','addps','addss','andnps','andps',
               'cmpeqps','cmpeqss','cmpleps','cmpless','cmpltps','cmpltss','cmpneqps','cmpneqss',
               'cmpnleps','cmpnless','cmpnltps','cmpnltss','cmpordps','cmpordss','cmpunordps',
               'cmpunordss','cmpps','cmpss','comiss','cvtpi2ps','cvtps2pi','cvtsi2ss','cvtss2si',
               'cvttps2pi','cvttss2si','divps','divss','ldmxcsr','maskmovq','maxps','maxss',
               'minps','minss','movaps','movhlps','movhps','movlhps','movlps','movmskps','movntps',
               'movntq','movntdq','movss','movups','mulps','mulss','orps','pavgb','pavgw','pextrw',
               'pinsrw','pmaxsw','pmaxub','pminsw','pminub','pmovmskb','pmulhuw','prefetchnta',
               'prefetcht0','prefetcht1','prefetcht2','psadbw','pshufw','rcpps','rcpss','rsqrtps',
               'rsqrtss','sfence','shufps','sqrtps','sqrtss','stmxcsr','subps','subss','ucomiss',
               'unpckhps','unpcklps','xorps','addpd','addsd','andnpd','andpd','cmpeqpd','cmpeqsd',
               'cmplepd','cmplesd','cmpltpd','cmpltsd','cmpneqpd','cmpneqsd','cmpnlepd','cmpnlesd',
               'cmpnltpd','cmpnltsd','cmpordpd','cmpordsd','cmpunordpd','cmpunordsd','cmppd',
               'cmpsd','comisd','cvtpi2pd','cvtsi2sd','divpd','divsd','maxpd','maxsd','minpd',
               'minsd','movapd','movhpd','movlpd','movmskpd','movntpd','movsd','movupd','mulpd',
               'mulsd','orpd','shufpd','sqrtpd','sqrtsd','subpd','subsd','ucomisd','unpckhpd',
               'unpcklpd','xorpd','cvtdq2pd','cvtpd2dq','cvtdq2ps','cvtpd2pi','cvtpd2ps','cvtps2pd',
               'cvtps2dq','cvtsd2si','cvtsd2ss','cvtss2sd','cvttpd2pi','cvttsd2si','cvttpd2dq',
               'cvttps2dq','maskmovdqu','movdqa','movdqu','movdq2q','movq2dq','pmuludq','pshufd',
               'pshufhw','pshuflw','pslldq','psrldq','punpckhqdq','punpcklqdq','addsubpd','addsubps',
               'cmpxchg16b','fisttp','fisttpll','haddpd','haddps','hsubpd','hsubps','lddqu',
               'monitor','movddup','movshdup','movsldup','mwait','vmcall','vmclear','vmlaunch',
               'vmresume','vmptrld','vmptrst','vmread','vmwrite','vmxoff','vmxon','vmfunc','getsec',
               'invept','invvpid','invpcid','phaddw','phaddd','phaddsw','phsubw','phsubd','phsubsw',
               'pmaddubsw','pmulhrsw','pshufb','psignb','psignw','psignd','palignr','pabsb','pabsw',
               'pabsd','blendpd','blendps','blendvpd','blendvps','dppd','dpps','extractps','insertps',
               'movntdqa','mpsadbw','packusdw','pblendvb','pblendw','pcmpeqq','pextrb','pextrd',
               'pextrq','phminposuw','pinsrb','pinsrd','pinsrq','pmaxsb','pmaxsd','pmaxud','pmaxuw',
               'pminsb','pminsd','pminud','pminuw','pmovsxbw','pmovsxbd','pmovsxbq','pmovsxwd',
               'pmovsxwq','pmovsxdq','pmovzxbw','pmovzxbd','pmovzxbq','pmovzxwd','pmovzxwq',
               'pmovzxdq','pmuldq','pmulld','ptest','roundpd','roundps','roundsd','roundss',
               'pcmpgtq','pcmpestri','pcmpestrm','pcmpistri','pcmpistrm','crc32','xsave',
               'xsave64','xrstor','xrstor64','xgetbv','xsetbv','xsaveopt','xsaveopt64','aesdec',
               'aesdeclast','aesenc','aesenclast','aesimc','aeskeygenassist','pclmulqdq',
               'pclmullqlqdq','pclmulhqlqdq','pclmullqhqdq','pclmulhqhqdq','vaddpd','vaddps',
               'vaddsd','vaddss','vaddsubpd','vaddsubps','vandnpd','vandnps','vandpd','vandps',
               'vblendpd','vblendps','vblendvpd','vblendvps','vbroadcastf128','vbroadcastsd',
               'vbroadcastss','vcmpeq_ospd','vcmpeq_osps','vcmpeq_ossd','vcmpeq_osss','vcmpeqpd',
               'vcmpeqps','vcmpeqsd','vcmpeqss','vcmpeq_uqpd','vcmpeq_uqps','vcmpeq_uqsd',
               'vcmpeq_uqss','vcmpeq_uspd','vcmpeq_usps','vcmpeq_ussd','vcmpeq_usss','vcmpfalse_ospd',
               'vcmpfalse_osps','vcmpfalse_ossd','vcmpfalse_osss','vcmpfalsepd','vcmpfalseps',
               'vcmpfalsesd','vcmpfalsess','vcmpge_oqpd','vcmpge_oqps','vcmpge_oqsd','vcmpge_oqss',
               'vcmpgepd','vcmpgeps','vcmpgesd','vcmpgess','vcmpgt_oqpd','vcmpgt_oqps',
               'vcmpgt_oqsd','vcmpgt_oqss','vcmpgtpd','vcmpgtps','vcmpgtsd','vcmpgtss',
               'vcmple_oqpd','vcmple_oqps','vcmple_oqsd','vcmple_oqss','vcmplepd','vcmpleps',
               'vcmplesd','vcmpless','vcmplt_oqpd','vcmplt_oqps','vcmplt_oqsd','vcmplt_oqss',
               'vcmpltpd','vcmpltps','vcmpltsd','vcmpltss','vcmpneq_oqpd','vcmpneq_oqps',
               'vcmpneq_oqsd','vcmpneq_oqss','vcmpneq_ospd','vcmpneq_osps','vcmpneq_ossd',
               'vcmpneq_osss','vcmpneqpd','vcmpneqps','vcmpneqsd','vcmpneqss','vcmpneq_uspd',
               'vcmpneq_usps','vcmpneq_ussd','vcmpneq_usss','vcmpngepd','vcmpngeps','vcmpngesd',
               'vcmpngess','vcmpnge_uqpd','vcmpnge_uqps','vcmpnge_uqsd','vcmpnge_uqss','vcmpngtpd',
               'vcmpngtps','vcmpngtsd','vcmpngtss','vcmpngt_uqpd','vcmpngt_uqps','vcmpngt_uqsd',
               'vcmpngt_uqss','vcmpnlepd','vcmpnleps','vcmpnlesd','vcmpnless','vcmpnle_uqpd',
               'vcmpnle_uqps','vcmpnle_uqsd','vcmpnle_uqss','vcmpnltpd','vcmpnltps','vcmpnltsd',
               'vcmpnltss','vcmpnlt_uqpd','vcmpnlt_uqps','vcmpnlt_uqsd','vcmpnlt_uqss','vcmpordpd',
               'vcmpordps','vcmpordsd','vcmpord_spd','vcmpord_sps','vcmpordss','vcmpord_ssd',
               'vcmpord_sss','vcmppd','vcmpps','vcmpsd','vcmpss','vcmptruepd','vcmptrueps',
               'vcmptruesd','vcmptruess','vcmptrue_uspd','vcmptrue_usps','vcmptrue_ussd',
               'vcmptrue_usss','vcmpunordpd','vcmpunordps','vcmpunordsd','vcmpunord_spd',
               'vcmpunord_sps','vcmpunordss','vcmpunord_ssd','vcmpunord_sss','vcomisd',
               'vcomiss','vcvtdq2pd','vcvtdq2ps','vcvtpd2dq','vcvtpd2dqx','vcvtpd2dqy','vcvtpd2ps',
               'vcvtpd2psx','vcvtpd2psy','vcvtps2dq','vcvtps2pd','vcvtsd2si','vcvtsd2ss',
               'vcvtsi2sd','vcvtsi2ss','vcvtss2sd','vcvtss2si','vcvttpd2dq','vcvttpd2dqx',
               'vcvttpd2dqy','vcvttps2dq','vcvttsd2si','vcvttss2si','vdivpd','vdivps','vdivsd',
               'vdivss','vdppd','vdpps','vextractf128','vextractps','vhaddpd','vhaddps',
               'vhsubpd','vhsubps','vinsertf128','vinsertps','vlddqu','vldmxcsr','vmaskmovdqu',
               'vmaskmovpd','vmaskmovps','vmaxpd','vmaxps','vmaxsd','vmaxss','vminpd','vminps',
               'vminsd','vminss','vmovapd','vmovaps','vmovd','vmovddup','vmovdqa','vmovdqu',
               'vmovhlps','vmovhpd','vmovhps','vmovlhps','vmovlpd','vmovlps','vmovmskpd','vmovmskps',
               'vmovntdq','vmovntdqa','vmovntpd','vmovntps','vmovq','vmovsd','vmovshdup',
               'vmovsldup','vmovss','vmovupd','vmovups','vmpsadbw','vmulpd','vmulps','vmulsd',
               'vmulss','vorpd','vorps','vpabsb','vpabsd','vpabsw','vpackssdw','vpacksswb',
               'vpackusdw','vpackuswb','vpaddsb','vpaddsw','vpaddb','vpaddd','vpaddq','vpaddw',
               'vpaddusb','vpaddusw','vpalignr','vpand','vpandn','vpavgb','vpavgw','vpblendvb',
               'vpblendw','vpcmpeqb','vpcmpeqd','vpcmpeqq','vpcmpeqw','vpcmpestri','vpcmpestrm',
               'vpcmpgtb','vpcmpgtd','vpcmpgtq','vpcmpgtw','vpcmpistri','vpcmpistrm','vperm2f128',
               'vpermilpd','vpermilps','vpextrb','vpextrd','vpextrq','vpextrw','vphaddd','vphaddsw',
               'vphaddw','vphminposuw','vphsubd','vphsubsw','vphsubw','vpinsrb','vpinsrd','vpinsrq',
               'vpinsrw','vpmaddubsw','vpmaddwd','vpmaxsb','vpmaxsd','vpmaxsw','vpmaxub','vpmaxud',
               'vpmaxuw','vpminsb','vpminsd','vpminsw','vpminub','vpminud','vpminuw','vpmovmskb',
               'vpmovsxbd','vpmovsxbq','vpmovsxbw','vpmovsxdq','vpmovsxwd','vpmovsxwq','vpmovzxbd',
               'vpmovzxbq','vpmovzxbw','vpmovzxdq','vpmovzxwd','vpmovzxwq','vpmuldq','vpmulhrsw',
               'vpmulhuw','vpmulhw','vpmulld','vpmullw','vpmuludq','vpor','vpsadbw','vpshufb',
               'vpshufd','vpshufhw','vpshuflw','vpsignb','vpsignd','vpsignw','vpslld','vpslldq',
               'vpsllq','vpsllw','vpsrad','vpsraw','vpsrld','vpsrldq','vpsrlq','vpsrlw','vpsubb',
               'vpsubd','vpsubq','vpsubsb','vpsubsw','vpsubusb','vpsubusw','vpsubw','vptest',
               'vpunpckhbw','vpunpckhdq','vpunpckhqdq','vpunpckhwd','vpunpcklbw','vpunpckldq',
               'vpunpcklqdq','vpunpcklwd','vpxor','vrcpps','vrcpss','vroundpd','vroundps',
               'vroundsd','vroundss','vrsqrtps','vrsqrtss','vshufpd','vshufps','vsqrtpd','vsqrtps',
               'vsqrtsd','vsqrtss','vstmxcsr','vsubpd','vsubps','vsubsd','vsubss','vtestpd',
               'vtestps','vucomisd','vucomiss','vunpckhpd','vunpckhps','vunpcklpd','vunpcklps',
               'vxorpd','vxorps','vzeroall','vzeroupper','vbroadcasti128','vpblendd','vpbroadcastb',
               'vpbroadcastd','vpbroadcastq','vpbroadcastw','vperm2i128','vpermd','vpermpd',
               'vpermps','vpermq','vextracti128','vinserti128','vpmaskmovd','vpmaskmovq','vpsllvd',
               'vpsllvq','vpsravd','vpsrlvd','vpsrlvq','vgatherdpd','vgatherdps','vgatherqpd',
               'vgatherqps','vpgatherdd','vpgatherdq','vpgatherqd','vpgatherqq','vaesdec','vaesdeclast',
               'vaesenc','vaesenclast','vaesimc','vaeskeygenassist','vpclmulqdq','vpclmullqlqdq',
               'vpclmulhqlqdq','vpclmullqhqdq','vpclmulhqhqdq','rdfsbase','rdgsbase','rdrand',
               'wrfsbase','wrgsbase','vcvtph2ps','vcvtps2ph','vfmadd132pd','vfmadd132ps','vfmadd213pd',
               'vfmadd213ps','vfmadd231pd','vfmadd231ps','vfmadd132sd','vfmadd132ss','vfmadd213sd',
               'vfmadd213ss','vfmadd231sd','vfmadd231ss','vfmaddsub132pd','vfmaddsub132ps',
               'vfmaddsub213pd','vfmaddsub213ps','vfmaddsub231pd','vfmaddsub231ps','vfmsubadd132pd',
               'vfmsubadd132ps','vfmsubadd213pd','vfmsubadd213ps','vfmsubadd231pd','vfmsubadd231ps',
               'vfmsub132pd','vfmsub132ps','vfmsub213pd','vfmsub213ps','vfmsub231pd','vfmsub231ps',
               'vfmsub132sd','vfmsub132ss','vfmsub213sd','vfmsub213ss','vfmsub231sd','vfmsub231ss',
               'vfnmadd132pd','vfnmadd132ps','vfnmadd213pd','vfnmadd213ps','vfnmadd231pd',
               'vfnmadd231ps','vfnmadd132sd','vfnmadd132ss','vfnmadd213sd','vfnmadd213ss',
               'vfnmadd231sd','vfnmadd231ss','vfnmsub132pd','vfnmsub132ps','vfnmsub213pd',
               'vfnmsub213ps','vfnmsub231pd','vfnmsub231ps','vfnmsub132sd','vfnmsub132ss',
               'vfnmsub213sd','vfnmsub213ss','vfnmsub231sd','vfnmsub231ss','xacquire','xrelease',
               'xabort','xbegin','xend','xtest','bzhi','mulx','pdep','pext','rorx','sarx','shlx',
               'shrx','vfmaddpd','vfmaddps','vfmaddsd','vfmaddss','vfmaddsubpd','vfmaddsubps',
               'vfmsubaddpd','vfmsubaddps','vfmsubpd','vfmsubps','vfmsubsd','vfmsubss','vfnmaddpd',
               'vfnmaddps','vfnmaddsd','vfnmaddss','vfnmsubpd','vfnmsubps','vfnmsubsd','vfnmsubss',
               'vfrczpd','vfrczps','vfrczsd','vfrczss','vpcmov','vpcomb','vpcomd','vpcomq',
               'vpcomub','vpcomud','vpcomuq','vpcomuw','vpcomw','vpermil2pd','vpermil2ps',
               'vpcomltb','vpcomltd','vpcomltq','vpcomltub','vpcomltud','vpcomltuq','vpcomltuw',
               'vpcomltw','vpcomleb','vpcomled','vpcomleq','vpcomleub','vpcomleud','vpcomleuq',
               'vpcomleuw','vpcomlew','vpcomgtb','vpcomgtd','vpcomgtq','vpcomgtub','vpcomgtud',
               'vpcomgtuq','vpcomgtuw','vpcomgtw','vpcomgeb','vpcomged','vpcomgeq','vpcomgeub',
               'vpcomgeud','vpcomgeuq','vpcomgeuw','vpcomgew','vpcomeqb','vpcomeqd','vpcomeqq',
               'vpcomequb','vpcomequd','vpcomequq','vpcomequw','vpcomeqw','vpcomneqb','vpcomneqd',
               'vpcomneqq','vpcomnequb','vpcomnequd','vpcomnequq','vpcomnequw','vpcomneqw',
               'vpcomfalseb','vpcomfalsed','vpcomfalseq','vpcomfalseub','vpcomfalseud',
               'vpcomfalseuq','vpcomfalseuw','vpcomfalsew','vpcomtrueb','vpcomtrued','vpcomtrueq',
               'vpcomtrueub','vpcomtrueud','vpcomtrueuq','vpcomtrueuw','vpcomtruew','vphaddbd',
               'vphaddbq','vphaddbw','vphadddq','vphaddubd','vphaddubq','vphaddubw','vphaddudq',
               'vphadduwd','vphadduwq','vphaddwd','vphaddwq','vphsubbw','vphsubdq','vphsubwd',
               'vpmacsdd','vpmacsdqh','vpmacsdql','vpmacssdd','vpmacssdqh','vpmacssdql',
               'vpmacsswd','vpmacssww','vpmacswd','vpmacsww','vpmadcsswd','vpmadcswd','vpperm',
               'vprotb','vprotd','vprotq','vprotw','vpshab','vpshad','vpshaq','vpshaw','vpshlb',
               'vpshld','vpshlq','vpshlw','llwpcb','slwpcb','lwpval','lwpins','andn','bextr','blsi',
               'blsmsk','blsr','tzcnt','blcfill','blci','blcic','blcmsk','blcs','blsfill','blsic',
               't1mskc','tzmsk','prefetch','prefetchw','femms','pavgusb','pf2id','pf2iw','pfacc',
               'pfadd','pfcmpeq','pfcmpge','pfcmpgt','pfmax','pfmin','pfmul','pfnacc','pfpnacc',
               'pfrcp','pfrcpit1','pfrcpit2','pfrsqit1','pfrsqrt','pfsub','pfsubr','pi2fd','pi2fw',
               'pmulhrw','pswapd','syscall','sysret','swapgs','rdtscp','clgi','invlpga','skinit',
               'stgi','vmload','vmmcall','vmrun','vmsave','movntsd','movntss','extrq','insertq', 
               'popcnt','lzcnt','xstore','xcrypt','montmul','xsha1','xsha256','xstorerng',
               'xcryptecb','xcryptcbc','xcryptctr','xcryptcfb','xcryptofb','xstore','adcx','adox',
               'rdseed','clac','stac','bnd','bndmk','bndmov','bndcl','bndcu','bndcn','bndstx',
               'bndldx','sha1rnds4','sha1nexte','sha1msg1','sha1msg2','sha256rnds2','sha256msg1','sha256msg2',
               'kandnw','kandw','korw','kxnorw','kxorw','kmovw','knotw','kortestw','kshiftlw',
               'kshiftrw','kunpckbw','valignd','vpternlogd','valignq','vpternlogq','vblendmpd',
               'vpblendmq','vpermi2pd','vpermi2q','vpermt2pd','vpermt2q','vpmaxsq','vpmaxuq',
               'vpminsq','vpminuq','vprolvq','vprorvq','vpsravq','vblendmps','vpblendmd',
               'vpermi2d','vpermi2ps','vpermt2d','vpermt2ps','vprolvd','vprorvd','vbroadcastf32x4',
               'vbroadcasti32x4','vbroadcastf64x4','vbroadcasti64x4','vcmpeq_oqpd','vcmpfalse_oqpd',
               'vcmpge_ospd','vcmpgt_ospd','vcmple_ospd','vcmplt_ospd','vcmpneq_uqpd','vcmpnge_uspd',
               'vcmpngt_uspd','vcmpnle_uspd','vcmpnlt_uspd','vcmpord_qpd','vcmptrue_uqpd',
               'vcmpunord_qpd','vcmpeq_oqps','vcmpfalse_oqps','vcmpge_osps','vcmpgt_osps',
               'vcmple_osps','vcmplt_osps','vcmpneq_uqps','vcmpnge_usps','vcmpngt_usps',
               'vcmpnle_usps','vcmpnlt_usps','vcmpord_qps','vcmptrue_uqps','vcmpunord_qps',
               'vcmpeq_oqsd','vcmpfalse_oqsd','vcmpge_ossd','vcmpgt_ossd','vcmple_ossd','vcmplt_ossd',
               'vcmpneq_uqsd','vcmpnge_ussd','vcmpngt_ussd','vcmpnle_ussd','vcmpnlt_ussd',
               'vcmpord_qsd','vcmptrue_uqsd','vcmpunord_qsd','vcmpeq_oqss','vcmpfalse_oqss',
               'vcmpge_osss','vcmpgt_osss','vcmple_osss','vcmplt_osss','vcmpneq_uqss',
               'vcmpnge_usss','vcmpngt_usss','vcmpnle_usss','vcmpnlt_usss','vcmpord_qss',
               'vcmptrue_uqss','vcmpunord_qss','vcompresspd','vpcompressq','vpscatterdq',
               'vpscatterqq','vscatterdpd','vscatterqpd','vcompressps','vpcompressd','vpscatterdd',
               'vscatterdps','vcvtudq2pd','vcvtps2udq','vcvtpd2udq','vcvtsd2usi','vcvtusi2sd',
               'vcvtusi2ss','vcvtss2usi','vcvttpd2udq','vcvttps2udq','vcvttsd2usi','vcvttss2usi',
               'vcvtudq2ps','vexpandpd','vpexpandq','vexpandps','vpexpandd','vextractf32x4',
               'vextracti32x4','vextractf64x4','vextracti64x4','vfixupimmpd','vfixupimmps',
               'vfixupimmsd','vgetmantsd','vrndscalesd','vfixupimmss','vgetmantss','vrndscaless',
               'vscalefpd','vscalefps','vscalefsd','vscalefss','vgetexppd','vgetexpps',
               'vgetexpsd','vgetexpss','vgetmantpd','vrndscalepd','vgetmantps','vrndscaleps',
               'vinsertf32x4','vinserti32x4','vinsertf64x4','vinserti64x4','vmovdqa64',
               'vmovdqa32','vmovdqu32','vmovdqu64','vrcp14ps','vrsqrt14ps','vpabsq',
               'vrcp14pd','vrsqrt14pd','vpandd','vpandnd','vpord','vpxord','vpandnq',
               'vpandq','vporq','vpxorq','vpcmpd','vpcmpled','vpcmpltd','vpcmpneqd',
               'vpcmpnled','vpcmpnltd','vpcmpud','vpcmpequd','vpcmpleud','vpcmpltud',
               'vpcmpnequd','vpcmpnleud','vpcmpnltud','vpcmpq','vpcmpleq','vpcmpltq',
               'vpcmpneqq','vpcmpnleq','vpcmpnltq','vpcmpuq','vpcmpequq','vpcmpleuq',
               'vpcmpltuq','vpcmpnequq','vpcmpnleuq','vpcmpnltuq','vptestmq','vpmovdb',
               'vpmovsdb','vpmovusdb','vpmovdw','vpmovsdw','vpmovusdw','vpmovqb','vpmovsqb',
               'vpmovusqb','vpmovqd','vpmovsqd','vpmovusqd','vpmovqw','vpmovsqw','vpmovusqw',
               'vprold','vprord','vprolq','vprorq','vpscatterqd','vscatterqps','vpsraq',
               'vptestmd','vrcp14sd','vrsqrt14sd','vrcp14ss','vrsqrt14ss','vshuff32x4',
               'vshufi32x4','vshuff64x2','vshufi64x2','vptestnmd','vptestnmq','vpbroadcastmb2q',
               'vpbroadcastmw2d','vpconflictd','vpconflictq','vplzcntd','vplzcntq','vexp2pd',
               'vexp2ps','vrcp28pd','vrsqrt28pd','vrcp28ps','vrsqrt28ps','vrcp28sd',
               'vrsqrt28sd','vrcp28ss','vrsqrt28ss','vgatherpf0dpd','vgatherpf0qpd',
               'vgatherpf1dpd','vgatherpf1qpd','vscatterpf0dpd','vscatterpf0qpd',
               'vscatterpf1dpd','vscatterpf1qpd','vgatherpf0dps','vgatherpf0qps',
               'vgatherpf1dps','vgatherpf1qps','vscatterpf0dps','vscatterpf0qps',
               'vscatterpf1dps','vscatterpf1qps','prefetchwt1','clflushopt','xrstors',
               'xrstors64','xsaves','xsaves64','xsavec','xsavec64','encls','enclu',
               'vcvtpd2udqx','vcvtpd2udqy','vcvttpd2udqx','vcvttpd2udqy','kaddd','kandd',
               'kandnd','kmovd','knotd','kord','kortestd','ktestd','kxnord','kxord','kaddq',
               'kandnq','kandq','kmovq','knotq','korq','kortestq','ktestq','kunpckdq',
               'kunpckwd','kxnorq','kxorq','kshiftld','kshiftlq','kshiftrd','kshiftrq',
               'vdbpsadbw','vmovdqu16','vmovdqu8','vpblendmb','vpblendmw','vpermi2w',
               'vpermt2w','vpermw','vpsllvw','vpsravw','vpsrlvw','vpcmpb','vpcmpub',
               'vpcmpuw','vpcmpw','vpmovb2m','vpmovm2b','vpmovm2w','vpmovswb','vpmovuswb',
               'vpmovwb','vpmovw2m','vptestmb','vptestmw','vptestnmb','vptestnmw','kaddb',
               'kandb','kandnb','kmovb','knotb','korb','kortestb','ktestb','kxnorb','kxorb',
               'kaddw','ktestw','kshiftlb','kshiftrb','vbroadcastf32x2','vbroadcastf32x8',
               'vbroadcasti32x2','vbroadcasti32x8','vbroadcastf64x2','vbroadcasti64x2',
               'vcvtpd2qq','vcvtpd2uqq','vcvtps2qq','vcvtps2uqq','vcvtqq2pd','vcvtuqq2pd',
               'vcvtqq2ps','vcvtqq2psx','vcvtqq2psy','vcvttpd2qq','vcvttpd2uqq','vcvttps2qq',
               'vcvttps2uqq','vcvtuqq2ps','vcvtuqq2psx','vcvtuqq2psy','vextractf32x8',
               'vextracti32x8','vinsertf32x8','vinserti32x8','vfpclassss','vextractf64x2',
               'vextracti64x2','vfpclasssd','vinsertf64x2','vinserti64x2','vfpclasspd',
               'vfpclasspdz','vfpclasspdx','vfpclasspdy','vfpclassps','vfpclasspsz','vfpclasspsx',
               'vfpclasspsy','vpmovd2m','vpmovm2d','vpmovm2q','vpmovq2m','vpmullq','vrangepd',
               'vreducepd','vrangeps','vreduceps','vrangesd','vreducesd','vrangess','vreducess',
               'clwb','pcommit','vpmadd52huq','vpmadd52luq','vpmultishiftqb','vpermb','vpermi2b',
               'vpermt2b','clzero','monitorx','mwaitx','rdpkru','wrpkru','rdpid']



def count_asm_symbols(asm_code):
    symbols = [0]*7
    for row in asm_code:
        if '*' in row:
            symbols[0] += 1
        if '-' in row:
            symbols[1] += 1
        if '+' in row:
            symbols[2] += 1
        if '[' in row:
            symbols[3] += 1
        if ']' in row:
            symbols[4] += 1
        if '@' in row:
            symbols[5] += 1
        if '?' in row:
            symbols[6] += 1

    return symbols


def count_asm_APIs(asm_code, apis):
    apis_values = [0]*len(apis)
    for row in asm_code:
        for i in range(len(apis)):
            if apis[i] in row:
                apis_values[i] += 1
                break
    return apis_values


def count_asm_misc(asm_code):
    keywords_values = [0]*len(keywords)
    for row in asm_code:
        for i in range(len(keywords)):
            if keywords[i] in row:
                keywords_values[i] += 1
                break
    return keywords_values


def count_asm_registers(asm_code):
    registers_values = [0]*len(x86_registers) # Need to optimise this init stuff to global vars.
    
    for row in asm_code:
        parts = row.replace(',',' ').replace('+',' ').replace('*',' ').replace('[',' ').replace(']',' ') \
                    .replace('-',' ').split()

        for idx, register in enumerate(x86_registers):
            registers_values[idx] += parts.count(register)

    return registers_values


def count_asm_opcodes(asm_code):
    opcodes_values = [0]*len(x86_opcodes)
    
    for row in asm_code:
        parts = row.split()

        for idx, opcode in enumerate(x86_opcodes):
            if opcode in parts:
                opcodes_values[idx] += 1
                break

    return opcodes_values


def extract_asm_features(multi_param):
    
    pid = os.getpid()
    feature_file = 'data/' + str(pid) + "-" + multi_param.temp_file
    ext_drive = multi_para.ext_drive
    
    lmsg = 'Process id: {:d} feature file: {:s}'.format(pid, feature_file)
    print(lmsg)

    # Do this in call graph feature extraction.
    #fapi = open("data/APIs.txt")
    #defined_apis = fapi.readlines()
    #defined_apis = defined_apis[0].split(',')
    
    flog = open("data/" + str(pid) + "-pe-asm-log.txt", "w")
    flog.write(lmsg + "\n")

    asm_files = [i for i in tfiles if '.pe.asm' in i]
    ftot = len(asm_files)
    
    feature_counts = []
    with open(feature_file, 'w') as f:
        
        fw = writer(f)
        
        for idx, fname in enumerate(asm_files):
            
            fasm = open(ext_drive + fname, 'r')
            content = fasm.readlines()
            fasm.close()
            
            fname = fname[fname.find("_")+1:] # Remove VirusShare_ from the start of the file name.
            
            reg_vals = count_asm_registers(content)
            opc_vals = count_asm_opcodes(content)
            #api_vals = count_asm_APIs(content, defined_apis) put in Call Graph features.
            #sec_vals = count_asm_sections(content) already in PE header features.
            #mis_vals = count_asm_misc(content) mostly already in PE header and call graph features.
            count_vals = reg_vals + opc_vals # + api_vals + mis_vals
            
            feature_counts.append([fname[:fname.find('.asm')]] + count_vals)   
            
            # Writing rows after every 10 files processed
            if (idx+1) % 10 == 0:
                lmsg = "{:d} {:d} of {:d} files processed.".format(pid, idx + 1, ftot)
                print(lmsg)
                flog.write(lmsg + "\n")
                fw.writerows(feature_counts)
                feature_counts = []
                
        # Writing remaining files
        if len(feature_counts) > 0:
            fw.writerows(feature_counts)
            feature_counts = []

    lmsg = "{:d} Completed processing {:d} PE ASM files.".format(pid, ftot)
    print(lmsg)
    flog.write(lmsg + "\n")
    flog.close()
    
    return



def combine_asm_files(out_file, temp_file):
    # Function to combine the newly generated asm feature files into one file:
    # 1. list data directory
    # 2. For each file in file list that matches (\d\d\d\d-asm-features.csv)
    # 3. Trim the filenames if necessary (should remove VirusShare_  prefix).
    # 4. Concatenate the unsorted asm feature files.
    # 5. Sort and write to data/sorted-asm-features.csv
    
    fop = open('data/' + out_file, 'w')
    colnames = "file_name," + ",".join(x86_registers) + "," + ",".join(x86_opcodes) + "\n"
    fop.write(colnames)
    
    print("Column names: {:s}".format(colnames))
    
    p1 = re.compile('\d{3,5}-' + temp_file) # This is the PID prefix for each file.
    file_list = os.listdir('data/')
    counter = 0
    
    for file_name in file_list:
        if p1.match(file_name):
            fip = open('data/' + file_name, 'r')
            in_lines = fip.readlines()
            fop.writelines(in_lines)
            counter += len(in_lines)
            fip.close()
            
    print('Completed combine of {:d} ASM features.'.format(counter))  
    
    fop.close()
    
    asms = pd.read_csv('data/' + out_file)
    # DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values().
    sorted_asms = asms.sort('file_name')
    sorted_asms.to_csv('data/sorted-' + out_file, index=False)
    sorted_asms.head(20)
    
    return


class Multi_Params(object):
    def __init__(self, featurefile="", tempfile="", extdrive="", filelist=[]):
        self.feature_file = featurefile
        self.temp_file = tempfile
        self.ext_drive = extdrive
        self.file_list = filelist



In [ ]:

    
out_file = 'pe-asm-features-apt.csv'
temp_file = 'pe-asm-temp-apt.csv'
ext_drive = '/opt/vs/asm/'
tfiles = os.listdir(ext_drive)

mp1 = Multi_Params(out_file, temp_file, tfiles)


extract_asm_features(mp1)

combine_asm_files(out_file, temp_file)



In [3]:

    
# find out what is going on with the IDA Pro disassembly.
fip = open('/opt/vs/asm/filetypes.txt', 'r')
lines = fip.readlines()
dir_list = os.listdir('/opt/vs/asm/')
file_list = []
counter = 0

for line in lines:
    tokens = line.split(':')
    file_name = tokens[0]
    counter += 1
    file_list.append(file_name)
    
print("PE32 files: {:d} directory files: {:d}".format(counter, len(dir_list)))

for fname in dir_list:
    if fname not in file_list:
        print("File not in PE32 list: {:s}".format(fname))









    



PE32 files: 239 directory files: 272
File not in PE32 list: VirusShare_2bd02b41817d227058522cca40acd390.asm
File not in PE32 list: VirusShare_2daa4a4574ba06aa3203ae0e0b45b3b8.asm
File not in PE32 list: VirusShare_827040a5f5ae8de281a63899224b2f3a.asm
File not in PE32 list: VirusShare_67504a0c2c2bf47efccdab5ca981ad7d.asm
File not in PE32 list: VirusShare_1328eaceb140a3863951d18661b097af.asm
File not in PE32 list: VirusShare_95f25d3afc5370f5d9fd8e65c17d3599.asm
File not in PE32 list: VirusShare_a5d4ebc0285f0213e0c29d23bc410889.asm
File not in PE32 list: VirusShare_02c65973b6018f5d473d701b3e7508b2.asm
File not in PE32 list: VirusShare_fc1937c1aa536b3744ebdfb1716fd54d.asm
File not in PE32 list: VirusShare_6e442c5ef460bee4c9457c6bf7a132d6.asm
File not in PE32 list: VirusShare_31e5e58dbdfad05175613e795298ebb5.asm
File not in PE32 list: VirusShare_933b11bc4799f8d9f65466fb2e3ea659.asm
File not in PE32 list: VirusShare_7cb055ac3acbf53e07e20b65ec9126a1.asm
File not in PE32 list: VirusShare_c91eacab7655870764d13ba741aa9a73.asm
File not in PE32 list: VirusShare_4e551abcd14506092a0f8d54a45f3569.asm
File not in PE32 list: VirusShare_6f9992c486195edcf0bf2f6ee6c3ec74.asm
File not in PE32 list: VirusShare_4a54d7878d4170c3d4e3c3606365c42c.asm
File not in PE32 list: VirusShare_7712d05c8b499fc7a1f4a6a6b6dee825.asm
File not in PE32 list: VirusShare_123505024f9e5ff74cb6aa67d7fcc392.asm
File not in PE32 list: VirusShare_00dbb9e1c09dbdafb360f3163ba5a3de.asm
File not in PE32 list: VirusShare_9675827a495f4ba6a4efd4dd70932b7c.asm
File not in PE32 list: VirusShare_ea1b44094ae4d8e2b63a1771a3e61fd5.asm
File not in PE32 list: filetypes.txt
File not in PE32 list: VirusShare_ca327bc83fbe38b3689cd1a5505dfc33.asm
File not in PE32 list: VirusShare_6808ec6dbb23f0fa7637c108f44c5c80.asm
File not in PE32 list: VirusShare_e476e4a24f8b4ff4c8a0b260aa35fc9f.asm
File not in PE32 list: VirusShare_4f763b07a7b8a80f1f9408e590f79532.asm
File not in PE32 list: VirusShare_0b506c6dde8d07f9eeb82fd01a6f97d4.asm
File not in PE32 list: VirusShare_8934aeed5d213fe29e858eee616a6ec7.asm
File not in PE32 list: VirusShare_3de1bd0f2107198931177b2b23877df4.asm
File not in PE32 list: VirusShare_c99fa835350aa9e2427ce69323b061a9.asm
File not in PE32 list: VirusShare_3107de21e480ab1f2d67725f419b28d0.asm
File not in PE32 list: VirusShare_0908d8b3e459551039bade50930e4c1b.asm



In [6]:

    
def get_unpacked_file_list(packer_id_feature_file, file_id_feature_file, trid_id_feature_file):
    # Load the malware packer id features and file id features from the sample set.
    packer_id_features = pd.read_csv(packer_id_feature_file)
    file_id_features = pd.read_csv(file_id_feature_file)
    trid_id_features = pd.read_csv(trid_id_feature_file)
    
    # Get a list of unpacked PE files that are not .NET CIL format.
    # IDA Pro cannot disassemble .NET files, have to use Ildisasm.exe in Visual Studio.
    unpacked_files = packer_id_features[packer_id_features['is_packed'] == 0]
    unpacked_pe_files = unpacked_files[unpacked_files['valid_pe'] == 1]
    not_dot_net = []
    counter = 0
    dot_net_counter = 0
    
    # Get the trid and file rows that are for unpacked PE files.
    trids = trid_id_features[trid_id_features['file_name'].isin(unpacked_pe_files['file_name'])]
    fids = file_id_features[file_id_features['file_name'].isin(unpacked_pe_files['file_name'])]
    
    # Iterate over the unpacked PE file list and check if each is a .NET file.
    # If not a .NET file then add to file list.
    pe_names_list = unpacked_pe_files['file_name']
    
    for idx, file_name in enumerate(pe_names_list):
        trid_name = trids.iloc[idx, 1]
        fid_name = fids.iloc[idx, 1]
        trid_name = trid_name.lower()
        fid_name = fid_name.lower()
        
        if trid_name.find('.net') > -1 or fid_name.find('.net') > -1:
            #print('Found: {:s} - {:s}'.format(trid_name, fid_name))
            dot_net_counter += 1
            continue
            
        #print('Found: {:s} - {:s}'.format(trid_name, fid_name))
        not_dot_net.append(file_name)
        counter += 1
    
    file_list = []
    write_list = []
    counter = 0
    
    # Iterate over the file list and prepend the full file name.
    for file_name in not_dot_net:
        full_name = "VirusShare_" + file_name
        file_list.append(full_name)
        write_list.append(full_name + "\n")
        counter += 1

    if (len(file_list) > 0):   
        fop = open('data/temp-unpacked-pe-non-dot-net.txt','w')
        fop.writelines(write_list)
        fop.close()
    
    print("Got {:d} unpacked PE filenames and {:d} .NET filenames.".format(counter, dot_net_counter))

    return file_list



In [8]:

    
packer_id_file = 'data/sorted-packer-id-features-vs251.csv'
file_id_file = 'data/sorted-file-id-features-vs251.csv'
trid_id_file = 'data/sorted-trid-id-features-vs251.csv'
ext_drive = '/opt/vs/train1/'

unflist = get_unpacked_file_list(packer_id_file, file_id_file, trid_id_file)

file_list = []
completed_list = os.listdir('/opt/vs/train1asm/')
print("Got {:d} completed ASM files.".format(len(completed_list)))
for idx, fname in enumerate(completed_list):
    completed_list[idx] = fname[0:fname.find(".asm")]

for idx, fname in enumerate(unflist):
    if fname not in completed_list:
        file_list.append(ext_drive + fname)

print("Processing {:d} files out of {:d} total unpacked PE files.".format(len(file_list), len(unflist)))









    



Got 55128 unpacked PE filenames and 348 .NET filenames.
Got 21315 completed ASM files.
Processing 33814 files out of 55128 total unpacked PE files.



In [9]:

    
33814 + 21315









    Out[9]:





55129



In [8]:

    
def get_unpacked_file_list(packer_id_feature_file, file_id_feature_file, trid_id_feature_file):
    # Load the malware packer id features and file id features from the sample set.
    packer_id_features = pd.read_csv(packer_id_feature_file)
    file_id_features = pd.read_csv(file_id_feature_file)
    trid_id_features = pd.read_csv(trid_id_feature_file)
    
    # Get a list of unpacked PE files that are not .NET CIL format.
    # IDA Pro cannot disassemble .NET files, have to use Ildisasm.exe in Visual Studio.
    unpacked_files = packer_id_features[packer_id_features['is_packed'] == 0]
    unpacked_pe_files = unpacked_files[unpacked_files['valid_pe'] == 1]
    not_dot_net = []
    counter = 0
    dot_net_counter = 0
    amd64_bit_counter = 0
    
    # Get the trid and file rows that are for unpacked PE files.
    trids = trid_id_features[trid_id_features['file_name'].isin(unpacked_pe_files['file_name'])]
    fids = file_id_features[file_id_features['file_name'].isin(unpacked_pe_files['file_name'])]
    
    # Iterate over the unpacked PE file list and check if each is a .NET file.
    # If not a .NET file then add to file list.
    pe_names_list = unpacked_pe_files['file_name']
    
    for idx, file_name in enumerate(pe_names_list):
        trid_name = trids.iloc[idx, 1]
        fid_name = fids.iloc[idx, 1]
        trid_name = trid_name.lower()
        fid_name = fid_name.lower()
        
        #print("Trid: {:s}".format(trid_name))
        #print("Fid: {:s}".format(fid_name))
        
        if trid_name.find('.net') > -1 or fid_name.find('.net') > -1:
            print('Found: {:s} - {:s}'.format(trid_name, fid_name))
            dot_net_counter += 1
            continue
            
        if trid_name.find('win64') > -1 or fid_name.startswith('pe32+'):
            print('Found: {:s} - {:s}'.format(trid_name, fid_name))
            amd64_bit_counter += 1
            continue
            
        #print('Found: {:s} - {:s}'.format(trid_name, fid_name))
        not_dot_net.append(file_name)
        counter += 1
    
    file_list = []
    write_list = []
    counter = 0
    
    # Iterate over the file list and prepend the full file name.
    for file_name in not_dot_net:
        full_name = "VirusShare_" + file_name
        file_list.append(full_name)
        write_list.append(full_name + "\n")
        counter += 1

    if (len(file_list) > 0):   
        fop = open('data/temp-unpacked-pe-non-dot-net.txt','w')
        fop.writelines(write_list)
        fop.close()
    
    print("Got {:d} unpacked PE files.".format(counter))
    print("Got {:d} .NET file and {:d} 64 Bit files.".format(dot_net_counter, amd64_bit_counter))

    return file_list



In [ ]:

    
packer_id_file = 'data/sorted-packer-id-features-apt.csv'
file_id_file = 'data/sorted-file-id-features-apt.csv'
trid_id_file = 'data/sorted-trid-id-features-apt.csv'
ext_drive = '/opt/vs/apt/'

unflist = get_unpacked_file_list(packer_id_file, file_id_file, trid_id_file)



In [ ]:

Test ELF Disassembly.



In [13]:

    
# Test ELF disassembly.

def get_elf_file_list(ext_drive, packer_id_feature_file, file_id_feature_file, trid_id_feature_file):
    # Load the malware packer id features and file id features from the sample set.
    packer_id_features = pd.read_csv(packer_id_feature_file)
    file_id_features = pd.read_csv(file_id_feature_file)
    trid_id_features = pd.read_csv(trid_id_feature_file)
    
    counter = 0

    file_names_list = file_id_features['file_name']
    file_list = []
    write_list = []
    fid_list = []
    
    for idx, file_name in enumerate(file_names_list):
        trid_name = trid_id_features.iloc[idx, 1]
        fid_name = file_id_features.iloc[idx, 1]
        
        if trid_name.find('ELF') > -1 or fid_name.find('ELF') > -1:
            print('Found: {:s} - {:s}'.format(trid_name, fid_name))
            counter += 1
            full_name = ext_drive + "VirusShare_" + file_name
            write_list =  full_name + "\n"
            file_list.append(full_name)
            fid_list.append(fid_name)


        
    fop = open('data/elf-file-list.txt','w')
    fop.writelines(write_list)
    fop.close()
    
    print("Got {:d} ELF filenames.".format(counter))

    return file_list, fid_list



def disassemble_elf_binaries(file_list, fid_list):
    # Use the command "objdump -d -M intel file_name" to dump out all 
    # the code sections of the ELF binary and generate assembly code in Intel
    # format as this is easier to read and better for machine learning 
    # feature extraction.
    # Use the command "objdump -g -x file_name -o file_name.txt to dump out
    # all header sections.
    
    counter = 0
    disassed = 0
    error_count = 0
    
    print("Disassembling {:d} binary ELF files.".format(len(file_list)))
    
    for idx, file_name in enumerate(file_list):
        file_path = file_name.rstrip() # remove the newlines or else !!!
        asm_file_name = file_path + ".elf.asm"
        hdr_file_name = file_path + ".elf.txt"
        fid_name = fid_list[idx]
        
        if (os.path.isfile(file_path)):
            fopasm = open(asm_file_name, "w")
            # Dump the assembly code listing.
            if "Intel" in fid_name:
                sub.call(["objdump", "-d", "-M intel", file_path], stdout=fopasm)
                #sub.call(["ndisasm", "-d", "-M intel", file_path], stdout=fopasm)
            elif "x86" in fid_name:
                sub.call(["objdump", "-d", "-M intel", file_path], stdout=fopasm)
            elif "ARM" in fid_name:
                sub.call(["objdump", "-d", "-marm", file_path], stdout=fopasm)
            elif "PowerPC" in fid_name:
                sub.call(["objdump", "-d", "-mpowerpc", file_path], stdout=fopasm)
            elif "Motorola" in fid_name:
                sub.call(["objdump", "-d", "-mm68k", file_path], stdout=fopasm)
            elif "SPARC" in fid_name:
                sub.call(["objdump", "-d", "-msparc", file_path], stdout=fopasm)
            elif "MIPS" in fid_name:
                sub.call(["objdump", "-d", "-mmips", file_path], stdout=fopasm)
            elif "Renesas" in fid_name: # SuperH
                sub.call(["objdump", "-d", "-msh", file_path], stdout=fopasm)
                
            # Dump the ELF section headers.
            fophdr = open(hdr_file_name, "w")
            sub.call(["readelf", "-e", file_path], stdout=fophdr)
            fophdr.close()
            
            fopasm.close()
            
            # now delete the binary, we do not need it anymore.
            # sub.call(["rm", file_path1])
            
            disassed += 1

        else:
            #print("Error: file does not exist - {:s}".format(file_path))
            error_count += 1
           
        counter += 1
        if (counter % 1000) == 0: # print progress
            print('Disassembled: {:d} - {:s}'.format(counter, file_path))    
 

    print("Disassembled {:d} ELF binaries with {:d} file path errors.".format(disassed, error_count))
    
    #sub.call(["mv", "*.asm", "/opt/vs/asm"])
    
    return



In [7]:

    
ext_drive = '/opt/vs/train1/'
packer_id_file = 'data/sorted-packer-id-features-vs251.csv'
file_id_file = 'data/sorted-file-id-features-vs251.csv'
trid_id_file = 'data/sorted-trid-id-features-vs251.csv'
    
unflist, fidlist = get_elf_file_list(ext_drive, packer_id_file, file_id_file, trid_id_file)

disassemble_elf_binaries(unflist, fidlist)









    



Found: unknown - ELF 32-bit MSB  executable PowerPC or cisco 4500 version 1 (SYSV) statically linked not stripped
Found: unknown - ELF 32-bit LSB  executable MIPS MIPS-I version 1 (SYSV) statically linked not stripped
Got 2 ELF filenames.
Disassembling 2 binary ELF files.
Disassembled 2 ELF binaries with 0 file path errors.



In [8]:

    
ext_drive = '/opt/vs/train2/'
packer_id_file = 'data/sorted-packer-id-features-vs252.csv'
file_id_file = 'data/sorted-file-id-features-vs252.csv'
trid_id_file = 'data/sorted-trid-id-features-vs252.csv'
    
unflist, fidlist = get_elf_file_list(ext_drive, packer_id_file, file_id_file, trid_id_file)

disassemble_elf_binaries(unflist, fidlist)









    



Found: unknown - ELF 32-bit MSB  executable PowerPC or cisco 4500 version 1 (SYSV) statically linked not stripped
Found: unknown - ELF 32-bit LSB  executable ARM version 1 statically linked not stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) dynamically linked (uses shared libs) for GNU/Linux 2.0.0 stripped
Got 3 ELF filenames.
Disassembling 3 binary ELF files.
Disassembled 3 ELF binaries with 0 file path errors.



In [14]:

    
ext_drive = '/opt/vs/train3/'
packer_id_file = 'data/sorted-packer-id-features-vs263.csv'
file_id_file = 'data/sorted-file-id-features-vs263.csv'
trid_id_file = 'data/sorted-trid-id-features-vs263.csv'
    
unflist, fidlist = get_elf_file_list(ext_drive, packer_id_file, file_id_file, trid_id_file)

disassemble_elf_binaries(unflist, fidlist)









    



Found: unknown - ELF 32-bit LSB  executable ARM EABI4 version 1 (SYSV) statically linked for GNU/Linux 2.6.14 stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked for GNU/Linux 2.2.5 not stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (GNU/Linux) statically linked for GNU/Linux 2.6.18 not stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked for GNU/Linux 2.6.9 not stripped
Found: unknown - ELF 32-bit LSB  executable ARM EABI5 version 1 (SYSV) statically linked for GNU/Linux 2.6.16 not stripped
Got 5 ELF filenames.
Disassembling 5 binary ELF files.
Disassembled 5 ELF binaries with 0 file path errors.



In [ ]:



In [ ]:

Generate Instruction Sets for Various Computer Architectures/Processors/JVM.



In [ ]:

    
ext_drive = '/opt/vs/train4/'
packer_id_file = 'data/sorted-packer-id-features-vs264.csv'
file_id_file = 'data/sorted-file-id-features-vs264.csv'
trid_id_file = 'data/sorted-trid-id-features-vs264.csv'
    
unflist, fidlist = get_elf_file_list(ext_drive, packer_id_file, file_id_file, trid_id_file)

disassemble_elf_binaries(unflist, fidlist)



In [ ]:

    
fip = open('data/amd64-instruction-set.txt')
inlines = fip.readlines()
inlines



In [ ]:

    
opcode_list = []
for line in inlines:
    tokens = line.rstrip()
    opcode_list.append(tokens.lower())
    
opcode_str = "[\'" + "','".join(opcode_list) + "\']" 
opcode_str



In [ ]:

    
fip = open('data/arm-instruction-set.txt')
inlines = fip.readlines()
inlines



In [4]:

    
opcode_list = []
for line in inlines:
    tokens = line.split()
    opcode_list.append(tokens[0].lower())
    
opcode_str = "[\'" + "','".join(opcode_list) + "\']" 
opcode_str









    Out[4]:





"['adc','msr','add','mul','and','mvn','b','orr','bic','rsb','bl','rsc','bx','sbc','cdp','smlal','cmn','smull','cmp','stc','eor','stm','ldc','str','ldm','strb','ldr','strbt','ldrb','strh','ldrbt','strt','ldrh','sub','ldrsb','swi','ldrsh','swp','ldrt','swpb','mcr','teq','mla','tst','mov','umlal','mrc','umull','mrs']"



In [ ]:

    
fip = open('data/sparc-instruction-set.txt')
inlines = fip.readlines()
inlines



In [ ]:

    
opcode_list = []
for line in inlines:
    tokens = line.split()
    opcode_list.append(tokens[0].lower())
    
opcode_str = "[\'" + "','".join(opcode_list) + "\']" 
opcode_str



In [ ]:

    
fip = open('data/powerpc-instruction-set.txt')
inlines = fip.readlines()
inlines



In [ ]:

    
opcode_list = []
for line in inlines:
    tokens = line.replace('[', ' ').split()
    opcode_list.append(tokens[0].lower())
    
opcode_str = "[\'" + "','".join(opcode_list) + "\']" 
opcode_str



In [6]:

    
len(opcode_list)









    Out[6]:





223



In [ ]:

    
fip = open('data/powerpc-version-202-instruction-set.txt')
inlines = fip.readlines()
inlines



In [ ]:

    
opcode_list = []
for line in inlines:
    tokens = line.replace('[', ' ').rstrip().split()
    opcode_list.append(tokens[0].lower())
    
opcode_str = "[\'" + "','".join(opcode_list) + "\']" 
opcode_str



In [9]:

    
len(opcode_list)









    Out[9]:





233



In [ ]:



In [ ]:

    
fip = open('data/motorola-instruction-set.txt')
inlines = fip.readlines()
inlines



In [ ]:



In [ ]:

    
fip = open('data/mips-instruction-set.txt')
inlines = fip.readlines()
inlines



In [ ]:



In [ ]:



In [ ]:



In [2]:

    
# Check file id strings for ELF executables
fip = open('data/sorted-file-id-features-vs251.csv')
inlines = fip.readlines()
for line in inlines:
    if "ELF" in line:
        print("-> {:s}".format(line))









    



-> abbde81d7f4733c16046cbd8ee7409d3,ELF 32-bit MSB  executable PowerPC or cisco 4500 version 1 (SYSV) statically linked not stripped,56

-> f04f278048fc082dd5d0f34efa3c05f8,ELF 32-bit LSB  executable MIPS MIPS-I version 1 (SYSV) statically linked not stripped,475



In [3]:

    
fip = open('data/sorted-file-id-features-vs252.csv')
inlines = fip.readlines()
for line in inlines:
    if "ELF" in line:
        print("-> {:s}".format(line))









    



-> c6813bcaf9a2801973e9c44fe75ef75b,ELF 32-bit MSB  executable PowerPC or cisco 4500 version 1 (SYSV) statically linked not stripped,56

-> cbb492024bdd2484f39893ab77da0cae,ELF 32-bit LSB  executable ARM version 1 statically linked not stripped,216

-> fa390c69553d757c3a10737a0a8604dc,ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) dynamically linked (uses shared libs) for GNU/Linux 2.0.0 stripped,463



In [4]:

    
fip = open('data/sorted-file-id-features-vs263.csv')
inlines = fip.readlines()
for line in inlines:
    if "ELF" in line:
        print("-> {:s}".format(line))









    



-> 480813ec6548a4e55245a0e446e63c36,ELF 32-bit LSB  executable ARM EABI4 version 1 (SYSV) statically linked for GNU/Linux 2.6.14 stripped,188

-> 5b88e0490dd764e66e13c8a543099c9d,ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked for GNU/Linux 2.2.5 not stripped,192

-> 62d33be03ef3bc9c81d703898fc0e18c,ELF 32-bit LSB  executable Intel 80386 version 1 (GNU/Linux) statically linked for GNU/Linux 2.6.18 not stripped,349

-> 7a891a96d6af45865e5fe6142b40eb77,ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked for GNU/Linux 2.6.9 not stripped,447

-> af8970eb045a77ad1c427eb6333c9efd,ELF 32-bit LSB  executable ARM EABI5 version 1 (SYSV) statically linked for GNU/Linux 2.6.16 not stripped,176



In [ ]:

    
fip = open('data/sorted-file-id-features-vs264.csv')
inlines = fip.readlines()
for line in inlines:
    if "ELF" in line:
        print("-> {:s}".format(line))



In [ ]:

Generate Assembly Instruction Sets for Various Computer Architectures.



In [ ]:

    
fip = open("/home/derek/binutils.txt")
inlines = fip.readlines()
inlines



In [9]:

    
package_list = []
for line in inlines:
    tokens = line.replace(":", " ").split()
    package_list.append(tokens[1])
    
command = "apt install " + " ".join(package_list)
command









    Out[9]:





'apt install binutils binutils binutils-aarch64-linux-gnu binutils-aarch64-linux-gnu binutils-arm-linux-gnueabi binutils-arm-linux-gnueabi binutils-arm-linux-gnueabihf binutils-arm-linux-gnueabihf binutils-arm-none-eabi binutils-arm-none-eabi binutils-avr binutils-avr binutils-dev binutils-dev binutils-doc binutils-gold binutils-gold binutils-h8300-hms binutils-h8300-hms binutils-m68hc1x binutils-m68hc1x binutils-mingw-w64 binutils-mingw-w64-i686 binutils-mingw-w64-i686 binutils-mingw-w64-x86-64 binutils-mingw-w64-x86-64 binutils-msp430 binutils-msp430 binutils-multiarch binutils-multiarch binutils-multiarch-dev binutils-multiarch-dev binutils-powerpc-linux-gnu binutils-powerpc-linux-gnu binutils-powerpc64le-linux-gnu binutils-powerpc64le-linux-gnu binutils-source binutils-static binutils-static binutils-z80 binutils-z80 elf-binutils elf-binutils mingw32-binutils mingw32-binutils'



In [ ]:

    
fip = open("/home/derek/binutils.txt")
inlines = fip.readlines()
inlines



In [5]:

    
package_list = []
for line in inlines:
    if ":i386" not in line:
        tokens = line.split()
        package_list.append(tokens[1])
    
command = "apt install " + " ".join(package_list)
command









    Out[5]:





'apt install binutils binutils-aarch64-linux-gnu binutils-alpha-linux-gnu binutils-arm-linux-gnueabi binutils-arm-linux-gnueabihf binutils-arm-linux-gnueabihf:i3 binutils-arm-none-eabi binutils-avr binutils-dev binutils-doc binutils-gold binutils-h8300-hms binutils-hppa-linux-gnu binutils-hppa64 binutils-hppa64-linux-gnu binutils-m68hc1x binutils-m68k-linux-gnu binutils-mingw-w64 binutils-mingw-w64-i686 binutils-mingw-w64-x86-64 binutils-mips-linux-gnu binutils-mips64-linux-gnuabi64 binutils-mips64-linux-gnuabi64: binutils-mips64el-linux-gnuabi6 binutils-mips64el-linux-gnuabi6 binutils-mipsel-linux-gnu binutils-msp430 binutils-multiarch binutils-multiarch-dev binutils-powerpc-linux-gnu binutils-powerpc-linux-gnuspe binutils-powerpc-linux-gnuspe:i binutils-powerpc64-linux-gnu binutils-powerpc64-linux-gnu:i3 binutils-powerpc64le-linux-gnu binutils-powerpc64le-linux-gnu: binutils-s390x-linux-gnu binutils-sh4-linux-gnu binutils-source binutils-sparc64-linux-gnu binutils-z80 elf-binutils'



In [6]:

    
x86_registers = ['edx','esi','es','fs','ds','ss','gs','cs','ah','al',
                 'ax','bh','bl','bx','ch','cl','cx','dh','dl','dx',
                 'eax','ebp','ebx','ecx','edi','esp']

x86_opcodes = ['add','al','bt','call','cdq','cld','cli','cmc','cmp','const','cwd','daa','db'
                ,'dd','dec','dw','endp','ends','faddp','fchs','fdiv','fdivp','fdivr','fild'
                ,'fistp','fld','fstcw','fstcwimul','fstp','fword','fxch','imul','in','inc'
                ,'ins','int','jb','je','jg','jge','jl','jmp','jnb','jno','jnz','jo','jz'
                ,'lea','loope','mov','movzx','mul','near','neg','not','or','out','outs'
                ,'pop','popf','proc','push','pushf','rcl','rcr','rdtsc','rep','ret','retn'
                ,'rol','ror','sal','sar','sbb','scas','setb','setle','setnle','setnz'
                ,'setz','shl','shld','shr','sidt','stc','std','sti','stos','sub','test'
                ,'wait','xchg','xor']

amd64_registers = ['rax','rbx','rcx','rdx','rsi','rdi','rbp','rsp','r8','r9','r10','r11','r12','r13','r14','r15']

amd64_opcodes = ['aaa','aad','aam','aas','adc','add','and','andn','bextr','bextr','blcfill','blci','blcic',
                 'blcmsk','blcs','blsfill','blsi','blsic','blsmsk','blsr','bound','bsf','bsr','bswap','bt',
                 'btc','btr','bts','bzhi','call','cbw','cwde','cdqe','cwd','cdq','cqo','clc','cld','clflush','cmc','cmov',
                 'cmp','cmps','cmpsb','cmpsw','cmpsd','cmpsq','cmpxchg','cmpxchg8b','cmpxchg16b','cpuid',
                 'crc32','daa','das','dec','div','enter','idiv','imul','in','inc','ins','insb','insw','insd',
                 'int','into','jcxz','jecxz','jrcxz','jmp','lahf','lds','les','lfs','lgs','lss','lea','leave','lfence',
                 'llwpcb','lods','lodsb','lodsw','lodsd','lodsq','loop','loope','loopne','loopnz','loopz','lwpins',
                 'lwpval','lzcnt','mfence','mov','movbe','movd','movmskpd','movmskps','movnti','movs','movsb',
                 'movsw','movsd','movsq','movsx','movsxd','movzx','mul','mulx','neg','nop','not','or','out',
                 'outs','outsb','outsw','outsd','pause','pdep','pext','pop','popa','popad','popcnt','popf','popfd',
                 'popfq','prefetch','prefetchw','prefetch','push','pusha','pushad','pushf','pushfd','pushfq',
                 'rcl','rcr','rdfsbase','rdgsbase','rdrand','ret','rol','ror','rorx','sahf','sal','shl','sar','sarx',
                 'sbb','scas','scasb','scasw','scasd','scasq','set','sfence','shl','shld','shlx',
                 'shr','shrd','shrx','slwpcb','stc','std','stos','stosb','stosw','stosd','stosq','sub','t1mskc',
                 'test','tzcnt','tzmsk','wrfsbase','wrgsbase','xadd','xchg','xlat','xlatb','xor','arpl','clgi','cli',
                 'clts','hlt','int','invd','invlpg','invlpga','iret','iretd','iretq','lar','lgdt','lidt','lldt',
                 'lmsw','lsl','ltr','monitor','monitorx','mwait','mwaitx','rdmsr','rdpmc','rdtsc','rdtscp',
                 'rsm','sgdt','sidt','skinit','sldt','smsw','sti','stgi','str','swapgs',
                 'syscall','sysenter','sysexit','sysret','ud2','verr','verw',
                 'vmload','vmmcall','vmrun','vmsave','wbinvd','wrmsr']

MIPS_registers = []

MIPS_opcodes = []

SPARC_registers = []

SPARC_opcodes = []

ARM_registers = ['r0','r1','r2','r3','r4','r5','r6','r7','r8','r9','r10','r11','r12','r13','r14','r15','cpsr']

ARM_opcodes = ['adc','msr','add','mul','and','mvn','b','orr','bic','rsb','bl','rsc','bx','sbc','cdp','smlal','cmn','smull',
               'cmp','stc','eor','stm','ldc','str','ldm','strb','ldr','strbt','ldrb','strh','ldrbt','strt','ldrh','sub','ldrsb','swi',
               'ldrsh','swp','ldrt','swpb','mcr','teq','mla','tst','mov','umlal','mrc','umull','mrs']

Motorola_registers = ['d0','d1','d2','d3','d4','d5','d6','d7','a0','a1','a2','a3','a4','a5','a6','a7','usp','ssp']

Motorola_opcodes = []

PowerPC_registers = ['r0','r1','r2','r3','r4','r5','r6','r7','r8','r9','r10','r11','r12','r13','r14','r15',
                    'r16','r17','r18','r19','r20','r21','r22','r23','r24','r25','r26','r27','r28','r29','r30','r31']

PowerPC_opcodes = ['add','addc','adde','addi','addic','addic.','addis','addme','addze','and','andc','andi.','andis.',
                   'b','bc','bcctr','bclr','cmp','cmpi','cmpl','cmpli','cntlzd','cntlzw','crand','crandc','creqv',
                   'crnand','crnor','cror','crorc','crxor','dcbf','dcbst','dcbt','dcbtst','dcbz','divd','divdu',
                   'divw','divwu','eciwx','ecowx','eieio','eqv','extsb','extsh','extsw','fabs','fadd','fadds',
                   'fcfid','fcmpo','fcmpu','fctid','fctidz','fctiw','fctiwz','fdiv','fdivs','fmadd','fmadds',
                   'fmr','fmsub','fmsubs','fmul','fmuls','fnabs','fneg','fnmadd','fnmadds','fnmsub','fnmsubs',
                   'fre','fres','frsp','frsqrte','frsqrtes','fsel','fsqrt','fsqrts','fsub','fsubs','hrfid','icbi',
                   'isync','lbz','lbzu','lbzux','lbzx','ld','ldarx','ldu','ldux','ldx','lfd','lfdu','lfdux','lfdx',
                   'lfs','lfsu','lfsux','lfsx','lha','lhau','lhaux','lhax','lhbrx','lhz','lhzu','lhzux','lhzx','lmw',
                   'lswi','lswx','lwa','lwarx','lwaux','lwax','lwbrx','lwz','lwzu','lwzux','lwzx','mcrf','mcrfs',
                   'mcrxr','mfcr','mfocrf','mffs','mfmsr','mfspr','mfsr','mfsrin','mftb','mtcrf','mtocrf','mtfsb0',
                   'mtfsb1','mtfsf','mtfsfi','mtmsr','mtmsrd','mtspr','mtsr','mtsrin','mulhd','mulhdu','mulhw','mulhwu',
                   'mulld','mulli','mullw','nand','neg','nor','or','orc','ori','oris','popcntb','rfid','rldcl','rldcr',
                   'rldic','rldicl','rldicr','rldimi','rlwimi','rlwinm','rlwnm','sc','slbia','slbie','slbmfee',
                   'slbmfev','slbmte','sld','slw','srad','sradi','sraw','srawi','srd','srw','stb','stbu','stbux',
                   'stbx','std','stdcx.','stdu','stdux','stdx','stfd','stfdu','stfdux','stfdx','stfiwx','stfs',
                   'stfsu','stfsux','stfsx','sth','sthbrx','sthu','sthux','sthx','stmw','stswi','stswx','stw',
                   'stwbrx','stwcx.','stwu','stwux','stwx','subf','subfc','subfe','subfic','subfme','subfze',
                   'sync','td','tdi','tlbia','tlbie','tlbsync','tw','twi','xor','xori','xoris']



In [7]:

    
fip = open("data/arm-listing.txt")
inlines = fip.readlines()
inlines[:20]









    Out[7]:





['  /* XScale instructions.  */\n',
 '  {ARM_FEATURE_COPROC (ARM_CEXT_XSCALE),\n',
 '    0x0e200010, 0x0fff0ff0,\n',
 '    "mia%c\\tacc0, %0-3r, %12-15r"},\n',
 '  {ARM_FEATURE_COPROC (ARM_CEXT_XSCALE),\n',
 '    0x0e280010, 0x0fff0ff0,\n',
 '    "miaph%c\\tacc0, %0-3r, %12-15r"},\n',
 '  {ARM_FEATURE_COPROC (ARM_CEXT_XSCALE),\n',
 '    0x0e2c0010, 0x0ffc0ff0, "mia%17\'T%17`B%16\'T%16`B%c\\tacc0, %0-3r, %12-15r"},\n',
 '  {ARM_FEATURE_COPROC (ARM_CEXT_XSCALE),\n',
 '    0x0c400000, 0x0ff00fff, "mar%c\\tacc0, %12-15r, %16-19r"},\n',
 '  {ARM_FEATURE_COPROC (ARM_CEXT_XSCALE),\n',
 '    0x0c500000, 0x0ff00fff, "mra%c\\t%12-15r, %16-19r, acc0"},\n',
 '\n',
 '  /* Intel Wireless MMX technology instructions.  */\n',
 '  {ARM_FEATURE_CORE_LOW (0), SENTINEL_IWMMXT_START, 0, "" },\n',
 '  {ARM_FEATURE_COPROC (ARM_CEXT_IWMMXT),\n',
 '    0x0e130130, 0x0f3f0fff, "tandc%22-23w%c\\t%12-15r"},\n',
 '  {ARM_FEATURE_COPROC (ARM_CEXT_XSCALE),\n',
 '    0x0e400010, 0x0ff00f3f, "tbcst%6-7w%c\\t%16-19g, %12-15r"},\n']



In [ ]:

    
# Parse the contents of arm-dis.c in binutils and extract all the ARM opcodes.
opcode_list = []
counter = 0
opcode = 'none'
for line in inlines:
    line = line.lstrip()
    line = line.replace('\\t',' ')
    if len(line) < 10:
        continue
    if line.startswith('{'):
        continue
    if line.startswith('"'):
        idx = line.find('%')
        if idx > 0:
            opcode = line[1:idx]
        else:
            continue
    else:
        tokens = line.split()
        if len(tokens) < 3:
            continue
        opcodestr = tokens[2]
        idx = opcodestr.find('%')
        if idx > 0:
            opcode = opcodestr[1:idx]
        else:
            continue
            
    if opcode not in opcode_list:
        opcode_list.append(opcode)
    
opcode_str = "[\'" + "','".join(opcode_list) + "\']" 
opcode_str



In [ ]:

    
fip = open("data/m68k-opc.c")
inlines = fip.readlines()
inlines[:20]



In [ ]:

    
# Parse the contents of m68k-opc.c in binutils and extract all the Motorola opcodes.
opcode_list = []
counter = 0
opcode = 'none'
for line in inlines:
    line = line.lstrip()
    if len(line) < 10:
        continue
    if line.startswith("{"):
        line = line[2:]
        idx = line.find("\"")
        if idx > 0:
            opcode = line[:idx]
        else:
            continue
    else:
        continue
            
    if opcode not in opcode_list:
        opcode_list.append(opcode)
    
opcode_str = "[\'" + "','".join(opcode_list) + "\']" 
opcode_str



In [ ]:

    
fip = open("data/mips-instruction-set.txt")
inlines = fip.readlines()
inlines[:20]



In [ ]:

    
# Parse the contents of mips-instruction-set.txt and extract all the things.
opcode_list = []
counter = 0
opcode = 'none'
for line in inlines:
    line = line.lstrip()
    if len(line) < 10:
        continue
    if line.startswith("{"):
        line = line[2:]
        idx = line.find("\"")
        if idx > 0:
            opcode = line[:idx]
        else:
            continue
    else:
        continue
    
    idx = opcode.find(".")
    if idx > 0:
        opcode = opcode[:idx]
        
    if opcode not in opcode_list:
        opcode_list.append(opcode)
    
opcode_str = "[\'" + "','".join(opcode_list) + "\']" 
opcode_str



In [ ]:

    
fip = open("data/i386-opc.tbl")
inlines = fip.readlines()
inlines[:20]



In [ ]:

    
# Parse the contents of i386-opc.tbl from binutils and extract all the things.
opcode_list = []
counter = 0
opcode = 'none'
for line in inlines:
    if len(line) < 10:
        continue
    if line.startswith("//"):
        continue
        
    line = line.replace(',', ' ')
    tokens = line.split()
    if len(tokens) > 0:
        opcode = tokens[0]
    else:
        continue
    
    idx = opcode.find(".")
    if idx > 0:
        opcode = opcode[:idx]
        
    if opcode not in opcode_list:
        opcode_list.append(opcode)
    
opcode_str = "[\'" + "','".join(opcode_list) + "\']" 
opcode_str



In [2]:

    
fip = open("data/Java-bytecode-instruction-listing.txt")
inlines = fip.readlines()
inlines[:20]









    Out[2]:





['aaload \t32 \t0011 0010 \t\tarrayref, index \xe2\x86\x92 value \tload onto the stack a reference from an array\n',
 'aastore \t53 \t0101 0011 \t\tarrayref, index, value \xe2\x86\x92 \tstore into a reference in an array\n',
 'aconst_null \t01 \t0000 0001 \t\t\xe2\x86\x92 null \tpush a /null/ reference onto the stack\n',
 'aload \t19 \t0001 1001 \t1: index \t\xe2\x86\x92 objectref \tload a reference onto the stack from a local variable /#index/\n',
 'aload_0 \t2a \t0010 1010 \t\t\xe2\x86\x92 objectref \tload a reference onto the stack from local variable 0\n',
 'aload_1 \t2b \t0010 1011 \t\t\xe2\x86\x92 objectref \tload a reference onto the stack from local variable 1\n',
 'aload_2 \t2c \t0010 1100 \t\t\xe2\x86\x92 objectref \tload a reference onto the stack from local variable 2\n',
 'aload_3 \t2d \t0010 1101 \t\t\xe2\x86\x92 objectref \tload a reference onto the stack from local variable 3\n',
 'anewarray \tbd \t1011 1101 \t2: indexbyte1, indexbyte2 \tcount \xe2\x86\x92 arrayref create a new array of references of length /count/ and component type\n',
 'areturn \tb0 \t1011 0000 \t\tobjectref \xe2\x86\x92 [empty] \treturn a reference from a method\n',
 'arraylength \tbe \t1011 1110 \t\tarrayref \xe2\x86\x92 length \tget the length of an array\n',
 'astore \t3a \t0011 1010 \t1: index \tobjectref \xe2\x86\x92 \tstore a reference into a local variable /#index/\n',
 'astore_0 \t4b \t0100 1011 \t\tobjectref \xe2\x86\x92 \tstore a reference into local variable 0\n',
 'astore_1 \t4c \t0100 1100 \t\tobjectref \xe2\x86\x92 \tstore a reference into local variable 1\n',
 'astore_2 \t4d \t0100 1101 \t\tobjectref \xe2\x86\x92 \tstore a reference into local variable 2\n',
 'astore_3 \t4e \t0100 1110 \t\tobjectref \xe2\x86\x92 \tstore a reference into local variable 3\n',
 'athrow \tbf \t1011 1111 \t\tobjectref \xe2\x86\x92 [empty], objectref \tthrows an error\n',
 'baload \t33 \t0011 0011 \t\tarrayref, index \xe2\x86\x92 value \tload a byte or Boolean value from an array\n',
 'bastore \t54 \t0101 0100 \t\tarrayref, index, value \xe2\x86\x92 \tstore a byte or Boolean value into an array\n',
 'bipush \t10 \t0001 0000 \t1: byte \t\xe2\x86\x92 value \tpush a /byte/ onto the stack as an integer /value/\n']



In [3]:

    
# Parse the contents of Java-bytecode-instruction-listing.txt and extract all the things.
opcode_list = []
counter = 0
opcode = 'none'
for line in inlines:
    if len(line) < 10:
        continue
        
    line = line.replace('\t', ' ')
    tokens = line.split()
    if len(tokens) > 0:
        opcode = tokens[0]
    else:
        continue

        
    if opcode not in opcode_list:
        opcode_list.append(opcode)
    
opcode_str = "[\'" + "','".join(opcode_list) + "\']" 
opcode_str









    Out[3]:





"['aaload','aastore','aconst_null','aload','aload_0','aload_1','aload_2','aload_3','anewarray','areturn','arraylength','astore','astore_0','astore_1','astore_2','astore_3','athrow','baload','bastore','bipush','breakpoint','caload','castore','checkcast','d2f','d2i','d2l','dadd','daload','dastore','dcmpg','dcmpl','dconst_0','dconst_1','ddiv','dload','dload_0','dload_1','dload_2','dload_3','dmul','dneg','drem','dreturn','dstore','dstore_0','dstore_1','dstore_2','dstore_3','dsub','dup','dup_x1','dup_x2','dup2','dup2_x1','dup2_x2','f2d','f2i','f2l','fadd','faload','fastore','fcmpg','fcmpl','fconst_0','fconst_1','fconst_2','fdiv','fload','fload_0','fload_1','fload_2','fload_3','fmul','fneg','frem','freturn','fstore','fstore_0','fstore_1','fstore_2','fstore_3','fsub','getfield','getstatic','goto','goto_w','i2b','i2c','i2d','i2f','i2l','i2s','iadd','iaload','iand','iastore','iconst_m1','iconst_0','iconst_1','iconst_2','iconst_3','iconst_4','iconst_5','idiv','if_acmpeq','if_acmpne','if_icmpeq','if_icmpge','if_icmpgt','if_icmple','if_icmplt','if_icmpne','ifeq','ifge','ifgt','ifle','iflt','ifne','ifnonnull','ifnull','iinc','iload','iload_0','iload_1','iload_2','iload_3','impdep1','impdep2','imul','ineg','instanceof','invokedynamic','invokeinterface','invokespecial','invokestatic','invokevirtual','ior','irem','ireturn','ishl','ishr','istore','istore_0','istore_1','istore_2','istore_3','isub','iushr','ixor','jsr','jsr_w','l2d','l2f','l2i','ladd','laload','land','lastore','lcmp','lconst_0','lconst_1','ldc','ldc_w','ldc2_w','ldiv','lload','lload_0','lload_1','lload_2','lload_3','lmul','lneg','lookupswitch','lor','lrem','lreturn','lshl','lshr','lstore','lstore_0','lstore_1','lstore_2','lstore_3','lsub','lushr','lxor','monitorenter','monitorexit','multianewarray','new','newarray','nop','pop','pop2','putfield','putstatic','ret','return','saload','sastore','sipush','swap','tableswitch','wide']"



In [4]:

    
len(opcode_list)









    Out[4]:





205



In [ ]:



In [ ]:

Test PDF Keywords/Names Feature Extraction.



In [ ]:

    
import os
from csv import writer
import numpy as np
import pandas as pd


# Start of Script

target_dir = "/opt/vs/legitware/"
out_file = "data/pdf-features-legit.csv"
pdf_token_file = "data/2716-pdf-token-counts-non-malicious-set.csv"
#out_file = "data/pdf-features-vs251.csv"

 

file_list = os.listdir(target_dir)
pdflist = []

for fname in enumerate(file_list):
    if fname.endswith('.pdf'):
        pdflist.append(target_dir + fname)
    
print("Got {:d} PDF files.".format(len(pdflist)))



In [ ]:



In [ ]:



In [ ]:

	file_name	entropy	file_size
0	00002e640cafb741bea9a48eaee27d6f	0.992174	208860
1	000118d12cbf9ad6103e8b914a6e1ac3	0.834382	201600
2	0001776237ac37a69fcef93c1bac0988	0.966021	682192
65536	00027c21667d9119a454df8cef2dc1c7	0.666599	18390
65537	0003887ab64b8ae19ffa988638decac2	0.903260	1134320
3	000403e4e488356b7535cc613fbeb80b	0.773787	199168
65538	0004376a62e22f6ad359467eb742b8ff	0.803515	149720
4	0004c8b2a0f4680a5694d74199b40ea2	0.985592	1165440
5	000595d8b586915c12053104cf845097	0.841920	264240
65539	000634f03457d088c71dbffb897b1315	0.957584	1725502
65540	00072ed24314e91b63b425b3dc572f50	0.486112	328093
65541	00092d369958b67557da8661cc9093bc	0.845657	522936
6	00093d5fa5cb7ce77f6eaf39962daa12	0.803481	742064
7	00099926d51b44c6f8c93a48c2567891	0.997032	725288
65542	0009a64f786fa29bfa6423278cc74f02	0.996663	671280
8	000a2db4762dc06628a086c9e117f884	0.535436	61551
65543	000ac11fa7587b2316470b154254a219	0.997824	1874471
9	000ae2c63ba69fc93dfc395b40bfe03a	0.899481	487386
65544	000ae90736a51c47543dcc6d8a735362	0.863887	260144
65545	000b41258d624ef2d6e430822d0c0c8f	0.992772	590824