1. Data Preparation and Extraction of Features

Features sets will consist of:
- Entropy and file size from packed binaries.
- Entropy and file size from unpacked binaries.
- ASM features from disassembled unpacked binaries.
- Executable header features.
- Call Graph Features.
- Sample Statistics.
- PE packer type.
- Behavioural features from Cuckoo Sandbox reports.
- Memory features from Volatility reports.

Training labels will be generated from ClamAV, Windows Defender and VirusTotal.com reports.

In [1]:
from multiprocessing import Pool
import os
from csv import writer
import numpy as np
import pandas as pd
import math
import scipy.misc
import array
import time as tm
import re
import subprocess as sub

In [2]:
ext_drive = '/opt/vs/'
tfiles = os.listdir(ext_drive + "train")

2. Generate Entropy and File Size of Packed Binaries and Non-Binary Files

Script: feature_extraction_entropy.py

In [12]:
# Calculate Shannon's Entropy, https://en.wikipedia.org/wiki/Entropy_(information_theory)

def calculate_entropy(byte_counts, total):
    entropy = 0.0

    for count in byte_counts:
        # If no bytes of this value were seen in the value, it doesn't affect
        # the entropy of the file.
        if count == 0:
        # p is the probability of seeing this byte in the file, as a floating-point number
        p = 1.0 * count / total
        entropy -= p * math.log(p, 256)

    return entropy

def entropy_counter(byte_code):
    byte_counts = [0] * 256
    code_length = len(byte_code)
    for i in range(len(byte_code)):
        byte_counts[int(byte_code[i])] += 1
    entropy = calculate_entropy(byte_counts, code_length)

    return entropy

def sort_and_save_entropy_feature_file():
    entropys = pd.read_csv('data/entropy-features.csv')
    # DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values().
    sorted_entropys = entropys.sort('file_name')
    sorted_entropys.to_csv('data/sorted-entropy-features.csv', index=False)

def combine_entropy_files():
    # Function to combine the newly generated entropy files into one file:
    # 1. list data directory
    # 2. For each file in file list that matches (\d\d\d\d-entropy-features.csv)
    # 3. Trim the filenames if necessary (should remove VirusShare_  prefix).
    # 4. Concatenate the unsorted packer id feature files.
    # 5. Sort and write to data/sorted-packer-id-features.csv
    fop = open('data/entropy-features.csv','w')
    p1 = re.compile('\d{3,5}-entropy-features-bin.csv') # This is the PID prefix for each file.
    file_list = os.listdir('data/')
    counter = 0
    for file_name in file_list:
        if p1.match(file_name):
            fip = open('data/' + file_name, 'r')
            in_lines = fip.readlines()
            counter += len(in_lines)
    print('Completed combine of {:d} entropy features.'.format(counter))  

# feature extraction for the binary files

def extract_binary_features(tfiles):
    #byte_files = [i for i in tfiles if '.bytes' in i]
    ftot = len(tfiles)
    pid = os.getpid()
    print('Process id:', pid)
    feature_file = 'data/' + str(pid) + '-entropy-features-bin.csv' # entropy, file size, ngrams...   
    print('feature file:', feature_file)
    feature_counts = []
    with open(feature_file, 'w') as f:
        # Write the column names for the csv file
        fw = writer(f)
        # Do this when combining the files.
        #colnames = ['file_name'] + ['entropy'] + ['file_size'] 
        # Now iterate through the file list and extract the features from each file.
        for idx, fname in enumerate(tfiles):
            fasm = open(ext_drive + fname, 'rb')
            filesize = os.path.getsize(ext_drive + fname)
            in_bytes = fasm.read()
            # TODO: Do ngram extraction
            # First do entropy calculations and filesize
            # Convert the input array into a byte array to prevent type errors
            # in entropy counter function.
            in_bytes = bytearray(in_bytes)
            #print("Type = {:s}").format(type(in_bytes))
            entropy = entropy_counter(in_bytes)
            count_vals = [entropy, filesize]
            feature_counts.append([fname[fname.find('_')+1:]] + count_vals)   
            # Print progress
            if (idx+1) % 1000 == 0:
                print("{:d} - {:d} of {:d} files processed.".format(pid, idx + 1, ftot))
                feature_counts = []
        # Write remaining files
        if len(feature_counts) > 0:
            feature_counts = []

        print("Completed processing {:d} rows for feature file {:s}".format(ftot,feature_file))

# Now divide the train files into four groups for multiprocessing
ext_drive = '/opt/vs/train/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print("Files: {:d} - {:d} - {:d}".format(len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4))))
trains = [train1, train2, train3, train4]
p = Pool(4)
p.map(extract_binary_features, trains)

# Now divide the train files into four groups for multiprocessing
ext_drive = '/opt/vs/train2/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print("Files: {:d} - {:d} - {:d}".format(len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4))))
trains = [train1, train2, train3, train4]
p = Pool(4)
p.map(extract_binary_features, trains)

In [15]:
entropys = pd.read_csv('data/sorted-entropy-features.csv')
sorted_entropys = entropys.sort('file_name')
sorted_entropys.to_csv('data/sorted-entropy-features-vs251-252.csv', index=False)

file_name entropy file_size
0 00002e640cafb741bea9a48eaee27d6f 0.992174 208860
1 000118d12cbf9ad6103e8b914a6e1ac3 0.834382 201600
2 0001776237ac37a69fcef93c1bac0988 0.966021 682192
65536 00027c21667d9119a454df8cef2dc1c7 0.666599 18390
65537 0003887ab64b8ae19ffa988638decac2 0.903260 1134320
3 000403e4e488356b7535cc613fbeb80b 0.773787 199168
65538 0004376a62e22f6ad359467eb742b8ff 0.803515 149720
4 0004c8b2a0f4680a5694d74199b40ea2 0.985592 1165440
5 000595d8b586915c12053104cf845097 0.841920 264240
65539 000634f03457d088c71dbffb897b1315 0.957584 1725502
65540 00072ed24314e91b63b425b3dc572f50 0.486112 328093
65541 00092d369958b67557da8661cc9093bc 0.845657 522936
6 00093d5fa5cb7ce77f6eaf39962daa12 0.803481 742064
7 00099926d51b44c6f8c93a48c2567891 0.997032 725288
65542 0009a64f786fa29bfa6423278cc74f02 0.996663 671280
8 000a2db4762dc06628a086c9e117f884 0.535436 61551
65543 000ac11fa7587b2316470b154254a219 0.997824 1874471
9 000ae2c63ba69fc93dfc395b40bfe03a 0.899481 487386
65544 000ae90736a51c47543dcc6d8a735362 0.863887 260144
65545 000b41258d624ef2d6e430822d0c0c8f 0.992772 590824

20 rows × 3 columns

3. Generate Entropy and File Size of Unpacked Binaries

# TODO: everything.

4. Generate PE ASM and Header Features

- PE Header Features from objdump header summaries.
- ASM Features from IDA Pro assembly files.

- Script: feature_extraction_pe_asm.py

keywords = ['Virtual','Offset','loc','Import','Imports','var','Forwarder','UINT','LONG'

known_sections = ['.text', '.data', '.bss', '.rdata', '.edata', '.idata', '.rsrc', '.tls', '.reloc']

registers = ['edx','esi','es','fs','ds','ss','gs','cs','ah','al',

opcodes = ['add','al','bt','call','cdq','cld','cli','cmc','cmp','const','cwd','daa','db'

def count_asm_symbols(asm_code):
    symbols = [0]*7
    for row in asm_code:
        if '*' in row:
            symbols[0] += 1
        if '-' in row:
            symbols[1] += 1
        if '+' in row:
            symbols[2] += 1
        if '[' in row:
            symbols[3] += 1
        if ']' in row:
            symbols[4] += 1
        if '@' in row:
            symbols[5] += 1
        if '?' in row:
            symbols[6] += 1

    return symbols

def count_asm_registers(asm_code):
    registers_values = [0]*len(registers)
    for row in asm_code:
        parts = row.replace(',',' ').replace('+',' ').replace('*',' ').replace('[',' ').replace(']',' ') \
                    .replace('-',' ').split()
        for register in registers:
            registers_values[registers.index(register)] += parts.count(register)
    return registers_values

def count_asm_opcodes(asm_code):
    opcodes_values = [0]*len(opcodes)
    for row in asm_code:
        parts = row.split()

        for opcode in opcodes:
            if opcode in parts:
                opcodes_values[opcodes.index(opcode)] += 1
    return opcodes_values

def count_asm_APIs(asm_code, apis):
    apis_values = [0]*len(apis)
    for row in asm_code:
        for i in range(len(apis)):
            if apis[i] in row:
                apis_values[i] += 1
    return apis_values

def count_asm_misc(asm_code):
    keywords_values = [0]*len(keywords)
    for row in asm_code:
        for i in range(len(keywords)):
            if keywords[i] in row:
                keywords_values[i] += 1
    return keywords_values

# Extract features from test/training asm files, file list is passed in as a parameter

def extract_asm_features(tfiles):
    pid = os.getpid()
    print('Process id:', pid)
    feature_file = 'data/' + str(pid) + '-malware-features-asm.csv' # Windows API, symbols, registers, opcodes, etc...   
    print('feature file:', feature_file)

    fapi = open("data/APIs.txt")
    defined_apis = fapi.readlines()
    defined_apis = defined_apis[0].split(',')

    asm_files = [i for i in tfiles if '.asm' in i]
    ftot = len(asm_files)
    feature_counts = []
    with open(feature_file, 'w') as f:
        # write the csv header
        fw = writer(f)
        colnames = ['file_name'] + registers + opcodes + defined_apis + keywords
        for idx, fname in enumerate(asm_files):
            fasm = open(ext_drive + fname, 'r')
            content = fasm.readlines()
            reg_vals = count_asm_registers(content)
            opc_vals = count_asm_opcodes(content)
            api_vals = count_asm_APIs(content, defined_apis)
            #sec_vals = count_asm_sections(content)
            mis_vals = count_asm_misc(content)
            count_vals = reg_vals + opc_vals + api_vals + mis_vals
            feature_counts.append([fname[:fname.find('.asm')]] + count_vals)   
            # Writing rows after every 10 files processed
            if (idx+1) % 10 == 0:
              print(pid, idx + 1, 'of', ftot, 'files processed.')
              feature_counts = []
        # Writing remaining files
        if len(feature_counts) > 0:
            feature_counts = []


8. Clean and Sort Function Names

- script: function_name_clean.py

In [ ]:
# Need to clean up and sort these function names for ASM feature extraction.
fip = open('data/all-function-column-names-multiline.csv')
function_names = fip.readlines()


fop = open('data/sorted-function-names-multiline.txt','w')

In [22]:
fip = open('data/sorted-function-names-multiline.txt','r')
sorted_function_names = fip.readlines()
fip = open('data/APIs.txt','r')
api_names_str = fip.readline()
api_names_str = api_names_str.rstrip()
api_names = api_names_str.split(',')


In [23]:
for idx in range(len(sorted_function_names)):
    sorted_function_names[idx] = sorted_function_names[idx].rstrip()
for aname in api_names:
    if aname not in sorted_function_names:


In [25]:
function_count = len(sorted_function_names)
total_chars = 0
for func_name in sorted_function_names:
    total_chars += len(func_name)
avg_name_len = int(total_chars / function_count)


# truncate function names to reduce the size of the huge sparse matrix.
function_column_names = []
for func in sorted_function_names:
    if func.startswith('sub') or func.startswith('loc') or func.startswith('unk'):
        func = func[:5] # lets try to reduce the vast number of functions.
    elif func.startswith('eax+') or func.startswith('ebx+') or func.startswith('ecx+') or func.startswith('edx+'):
        func = func[:3]
    elif func.startswith('edi+') or func.startswith('esi+'):
        func = func[:3]
    elif func.startswith('byte_') or func.startswith('word_') or func.startswith('off_'):
        func = func[:4]
    elif func.startswith('_') or func.startswith('$'):
        func = func[1:]
    elif func.startswith('__') or func.startswith('$$'):
        func = func[2:]
    #else: need a regex here to match a bunch of random crap 
    #    func = func[:33]
    if len(func) > 32: # Reduce the the function name length to max of average function length.
        func = func[:32]
    if func not in function_column_names:    

In [27]:
fop = open('data/sorted-reduced-function-names.txt','w')

for fname in function_column_names:
    fop.write(fname + "\n")



# Use a regex to remove function names that are just hexadecimal addresses.
p1 = re.compile('\d\w+h')
reduced_function_names = []
fip = open('data/sorted-reduced-function-names.txt','r')
function_column_names = fip.readlines()

fop = open('data/sorted-reduced-function-names-hexless.txt','w')
for fname in function_column_names:
    fname = fname.rstrip()
    m = p1.match(fname)
    if m == None:
        fop.write(fname + "\n")

10. Test Code Only

In [4]:
signat = sub.check_output(["file",'-b', '/opt/vs/agobot.exe'])

In [5]:

'PE32 executable (GUI) Intel 80386, for MS Windows, UPX compressed\n'

Generate C Library API Tokens for ASM and Header Feature Extraction.

In [13]:
# Generate libc function calls for ELF API feature extraction.
# Put in feature_extraction_elf_asm.py

def generate_libc_api():
    # Extract libc function and variable names from GNU Libc Documentation.
    # Put in feature_extraction.py
    fipfunc = open('data/libc-function-index.txt', 'r')
    funclines = fipfunc.readlines()
    counter = 0
    func_list = []
    for idx, fline in enumerate(funclines):
        fline = fline.replace('\t','').replace('\n','')
        if fline.startswith('|'):
            tokens = fline.split('|')   # The function names are |funcname|
            funcname = tokens[1]
            counter += 1
    print("Found {:d} function definitions for libc api.".format(counter))
    fop = open('data/elf-libc-api.txt', 'w')
    for func_name in func_list:
        fop.write(func_name + "\n")
    return func_list

def generate_libc_var():
    # Extract libc function and variable names from GNU Libc Documentation.
    # Put in feature_extraction.py

    fipvar = open('data/libc-variable-index.txt', 'r')
    varlines = fipvar.readlines()
    counter = 0
    var_list = []
    for idx, vline in enumerate(varlines):
        vline = vline.replace('\t','').replace('\n','')
        if vline.startswith('|'):
            tokens = vline.split('|')   # The function names are |funcname|
            varname = tokens[1]
            counter += 1
    print("Found {:d} variable definitions for libc api.".format(counter))
    fop = open('data/elf-libc-var.txt', 'w')
    for var_name in var_list:
        fop.write(var_name + "\n")
    return var_list

func_list = generate_libc_api()

Validate Disassembly Results.

In [18]:
# Check interrupted disassembly results for train1 feature set.
# def validate_disassembly():
# put in disassemble_pe.py

t1asm = os.listdir('/opt/vs/train1asm/')
t1hdr = os.listdir('/opt/vs/train1hdr/')
asm_files = []
hdr_files = []

for fname in t1asm:
    if fname.endswith('.asm'):
for fname in t1hdr:
    if fname.endswith('.txt'):
print("asm dir: {:d} asm files {:d} hdr dir {:d} hdr files {:d}".format(len(t1asm),len(asm_files),len(t1hdr),len(hdr_files)))

asm dir: 21314 asm files 21314 hdr dir 21409 hdr files 21409

In [19]:
len(t1hdr) - len(t1asm)


In [20]:
counter = 0
missing_hdr_list = []

for fname in asm_files:
    hdr_name = fname.replace('.asm', '.txt')
    if hdr_name not in hdr_files:
        print("{:s} not in header file list.".format(hdr_name))
        counter += 1
print("{:d} missing header files.".format(counter))

0 missing header files.

counter = 0
missing_asm_list = []

for fname in hdr_files:
    asm_name = fname.replace('.txt','.asm')
    if asm_name not in asm_files:
        print("{:s} not in asm file list.".format(asm_name))
        counter += 1
print("{:d} missing assembly files.".format(counter))

In [23]:
counter = 0
fop = open('data/disass-train1-missing-asm-files.txt', 'w')
for fname in missing_asm_list:
    fop.write(fname + "\n")
    counter += 1
print("Wrote {:d} missing asm file names.".format(counter))

Wrote 95 missing asm file names.

In [25]:
counter = 0
bad_hdr_list = []

for fname in hdr_files:
    fsize = os.path.getsize('/opt/vs/train1hdr/' + fname)
    if fsize < 1000:
        print("{:s} bad output, filesize = {:d}.".format(fname, fsize))
        counter += 1
print("{:d} bad header files.".format(counter))

VirusShare_d5eff38b212286c46db007aa7159ffd8.txt bad output, filesize = 0.
VirusShare_592d7ac775519110d58e9ce1975c1b5b.txt bad output, filesize = 0.
VirusShare_4a0c79f6ad27b0a674b08005d102e16d.txt bad output, filesize = 0.
VirusShare_c80d9b2dbf9b7953a3b6e9b51a39a0c2.txt bad output, filesize = 0.
4 bad header files.

counter = 0
bad_asm_list = []

for fname in asm_files:
    fsize = os.path.getsize('/opt/vs/train1asm/' + fname)
    if fsize < 1000:
        print("{:s} bad output, filesize = {:d}.".format(fname, fsize))
        counter += 1
print("{:d} bad asm files.".format(counter))

In [3]:
apt_df = pd.read_csv('data/sorted-entropy-features-apt.csv')
apt_file_list = apt_df['file_name']


In [5]:

0    001dd76872d80801692ff942308c64e6
1    002325a0a67fded0381b5648d7fe9b8e
2    00dbb9e1c09dbdafb360f3163ba5a3de
3    0149b7bd7218aab4e257d28469fddb0d
4    01e0dc079d4e33d8edd050c4900818da
Name: file_name, dtype: object

In [7]:
f_list = os.listdir('/home/derek/project/temp/train/')
counter = 0
file_list = []
for fname in f_list:
    if fname.startswith('Virus'):
        tname = fname[fname.find('_') + 1:]
        counter += 1
print("Got {:d} files in training directory.".format(counter))

apt_list = np.array(apt_file_list)
for fname in file_list:
    if fname not in apt_list:
        print("Extra file: {:s}".format(fname))

Got 294 files in training directory.
Extra file: 00248ef21706d78c1f0e1eca3cab72c3

Rename Header and ASM Files Generated by IDA Pro.

In [5]:
def rename_header_files(ext_dir):
    # Rename all the PE headers files so it is easier to process them.
    file_list = os.listdir(ext_dir)
    counter = 0
    for fname in file_list:
        if fname.startswith('Virus'):
            file_path = ext_dir + fname
            trunc_name = fname[0:fname.find('.txt')]
            new_path = ext_dir + trunc_name + '.pe.txt'
            result = sub.check_call(['mv', file_path, new_path])
            counter += 1

        if (counter % 1000) == 0:
            print('Renamed {:d} header files.'.format(counter))

    print('Completed move of {:d} header files.'.format(counter))

ext_dir = '/home/derek/project/temp/'

In [2]:
def rename_asm_files(ext_dir, new_dir):
    # Rename all the PE ASM files and move to a new directory
    # so it is easier to process them.
    file_list = os.listdir(ext_dir)
    counter = 0
    print("Got {:d} files in directory {:s}".format(len(file_list), ext_dir))
    for fname in file_list:
        if fname.endswith('.asm'):
            file_path = ext_dir + fname
            trunc_name = fname[0:fname.find('.asm')]
            new_path = new_dir + trunc_name + '.pe.asm'
            result = sub.check_call(['mv', file_path, new_path])
            counter += 1

            if (counter % 1000) == 0:
                print('Renamed {:d} ASM files.'.format(counter))

    print('Completed rename of {:d} ASM files.'.format(counter))

In [4]:
def rename_asm_files_fix(ext_dir):
    # Rename all the PE headers files so it is easier to process them.
    file_list = os.listdir(ext_dir)
    pe_counter = 0
    unpe_counter = 0
    print("Got total files: {:d}".format(len(file_list)))
    for fname in file_list:
        if fname.endswith('.pe.asm'):
            pe_counter += 1
        elif fname.endswith('.asm'):
            file_path = ext_dir + fname
            trunc_name = fname[0:fname.find('.asm')]
            new_path = ext_dir + trunc_name + '.pe.asm'
            result = sub.check_call(['mv', file_path, new_path])
            unpe_counter += 1

            if (unpe_counter % 1000) == 0:
                print('Renamed {:d} ASM files.'.format(unpe_counter))

    print('Completed move of {:d} ASM files with {:d} files already renamed.'.format(unpe_counter, pe_counter))

Validate Disassembly Results Part 2

- disassemble_pe.py

In [4]:
# Moved to feature-extraction-validation.ipynb

Find ELF Disassembly Files.

In [6]:
def find_elf_train1asm(ext_dir):
    # Rename all the PE headers files so it is easier to process them.
    file_list = os.listdir(ext_dir)
    elf_counter = 0
    print("Got total files: {:d}".format(len(file_list)))
    for fname in file_list:
        if 'elf' in fname:
            elf_counter += 1
            file_path = ext_dir + fname
            #trunc_name = fname[0:fname.find('.elf')]
            new_path = '/opt/vs/' + fname
            #result = sub.check_call(['mv', file_path, new_path])

            #if (elf_counter % 1000) == 0:
            print('ELF ASM file {:s}.'.format(file_path))

    print('Completed move of {:d} ELF ASM files.'.format(elf_counter))

In [7]:

Got total files: 54911
Completed move of 0 ELF ASM files.

In [8]:

Got total files: 46166
Completed move of 0 ELF ASM files.

Test Generation of PE/COFF Header Tokens.

- generate_pe_header_tokens.py

In [27]:
def save_token_counts(token_counter_map, out_file):
    # Output the malware sample classification counts.
    fop = open(out_file, 'w')
    csv_wouter = writer(fop)
    cols = ['token_name','count'] # write out the column names.
    outlines = []
    sorted_keys = token_counter_map.keys()
    counter = 0
    for key in sorted_keys:
        outlines.append([key, token_counter_map[key]])
        counter += 1
        if (counter % 100) == 0: # write out some lines
            outlines = []
            print("Processed token {:s} -> {:d}.".format(key, token_counter_map[key]))

    # Finish off.
    if (len(outlines) > 0):
        outlines = []

    print("Completed writing {:d} tokens.".format(len(sorted_keys)))    


def get_token_count_map(token_df):
    # Read in the token count file and create a dict.
    token_dict = {}
    type_y = np.array(token_df['token_name'])
    for idx in range(token_df.shape[0]): # First fill the dict with the token counts
        token_dict[token_df.iloc[idx,0]] = token_df.iloc[idx,1]

    return token_dict

def combine_token_files():
    # TODO: everything

def generate_pe_tokens(file_list, out_token_file, out_count_file):

    psections = re.compile('\s+\d{1,2}\s+(\.\w+|\w+)\s+\d+')            # Pattern for section names.
    pdlls = re.compile('\s+DLL Name: (\w+)')                  # Pattern for import DLL names.
    pfunctions = re.compile('\s+\w+\s+\d{1,4}\s+(.+)')        # Pattern for import function names.
    preloc = re.compile('\s+reloc')                           # Pattern for relocation entries.
    pexports = re.compile('\s+\[\s*\d+\]\s+(\w+)')            # Pattern for exported function names.
    token_counter_map = {}
    counter = 0
    pid = os.getpid()
    for idx, fname in enumerate(file_list):

        fip = open(fname, 'r')
        in_lines = fip.readlines()
        counter += 1
        for line in in_lines:

            line = line.rstrip() # get rid of newlines they are annoying.
            token_val = ""
            m = preloc.match(line)
            if m != None:
                #token_val = m.group(2)

            m = psections.match(line)
            if m != None:
                token_val = m.group(1)
                print("Section: {:s}".format(token_val))
                m = pdlls.match(line)
                if m != None:
                    token_val = m.group(1)
                    m = pfunctions.match(line)
                    if m != None:
                        token_val = m.group(1)
                        m = pexports.match(line)
                        if m != None:
                            token_val = m.group(1)
                            print("Export: {:s}".format(token_val))
            # Count the token type.
            if token_val in token_counter_map.keys():
                token_counter_map[token_val] += 1
                token_counter_map[token_val] = 1

        if (counter % 100) == 0:
            print("{:d} Processed {:d} header files.".format(pid, counter))

    save_token_counts(token_counter_map, out_count_file)

In [ ]:
ext_drive = '/opt/vs/hdr/'
file_list = os.listdir(ext_drive)
file_paths = []

for fname in file_list:
    file_paths.append(ext_drive + fname)

In [6]:
# Testing PE header token generation.

def save_token_counts(token_counter_map, out_file_name):
    # Output the PE Header token counts.
    pid = os.getpid()
    out_file = "data/" + str(pid) + "-" + out_file_name
    fop = open(out_file, 'w')
    csv_wouter = writer(fop)

    outlines = []
    sorted_keys = token_counter_map.keys()
    counter = 0
    for key in sorted_keys:
        outlines.append([key, token_counter_map[key]])
        counter += 1
        if (counter % 100) == 0: # write out some lines
            outlines = []
            print("Processed token {:s} -> {:d}.".format(key, token_counter_map[key]))

    # Finish off.
    if (len(outlines) > 0):
        outlines = []

    print("Completed writing {:d} tokens.".format(len(sorted_keys)))    


def get_token_count_map(token_df):
    # Read in the token count file and create a dict.
    token_dict = {}
    type_y = np.array(token_df['token_name'])
    for idx in range(token_df.shape[0]): # First fill the dict with the token counts
        token_dict[token_df.iloc[idx,0]] = token_df.iloc[idx,1]

    return token_dict

def generate_pe_tokens(mp_params):
    # Parse a bunch of PE/COFF headers dumped by objdump and extract
    # section names, import DLLs, import functions and exported functions.
    file_list = mp_params.file_list
    out_count_file = mp_params.count_file
    psections = re.compile('\s+\d{1,2}\s+(\.\w+|\w+)\s+\d+')  # Pattern for section names.
    pdlls = re.compile('\s+DLL Name: (\w+)')                  # Pattern for import DLL names.
    pfunctions = re.compile('\s+\w+\s+\d{1,4}\s+(.+)')        # Pattern for import function names.
    preloc = re.compile('\s+reloc')                           # Pattern for relocation entries.
    pexports = re.compile('\s+\[\s*\d+\]\s+(\w+)')            # Pattern for exported function names.
    token_counter_map = {}
    counter = 0
    pid = os.getpid()
    for idx, fname in enumerate(file_list):

        fip = open(fname, 'r')
        in_lines = fip.readlines()
        counter += 1
        for line in in_lines:

            line = line.rstrip() # get rid of newlines they are annoying.
            token_val = ""
            m = preloc.match(line)
            if m != None:
                #token_val = m.group(2)

            m = psections.match(line)
            if m != None:
                token_val = m.group(1)
                #print("Section: {:s}".format(token_val))
                m = pdlls.match(line)
                if m != None:
                    token_val = m.group(1)
                    m = pfunctions.match(line)
                    if m != None:
                        token_val = m.group(1)
                        m = pexports.match(line)
                        if m != None:
                            token_val = m.group(1)
                            #print("Export: {:s}".format(token_val))
            # Count the token type.
            if token_val in token_counter_map.keys():
                token_counter_map[token_val] += 1
                token_counter_map[token_val] = 1

        if (counter % 100) == 0:
            print("{:d} Processed {:d} header files.".format(pid, counter))

    save_token_counts(token_counter_map, out_count_file)

def save_combine(token_counter_map, out_file_name):
    # Save the combined token counts.
    out_file = "data/" + out_file_name
    fop = open(out_file, 'w')
    csv_wouter = writer(fop)
    cols = ['token_name','count'] 
    outlines = []
    sorted_keys = token_counter_map.keys()
    counter = 0
    for key in sorted_keys:
        outlines.append([key, token_counter_map[key]])
        counter += 1
        if (counter % 100) == 0: # write out some lines
            outlines = []
            print("Processed token {:s} -> {:d}.".format(key, token_counter_map[key]))

    # Finish off.
    if (len(outlines) > 0):
        outlines = []

    print("Completed writing {:d} tokens.".format(len(sorted_keys)))  

def combine_token_files(token_file, count_file):
    # Function to combine the newly generated token files into one file:
    # 1. list data directory
    # 2. For each file in file list that matches (\d\d\d\d-pe-header-tokens.csv)
    # 3. Trim the filenames if necessary (should remove VirusShare_  prefix).
    # 4. Concatenate the unsorted token feature files.
    # 5. Sort and write to data/sorted-token-features.csv

    p1 = re.compile('\d{3,5}-' + count_file) # This is the PID prefix for each file.
    file_list = os.listdir('data/')
    counter = 0
    token_map = {}
    for file_name in file_list:
        if p1.match(file_name):
            fip = open('data/' + file_name, 'r')
            in_lines = fip.readlines()
            for line in in_lines:
                tokens = line.split(',')
                if tokens[0] not in token_map.keys():
                    token_map[tokens[0]] = int(tokens[1])
                    token_map[tokens[0]] += int(tokens[1])
            counter += len(in_lines)

    save_combine(token_map, token_file)
    print('Completed combine of {:d} PE/COFF header tokens.'.format(counter)) 

class Multi_Params(object):
    def __init__(self, tokenfile="", countfile="", filelist=[]):
        self.token_file = tokenfile
        self.count_file = countfile
        self.file_list = filelist

In [3]:
token_file = 'pe-header-tokens-apt.txt'
count_file = 'pe-header-token-counts-apt.csv'
ext_drive = '/opt/vs/apthdr/'
file_list = os.listdir(ext_drive)
tfiles = []

for fname in file_list:
    tfiles.append(ext_drive + fname)

In [ ]:
mp1 = Multi_Params(token_file, count_file, tfiles)


In [7]:
combine_token_files(token_file, count_file)

Processed token AdjustTokenPrivileges -> 63.
Processed token CryptGetUserKey -> 3.
Processed token GetAdaptersInfo -> 2.
Processed token GetSystemTimeAsFileTime -> 29.
Processed token IsValidLocale -> 7.
Processed token OutputDebugStringA -> 32.
Processed token SHCreateDirectoryExA -> 7.
Processed token UnhandledExceptionFilter -> 64.
Processed token _configthreadlocale -> 2.
Processed token free -> 120.
Processed token wsprintfA -> 20.
Completed writing 1103 tokens.
Completed combine of 1103 PE/COFF header tokens.

Test PE Header Feature Extraction

- feature_extraction_pe_header.py

In [6]:
# Testing PE header feature extraction.

field_list = [ "Characteristics","Time/Date","Magic","MajorLinkerVersion","MinorLinkerVersion",
"SizeOfStackCommit","SizeOfHeapReserve","SizeOfHeapCommit","LoaderFlags","NumberOfRvaAndSizes" ]

field_list_len = len(field_list)

ptime = re.compile("Time/Date\s+(.+)") # Time/Date pattern for PE Header field.

def get_field_values(header_lines):

    field_vals = [0] * field_list_len
    for idx1 in range(0,44): # The PE header fields are the first 44 lines of the file.
        line = header_lines[idx1].rstrip()
        tokens = line.split()
        for idx2, field_name in enumerate(field_list):
            if field_name in tokens:
                if field_name.startswith("Time"):
                    time_match = ptime.match(field_name)
                    if time_match != None:   
                        time_str = time_match.group(1) 
                        time_s = tm.strptime(time_str, "%a %b %d %H:%M:%S %Y") # Convert time string to epoch int.
                        time_epoch = tm.mktime(time_s)
                        time_epoch = 0
                    field_vals[idx2] = time_epoch

                elif len(tokens) > 1:
                    field_vals[idx2] = int(tokens[1], 16) # Convert the hex value of the field to int.
    return field_vals
def count_header_keywords(asm_code, keywords, klen):
    keywords_values = [0] * klen
    for row in asm_code:
        for i in range(klen):
            if keywords[i] in row:
                keywords_values[i] += 1
    return keywords_values

def extract_header_features(multi_parameters):
    # 1. Get the feature file and token/keyword file names
    # 2. Create an array of token/keyword values.
    # 3. Iterate throught the PE header file list and counter the occurrence of the keywords in each file.

    pid = os.getpid()
    feature_file = 'data/' + str(pid) + "-" + multi_parameters.out_file  
    token_file = 'data/' + multi_parameters.token_file
    print('Process id: {:d} - Feature file: {:s} - Keyword file: {:s}'.format(pid, feature_file, token_file))

    hdr_pd = pd.read_csv(token_file)
    tokens = list(hdr_pd['token_name'])
    tlen = len(tokens)

    for idx, token in enumerate(tokens): # Clamp the token name length and demangle C++ names, they are annoying.
        token = token.replace('@','').replace('$','').replace('?','')
        if len(token) > 32:
            tokens[idx] = token[:32]
            tokens[idx] = token
    asm_files = [i for i in tfiles if '.pe.txt' in i]
    ftot = len(asm_files)
    feature_counts = []
    with open(feature_file, 'w') as f:

        fw = writer(f)
        for idx, fname in enumerate(asm_files):
            fasm = open(ext_drive + fname, 'r')
            content = fasm.readlines()
            fname = fname[fname.find("_")+1:] # Remove VirusShare_ from the start of the file name.
            field_vals = get_field_values(content)
            keyword_vals = count_header_keywords(content, tokens, tlen)
            feature_counts.append([fname[0:fname.find('.pe.txt')]] + field_vals + keyword_vals)   
            # Writing rows after every 10 files processed
            if (idx+1) % 1000 == 0:
                print("{:d} - {:d} of {:d} files processed.".format(pid, idx + 1, ftot))
                feature_counts = []
        # Writing remaining features
        if len(feature_counts) > 0:
            feature_counts = []

    print("{:d} Completed processing {:d} PE header files.".format(pid, ftot))

def combine_feature_files(feature_file_name, token_file):
    # Function to combine the newly generated PE header feature files into one file:
    # 1. list data directory
    # 2. For each file in file list that matches (\d\d\d\d-pe-header-features.csv)
    # 3. Trim the filenames if necessary (should remove VirusShare_  prefix).
    # 4. Concatenate the unsorted pe header feature files.
    # 5. Sort and write to data/sorted-pe-header-features.csv
    hdr_pd = pd.read_csv('data/' + token_file)
    tokens = list(hdr_pd['token_name'])
    for idx, token in enumerate(tokens): # Clamp the token name length and demangle C++ names, they are annoying.
        token = token.replace('@','').replace('$','').replace('?','')
        if len(token) > 32:
            tokens[idx] = token[:32]
            tokens[idx] = token

    fop = open('data/' + feature_file_name,'w')
    colnames = "file_name," + ",".join(field_list) + "," + ",".join(tokens) + "\n"
    print("Column names: {:s}".format(colnames))

    p1 = re.compile('\d{3,5}-' + feature_file_name) # This is the PID prefix for each file.
    file_list = os.listdir('data/')
    counter = 0
    for file_name in file_list:
        if p1.match(file_name):
            fip = open('data/' + file_name, 'r')
            in_lines = fip.readlines()
            counter += len(in_lines)
    features = pd.read_csv('data/' + feature_file_name)
    # DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values().
    sorted_features = features.sort('file_name')
    sorted_features.to_csv('data/sorted-' + feature_file_name, index=False)
    print('Completed combine of {:d} PE header file features.'.format(counter))  

class Multi_Params(object):
    def __init__(self, outfile="", tokenfile="", fieldnames=[], filelist=[]):
        self.out_file = outfile
        self.token_file = tokenfile
        self.field_names = fieldnames
        self.file_list = filelist

In [ ]:
header_field_names = 'pe-coff-header-field-names.txt'
out_file = 'pe-header-features-apt.csv'
token_file = 'pe-header-tokens-apt.csv'
ext_drive = '/opt/vs/apthdr/'
tfiles = os.listdir(ext_drive)

mp1 = Multi_Params(out_file, token_file, header_field_names, tfiles)


combine_feature_files(out_file, token_file)

In [7]:
combine_feature_files(out_file, token_file)

Column names: file_name,Characteristics,Time/Date,Magic,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOSystemVersion,MinorOSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,Win32Version,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,.CRT,.bss,.data,.idata,.pdata,.rdata,.reloc,.rsrc,.text,.upx,<none>,0basic_fstreamDUchar_traitsDstds,0basic_fstreamDUchar_traitsDstds,0basic_iostreamDUchar_traitsDstd,0basic_streambufDUchar_traitsDst,0basic_stringDUchar_traitsDstdVa,0basic_stringDUchar_traitsDstdVa,0Initios_basestdQAEXZ,0_LockitstdQAEXZ,0_WinitstdQAEXZ,0exceptionQAEABQBDZ,0exceptionQAEABV0Z,0ios_basestdIAEXZ,0localestdQAEXZ,1basic_filebufDUchar_traitsDstds,1basic_fstreamDUchar_traitsDstds,1basic_iosDUchar_traitsDstdstdUA,1basic_iostreamDUchar_traitsDstd,1basic_streambufDUchar_traitsDst,1basic_stringDUchar_traitsDstdVa,1Initios_basestdQAEXZ,1_LockitstdQAEXZ,1_WinitstdQAEXZ,1exceptionUAEXZ,1ios_basestdUAEXZ,1localestdQAEXZ,1type_infoUAEXZ,2YAPAXIZ,3YAXPAXZ,6basic_ostreamDUchar_traitsDstds,6stdYAAAVbasic_ostreamDUchar_tra,7ios_basestdQBE_NXZ,8stdYA_NABVbasic_stringDUchar_tr,8stdYA_NABVbasic_stringDUchar_tr,9stdYA_NABVbasic_stringDUchar_tr,HstdYAAVbasic_stringDUchar_trait,HstdYAAVbasic_stringDUchar_trait,MstdYA_NABVbasic_stringDUchar_tr,_7basic_filebufDUchar_traitsDstd,_7basic_fstreamDUchar_traitsDstd,_7basic_iosDUchar_traitsDstdstd6,_8basic_fstreamDUchar_traitsDstd,_8basic_fstreamDUchar_traitsDstd,_Dbasic_fstreamDUchar_traitsDstd,_UYAPAXIZ,_VYAXPAXZ,_C1_Nullstrbasic_stringDUchar_tr,_Copybasic_stringDUchar_traitsDs,_Eosbasic_stringDUchar_traitsDst,_Fpzstd3_JB,_Freezebasic_stringDUchar_traits,_Growbasic_stringDUchar_traitsDs,_Initbasic_filebufDUchar_traitsD,_Initcvtbasic_filebufDUchar_trai,_Splitbasic_stringDUchar_traitsD,_Tidybasic_stringDUchar_traitsDs,_XlenstdYAXXZ,_XranstdYAXXZ,__FiopenstdYAPAU_iobufPBDHZ,_type_info_dtor_internal_methodt,appendbasic_stringDUchar_traitsD,appendbasic_stringDUchar_traitsD,appendbasic_stringDUchar_traitsD,assignbasic_stringDUchar_traitsD,assignbasic_stringDUchar_traitsD,c_strbasic_stringDUchar_traitsDs,clearbasic_iosDUchar_traitsDstds,clearios_basestdQAEXH_NZ,closebasic_filebufDUchar_traitsD,closebasic_fstreamDUchar_traitsD,coutstd3Vbasic_ostreamDUchar_tra,endlstdYAAAVbasic_ostreamDUchar_,erasebasic_stringDUchar_traitsDs,gcountbasic_istreamDUchar_traits,insertbasic_stringDUchar_traitsD,is_openbasic_fstreamDUchar_trait,max_sizebasic_stringDUchar_trait,nposbasic_stringDUchar_traitsDst,openbasic_filebufDUchar_traitsDs,openbasic_fstreamDUchar_traitsDs,readbasic_istreamDUchar_traitsDs,seekgbasic_istreamDUchar_traitsD,seekpbasic_ostreamDUchar_traitsD,setstatebasic_iosDUchar_traitsDs,tellpbasic_ostreamDUchar_traitsD,terminateYAXXZ,writebasic_ostreamDUchar_traitsD,ADVAPI32,AcquireCredentialsHandleA,AdjustTokenPrivileges,AllocConsole,AllocateAndInitializeSid,AssignProcessToJobObject,AttachConsole,BeginPaint,BitBlt,BuildExplicitAccessWithNameA,BuildSecurityDescriptorW,CLSIDFromString,COMCTL32,COMDLG32,CRYPT32,CertAddCertificateContextToStore,CertCloseStore,CertEnumCertificatesInStore,CertEnumSystemStore,CertFindCertificateInStore,CertFreeCertificateChain,CertFreeCertificateContext,CertGetCertificateChain,CertGetCertificateContextPropert,CertGetNameStringW,CertNameToStrA,CertOpenStore,CertOpenSystemStoreA,CertVerifyCertificateChainPolicy,ChangeServiceConfig2A,ChangeServiceConfigA,CharLowerA,CharToOemA,CharToOemBuffA,CharToOemBuffW,CharUpperA,CharUpperW,CloseDesktop,CloseHandle,CloseServiceHandle,CloseWindowStation,CmUnRegisterCallback,CoCreateInstance,CoInitialize,CodeView,CommDlgExtendedError,CompareFileTime,CompareStringA,CompareStringW,ConnectNamedPipe,ContinueDebugEvent,ControlService,ConvertSidToStringSidW,CopyFileA,CopyFileW,CopyRect,CreateCompatibleBitmap,CreateCompatibleDC,CreateDCA,CreateDirectoryA,CreateDirectoryW,CreateEventA,CreateFileA,CreateFileMappingA,CreateFileMappingW,CreateFileW,CreateJobObjectW,CreateMutexA,CreateNamedPipeA,CreateNamedPipeW,CreatePipe,CreateProcessA,CreateProcessAsUserA,CreateProcessW,CreateProcessWithLogonW,CreateRemoteThread,CreateServiceA,CreateServiceW,CreateStreamOnHGlobal,CreateThread,CreateToolhelp32Snapshot,CreateWaitableTimerA,CreateWindowExA,CreateWindowExW,CredEnumerateW,CredFree,CryptAcquireCertificatePrivateKe,CryptAcquireContextA,CryptAcquireContextW,CryptBinaryToStringA,CryptCreateHash,CryptDecrypt,CryptDeriveKey,CryptDestroyHash,CryptDestroyKey,CryptDuplicateKey,CryptEncrypt,CryptEnumProvidersW,CryptExportKey,CryptGenRandom,CryptGetHashParam,CryptGetKeyParam,CryptGetProvParam,CryptGetUserKey,CryptHashData,CryptImportKey,CryptReleaseContext,CryptSetHashParam,CryptSetKeyParam,CryptStringToBinaryA,DNSAPI,DbgPrint,DebugActiveProcess,DebugBreak,DebugSetProcessKillOnExit,DecodePointer,DecryptMessage,DefWindowProcA,DefWindowProcW,DeleteCriticalSection,DeleteDC,DeleteFileA,DeleteFileW,DeleteObject,DeleteSecurityContext,DeleteService,DeleteUrlCacheEntry,DeregisterEventSource,DestroyIcon,DestroyWindow,DeviceIoControl,DialogBoxParamA,DialogBoxParamW,DisconnectNamedPipe,DispatchMessageA,DispatchMessageW,DnsFree,DnsQuery_A,DosDateTimeToFileTime,DrawIcon,DuplicateHandle,DuplicateTokenEx,EnableWindow,EncodePointer,EncryptMessage,EndDialog,EndPaint,EnterCriticalSection,EnumProcessModules,EnumProcesses,EnumServicesStatusExA,EnumServicesStatusExW,EnumSystemLocalesA,EnumWindows,ExAllocatePoolWithQuotaTag,ExAllocatePoolWithTag,ExFreePoolWithTag,ExitProcess,ExitThread,ExitWindowsEx,ExpandEnvironmentStringsA,ExpandEnvironmentStringsW,FLTMGR,FileTimeToLocalFileTime,FileTimeToSystemTime,FillConsoleOutputCharacterW,FindClose,FindFirstFileA,FindFirstFileW,FindFirstVolumeA,FindNextFileA,FindNextFileW,FindResourceA,FindResourceExA,FindResourceW,FindVolumeClose,FindWindowExA,FindWindowExW,FlsAlloc,FlsFree,FlsGetValue,FlsSetValue,FltEnumerateFilters,FltEnumerateInstances,FltGetFilterInformation,FltGetVolumeFromInstance,FltObjectDereference,FlushConsoleInputBuffer,FlushFileBuffers,FormatMessageA,FormatMessageW,FreeConsole,FreeContextBuffer,FreeCredentialsHandle,FreeEnvironmentStringsA,FreeEnvironmentStringsW,FreeLibrary,FreeLibraryAndExitThread,FreeResource,FreeSid,GDI32,GenerateConsoleCtrlEvent,GetACP,GetAdaptersInfo,GetCPInfo,GetClassNameA,GetClassNameW,GetClientRect,GetCommandLineA,GetCommandLineW,GetComputerNameA,GetComputerNameExW,GetComputerNameW,GetConsoleCP,GetConsoleDisplayMode,GetConsoleMode,GetConsoleOutputCP,GetConsoleScreenBufferInfo,GetCurrentDirectoryA,GetCurrentDirectoryW,GetCurrentProcess,GetCurrentProcessId,GetCurrentThread,GetCurrentThreadId,GetDC,GetDIBits,GetDateFormatA,GetDateFormatW,GetDesktopWindow,GetDeviceCaps,GetDiskFreeSpaceA,GetDiskFreeSpaceExA,GetDiskFreeSpaceExW,GetDiskFreeSpaceW,GetDlgItem,GetDlgItemTextA,GetDlgItemTextW,GetDriveTypeA,GetDriveTypeW,GetEnvironmentStrings,GetEnvironmentStringsW,GetEnvironmentVariableA,GetEnvironmentVariableW,GetExitCodeProcess,GetExitCodeThread,GetFileAttributesA,GetFileAttributesExA,GetFileAttributesW,GetFileInformationByHandle,GetFileSize,GetFileTime,GetFileTitleA,GetFileType,GetFullPathNameA,GetFullPathNameW,GetKernelObjectSecurity,GetLastError,GetLocalTime,GetLocaleInfoA,GetLocaleInfoW,GetLogicalDriveStringsA,GetLogicalDrives,GetLongPathNameA,GetMessageA,GetMessageW,GetModuleBaseNameA,GetModuleFileNameA,GetModuleFileNameExA,GetModuleFileNameW,GetModuleHandleA,GetModuleHandleW,GetModuleInformation,GetNativeSystemInfo,GetNumberFormatA,GetNumberFormatW,GetOEMCP,GetObjectA,GetObjectW,GetOpenFileNameW,GetParent,GetPrivateProfileStringA,GetProcAddress,GetProcessHeap,GetProcessId,GetProcessTimes,GetProcessWindowStation,GetSaveFileNameW,GetSecurityDescriptorDacl,GetSecurityInfo,GetShortPathNameA,GetShortPathNameW,GetStartupInfoA,GetStartupInfoW,GetStdHandle,GetStockObject,GetStringTypeA,GetStringTypeW,GetSysColor,GetSystemDirectoryA,GetSystemInfo,GetSystemMetrics,GetSystemPowerStatus,GetSystemTime,GetSystemTimeAsFileTime,GetTempFileNameA,GetTempPathA,GetTempPathW,GetThreadContext,GetThreadDesktop,GetTickCount,GetTimeFormatA,GetTimeFormatW,GetTimeZoneInformation,GetTokenInformation,GetUserDefaultLCID,GetUserNameA,GetUserNameExA,GetUserNameExW,GetUserObjectInformationA,GetUserObjectInformationW,GetUserProfileDirectoryA,GetVersion,GetVersionExA,GetVersionExW,GetVolumeInformationA,GetWindow,GetWindowLongA,GetWindowLongW,GetWindowRect,GetWindowTextA,GetWindowTextW,GetWindowThreadProcessId,GetWindowsDirectoryA,GlobalAlloc,GlobalFree,GlobalLock,GlobalMemoryStatus,GlobalUnlock,Heap32ListFirst,Heap32ListNext,HeapAlloc,HeapCreate,HeapDestroy,HeapFree,HeapReAlloc,HeapSetInformation,HeapSize,HeapValidate,HttpAddRequestHeadersA,HttpAddRequestHeadersW,HttpEndRequestA,HttpOpenRequestA,HttpOpenRequestW,HttpQueryInfoA,HttpQueryInfoW,HttpSendRequestA,HttpSendRequestExA,HttpSendRequestExW,INIT,I_RpcGetBuffer,ImpersonateLoggedOnUser,InitCommonControlsEx,InitializeCriticalSection,InitializeCriticalSectionAndSpin,InitializeSecurityContextA,InstallService,InterlockedCompareExchange,InterlockedDecrement,InterlockedExchange,InterlockedIncrement,InternetAttemptConnect,InternetCloseHandle,InternetConnectA,InternetConnectW,InternetCrackUrlA,InternetCrackUrlW,InternetGetConnectedState,InternetOpenA,InternetOpenUrlA,InternetOpenW,InternetQueryDataAvailable,InternetQueryOptionA,InternetReadFile,InternetSetCookieA,InternetSetCookieW,InternetSetOptionA,InternetSetOptionExA,InternetSetOptionW,InternetWriteFile,IoCreateDevice,IoCreateSymbolicLink,IoDeleteDevice,IoDeleteSymbolicLink,IoEnumerateRegisteredFiltersList,IofCompleteRequest,IsBadCodePtr,IsBadReadPtr,IsBadWritePtr,IsDBCSLeadByte,IsDebuggerPresent,IsIconic,IsProcessorFeaturePresent,IsValidCodePage,IsValidLocale,IsWindow,IsWindowVisible,KERNEL32,KeBugCheckEx,Kernel32,KillTimer,LCMapStringA,LCMapStringW,LZ32,LZClose,LZCopy,LZOpenFileA,LeaveCriticalSection,LoadAcceleratorsA,LoadBitmapA,LoadBitmapW,LoadCursorA,LoadCursorW,LoadIconA,LoadIconW,LoadImageA,LoadLibraryA,LoadLibraryExA,LoadLibraryW,LoadResource,LoadStringA,LoadStringW,LocalAlloc,LocalFileTimeToFileTime,LocalFree,LocalReAlloc,LockResource,LockServiceDatabase,LogonUserA,LookupAccountSidA,LookupAccountSidW,LookupPrivilegeNameW,LookupPrivilegeValueA,LookupPrivilegeValueW,LsaClose,LsaEnumerateLogonSessions,LsaFreeMemory,LsaFreeReturnBuffer,LsaGetLogonSessionData,LsaOpenPolicy,LsaQueryInformationPolicy,MFC42,MPR,MSVCP60,MSVCR80,MSVCR90,MSVCRT,MakeAbsoluteSD,MapViewOfFile,MapWindowPoints,Mcdl,MessageBoxA,MessageBoxW,MmGetSystemRoutineAddress,Module32First,Module32FirstW,Module32Next,Module32NextW,MoveFileA,MoveFileExA,MoveFileExW,MoveFileW,MultiByteToWideChar,NETAPI32,NdrConvert,NdrNonConformantStringMarshall,NdrNonConformantStringUnmarshall,NdrServerInitializeNew,NetApiBufferFree,NetServerEnum,NetShareGetInfo,Netbios,NtBuildNumber,OLE32,OLEAUT32,ObOpenObjectByPointer,ObfDereferenceObject,OemToCharA,OemToCharBuffA,OleInitialize,OleUninitialize,OpenDesktopA,OpenFile,OpenFileMappingA,OpenFileMappingW,OpenInputDesktop,OpenProcess,OpenProcessToken,OpenSCManagerA,OpenSCManagerW,OpenServiceA,OpenServiceW,OpenThread,OpenWindowStationA,OutputDebugStringA,PAGE,PFXExportCertStoreEx,PSAPI,PathCanonicalizeW,PathCombineW,PathFileExistsA,PathIsDirectoryA,PathIsRelativeW,PeekMessageA,PeekMessageW,PeekNamedPipe,PostMessageA,PostMessageW,PostQuitMessage,PostThreadMessageW,ProceA,Process32First,Process32FirstW,Process32Next,Process32NextW,PsDereferencePrimaryToken,PsGetProcessId,PsGetProcessImageFileName,PsGetVersion,PsInitialSystemProcess,PsProcessType,PsReferencePrimaryToken,PsSetCreateProcessNotifyRoutine,PsSetCreateThreadNotifyRoutine,PsSetLoadImageNotifyRoutine,QueryContextAttributesA,QueryPerformanceCounter,QueryPerformanceFrequency,QueryServiceConfig2A,QueryServiceConfigA,QueryServiceObjectSecurity,QueryServiceStatus,QueryServiceStatusEx,RPCRT4,RaiseException,ReadConsoleInputA,ReadDirectoryChangesW,ReadFile,ReadProcessMemory,RealizePalette,RegCloseKey,RegConnectRegistryA,RegCreateKeyA,RegCreateKeyExA,RegCreateKeyExW,RegDeleteKeyA,RegDeleteValueA,RegEnumKeyExA,RegEnumKeyExW,RegEnumValueA,RegOpenKeyA,RegOpenKeyExA,RegOpenKeyExW,RegQueryInfoKeyA,RegQueryInfoKeyW,RegQueryValueExA,RegQueryValueExW,RegSaveKeyA,RegSetValueExA,RegSetValueExW,RegisterClassExA,RegisterClassExW,RegisterEventSourceA,RegisterEventSourceW,RegisterServiceCtrlHandlerA,ReleaseDC,ReleaseMutex,RemoveDirectoryA,ReportEventA,ReportEventW,ResetEvent,ResumeThread,RevertToSelf,RpcMgmtStopServerListening,RpcRaiseException,RpcServerListen,RpcServerRegisterIf,RpcServerUseProtseqEpA,RtlCaptureContext,RtlCompareMemory,RtlInitUnicodeString,RtlLookupFunctionEntry,RtlPcToFileHeader,RtlUnwind,RtlUnwindEx,RtlVirtualUnwind,RundllInstall,RundllInstallA,RundllUninstall,RundllUninstallA,SHAutoComplete,SHBrowseForFolderA,SHBrowseForFolderW,SHChangeNotify,SHCreateDirectoryExA,SHELL32,SHFileOperationA,SHFileOperationW,SHGetFileInfoA,SHGetFileInfoW,SHGetMalloc,SHGetPathFromIDListA,SHGetPathFromIDListW,SHGetSpecialFolderLocation,SHGetSpecialFolderPathA,SHGetSpecialFolderPathW,SHLWAPI,Secur32,SelectObject,SelectPalette,SendDlgItemMessageA,SendDlgItemMessageW,SendMessageA,SendMessageW,ServiceInstall,ServiceMain,SetConsoleCtrlHandler,SetConsoleCursorPosition,SetConsoleMode,SetConsoleTitleW,SetCurrentDirectoryA,SetCurrentDirectoryW,SetCursorPos,SetDlgItemTextA,SetDlgItemTextW,SetEndOfFile,SetEntriesInAclA,SetEnvironmentVariableA,SetEnvironmentVariableW,SetErrorMode,SetEvent,SetFileAttributesA,SetFileAttributesW,SetFilePointer,SetFileSecurityA,SetFileSecurityW,SetFileTime,SetFocus,SetForegroundWindow,SetHandleCount,SetKernelObjectSecurity,SetLastError,SetMenu,SetNamedPipeHandleState,SetPriorityClass,SetProcessPriorityBoost,SetProcessWindowStation,SetSecurityDescriptorDacl,SetSecurityInfo,SetServiceObjectSecurity,SetServiceStatus,SetStdHandle,SetThreadContext,SetThreadDesktop,SetThreadPriority,SetTimer,SetUnhandledExceptionFilter,SetVolumeLabelA,SetWaitableTimer,SetWindowLongA,SetWindowLongW,SetWindowPos,SetWindowTextA,SetWindowTextW,ShellExecuteA,ShellExecuteExA,ShellExecuteExW,ShellExecuteW,ShowWindow,SizeofResource,Sleep,StartServiceA,StartServiceCtrlDispatcherA,StartServiceW,StrToIntA,StretchBlt,SuspendThread,SystemTimeToFileTime,TerminateJobObject,TerminateProcess,TerminateThread,Thread32First,Thread32Next,TlsAlloc,TlsFree,TlsGetValue,TlsSetValue,Toolhelp32ReadProcessMemory,TranslateAcceleratorA,TranslateMessage,URLDownloadToFileA,USER32,USERENV,UnServiceInstall,UnhandledExceptionFilter,UninstallService,UnlockServiceDatabase,UnmapViewOfFile,UpdateWindow,VirtualAlloc,VirtualAllocEx,VirtualFree,VirtualFreeEx,VirtualProtect,VirtualProtectEx,VirtualQueryEx,WINHTTP,WININET,WINMM,WNetAddConnection2A,WNetCancelConnection2A,WS2_32,WSACloseEvent,WSACreateEvent,WSAEnumNetworkEvents,WSAEventSelect,WSAResetEvent,WSASocketA,WSAWaitForMultipleEvents,WTSAPI32,WTSCloseServer,WTSEnumerateProcessesW,WTSEnumerateSessionsW,WTSFreeMemory,WTSOpenServerW,WaitForDebugEvent,WaitForInputIdle,WaitForMultipleObjects,WaitForSingleObject,WaitNamedPipeW,WideCharToMultiByte,WinExec,WinHttpAddRequestHeaders,WinHttpCloseHandle,WinHttpConnect,WinHttpCrackUrl,WinHttpGetIEProxyConfigForCurren,WinHttpGetProxyForUrl,WinHttpOpen,WinHttpOpenRequest,WinHttpQueryDataAvailable,WinHttpQueryHeaders,WinHttpQueryOption,WinHttpReadData,WinHttpReceiveResponse,WinHttpSendRequest,WinHttpSetOption,WinHttpSetTimeouts,WinHttpWriteData,WriteConsoleA,WriteConsoleInputA,WriteConsoleW,WriteFile,WritePrivateProfileStringA,WriteProcessMemory,ZwClose,ZwDuplicateToken,ZwOpenProcessTokenEx,ZwQuerySystemInformation,ZwSetInformationProcess,_CxxThrowException,_EH_prolog,_XcptFilter,__CppXcptFilter,__CxxFrameHandler,__CxxFrameHandler3,__clean_type_info_names_internal,__dllonexit,__getmainargs,__iob_func,__p___initenv,__p__commode,__p__environ,__p__fmode,__set_app_type,__setusermatherr,_access,_acmdln,_adjust_fdiv,_alldiv,_allmul,_alloca_probe,_allshl,_amsg_exit,_atoi64,_aulldiv,_aullrem,_aullshr,_beginthread,_beginthreadex,_cexit,_chdir,_chkstk,_close,_configthreadlocale,_controlfp,_controlfp_s,_crt_debugger_hook,_decode_pointer,_encode_pointer,_encoded_null,_endthreadex,_errno,_except_handler3,_except_handler4_common,_exit,_fdopen,_filelength,_fileno,_findclose,_findfirst,_findfirst64i32,_findnext,_findnext64i32,_ftime,_ftime64,_ftol,_get_osfhandle,_getch,_getcwd,_gmtime32,_initterm,_initterm_e,_invoke_watson,_iob,_ismbblead,_itoa,_lclose,_local_unwind2,_localtime64,_localtime64_s,_lock,_lopen,_malloc_crt,_mbschr,_mbscmp,_mbsicmp,_mbsinc,_mbsnbcat,_mbsnbcmp,_mbsnbcpy,_mbsrchr,_mkdir,_onexit,_open,_purecall,_read,_setmbcp,_setmode,_snprintf,_splitpath,_stat,_stat32,_strcmpi,_strdate,_stricmp,_strlwr,_strnicmp,_strtime,_strupr,_time32,_time64,_ui64toa,_ultoa,_unlink,_unlock,_vsnprintf,_vsnwprintf,_wcsicmp,_wcsnicmp,_wfopen,abort,abs,addLogonSession,asctime_s,atexit,atof,atoi,atol,ceil,comdlg32,delLogonSession,exit,fclose,fflush,fgetc,fgetpos,fgets,floor,fopen,fprintf,fputc,fputs,fread,free,freopen,fseek,ftell,fwprintf,fwrite,getDescription,getLocalAccounts,getLogonPasswords,getLogonSessions,getMSV,getMSVFunctions,getSAMFunctions,getSECFunctions,getSecrets,getTsPkg,getTsPkgFunctions,getWDigest,getWDigestFunctions,getchar,getenv,gmtime,install,installA,iphlpapi,isalnum,isdigit,islower,isprint,isspace,isupper,isxdigit,keybd_event,localtime,lstrcatA,lstrcatW,lstrcmpiA,lstrcpyA,lstrcpynA,lstrlenA,lstrlenW,malloc,mbstowcs,memchr,memcmp,memcpy,memmove,memset,mouse_event,msvcrt,nNOTES,ntdll,ntoskrnl,ole32,ping,printf,qsort,raise,rand,realloc,rewind,scanf,setlocale,signal,sprintf,srand,sscanf,strcat,strchr,strcmp,strcpy,strerror,strftime,strlen,strncat,strncmp,strncpy,strpbrk,strrchr,strstr,strtok,strtoul,system,time,timeGetTime,tolower,uninstall,uninstallA,urlmon,vfprintf,vsprintf,wcscat,wcschr,wcscmp,wcscpy,wcslen,wcsrchr,wcsstr,wcstombs,wprintf,wsprintfA,wsprintfW,wvsprintfA,wvsprintfW

Completed combine of 275 PE header file features.

out_file = 'pe-header-features-vs251.csv'
token_file = 'pe-header-tokens-vs251.csv'
combine_feature_files(out_file, token_file)

Test PE/COFF ASM Feature Extraction.

In [15]:
# Test PE ASM feature extraction.

x86_registers = ['edx','esi','es','fs','ds','ss','gs','cs','ah','al',

x86_opcodes = ['add','al','bt','call','cdq','cld','cli','cmc','cmp','const','cwd','daa','db'

def count_asm_registers(asm_code):
    registers_values = [0]*len(x86_registers) # Need to optimise this init stuff to global vars.
    for row in asm_code:
        parts = row.replace(',',' ').replace('+',' ').replace('*',' ').replace('[',' ').replace(']',' ') \
                    .replace('-',' ').split()

        for idx, register in enumerate(x86_registers):
            registers_values[idx] += parts.count(register)

    return registers_values

def count_asm_opcodes(asm_code):
    opcodes_values = [0]*len(x86_opcodes)
    for row in asm_code:
        parts = row.split()

        for idx, opcode in enumerate(x86_opcodes):
            if opcode in parts:
                opcodes_values[idx] += 1

    return opcodes_values

def extract_asm_features(multi_param):
    pid = os.getpid()
    feature_file = 'data/' + str(pid) + "-" + multi_param.temp_file
    print('Process id: {:d} feature file: {:s}'.format(pid,feature_file))

    # Do this in call graph feature extraction.
    #fapi = open("data/APIs.txt")
    #defined_apis = fapi.readlines()
    #defined_apis = defined_apis[0].split(',')

    asm_files = [i for i in tfiles if '.asm' in i]
    ftot = len(asm_files)
    feature_counts = []
    with open(feature_file, 'w') as f:
        fw = writer(f)
        for idx, fname in enumerate(asm_files):
            fasm = open(ext_drive + fname, 'r')
            content = fasm.readlines()
            fname = fname[fname.find("_")+1:] # Remove VirusShare_ from the start of the file name.
            reg_vals = count_asm_registers(content)
            opc_vals = count_asm_opcodes(content)
            #api_vals = count_asm_APIs(content, defined_apis) put in Call Graph features.
            #sec_vals = count_asm_sections(content) already in PE header features.
            #mis_vals = count_asm_misc(content) mostly already in PE header and call graph features.
            count_vals = reg_vals + opc_vals # + api_vals + mis_vals
            feature_counts.append([fname[:fname.find('.asm')]] + count_vals)   
            # Writing rows after every 10 files processed
            if (idx+1) % 10 == 0:
                print("{:d} {:d} of {:d} files processed.".format(pid, idx + 1, ftot))
                feature_counts = []
        # Writing remaining files
        if len(feature_counts) > 0:
            feature_counts = []


def combine_asm_files(out_file, temp_file):
    # Function to combine the newly generated asm feature files into one file:
    # 1. list data directory
    # 2. For each file in file list that matches (\d\d\d\d-asm-features.csv)
    # 3. Trim the filenames if necessary (should remove VirusShare_  prefix).
    # 4. Concatenate the unsorted asm feature files.
    # 5. Sort and write to data/sorted-asm-features.csv
    fop = open('data/' + out_file,'w')
    colnames = "file_name," + ",".join(x86_registers) + "," + ",".join(x86_opcodes) + "\n"
    print("Column names: {:s}".format(colnames))
    p1 = re.compile('\d{3,5}-' + temp_file) # This is the PID prefix for each file.
    file_list = os.listdir('data/')
    counter = 0
    for file_name in file_list:
        if p1.match(file_name):
            fip = open('data/' + file_name, 'r')
            in_lines = fip.readlines()
            counter += len(in_lines)
    print('Completed combine of {:d} ASM features.'.format(counter))  
    asms = pd.read_csv('data/' + out_file)
    # DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values().
    sorted_asms = asms.sort('file_name')
    sorted_asms.to_csv('data/sorted-' + out_file, index=False)

class Multi_Params(object):
    def __init__(self, featurefile="", tempfile="", filelist=[]):
        self.feature_file = featurefile
        self.temp_file = tempfile
        self.file_list = filelist

In [2]:
# Includes x86 and amd64 registers and opcodes.
x86_registers = ['edx','esi','es','fs','ds','ss','gs','cs','ah','al',

x86_opcodes = ['mov','movabs','movbe','movsbl','movsbw','movswl','movsbq','movswq','movslq','movsx',

def count_asm_symbols(asm_code):
    symbols = [0]*7
    for row in asm_code:
        if '*' in row:
            symbols[0] += 1
        if '-' in row:
            symbols[1] += 1
        if '+' in row:
            symbols[2] += 1
        if '[' in row:
            symbols[3] += 1
        if ']' in row:
            symbols[4] += 1
        if '@' in row:
            symbols[5] += 1
        if '?' in row:
            symbols[6] += 1

    return symbols

def count_asm_APIs(asm_code, apis):
    apis_values = [0]*len(apis)
    for row in asm_code:
        for i in range(len(apis)):
            if apis[i] in row:
                apis_values[i] += 1
    return apis_values

def count_asm_misc(asm_code):
    keywords_values = [0]*len(keywords)
    for row in asm_code:
        for i in range(len(keywords)):
            if keywords[i] in row:
                keywords_values[i] += 1
    return keywords_values

def count_asm_registers(asm_code):
    registers_values = [0]*len(x86_registers) # Need to optimise this init stuff to global vars.
    for row in asm_code:
        parts = row.replace(',',' ').replace('+',' ').replace('*',' ').replace('[',' ').replace(']',' ') \
                    .replace('-',' ').split()

        for idx, register in enumerate(x86_registers):
            registers_values[idx] += parts.count(register)

    return registers_values

def count_asm_opcodes(asm_code):
    opcodes_values = [0]*len(x86_opcodes)
    for row in asm_code:
        parts = row.split()

        for idx, opcode in enumerate(x86_opcodes):
            if opcode in parts:
                opcodes_values[idx] += 1

    return opcodes_values

def extract_asm_features(multi_param):
    pid = os.getpid()
    feature_file = 'data/' + str(pid) + "-" + multi_param.temp_file
    ext_drive = multi_para.ext_drive
    lmsg = 'Process id: {:d} feature file: {:s}'.format(pid, feature_file)

    # Do this in call graph feature extraction.
    #fapi = open("data/APIs.txt")
    #defined_apis = fapi.readlines()
    #defined_apis = defined_apis[0].split(',')
    flog = open("data/" + str(pid) + "-pe-asm-log.txt", "w")
    flog.write(lmsg + "\n")

    asm_files = [i for i in tfiles if '.pe.asm' in i]
    ftot = len(asm_files)
    feature_counts = []
    with open(feature_file, 'w') as f:
        fw = writer(f)
        for idx, fname in enumerate(asm_files):
            fasm = open(ext_drive + fname, 'r')
            content = fasm.readlines()
            fname = fname[fname.find("_")+1:] # Remove VirusShare_ from the start of the file name.
            reg_vals = count_asm_registers(content)
            opc_vals = count_asm_opcodes(content)
            #api_vals = count_asm_APIs(content, defined_apis) put in Call Graph features.
            #sec_vals = count_asm_sections(content) already in PE header features.
            #mis_vals = count_asm_misc(content) mostly already in PE header and call graph features.
            count_vals = reg_vals + opc_vals # + api_vals + mis_vals
            feature_counts.append([fname[:fname.find('.asm')]] + count_vals)   
            # Writing rows after every 10 files processed
            if (idx+1) % 10 == 0:
                lmsg = "{:d} {:d} of {:d} files processed.".format(pid, idx + 1, ftot)
                flog.write(lmsg + "\n")
                feature_counts = []
        # Writing remaining files
        if len(feature_counts) > 0:
            feature_counts = []

    lmsg = "{:d} Completed processing {:d} PE ASM files.".format(pid, ftot)
    flog.write(lmsg + "\n")

def combine_asm_files(out_file, temp_file):
    # Function to combine the newly generated asm feature files into one file:
    # 1. list data directory
    # 2. For each file in file list that matches (\d\d\d\d-asm-features.csv)
    # 3. Trim the filenames if necessary (should remove VirusShare_  prefix).
    # 4. Concatenate the unsorted asm feature files.
    # 5. Sort and write to data/sorted-asm-features.csv
    fop = open('data/' + out_file, 'w')
    colnames = "file_name," + ",".join(x86_registers) + "," + ",".join(x86_opcodes) + "\n"
    print("Column names: {:s}".format(colnames))
    p1 = re.compile('\d{3,5}-' + temp_file) # This is the PID prefix for each file.
    file_list = os.listdir('data/')
    counter = 0
    for file_name in file_list:
        if p1.match(file_name):
            fip = open('data/' + file_name, 'r')
            in_lines = fip.readlines()
            counter += len(in_lines)
    print('Completed combine of {:d} ASM features.'.format(counter))  
    asms = pd.read_csv('data/' + out_file)
    # DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values().
    sorted_asms = asms.sort('file_name')
    sorted_asms.to_csv('data/sorted-' + out_file, index=False)

class Multi_Params(object):
    def __init__(self, featurefile="", tempfile="", extdrive="", filelist=[]):
        self.feature_file = featurefile
        self.temp_file = tempfile
        self.ext_drive = extdrive
        self.file_list = filelist

In [ ]:
out_file = 'pe-asm-features-apt.csv'
temp_file = 'pe-asm-temp-apt.csv'
ext_drive = '/opt/vs/asm/'
tfiles = os.listdir(ext_drive)

mp1 = Multi_Params(out_file, temp_file, tfiles)


combine_asm_files(out_file, temp_file)

In [3]:
# find out what is going on with the IDA Pro disassembly.
fip = open('/opt/vs/asm/filetypes.txt', 'r')
lines = fip.readlines()
dir_list = os.listdir('/opt/vs/asm/')
file_list = []
counter = 0

for line in lines:
    tokens = line.split(':')
    file_name = tokens[0]
    counter += 1
print("PE32 files: {:d} directory files: {:d}".format(counter, len(dir_list)))

for fname in dir_list:
    if fname not in file_list:
        print("File not in PE32 list: {:s}".format(fname))

PE32 files: 239 directory files: 272
File not in PE32 list: VirusShare_2bd02b41817d227058522cca40acd390.asm
File not in PE32 list: VirusShare_2daa4a4574ba06aa3203ae0e0b45b3b8.asm
File not in PE32 list: VirusShare_827040a5f5ae8de281a63899224b2f3a.asm
File not in PE32 list: VirusShare_67504a0c2c2bf47efccdab5ca981ad7d.asm
File not in PE32 list: VirusShare_1328eaceb140a3863951d18661b097af.asm
File not in PE32 list: VirusShare_95f25d3afc5370f5d9fd8e65c17d3599.asm
File not in PE32 list: VirusShare_a5d4ebc0285f0213e0c29d23bc410889.asm
File not in PE32 list: VirusShare_02c65973b6018f5d473d701b3e7508b2.asm
File not in PE32 list: VirusShare_fc1937c1aa536b3744ebdfb1716fd54d.asm
File not in PE32 list: VirusShare_6e442c5ef460bee4c9457c6bf7a132d6.asm
File not in PE32 list: VirusShare_31e5e58dbdfad05175613e795298ebb5.asm
File not in PE32 list: VirusShare_933b11bc4799f8d9f65466fb2e3ea659.asm
File not in PE32 list: VirusShare_7cb055ac3acbf53e07e20b65ec9126a1.asm
File not in PE32 list: VirusShare_c91eacab7655870764d13ba741aa9a73.asm
File not in PE32 list: VirusShare_4e551abcd14506092a0f8d54a45f3569.asm
File not in PE32 list: VirusShare_6f9992c486195edcf0bf2f6ee6c3ec74.asm
File not in PE32 list: VirusShare_4a54d7878d4170c3d4e3c3606365c42c.asm
File not in PE32 list: VirusShare_7712d05c8b499fc7a1f4a6a6b6dee825.asm
File not in PE32 list: VirusShare_123505024f9e5ff74cb6aa67d7fcc392.asm
File not in PE32 list: VirusShare_00dbb9e1c09dbdafb360f3163ba5a3de.asm
File not in PE32 list: VirusShare_9675827a495f4ba6a4efd4dd70932b7c.asm
File not in PE32 list: VirusShare_ea1b44094ae4d8e2b63a1771a3e61fd5.asm
File not in PE32 list: filetypes.txt
File not in PE32 list: VirusShare_ca327bc83fbe38b3689cd1a5505dfc33.asm
File not in PE32 list: VirusShare_6808ec6dbb23f0fa7637c108f44c5c80.asm
File not in PE32 list: VirusShare_e476e4a24f8b4ff4c8a0b260aa35fc9f.asm
File not in PE32 list: VirusShare_4f763b07a7b8a80f1f9408e590f79532.asm
File not in PE32 list: VirusShare_0b506c6dde8d07f9eeb82fd01a6f97d4.asm
File not in PE32 list: VirusShare_8934aeed5d213fe29e858eee616a6ec7.asm
File not in PE32 list: VirusShare_3de1bd0f2107198931177b2b23877df4.asm
File not in PE32 list: VirusShare_c99fa835350aa9e2427ce69323b061a9.asm
File not in PE32 list: VirusShare_3107de21e480ab1f2d67725f419b28d0.asm
File not in PE32 list: VirusShare_0908d8b3e459551039bade50930e4c1b.asm

In [6]:
def get_unpacked_file_list(packer_id_feature_file, file_id_feature_file, trid_id_feature_file):
    # Load the malware packer id features and file id features from the sample set.
    packer_id_features = pd.read_csv(packer_id_feature_file)
    file_id_features = pd.read_csv(file_id_feature_file)
    trid_id_features = pd.read_csv(trid_id_feature_file)
    # Get a list of unpacked PE files that are not .NET CIL format.
    # IDA Pro cannot disassemble .NET files, have to use Ildisasm.exe in Visual Studio.
    unpacked_files = packer_id_features[packer_id_features['is_packed'] == 0]
    unpacked_pe_files = unpacked_files[unpacked_files['valid_pe'] == 1]
    not_dot_net = []
    counter = 0
    dot_net_counter = 0
    # Get the trid and file rows that are for unpacked PE files.
    trids = trid_id_features[trid_id_features['file_name'].isin(unpacked_pe_files['file_name'])]
    fids = file_id_features[file_id_features['file_name'].isin(unpacked_pe_files['file_name'])]
    # Iterate over the unpacked PE file list and check if each is a .NET file.
    # If not a .NET file then add to file list.
    pe_names_list = unpacked_pe_files['file_name']
    for idx, file_name in enumerate(pe_names_list):
        trid_name = trids.iloc[idx, 1]
        fid_name = fids.iloc[idx, 1]
        trid_name = trid_name.lower()
        fid_name = fid_name.lower()
        if trid_name.find('.net') > -1 or fid_name.find('.net') > -1:
            #print('Found: {:s} - {:s}'.format(trid_name, fid_name))
            dot_net_counter += 1
        #print('Found: {:s} - {:s}'.format(trid_name, fid_name))
        counter += 1
    file_list = []
    write_list = []
    counter = 0
    # Iterate over the file list and prepend the full file name.
    for file_name in not_dot_net:
        full_name = "VirusShare_" + file_name
        write_list.append(full_name + "\n")
        counter += 1

    if (len(file_list) > 0):   
        fop = open('data/temp-unpacked-pe-non-dot-net.txt','w')
    print("Got {:d} unpacked PE filenames and {:d} .NET filenames.".format(counter, dot_net_counter))

    return file_list

In [8]:
packer_id_file = 'data/sorted-packer-id-features-vs251.csv'
file_id_file = 'data/sorted-file-id-features-vs251.csv'
trid_id_file = 'data/sorted-trid-id-features-vs251.csv'
ext_drive = '/opt/vs/train1/'

unflist = get_unpacked_file_list(packer_id_file, file_id_file, trid_id_file)

file_list = []
completed_list = os.listdir('/opt/vs/train1asm/')
print("Got {:d} completed ASM files.".format(len(completed_list)))
for idx, fname in enumerate(completed_list):
    completed_list[idx] = fname[0:fname.find(".asm")]

for idx, fname in enumerate(unflist):
    if fname not in completed_list:
        file_list.append(ext_drive + fname)

print("Processing {:d} files out of {:d} total unpacked PE files.".format(len(file_list), len(unflist)))

Got 55128 unpacked PE filenames and 348 .NET filenames.
Got 21315 completed ASM files.
Processing 33814 files out of 55128 total unpacked PE files.

In [9]:
33814 + 21315


In [8]:
def get_unpacked_file_list(packer_id_feature_file, file_id_feature_file, trid_id_feature_file):
    # Load the malware packer id features and file id features from the sample set.
    packer_id_features = pd.read_csv(packer_id_feature_file)
    file_id_features = pd.read_csv(file_id_feature_file)
    trid_id_features = pd.read_csv(trid_id_feature_file)
    # Get a list of unpacked PE files that are not .NET CIL format.
    # IDA Pro cannot disassemble .NET files, have to use Ildisasm.exe in Visual Studio.
    unpacked_files = packer_id_features[packer_id_features['is_packed'] == 0]
    unpacked_pe_files = unpacked_files[unpacked_files['valid_pe'] == 1]
    not_dot_net = []
    counter = 0
    dot_net_counter = 0
    amd64_bit_counter = 0
    # Get the trid and file rows that are for unpacked PE files.
    trids = trid_id_features[trid_id_features['file_name'].isin(unpacked_pe_files['file_name'])]
    fids = file_id_features[file_id_features['file_name'].isin(unpacked_pe_files['file_name'])]
    # Iterate over the unpacked PE file list and check if each is a .NET file.
    # If not a .NET file then add to file list.
    pe_names_list = unpacked_pe_files['file_name']
    for idx, file_name in enumerate(pe_names_list):
        trid_name = trids.iloc[idx, 1]
        fid_name = fids.iloc[idx, 1]
        trid_name = trid_name.lower()
        fid_name = fid_name.lower()
        #print("Trid: {:s}".format(trid_name))
        #print("Fid: {:s}".format(fid_name))
        if trid_name.find('.net') > -1 or fid_name.find('.net') > -1:
            print('Found: {:s} - {:s}'.format(trid_name, fid_name))
            dot_net_counter += 1
        if trid_name.find('win64') > -1 or fid_name.startswith('pe32+'):
            print('Found: {:s} - {:s}'.format(trid_name, fid_name))
            amd64_bit_counter += 1
        #print('Found: {:s} - {:s}'.format(trid_name, fid_name))
        counter += 1
    file_list = []
    write_list = []
    counter = 0
    # Iterate over the file list and prepend the full file name.
    for file_name in not_dot_net:
        full_name = "VirusShare_" + file_name
        write_list.append(full_name + "\n")
        counter += 1

    if (len(file_list) > 0):   
        fop = open('data/temp-unpacked-pe-non-dot-net.txt','w')
    print("Got {:d} unpacked PE files.".format(counter))
    print("Got {:d} .NET file and {:d} 64 Bit files.".format(dot_net_counter, amd64_bit_counter))

    return file_list

In [ ]:
packer_id_file = 'data/sorted-packer-id-features-apt.csv'
file_id_file = 'data/sorted-file-id-features-apt.csv'
trid_id_file = 'data/sorted-trid-id-features-apt.csv'
ext_drive = '/opt/vs/apt/'

unflist = get_unpacked_file_list(packer_id_file, file_id_file, trid_id_file)

Test ELF Disassembly.

In [13]:
# Test ELF disassembly.

def get_elf_file_list(ext_drive, packer_id_feature_file, file_id_feature_file, trid_id_feature_file):
    # Load the malware packer id features and file id features from the sample set.
    packer_id_features = pd.read_csv(packer_id_feature_file)
    file_id_features = pd.read_csv(file_id_feature_file)
    trid_id_features = pd.read_csv(trid_id_feature_file)
    counter = 0

    file_names_list = file_id_features['file_name']
    file_list = []
    write_list = []
    fid_list = []
    for idx, file_name in enumerate(file_names_list):
        trid_name = trid_id_features.iloc[idx, 1]
        fid_name = file_id_features.iloc[idx, 1]
        if trid_name.find('ELF') > -1 or fid_name.find('ELF') > -1:
            print('Found: {:s} - {:s}'.format(trid_name, fid_name))
            counter += 1
            full_name = ext_drive + "VirusShare_" + file_name
            write_list =  full_name + "\n"

    fop = open('data/elf-file-list.txt','w')
    print("Got {:d} ELF filenames.".format(counter))

    return file_list, fid_list

def disassemble_elf_binaries(file_list, fid_list):
    # Use the command "objdump -d -M intel file_name" to dump out all 
    # the code sections of the ELF binary and generate assembly code in Intel
    # format as this is easier to read and better for machine learning 
    # feature extraction.
    # Use the command "objdump -g -x file_name -o file_name.txt to dump out
    # all header sections.
    counter = 0
    disassed = 0
    error_count = 0
    print("Disassembling {:d} binary ELF files.".format(len(file_list)))
    for idx, file_name in enumerate(file_list):
        file_path = file_name.rstrip() # remove the newlines or else !!!
        asm_file_name = file_path + ".elf.asm"
        hdr_file_name = file_path + ".elf.txt"
        fid_name = fid_list[idx]
        if (os.path.isfile(file_path)):
            fopasm = open(asm_file_name, "w")
            # Dump the assembly code listing.
            if "Intel" in fid_name:
                sub.call(["objdump", "-d", "-M intel", file_path], stdout=fopasm)
                #sub.call(["ndisasm", "-d", "-M intel", file_path], stdout=fopasm)
            elif "x86" in fid_name:
                sub.call(["objdump", "-d", "-M intel", file_path], stdout=fopasm)
            elif "ARM" in fid_name:
                sub.call(["objdump", "-d", "-marm", file_path], stdout=fopasm)
            elif "PowerPC" in fid_name:
                sub.call(["objdump", "-d", "-mpowerpc", file_path], stdout=fopasm)
            elif "Motorola" in fid_name:
                sub.call(["objdump", "-d", "-mm68k", file_path], stdout=fopasm)
            elif "SPARC" in fid_name:
                sub.call(["objdump", "-d", "-msparc", file_path], stdout=fopasm)
            elif "MIPS" in fid_name:
                sub.call(["objdump", "-d", "-mmips", file_path], stdout=fopasm)
            elif "Renesas" in fid_name: # SuperH
                sub.call(["objdump", "-d", "-msh", file_path], stdout=fopasm)
            # Dump the ELF section headers.
            fophdr = open(hdr_file_name, "w")
            sub.call(["readelf", "-e", file_path], stdout=fophdr)
            # now delete the binary, we do not need it anymore.
            # sub.call(["rm", file_path1])
            disassed += 1

            #print("Error: file does not exist - {:s}".format(file_path))
            error_count += 1
        counter += 1
        if (counter % 1000) == 0: # print progress
            print('Disassembled: {:d} - {:s}'.format(counter, file_path))    

    print("Disassembled {:d} ELF binaries with {:d} file path errors.".format(disassed, error_count))
    #sub.call(["mv", "*.asm", "/opt/vs/asm"])

In [7]:
ext_drive = '/opt/vs/train1/'
packer_id_file = 'data/sorted-packer-id-features-vs251.csv'
file_id_file = 'data/sorted-file-id-features-vs251.csv'
trid_id_file = 'data/sorted-trid-id-features-vs251.csv'
unflist, fidlist = get_elf_file_list(ext_drive, packer_id_file, file_id_file, trid_id_file)

disassemble_elf_binaries(unflist, fidlist)

Found: unknown - ELF 32-bit MSB  executable PowerPC or cisco 4500 version 1 (SYSV) statically linked not stripped
Found: unknown - ELF 32-bit LSB  executable MIPS MIPS-I version 1 (SYSV) statically linked not stripped
Got 2 ELF filenames.
Disassembling 2 binary ELF files.
Disassembled 2 ELF binaries with 0 file path errors.

In [8]:
ext_drive = '/opt/vs/train2/'
packer_id_file = 'data/sorted-packer-id-features-vs252.csv'
file_id_file = 'data/sorted-file-id-features-vs252.csv'
trid_id_file = 'data/sorted-trid-id-features-vs252.csv'
unflist, fidlist = get_elf_file_list(ext_drive, packer_id_file, file_id_file, trid_id_file)

disassemble_elf_binaries(unflist, fidlist)

Found: unknown - ELF 32-bit MSB  executable PowerPC or cisco 4500 version 1 (SYSV) statically linked not stripped
Found: unknown - ELF 32-bit LSB  executable ARM version 1 statically linked not stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) dynamically linked (uses shared libs) for GNU/Linux 2.0.0 stripped
Got 3 ELF filenames.
Disassembling 3 binary ELF files.
Disassembled 3 ELF binaries with 0 file path errors.

In [14]:
ext_drive = '/opt/vs/train3/'
packer_id_file = 'data/sorted-packer-id-features-vs263.csv'
file_id_file = 'data/sorted-file-id-features-vs263.csv'
trid_id_file = 'data/sorted-trid-id-features-vs263.csv'
unflist, fidlist = get_elf_file_list(ext_drive, packer_id_file, file_id_file, trid_id_file)

disassemble_elf_binaries(unflist, fidlist)

Found: unknown - ELF 32-bit LSB  executable ARM EABI4 version 1 (SYSV) statically linked for GNU/Linux 2.6.14 stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked for GNU/Linux 2.2.5 not stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (GNU/Linux) statically linked for GNU/Linux 2.6.18 not stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked for GNU/Linux 2.6.9 not stripped
Found: unknown - ELF 32-bit LSB  executable ARM EABI5 version 1 (SYSV) statically linked for GNU/Linux 2.6.16 not stripped
Got 5 ELF filenames.
Disassembling 5 binary ELF files.
Disassembled 5 ELF binaries with 0 file path errors.

Generate Instruction Sets for Various Computer Architectures/Processors/JVM.

In [ ]:
ext_drive = '/opt/vs/train4/'
packer_id_file = 'data/sorted-packer-id-features-vs264.csv'
file_id_file = 'data/sorted-file-id-features-vs264.csv'
trid_id_file = 'data/sorted-trid-id-features-vs264.csv'
unflist, fidlist = get_elf_file_list(ext_drive, packer_id_file, file_id_file, trid_id_file)

disassemble_elf_binaries(unflist, fidlist)

In [ ]:
fip = open('data/amd64-instruction-set.txt')
inlines = fip.readlines()

In [ ]:
opcode_list = []
for line in inlines:
    tokens = line.rstrip()
opcode_str = "[\'" + "','".join(opcode_list) + "\']" 

In [ ]:
fip = open('data/arm-instruction-set.txt')
inlines = fip.readlines()

In [4]:
opcode_list = []
for line in inlines:
    tokens = line.split()
opcode_str = "[\'" + "','".join(opcode_list) + "\']" 


fip = open('data/sparc-instruction-set.txt')
inlines = fip.readlines()

In [ ]:
opcode_list = []
for line in inlines:
    tokens = line.split()
opcode_str = "[\'" + "','".join(opcode_list) + "\']" 

In [ ]:
fip = open('data/powerpc-instruction-set.txt')
inlines = fip.readlines()

In [ ]:
opcode_list = []
for line in inlines:
    tokens = line.replace('[', ' ').split()
opcode_str = "[\'" + "','".join(opcode_list) + "\']" 

In [ ]:
fip = open('data/powerpc-version-202-instruction-set.txt')
inlines = fip.readlines()

In [ ]:
opcode_list = []
for line in inlines:
    tokens = line.replace('[', ' ').rstrip().split()
opcode_str = "[\'" + "','".join(opcode_list) + "\']" 

In [ ]:
fip = open('data/motorola-instruction-set.txt')
inlines = fip.readlines()

In [ ]:
fip = open('data/mips-instruction-set.txt')
inlines = fip.readlines()

In [2]:
# Check file id strings for ELF executables
fip = open('data/sorted-file-id-features-vs251.csv')
inlines = fip.readlines()
for line in inlines:
    if "ELF" in line:
        print("-> {:s}".format(line))

-> abbde81d7f4733c16046cbd8ee7409d3,ELF 32-bit MSB  executable PowerPC or cisco 4500 version 1 (SYSV) statically linked not stripped,56

-> f04f278048fc082dd5d0f34efa3c05f8,ELF 32-bit LSB  executable MIPS MIPS-I version 1 (SYSV) statically linked not stripped,475

In [3]:
fip = open('data/sorted-file-id-features-vs252.csv')
inlines = fip.readlines()
for line in inlines:
    if "ELF" in line:
        print("-> {:s}".format(line))

-> c6813bcaf9a2801973e9c44fe75ef75b,ELF 32-bit MSB  executable PowerPC or cisco 4500 version 1 (SYSV) statically linked not stripped,56

-> cbb492024bdd2484f39893ab77da0cae,ELF 32-bit LSB  executable ARM version 1 statically linked not stripped,216

-> fa390c69553d757c3a10737a0a8604dc,ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) dynamically linked (uses shared libs) for GNU/Linux 2.0.0 stripped,463

In [4]:
fip = open('data/sorted-file-id-features-vs263.csv')
inlines = fip.readlines()
for line in inlines:
    if "ELF" in line:
        print("-> {:s}".format(line))

-> 480813ec6548a4e55245a0e446e63c36,ELF 32-bit LSB  executable ARM EABI4 version 1 (SYSV) statically linked for GNU/Linux 2.6.14 stripped,188

-> 5b88e0490dd764e66e13c8a543099c9d,ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked for GNU/Linux 2.2.5 not stripped,192

-> 62d33be03ef3bc9c81d703898fc0e18c,ELF 32-bit LSB  executable Intel 80386 version 1 (GNU/Linux) statically linked for GNU/Linux 2.6.18 not stripped,349

-> 7a891a96d6af45865e5fe6142b40eb77,ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked for GNU/Linux 2.6.9 not stripped,447

-> af8970eb045a77ad1c427eb6333c9efd,ELF 32-bit LSB  executable ARM EABI5 version 1 (SYSV) statically linked for GNU/Linux 2.6.16 not stripped,176

In [ ]:
fip = open('data/sorted-file-id-features-vs264.csv')
inlines = fip.readlines()
for line in inlines:
    if "ELF" in line:
        print("-> {:s}".format(line))

Generate Assembly Instruction Sets for Various Computer Architectures.

In [ ]:
fip = open("/home/derek/binutils.txt")
inlines = fip.readlines()

In [9]:
package_list = []
for line in inlines:
    tokens = line.replace(":", " ").split()
command = "apt install " + " ".join(package_list)

'apt install binutils binutils binutils-aarch64-linux-gnu binutils-aarch64-linux-gnu binutils-arm-linux-gnueabi binutils-arm-linux-gnueabi binutils-arm-linux-gnueabihf binutils-arm-linux-gnueabihf binutils-arm-none-eabi binutils-arm-none-eabi binutils-avr binutils-avr binutils-dev binutils-dev binutils-doc binutils-gold binutils-gold binutils-h8300-hms binutils-h8300-hms binutils-m68hc1x binutils-m68hc1x binutils-mingw-w64 binutils-mingw-w64-i686 binutils-mingw-w64-i686 binutils-mingw-w64-x86-64 binutils-mingw-w64-x86-64 binutils-msp430 binutils-msp430 binutils-multiarch binutils-multiarch binutils-multiarch-dev binutils-multiarch-dev binutils-powerpc-linux-gnu binutils-powerpc-linux-gnu binutils-powerpc64le-linux-gnu binutils-powerpc64le-linux-gnu binutils-source binutils-static binutils-static binutils-z80 binutils-z80 elf-binutils elf-binutils mingw32-binutils mingw32-binutils'

In [ ]:
fip = open("/home/derek/binutils.txt")
inlines = fip.readlines()

In [5]:
package_list = []
for line in inlines:
    if ":i386" not in line:
        tokens = line.split()
command = "apt install " + " ".join(package_list)

'apt install binutils binutils-aarch64-linux-gnu binutils-alpha-linux-gnu binutils-arm-linux-gnueabi binutils-arm-linux-gnueabihf binutils-arm-linux-gnueabihf:i3 binutils-arm-none-eabi binutils-avr binutils-dev binutils-doc binutils-gold binutils-h8300-hms binutils-hppa-linux-gnu binutils-hppa64 binutils-hppa64-linux-gnu binutils-m68hc1x binutils-m68k-linux-gnu binutils-mingw-w64 binutils-mingw-w64-i686 binutils-mingw-w64-x86-64 binutils-mips-linux-gnu binutils-mips64-linux-gnuabi64 binutils-mips64-linux-gnuabi64: binutils-mips64el-linux-gnuabi6 binutils-mips64el-linux-gnuabi6 binutils-mipsel-linux-gnu binutils-msp430 binutils-multiarch binutils-multiarch-dev binutils-powerpc-linux-gnu binutils-powerpc-linux-gnuspe binutils-powerpc-linux-gnuspe:i binutils-powerpc64-linux-gnu binutils-powerpc64-linux-gnu:i3 binutils-powerpc64le-linux-gnu binutils-powerpc64le-linux-gnu: binutils-s390x-linux-gnu binutils-sh4-linux-gnu binutils-source binutils-sparc64-linux-gnu binutils-z80 elf-binutils'

In [6]:
x86_registers = ['edx','esi','es','fs','ds','ss','gs','cs','ah','al',

x86_opcodes = ['add','al','bt','call','cdq','cld','cli','cmc','cmp','const','cwd','daa','db'

amd64_registers = ['rax','rbx','rcx','rdx','rsi','rdi','rbp','rsp','r8','r9','r10','r11','r12','r13','r14','r15']

amd64_opcodes = ['aaa','aad','aam','aas','adc','add','and','andn','bextr','bextr','blcfill','blci','blcic',

MIPS_registers = []

MIPS_opcodes = []

SPARC_registers = []

SPARC_opcodes = []

ARM_registers = ['r0','r1','r2','r3','r4','r5','r6','r7','r8','r9','r10','r11','r12','r13','r14','r15','cpsr']

ARM_opcodes = ['adc','msr','add','mul','and','mvn','b','orr','bic','rsb','bl','rsc','bx','sbc','cdp','smlal','cmn','smull',

Motorola_registers = ['d0','d1','d2','d3','d4','d5','d6','d7','a0','a1','a2','a3','a4','a5','a6','a7','usp','ssp']

Motorola_opcodes = []

PowerPC_registers = ['r0','r1','r2','r3','r4','r5','r6','r7','r8','r9','r10','r11','r12','r13','r14','r15',

PowerPC_opcodes = ['add','addc','adde','addi','addic','addic.','addis','addme','addze','and','andc','andi.','andis.',

In [7]:
fip = open("data/arm-listing.txt")
inlines = fip.readlines()

['  /* XScale instructions.  */\n',
 '    0x0e200010, 0x0fff0ff0,\n',
 '    "mia%c\\tacc0, %0-3r, %12-15r"},\n',
 '    0x0e280010, 0x0fff0ff0,\n',
 '    "miaph%c\\tacc0, %0-3r, %12-15r"},\n',
 '    0x0e2c0010, 0x0ffc0ff0, "mia%17\'T%17`B%16\'T%16`B%c\\tacc0, %0-3r, %12-15r"},\n',
 '    0x0c400000, 0x0ff00fff, "mar%c\\tacc0, %12-15r, %16-19r"},\n',
 '    0x0c500000, 0x0ff00fff, "mra%c\\t%12-15r, %16-19r, acc0"},\n',
 '  /* Intel Wireless MMX technology instructions.  */\n',
 '    0x0e130130, 0x0f3f0fff, "tandc%22-23w%c\\t%12-15r"},\n',
 '    0x0e400010, 0x0ff00f3f, "tbcst%6-7w%c\\t%16-19g, %12-15r"},\n']

In [ ]:
# Parse the contents of arm-dis.c in binutils and extract all the ARM opcodes.
opcode_list = []
counter = 0
opcode = 'none'
for line in inlines:
    line = line.lstrip()
    line = line.replace('\\t',' ')
    if len(line) < 10:
    if line.startswith('{'):
    if line.startswith('"'):
        idx = line.find('%')
        if idx > 0:
            opcode = line[1:idx]
        tokens = line.split()
        if len(tokens) < 3:
        opcodestr = tokens[2]
        idx = opcodestr.find('%')
        if idx > 0:
            opcode = opcodestr[1:idx]
    if opcode not in opcode_list:
opcode_str = "[\'" + "','".join(opcode_list) + "\']" 

In [ ]:
fip = open("data/m68k-opc.c")
inlines = fip.readlines()

In [ ]:
# Parse the contents of m68k-opc.c in binutils and extract all the Motorola opcodes.
opcode_list = []
counter = 0
opcode = 'none'
for line in inlines:
    line = line.lstrip()
    if len(line) < 10:
    if line.startswith("{"):
        line = line[2:]
        idx = line.find("\"")
        if idx > 0:
            opcode = line[:idx]
    if opcode not in opcode_list:
opcode_str = "[\'" + "','".join(opcode_list) + "\']" 

In [ ]:
fip = open("data/mips-instruction-set.txt")
inlines = fip.readlines()

In [ ]:
# Parse the contents of mips-instruction-set.txt and extract all the things.
opcode_list = []
counter = 0
opcode = 'none'
for line in inlines:
    line = line.lstrip()
    if len(line) < 10:
    if line.startswith("{"):
        line = line[2:]
        idx = line.find("\"")
        if idx > 0:
            opcode = line[:idx]
    idx = opcode.find(".")
    if idx > 0:
        opcode = opcode[:idx]
    if opcode not in opcode_list:
opcode_str = "[\'" + "','".join(opcode_list) + "\']" 

In [ ]:
fip = open("data/i386-opc.tbl")
inlines = fip.readlines()

In [ ]:
# Parse the contents of i386-opc.tbl from binutils and extract all the things.
opcode_list = []
counter = 0
opcode = 'none'
for line in inlines:
    if len(line) < 10:
    if line.startswith("//"):
    line = line.replace(',', ' ')
    tokens = line.split()
    if len(tokens) > 0:
        opcode = tokens[0]
    idx = opcode.find(".")
    if idx > 0:
        opcode = opcode[:idx]
    if opcode not in opcode_list:
opcode_str = "[\'" + "','".join(opcode_list) + "\']" 

In [2]:
fip = open("data/Java-bytecode-instruction-listing.txt")
inlines = fip.readlines()

['aaload \t32 \t0011 0010 \t\tarrayref, index \xe2\x86\x92 value \tload onto the stack a reference from an array\n',
 'aastore \t53 \t0101 0011 \t\tarrayref, index, value \xe2\x86\x92 \tstore into a reference in an array\n',
 'aconst_null \t01 \t0000 0001 \t\t\xe2\x86\x92 null \tpush a /null/ reference onto the stack\n',
 'aload \t19 \t0001 1001 \t1: index \t\xe2\x86\x92 objectref \tload a reference onto the stack from a local variable /#index/\n',
 'aload_0 \t2a \t0010 1010 \t\t\xe2\x86\x92 objectref \tload a reference onto the stack from local variable 0\n',
 'aload_1 \t2b \t0010 1011 \t\t\xe2\x86\x92 objectref \tload a reference onto the stack from local variable 1\n',
 'aload_2 \t2c \t0010 1100 \t\t\xe2\x86\x92 objectref \tload a reference onto the stack from local variable 2\n',
 'aload_3 \t2d \t0010 1101 \t\t\xe2\x86\x92 objectref \tload a reference onto the stack from local variable 3\n',
 'anewarray \tbd \t1011 1101 \t2: indexbyte1, indexbyte2 \tcount \xe2\x86\x92 arrayref create a new array of references of length /count/ and component type\n',
 'areturn \tb0 \t1011 0000 \t\tobjectref \xe2\x86\x92 [empty] \treturn a reference from a method\n',
 'arraylength \tbe \t1011 1110 \t\tarrayref \xe2\x86\x92 length \tget the length of an array\n',
 'astore \t3a \t0011 1010 \t1: index \tobjectref \xe2\x86\x92 \tstore a reference into a local variable /#index/\n',
 'astore_0 \t4b \t0100 1011 \t\tobjectref \xe2\x86\x92 \tstore a reference into local variable 0\n',
 'astore_1 \t4c \t0100 1100 \t\tobjectref \xe2\x86\x92 \tstore a reference into local variable 1\n',
 'astore_2 \t4d \t0100 1101 \t\tobjectref \xe2\x86\x92 \tstore a reference into local variable 2\n',
 'astore_3 \t4e \t0100 1110 \t\tobjectref \xe2\x86\x92 \tstore a reference into local variable 3\n',
 'athrow \tbf \t1011 1111 \t\tobjectref \xe2\x86\x92 [empty], objectref \tthrows an error\n',
 'baload \t33 \t0011 0011 \t\tarrayref, index \xe2\x86\x92 value \tload a byte or Boolean value from an array\n',
 'bastore \t54 \t0101 0100 \t\tarrayref, index, value \xe2\x86\x92 \tstore a byte or Boolean value into an array\n',
 'bipush \t10 \t0001 0000 \t1: byte \t\xe2\x86\x92 value \tpush a /byte/ onto the stack as an integer /value/\n']

In [3]:
# Parse the contents of Java-bytecode-instruction-listing.txt and extract all the things.
opcode_list = []
counter = 0
opcode = 'none'
for line in inlines:
    if len(line) < 10:
    line = line.replace('\t', ' ')
    tokens = line.split()
    if len(tokens) > 0:
        opcode = tokens[0]

    if opcode not in opcode_list:
opcode_str = "[\'" + "','".join(opcode_list) + "\']" 


Test PDF Keywords/Names Feature Extraction.

In [ ]:
import os
from csv import writer
import numpy as np
import pandas as pd

# Start of Script

target_dir = "/opt/vs/legitware/"
out_file = "data/pdf-features-legit.csv"
pdf_token_file = "data/2716-pdf-token-counts-non-malicious-set.csv"
#out_file = "data/pdf-features-vs251.csv"


file_list = os.listdir(target_dir)
pdflist = []

for fname in enumerate(file_list):
    if fname.endswith('.pdf'):
        pdflist.append(target_dir + fname)
print("Got {:d} PDF files.".format(len(pdflist)))

In [ ]: