Features sets will consist of:
- Entropy and file size from packed binaries.
- Entropy and file size from unpacked binaries.
- ASM features from disassembled unpacked binaries.
- Executable header features.
- Call Graph Features.
- Sample Statistics.
- PE packer type.
- Behavioural features from Cuckoo Sandbox reports.
- Memory features from Volatility reports.
Training labels will be generated from ClamAV, Windows Defender and VirusTotal.com reports.
In [1]:
from multiprocessing import Pool
import os
from csv import writer
import numpy as np
import pandas as pd
import math
import scipy.misc
import array
import time as tm
import re
import subprocess as sub
In [2]:
ext_drive = '/opt/vs/'
tfiles = os.listdir(ext_drive + "train")
In [4]:
len(tfiles)
Out[4]:
In [12]:
# Calculate Shannon's Entropy, https://en.wikipedia.org/wiki/Entropy_(information_theory)
def calculate_entropy(byte_counts, total):
entropy = 0.0
for count in byte_counts:
# If no bytes of this value were seen in the value, it doesn't affect
# the entropy of the file.
if count == 0:
continue
# p is the probability of seeing this byte in the file, as a floating-point number
p = 1.0 * count / total
entropy -= p * math.log(p, 256)
return entropy
def entropy_counter(byte_code):
byte_counts = [0] * 256
code_length = len(byte_code)
for i in range(len(byte_code)):
byte_counts[int(byte_code[i])] += 1
entropy = calculate_entropy(byte_counts, code_length)
return entropy
def sort_and_save_entropy_feature_file():
entropys = pd.read_csv('data/entropy-features.csv')
# DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values().
sorted_entropys = entropys.sort('file_name')
sorted_entropys.to_csv('data/sorted-entropy-features.csv', index=False)
sorted_entropys.head(20)
return
def combine_entropy_files():
# Function to combine the newly generated entropy files into one file:
# 1. list data directory
# 2. For each file in file list that matches (\d\d\d\d-entropy-features.csv)
# 3. Trim the filenames if necessary (should remove VirusShare_ prefix).
# 4. Concatenate the unsorted packer id feature files.
# 5. Sort and write to data/sorted-packer-id-features.csv
fop = open('data/entropy-features.csv','w')
fop.write('file_name,entropy,file_size\n')
p1 = re.compile('\d{3,5}-entropy-features-bin.csv') # This is the PID prefix for each file.
file_list = os.listdir('data/')
counter = 0
for file_name in file_list:
if p1.match(file_name):
fip = open('data/' + file_name, 'r')
in_lines = fip.readlines()
fop.writelines(in_lines)
counter += len(in_lines)
fip.close()
print('Completed combine of {:d} entropy features.'.format(counter))
fop.close()
sort_and_save_entropy_feature_file()
return
# feature extraction for the binary files
def extract_binary_features(tfiles):
#byte_files = [i for i in tfiles if '.bytes' in i]
ftot = len(tfiles)
pid = os.getpid()
print('Process id:', pid)
feature_file = 'data/' + str(pid) + '-entropy-features-bin.csv' # entropy, file size, ngrams...
print('feature file:', feature_file)
feature_counts = []
with open(feature_file, 'w') as f:
# Write the column names for the csv file
fw = writer(f)
# Do this when combining the files.
#colnames = ['file_name'] + ['entropy'] + ['file_size']
#fw.writerow(colnames)
# Now iterate through the file list and extract the features from each file.
for idx, fname in enumerate(tfiles):
fasm = open(ext_drive + fname, 'rb')
filesize = os.path.getsize(ext_drive + fname)
in_bytes = fasm.read()
# TODO: Do ngram extraction
# First do entropy calculations and filesize
# Convert the input array into a byte array to prevent type errors
# in entropy counter function.
in_bytes = bytearray(in_bytes)
#print("Type = {:s}").format(type(in_bytes))
entropy = entropy_counter(in_bytes)
count_vals = [entropy, filesize]
feature_counts.append([fname[fname.find('_')+1:]] + count_vals)
fasm.close()
# Print progress
if (idx+1) % 1000 == 0:
print("{:d} - {:d} of {:d} files processed.".format(pid, idx + 1, ftot))
fw.writerows(feature_counts)
feature_counts = []
# Write remaining files
if len(feature_counts) > 0:
fw.writerows(feature_counts)
feature_counts = []
print("Completed processing {:d} rows for feature file {:s}".format(ftot,feature_file))
In [ ]:
# TRAINING
# Now divide the train files into four groups for multiprocessing
ext_drive = '/opt/vs/train/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print("Files: {:d} - {:d} - {:d}".format(len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4))))
trains = [train1, train2, train3, train4]
p = Pool(4)
p.map(extract_binary_features, trains)
combine_entropy_files()
In [ ]:
# TRAINING
# Now divide the train files into four groups for multiprocessing
ext_drive = '/opt/vs/train2/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print("Files: {:d} - {:d} - {:d}".format(len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4))))
trains = [train1, train2, train3, train4]
p = Pool(4)
p.map(extract_binary_features, trains)
combine_entropy_files()
In [15]:
entropys = pd.read_csv('data/sorted-entropy-features.csv')
sorted_entropys = entropys.sort('file_name')
sorted_entropys.to_csv('data/sorted-entropy-features-vs251-252.csv', index=False)
sorted_entropys.head(20)
Out[15]:
In [ ]:
# TODO: everything.
In [ ]:
In [ ]:
keywords = ['Virtual','Offset','loc','Import','Imports','var','Forwarder','UINT','LONG'
,'BOOL','WORD','BYTES','large','short','dd','db','dw','XREF','ptr','DATA'
,'FUNCTION','extrn','byte','word','dword','char','DWORD','stdcall','arg'
,'locret','asc','align','WinMain','unk','cookie','off','nullsub','DllEntryPoint'
,'System32','dll','CHUNK','BASS','HMENU','DLL','LPWSTR','void','HRESULT','HDC'
,'LRESULT','HANDLE','HWND','LPSTR','int','HLOCAL','FARPROC','ATOM','HMODULE'
,'WPARAM','HGLOBAL','entry','rva','COLLAPSED','config','exe','Software'
,'CurrentVersion','__imp_','INT_PTR','UINT_PTR','---Seperator','PCCTL_CONTEXT'
,'__IMPORT_','INTERNET_STATUS_CALLBACK','.rdata:','.data:','.text:','case'
,'installdir','market','microsoft','policies','proc','scrollwindow','search'
,'trap','visualc','___security_cookie','assume','callvirtualalloc','exportedentry'
,'hardware','hkey_current_user','hkey_local_machine','sp-analysisfailed','unableto']
known_sections = ['.text', '.data', '.bss', '.rdata', '.edata', '.idata', '.rsrc', '.tls', '.reloc']
registers = ['edx','esi','es','fs','ds','ss','gs','cs','ah','al',
'ax','bh','bl','bx','ch','cl','cx','dh','dl','dx',
'eax','ebp','ebx','ecx','edi','esp']
opcodes = ['add','al','bt','call','cdq','cld','cli','cmc','cmp','const','cwd','daa','db'
,'dd','dec','dw','endp','ends','faddp','fchs','fdiv','fdivp','fdivr','fild'
,'fistp','fld','fstcw','fstcwimul','fstp','fword','fxch','imul','in','inc'
,'ins','int','jb','je','jg','jge','jl','jmp','jnb','jno','jnz','jo','jz'
,'lea','loope','mov','movzx','mul','near','neg','not','or','out','outs'
,'pop','popf','proc','push','pushf','rcl','rcr','rdtsc','rep','ret','retn'
,'rol','ror','sal','sar','sbb','scas','setb','setle','setnle','setnz'
,'setz','shl','shld','shr','sidt','stc','std','sti','stos','sub','test'
,'wait','xchg','xor']
def count_asm_symbols(asm_code):
symbols = [0]*7
for row in asm_code:
if '*' in row:
symbols[0] += 1
if '-' in row:
symbols[1] += 1
if '+' in row:
symbols[2] += 1
if '[' in row:
symbols[3] += 1
if ']' in row:
symbols[4] += 1
if '@' in row:
symbols[5] += 1
if '?' in row:
symbols[6] += 1
return symbols
def count_asm_registers(asm_code):
registers_values = [0]*len(registers)
for row in asm_code:
parts = row.replace(',',' ').replace('+',' ').replace('*',' ').replace('[',' ').replace(']',' ') \
.replace('-',' ').split()
for register in registers:
registers_values[registers.index(register)] += parts.count(register)
return registers_values
def count_asm_opcodes(asm_code):
opcodes_values = [0]*len(opcodes)
for row in asm_code:
parts = row.split()
for opcode in opcodes:
if opcode in parts:
opcodes_values[opcodes.index(opcode)] += 1
break
return opcodes_values
def count_asm_APIs(asm_code, apis):
apis_values = [0]*len(apis)
for row in asm_code:
for i in range(len(apis)):
if apis[i] in row:
apis_values[i] += 1
break
return apis_values
def count_asm_misc(asm_code):
keywords_values = [0]*len(keywords)
for row in asm_code:
for i in range(len(keywords)):
if keywords[i] in row:
keywords_values[i] += 1
break
return keywords_values
# Extract features from test/training asm files, file list is passed in as a parameter
def extract_asm_features(tfiles):
pid = os.getpid()
print('Process id:', pid)
feature_file = 'data/' + str(pid) + '-malware-features-asm.csv' # Windows API, symbols, registers, opcodes, etc...
print('feature file:', feature_file)
fapi = open("data/APIs.txt")
defined_apis = fapi.readlines()
defined_apis = defined_apis[0].split(',')
asm_files = [i for i in tfiles if '.asm' in i]
ftot = len(asm_files)
feature_counts = []
with open(feature_file, 'w') as f:
# write the csv header
fw = writer(f)
colnames = ['file_name'] + registers + opcodes + defined_apis + keywords
fw.writerow(colnames)
for idx, fname in enumerate(asm_files):
fasm = open(ext_drive + fname, 'r')
content = fasm.readlines()
reg_vals = count_asm_registers(content)
opc_vals = count_asm_opcodes(content)
api_vals = count_asm_APIs(content, defined_apis)
#sec_vals = count_asm_sections(content)
mis_vals = count_asm_misc(content)
count_vals = reg_vals + opc_vals + api_vals + mis_vals
feature_counts.append([fname[:fname.find('.asm')]] + count_vals)
# Writing rows after every 10 files processed
if (idx+1) % 10 == 0:
print(pid, idx + 1, 'of', ftot, 'files processed.')
fw.writerows(feature_counts)
feature_counts = []
# Writing remaining files
if len(feature_counts) > 0:
fw.writerows(feature_counts)
feature_counts = []
return
In [ ]:
In [ ]:
In [ ]:
# Need to clean up and sort these function names for ASM feature extraction.
fip = open('data/all-function-column-names-multiline.csv')
function_names = fip.readlines()
fip.close()
function_names.sort()
function_names[:50]
fop = open('data/sorted-function-names-multiline.txt','w')
fop.writelines(function_names)
fop.close()
In [22]:
fip = open('data/sorted-function-names-multiline.txt','r')
sorted_function_names = fip.readlines()
fip.close()
fip = open('data/APIs.txt','r')
api_names_str = fip.readline()
fip.close()
api_names_str = api_names_str.rstrip()
api_names = api_names_str.split(',')
api_names.sort()
len(api_names)
Out[22]:
In [23]:
for idx in range(len(sorted_function_names)):
sorted_function_names[idx] = sorted_function_names[idx].rstrip()
for aname in api_names:
if aname not in sorted_function_names:
sorted_function_names.append(aname)
sorted_function_names.sort()
len(sorted_function_names)
Out[23]:
In [ ]:
sorted_function_names[:50]
In [25]:
function_count = len(sorted_function_names)
total_chars = 0
for func_name in sorted_function_names:
total_chars += len(func_name)
avg_name_len = int(total_chars / function_count)
avg_name_len
Out[25]:
In [ ]:
# truncate function names to reduce the size of the huge sparse matrix.
function_column_names = []
for func in sorted_function_names:
if func.startswith('sub') or func.startswith('loc') or func.startswith('unk'):
func = func[:5] # lets try to reduce the vast number of functions.
elif func.startswith('eax+') or func.startswith('ebx+') or func.startswith('ecx+') or func.startswith('edx+'):
func = func[:3]
elif func.startswith('edi+') or func.startswith('esi+'):
func = func[:3]
elif func.startswith('byte_') or func.startswith('word_') or func.startswith('off_'):
func = func[:4]
elif func.startswith('_') or func.startswith('$'):
func = func[1:]
elif func.startswith('__') or func.startswith('$$'):
func = func[2:]
#else: need a regex here to match a bunch of random crap
# func = func[:33]
if len(func) > 32: # Reduce the the function name length to max of average function length.
func = func[:32]
if func not in function_column_names:
function_column_names.append(func)
function_column_names[:50]
In [27]:
fop = open('data/sorted-reduced-function-names.txt','w')
for fname in function_column_names:
fop.write(fname + "\n")
fop.close()
len(function_column_names)
Out[27]:
In [ ]:
# Use a regex to remove function names that are just hexadecimal addresses.
p1 = re.compile('\d\w+h')
reduced_function_names = []
fip = open('data/sorted-reduced-function-names.txt','r')
function_column_names = fip.readlines()
fip.close()
fop = open('data/sorted-reduced-function-names-hexless.txt','w')
for fname in function_column_names:
fname = fname.rstrip()
m = p1.match(fname)
if m == None:
fop.write(fname + "\n")
reduced_function_names.append(fname)
fop.close()
reduced_function_names[:50]
In [29]:
len(reduced_function_names)
Out[29]:
In [ ]:
In [4]:
signat = sub.check_output(["file",'-b', '/opt/vs/agobot.exe'])
In [5]:
signat
Out[5]:
In [13]:
# Generate libc function calls for ELF API feature extraction.
# Put in feature_extraction_elf_asm.py
def generate_libc_api():
# Extract libc function and variable names from GNU Libc Documentation.
# Put in feature_extraction.py
fipfunc = open('data/libc-function-index.txt', 'r')
funclines = fipfunc.readlines()
fipfunc.close()
counter = 0
func_list = []
for idx, fline in enumerate(funclines):
fline = fline.replace('\t','').replace('\n','')
if fline.startswith('|'):
tokens = fline.split('|') # The function names are |funcname|
funcname = tokens[1]
func_list.append(funcname)
counter += 1
print("Found {:d} function definitions for libc api.".format(counter))
fop = open('data/elf-libc-api.txt', 'w')
for func_name in func_list:
fop.write(func_name + "\n")
fop.close()
return func_list
def generate_libc_var():
# Extract libc function and variable names from GNU Libc Documentation.
# Put in feature_extraction.py
fipvar = open('data/libc-variable-index.txt', 'r')
varlines = fipvar.readlines()
fipvar.close()
counter = 0
var_list = []
for idx, vline in enumerate(varlines):
vline = vline.replace('\t','').replace('\n','')
if vline.startswith('|'):
tokens = vline.split('|') # The function names are |funcname|
varname = tokens[1]
var_list.append(varname)
counter += 1
print("Found {:d} variable definitions for libc api.".format(counter))
fop = open('data/elf-libc-var.txt', 'w')
for var_name in var_list:
fop.write(var_name + "\n")
fop.close()
return var_list
In [ ]:
func_list = generate_libc_api()
func_list
In [18]:
# Check interrupted disassembly results for train1 feature set.
# def validate_disassembly():
# put in disassemble_pe.py
t1asm = os.listdir('/opt/vs/train1asm/')
t1hdr = os.listdir('/opt/vs/train1hdr/')
asm_files = []
hdr_files = []
for fname in t1asm:
if fname.endswith('.asm'):
asm_files.append(fname)
for fname in t1hdr:
if fname.endswith('.txt'):
hdr_files.append(fname)
print("asm dir: {:d} asm files {:d} hdr dir {:d} hdr files {:d}".format(len(t1asm),len(asm_files),len(t1hdr),len(hdr_files)))
In [19]:
len(t1hdr) - len(t1asm)
Out[19]:
In [14]:
os.path.getsize('/opt/vs/train1asm/VirusShare_5ac1817d757a27edb90cdf887ba66870.asm')
Out[14]:
In [20]:
counter = 0
missing_hdr_list = []
for fname in asm_files:
hdr_name = fname.replace('.asm', '.txt')
if hdr_name not in hdr_files:
print("{:s} not in header file list.".format(hdr_name))
counter += 1
missing_asm_list.append(fname)
print("{:d} missing header files.".format(counter))
In [ ]:
counter = 0
missing_asm_list = []
for fname in hdr_files:
asm_name = fname.replace('.txt','.asm')
if asm_name not in asm_files:
print("{:s} not in asm file list.".format(asm_name))
counter += 1
missing_asm_list.append(fname)
print("{:d} missing assembly files.".format(counter))
In [23]:
counter = 0
fop = open('data/disass-train1-missing-asm-files.txt', 'w')
for fname in missing_asm_list:
fop.write(fname + "\n")
counter += 1
fop.close()
print("Wrote {:d} missing asm file names.".format(counter))
In [ ]:
In [25]:
counter = 0
bad_hdr_list = []
for fname in hdr_files:
fsize = os.path.getsize('/opt/vs/train1hdr/' + fname)
if fsize < 1000:
print("{:s} bad output, filesize = {:d}.".format(fname, fsize))
counter += 1
bad_hdr_list.append(fname)
print("{:d} bad header files.".format(counter))
In [ ]:
counter = 0
bad_asm_list = []
for fname in asm_files:
fsize = os.path.getsize('/opt/vs/train1asm/' + fname)
if fsize < 1000:
print("{:s} bad output, filesize = {:d}.".format(fname, fsize))
counter += 1
bad_asm_list.append(fname)
print("{:d} bad asm files.".format(counter))
In [ ]:
In [3]:
apt_df = pd.read_csv('data/sorted-entropy-features-apt.csv')
apt_file_list = apt_df['file_name']
apt_file_list.shape
Out[3]:
In [5]:
apt_file_list.head()
Out[5]:
In [7]:
f_list = os.listdir('/home/derek/project/temp/train/')
counter = 0
file_list = []
for fname in f_list:
if fname.startswith('Virus'):
tname = fname[fname.find('_') + 1:]
file_list.append(tname)
counter += 1
print("Got {:d} files in training directory.".format(counter))
apt_list = np.array(apt_file_list)
for fname in file_list:
if fname not in apt_list:
print("Extra file: {:s}".format(fname))
In [ ]:
In [5]:
def rename_header_files(ext_dir):
# Rename all the PE headers files so it is easier to process them.
file_list = os.listdir(ext_dir)
counter = 0
for fname in file_list:
if fname.startswith('Virus'):
file_path = ext_dir + fname
trunc_name = fname[0:fname.find('.txt')]
new_path = ext_dir + trunc_name + '.pe.txt'
result = sub.check_call(['mv', file_path, new_path])
counter += 1
if (counter % 1000) == 0:
print('Renamed {:d} header files.'.format(counter))
print('Completed move of {:d} header files.'.format(counter))
return
In [ ]:
ext_dir = '/home/derek/project/temp/'
rename_header_files(ext_dir)
In [2]:
def rename_asm_files(ext_dir, new_dir):
# Rename all the PE ASM files and move to a new directory
# so it is easier to process them.
file_list = os.listdir(ext_dir)
counter = 0
print("Got {:d} files in directory {:s}".format(len(file_list), ext_dir))
for fname in file_list:
if fname.endswith('.asm'):
file_path = ext_dir + fname
trunc_name = fname[0:fname.find('.asm')]
new_path = new_dir + trunc_name + '.pe.asm'
result = sub.check_call(['mv', file_path, new_path])
counter += 1
if (counter % 1000) == 0:
print('Renamed {:d} ASM files.'.format(counter))
print('Completed rename of {:d} ASM files.'.format(counter))
return
In [ ]:
rename_asm_files('/opt/vs/train3/','opt/vs/train3asm/')
In [4]:
def rename_asm_files_fix(ext_dir):
# Rename all the PE headers files so it is easier to process them.
file_list = os.listdir(ext_dir)
pe_counter = 0
unpe_counter = 0
print("Got total files: {:d}".format(len(file_list)))
for fname in file_list:
if fname.endswith('.pe.asm'):
pe_counter += 1
elif fname.endswith('.asm'):
file_path = ext_dir + fname
trunc_name = fname[0:fname.find('.asm')]
new_path = ext_dir + trunc_name + '.pe.asm'
result = sub.check_call(['mv', file_path, new_path])
unpe_counter += 1
if (unpe_counter % 1000) == 0:
print('Renamed {:d} ASM files.'.format(unpe_counter))
print('Completed move of {:d} ASM files with {:d} files already renamed.'.format(unpe_counter, pe_counter))
return
In [ ]:
rename_asm_files_fix('/opt/vs/train3/')
In [ ]:
In [4]:
# Moved to feature-extraction-validation.ipynb
In [ ]:
In [ ]:
In [6]:
def find_elf_train1asm(ext_dir):
# Rename all the PE headers files so it is easier to process them.
file_list = os.listdir(ext_dir)
elf_counter = 0
print("Got total files: {:d}".format(len(file_list)))
for fname in file_list:
if 'elf' in fname:
elf_counter += 1
file_path = ext_dir + fname
#trunc_name = fname[0:fname.find('.elf')]
new_path = '/opt/vs/' + fname
#result = sub.check_call(['mv', file_path, new_path])
#if (elf_counter % 1000) == 0:
print('ELF ASM file {:s}.'.format(file_path))
print('Completed move of {:d} ELF ASM files.'.format(elf_counter))
return
In [7]:
find_elf_train1asm('/opt/vs/train1asm/')
In [8]:
find_elf_train1asm('/opt/vs/train2asm/')
In [ ]:
find_elf_train1asm('/opt/vs/train3asm/')
In [ ]:
find_elf_train1asm('/opt/vs/train4asm/')
In [ ]:
find_elf_train1asm('/opt/vs/aptasm/')
In [27]:
def save_token_counts(token_counter_map, out_file):
# Output the malware sample classification counts.
fop = open(out_file, 'w')
csv_wouter = writer(fop)
cols = ['token_name','count'] # write out the column names.
csv_wouter.writerow(cols)
outlines = []
sorted_keys = token_counter_map.keys()
sorted_keys.sort()
counter = 0
for key in sorted_keys:
outlines.append([key, token_counter_map[key]])
counter += 1
if (counter % 100) == 0: # write out some lines
csv_wouter.writerows(outlines)
outlines = []
print("Processed token {:s} -> {:d}.".format(key, token_counter_map[key]))
# Finish off.
if (len(outlines) > 0):
csv_wouter.writerows(outlines)
outlines = []
print("Completed writing {:d} tokens.".format(len(sorted_keys)))
fop.close()
return
def get_token_count_map(token_df):
# Read in the token count file and create a dict.
token_dict = {}
type_y = np.array(token_df['token_name'])
for idx in range(token_df.shape[0]): # First fill the dict with the token counts
token_dict[token_df.iloc[idx,0]] = token_df.iloc[idx,1]
return token_dict
def combine_token_files():
# TODO: everything
return
def generate_pe_tokens(file_list, out_token_file, out_count_file):
psections = re.compile('\s+\d{1,2}\s+(\.\w+|\w+)\s+\d+') # Pattern for section names.
pdlls = re.compile('\s+DLL Name: (\w+)') # Pattern for import DLL names.
pfunctions = re.compile('\s+\w+\s+\d{1,4}\s+(.+)') # Pattern for import function names.
preloc = re.compile('\s+reloc') # Pattern for relocation entries.
pexports = re.compile('\s+\[\s*\d+\]\s+(\w+)') # Pattern for exported function names.
token_counter_map = {}
counter = 0
pid = os.getpid()
for idx, fname in enumerate(file_list):
fip = open(fname, 'r')
in_lines = fip.readlines()
counter += 1
for line in in_lines:
line = line.rstrip() # get rid of newlines they are annoying.
token_val = ""
m = preloc.match(line)
if m != None:
#token_val = m.group(2)
continue
m = psections.match(line)
if m != None:
token_val = m.group(1)
print("Section: {:s}".format(token_val))
else:
m = pdlls.match(line)
if m != None:
token_val = m.group(1)
else:
m = pfunctions.match(line)
if m != None:
token_val = m.group(1)
else:
m = pexports.match(line)
if m != None:
token_val = m.group(1)
print("Export: {:s}".format(token_val))
else:
continue
# Count the token type.
if token_val in token_counter_map.keys():
token_counter_map[token_val] += 1
else:
token_counter_map[token_val] = 1
if (counter % 100) == 0:
print("{:d} Processed {:d} header files.".format(pid, counter))
fip.close()
save_token_counts(token_counter_map, out_count_file)
return
In [ ]:
In [ ]:
ext_drive = '/opt/vs/hdr/'
file_list = os.listdir(ext_drive)
file_paths = []
for fname in file_list:
file_paths.append(ext_drive + fname)
generate_pe_tokens(file_paths,'data/pe-header-tokens-apt.txt','data/pe-coff-header-token-counts-apt.csv')
In [ ]:
In [6]:
# Testing PE header token generation.
def save_token_counts(token_counter_map, out_file_name):
# Output the PE Header token counts.
pid = os.getpid()
out_file = "data/" + str(pid) + "-" + out_file_name
fop = open(out_file, 'w')
csv_wouter = writer(fop)
outlines = []
sorted_keys = token_counter_map.keys()
sorted_keys.sort()
counter = 0
for key in sorted_keys:
outlines.append([key, token_counter_map[key]])
counter += 1
if (counter % 100) == 0: # write out some lines
csv_wouter.writerows(outlines)
outlines = []
print("Processed token {:s} -> {:d}.".format(key, token_counter_map[key]))
# Finish off.
if (len(outlines) > 0):
csv_wouter.writerows(outlines)
outlines = []
print("Completed writing {:d} tokens.".format(len(sorted_keys)))
fop.close()
return
def get_token_count_map(token_df):
# Read in the token count file and create a dict.
token_dict = {}
type_y = np.array(token_df['token_name'])
for idx in range(token_df.shape[0]): # First fill the dict with the token counts
token_dict[token_df.iloc[idx,0]] = token_df.iloc[idx,1]
return token_dict
def generate_pe_tokens(mp_params):
# Parse a bunch of PE/COFF headers dumped by objdump and extract
# section names, import DLLs, import functions and exported functions.
file_list = mp_params.file_list
out_count_file = mp_params.count_file
psections = re.compile('\s+\d{1,2}\s+(\.\w+|\w+)\s+\d+') # Pattern for section names.
pdlls = re.compile('\s+DLL Name: (\w+)') # Pattern for import DLL names.
pfunctions = re.compile('\s+\w+\s+\d{1,4}\s+(.+)') # Pattern for import function names.
preloc = re.compile('\s+reloc') # Pattern for relocation entries.
pexports = re.compile('\s+\[\s*\d+\]\s+(\w+)') # Pattern for exported function names.
token_counter_map = {}
counter = 0
pid = os.getpid()
for idx, fname in enumerate(file_list):
fip = open(fname, 'r')
in_lines = fip.readlines()
counter += 1
for line in in_lines:
line = line.rstrip() # get rid of newlines they are annoying.
token_val = ""
m = preloc.match(line)
if m != None:
#token_val = m.group(2)
continue
m = psections.match(line)
if m != None:
token_val = m.group(1)
#print("Section: {:s}".format(token_val))
else:
m = pdlls.match(line)
if m != None:
token_val = m.group(1)
else:
m = pfunctions.match(line)
if m != None:
token_val = m.group(1)
else:
m = pexports.match(line)
if m != None:
token_val = m.group(1)
#print("Export: {:s}".format(token_val))
else:
continue
# Count the token type.
if token_val in token_counter_map.keys():
token_counter_map[token_val] += 1
else:
token_counter_map[token_val] = 1
if (counter % 100) == 0:
print("{:d} Processed {:d} header files.".format(pid, counter))
fip.close()
save_token_counts(token_counter_map, out_count_file)
return
def save_combine(token_counter_map, out_file_name):
# Save the combined token counts.
out_file = "data/" + out_file_name
fop = open(out_file, 'w')
csv_wouter = writer(fop)
cols = ['token_name','count']
csv_wouter.writerow(cols)
outlines = []
sorted_keys = token_counter_map.keys()
sorted_keys.sort()
counter = 0
for key in sorted_keys:
outlines.append([key, token_counter_map[key]])
counter += 1
if (counter % 100) == 0: # write out some lines
csv_wouter.writerows(outlines)
outlines = []
print("Processed token {:s} -> {:d}.".format(key, token_counter_map[key]))
# Finish off.
if (len(outlines) > 0):
csv_wouter.writerows(outlines)
outlines = []
fop.close()
print("Completed writing {:d} tokens.".format(len(sorted_keys)))
return
def combine_token_files(token_file, count_file):
# Function to combine the newly generated token files into one file:
# 1. list data directory
# 2. For each file in file list that matches (\d\d\d\d-pe-header-tokens.csv)
# 3. Trim the filenames if necessary (should remove VirusShare_ prefix).
# 4. Concatenate the unsorted token feature files.
# 5. Sort and write to data/sorted-token-features.csv
p1 = re.compile('\d{3,5}-' + count_file) # This is the PID prefix for each file.
file_list = os.listdir('data/')
counter = 0
token_map = {}
for file_name in file_list:
if p1.match(file_name):
fip = open('data/' + file_name, 'r')
in_lines = fip.readlines()
for line in in_lines:
tokens = line.split(',')
if tokens[0] not in token_map.keys():
token_map[tokens[0]] = int(tokens[1])
else:
token_map[tokens[0]] += int(tokens[1])
counter += len(in_lines)
fip.close()
save_combine(token_map, token_file)
print('Completed combine of {:d} PE/COFF header tokens.'.format(counter))
return
class Multi_Params(object):
def __init__(self, tokenfile="", countfile="", filelist=[]):
self.token_file = tokenfile
self.count_file = countfile
self.file_list = filelist
In [3]:
token_file = 'pe-header-tokens-apt.txt'
count_file = 'pe-header-token-counts-apt.csv'
ext_drive = '/opt/vs/apthdr/'
file_list = os.listdir(ext_drive)
tfiles = []
for fname in file_list:
tfiles.append(ext_drive + fname)
In [ ]:
mp1 = Multi_Params(token_file, count_file, tfiles)
generate_pe_tokens(mp1)
In [7]:
combine_token_files(token_file, count_file)
In [6]:
# Testing PE header feature extraction.
field_list = [ "Characteristics","Time/Date","Magic","MajorLinkerVersion","MinorLinkerVersion",
"SizeOfCode","SizeOfInitializedData","SizeOfUninitializedData","AddressOfEntryPoint",
"BaseOfCode","BaseOfData","ImageBase","SectionAlignment","FileAlignment",
"MajorOSystemVersion","MinorOSystemVersion","MajorImageVersion","MinorImageVersion",
"MajorSubsystemVersion","MinorSubsystemVersion","Win32Version",
"SizeOfImage","SizeOfHeaders","CheckSum","Subsystem","DllCharacteristics","SizeOfStackReserve",
"SizeOfStackCommit","SizeOfHeapReserve","SizeOfHeapCommit","LoaderFlags","NumberOfRvaAndSizes" ]
field_list_len = len(field_list)
ptime = re.compile("Time/Date\s+(.+)") # Time/Date pattern for PE Header field.
def get_field_values(header_lines):
field_vals = [0] * field_list_len
for idx1 in range(0,44): # The PE header fields are the first 44 lines of the file.
line = header_lines[idx1].rstrip()
tokens = line.split()
for idx2, field_name in enumerate(field_list):
if field_name in tokens:
if field_name.startswith("Time"):
time_match = ptime.match(field_name)
if time_match != None:
time_str = time_match.group(1)
time_s = tm.strptime(time_str, "%a %b %d %H:%M:%S %Y") # Convert time string to epoch int.
time_epoch = tm.mktime(time_s)
else:
time_epoch = 0
field_vals[idx2] = time_epoch
elif len(tokens) > 1:
field_vals[idx2] = int(tokens[1], 16) # Convert the hex value of the field to int.
return field_vals
def count_header_keywords(asm_code, keywords, klen):
keywords_values = [0] * klen
for row in asm_code:
for i in range(klen):
if keywords[i] in row:
keywords_values[i] += 1
break
return keywords_values
def extract_header_features(multi_parameters):
# 1. Get the feature file and token/keyword file names
# 2. Create an array of token/keyword values.
# 3. Iterate throught the PE header file list and counter the occurrence of the keywords in each file.
pid = os.getpid()
feature_file = 'data/' + str(pid) + "-" + multi_parameters.out_file
token_file = 'data/' + multi_parameters.token_file
print('Process id: {:d} - Feature file: {:s} - Keyword file: {:s}'.format(pid, feature_file, token_file))
hdr_pd = pd.read_csv(token_file)
tokens = list(hdr_pd['token_name'])
tlen = len(tokens)
for idx, token in enumerate(tokens): # Clamp the token name length and demangle C++ names, they are annoying.
token = token.replace('@','').replace('$','').replace('?','')
if len(token) > 32:
tokens[idx] = token[:32]
else:
tokens[idx] = token
asm_files = [i for i in tfiles if '.pe.txt' in i]
ftot = len(asm_files)
feature_counts = []
with open(feature_file, 'w') as f:
fw = writer(f)
for idx, fname in enumerate(asm_files):
fasm = open(ext_drive + fname, 'r')
content = fasm.readlines()
fasm.close()
fname = fname[fname.find("_")+1:] # Remove VirusShare_ from the start of the file name.
field_vals = get_field_values(content)
keyword_vals = count_header_keywords(content, tokens, tlen)
feature_counts.append([fname[0:fname.find('.pe.txt')]] + field_vals + keyword_vals)
# Writing rows after every 10 files processed
if (idx+1) % 1000 == 0:
print("{:d} - {:d} of {:d} files processed.".format(pid, idx + 1, ftot))
fw.writerows(feature_counts)
feature_counts = []
# Writing remaining features
if len(feature_counts) > 0:
fw.writerows(feature_counts)
feature_counts = []
print("{:d} Completed processing {:d} PE header files.".format(pid, ftot))
return
def combine_feature_files(feature_file_name, token_file):
# Function to combine the newly generated PE header feature files into one file:
# 1. list data directory
# 2. For each file in file list that matches (\d\d\d\d-pe-header-features.csv)
# 3. Trim the filenames if necessary (should remove VirusShare_ prefix).
# 4. Concatenate the unsorted pe header feature files.
# 5. Sort and write to data/sorted-pe-header-features.csv
hdr_pd = pd.read_csv('data/' + token_file)
tokens = list(hdr_pd['token_name'])
for idx, token in enumerate(tokens): # Clamp the token name length and demangle C++ names, they are annoying.
token = token.replace('@','').replace('$','').replace('?','')
if len(token) > 32:
tokens[idx] = token[:32]
else:
tokens[idx] = token
fop = open('data/' + feature_file_name,'w')
colnames = "file_name," + ",".join(field_list) + "," + ",".join(tokens) + "\n"
print("Column names: {:s}".format(colnames))
fop.write(colnames)
p1 = re.compile('\d{3,5}-' + feature_file_name) # This is the PID prefix for each file.
file_list = os.listdir('data/')
counter = 0
for file_name in file_list:
if p1.match(file_name):
fip = open('data/' + file_name, 'r')
in_lines = fip.readlines()
fop.writelines(in_lines)
counter += len(in_lines)
fip.close()
fop.close()
features = pd.read_csv('data/' + feature_file_name)
# DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values().
sorted_features = features.sort('file_name')
sorted_features.to_csv('data/sorted-' + feature_file_name, index=False)
print('Completed combine of {:d} PE header file features.'.format(counter))
return
class Multi_Params(object):
def __init__(self, outfile="", tokenfile="", fieldnames=[], filelist=[]):
self.out_file = outfile
self.token_file = tokenfile
self.field_names = fieldnames
self.file_list = filelist
In [ ]:
header_field_names = 'pe-coff-header-field-names.txt'
out_file = 'pe-header-features-apt.csv'
token_file = 'pe-header-tokens-apt.csv'
ext_drive = '/opt/vs/apthdr/'
tfiles = os.listdir(ext_drive)
mp1 = Multi_Params(out_file, token_file, header_field_names, tfiles)
extract_header_features(mp1)
combine_feature_files(out_file, token_file)
In [7]:
combine_feature_files(out_file, token_file)
In [ ]:
out_file = 'pe-header-features-vs251.csv'
token_file = 'pe-header-tokens-vs251.csv'
combine_feature_files(out_file, token_file)
In [15]:
# Test PE ASM feature extraction.
x86_registers = ['edx','esi','es','fs','ds','ss','gs','cs','ah','al',
'ax','bh','bl','bx','ch','cl','cx','dh','dl','dx',
'eax','ebp','ebx','ecx','edi','esp']
x86_opcodes = ['add','al','bt','call','cdq','cld','cli','cmc','cmp','const','cwd','daa','db'
,'dd','dec','dw','endp','ends','faddp','fchs','fdiv','fdivp','fdivr','fild'
,'fistp','fld','fstcw','fstcwimul','fstp','fword','fxch','imul','in','inc'
,'ins','int','jb','je','jg','jge','jl','jmp','jnb','jno','jnz','jo','jz'
,'lea','loope','mov','movzx','mul','near','neg','not','or','out','outs'
,'pop','popf','proc','push','pushf','rcl','rcr','rdtsc','rep','ret','retn'
,'rol','ror','sal','sar','sbb','scas','setb','setle','setnle','setnz'
,'setz','shl','shld','shr','sidt','stc','std','sti','stos','sub','test'
,'wait','xchg','xor']
def count_asm_registers(asm_code):
registers_values = [0]*len(x86_registers) # Need to optimise this init stuff to global vars.
for row in asm_code:
parts = row.replace(',',' ').replace('+',' ').replace('*',' ').replace('[',' ').replace(']',' ') \
.replace('-',' ').split()
for idx, register in enumerate(x86_registers):
registers_values[idx] += parts.count(register)
return registers_values
def count_asm_opcodes(asm_code):
opcodes_values = [0]*len(x86_opcodes)
for row in asm_code:
parts = row.split()
for idx, opcode in enumerate(x86_opcodes):
if opcode in parts:
opcodes_values[idx] += 1
break
return opcodes_values
def extract_asm_features(multi_param):
pid = os.getpid()
feature_file = 'data/' + str(pid) + "-" + multi_param.temp_file
print('Process id: {:d} feature file: {:s}'.format(pid,feature_file))
# Do this in call graph feature extraction.
#fapi = open("data/APIs.txt")
#defined_apis = fapi.readlines()
#defined_apis = defined_apis[0].split(',')
asm_files = [i for i in tfiles if '.asm' in i]
ftot = len(asm_files)
feature_counts = []
with open(feature_file, 'w') as f:
fw = writer(f)
for idx, fname in enumerate(asm_files):
fasm = open(ext_drive + fname, 'r')
content = fasm.readlines()
fasm.close()
fname = fname[fname.find("_")+1:] # Remove VirusShare_ from the start of the file name.
reg_vals = count_asm_registers(content)
opc_vals = count_asm_opcodes(content)
#api_vals = count_asm_APIs(content, defined_apis) put in Call Graph features.
#sec_vals = count_asm_sections(content) already in PE header features.
#mis_vals = count_asm_misc(content) mostly already in PE header and call graph features.
count_vals = reg_vals + opc_vals # + api_vals + mis_vals
feature_counts.append([fname[:fname.find('.asm')]] + count_vals)
# Writing rows after every 10 files processed
if (idx+1) % 10 == 0:
print("{:d} {:d} of {:d} files processed.".format(pid, idx + 1, ftot))
fw.writerows(feature_counts)
feature_counts = []
# Writing remaining files
if len(feature_counts) > 0:
fw.writerows(feature_counts)
feature_counts = []
return
def combine_asm_files(out_file, temp_file):
# Function to combine the newly generated asm feature files into one file:
# 1. list data directory
# 2. For each file in file list that matches (\d\d\d\d-asm-features.csv)
# 3. Trim the filenames if necessary (should remove VirusShare_ prefix).
# 4. Concatenate the unsorted asm feature files.
# 5. Sort and write to data/sorted-asm-features.csv
fop = open('data/' + out_file,'w')
colnames = "file_name," + ",".join(x86_registers) + "," + ",".join(x86_opcodes) + "\n"
fop.write(colnames)
print("Column names: {:s}".format(colnames))
p1 = re.compile('\d{3,5}-' + temp_file) # This is the PID prefix for each file.
file_list = os.listdir('data/')
counter = 0
for file_name in file_list:
if p1.match(file_name):
fip = open('data/' + file_name, 'r')
in_lines = fip.readlines()
fop.writelines(in_lines)
counter += len(in_lines)
fip.close()
print('Completed combine of {:d} ASM features.'.format(counter))
fop.close()
asms = pd.read_csv('data/' + out_file)
# DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values().
sorted_asms = asms.sort('file_name')
sorted_asms.to_csv('data/sorted-' + out_file, index=False)
sorted_asms.head(20)
return
class Multi_Params(object):
def __init__(self, featurefile="", tempfile="", filelist=[]):
self.feature_file = featurefile
self.temp_file = tempfile
self.file_list = filelist
In [2]:
# Includes x86 and amd64 registers and opcodes.
x86_registers = ['edx','esi','es','fs','ds','ss','gs','cs','ah','al',
'ax','bh','bl','bx','ch','cl','cx','dh','dl','dx',
'eax','ebp','ebx','ecx','edi','esp','rax','rbx','rcx','rdx','rsi',
'rdi','rbp','rsp','r8','r9','r10','r11','r12','r13','r14','r15']
x86_opcodes = ['mov','movabs','movbe','movsbl','movsbw','movswl','movsbq','movswq','movslq','movsx',
'movsxd','movzb','movzw','movzx','push','pusha','pop','popa','xchg','in','out','lea',
'lds','les','lfs','lgs','lss','clc','cld','cli','clts','cmc','lahf','sahf','pushf',
'popf','stc','std','sti','add','inc','sub','dec','sbb','cmp','test','and','or','xor',
'clr','adc','neg','not','aaa','aas','daa','das','aad','aam','cbw','cdqe','cwde','cwd',
'cdq','cqo','cbtw','cltq','cwtl','cwtd','cltd','cqto','mul','imul','div','idiv',
'rol','ror','rcl','rcr','sal','shl','shr','sar','shld','shrd','call','lcall',
'jmp','ljmp','ret','lret','retf','enter','leave','jo','jno','jb','jc','jnae','jnb',
'jnc','jae','je','jz','jne','jnz','jbe','jna','jnbe','ja','js','jns','jp','jpe',
'jnp','jpo','jl','jnge','jnl','jge','jle','jng','jnle','jg','jcxz','jecxz','jrcxz',
'loop','loopz','loope','loopnz','loopne','seto','setno','setb','setc','setnae',
'setnb','setnc','setae','sete','setz','setne','setnz','setbe','setna','setnbe',
'seta','sets','setns','setp','setpe','setnp','setpo','setl','setnge','setnl',
'setge','setle','setng','setnle','setg','cmps','scmp','ins','outs','lods',
'slod','movs','smov','scas','ssca','stos','ssto','xlat','bsf','bsr','bt',
'btc','btr','bts','int','int3','into','iret','rsm','bound','hlt','nop','arpl',
'lar','lgdt','lidt','lldt','lmsw','lsl','ltr','sgdt','sidt','sldt','smsw','str',
'verr','verw','fld','fild','fildll','fldt','fbld','fst','fist','fstp','fistp',
'fistpll','fstpt','fbstp','fxch','fcom','ficom','fcomp','ficomp','fcompp','fucom',
'fucomp','fucompp','ftst','fxam','fld1','fldl2t','fldl2e','fldpi','fldlg2','fldln2',
'fldz','fadd','fiadd','faddp','fsub','fisub','fsubp','fsubr','fisubr','fsubrp',
'fmul','fimul','fmulp','fdiv','fidiv','fdivp','fdivr','fidivr','fdivrp','f2xm1',
'fyl2x','fptan','fpatan','fxtract','fprem1','fdecstp','fincstp','fprem','fyl2xp1',
'fsqrt','fsincos','frndint','fscale','fsin','fcos','fchs','fabs','fninit','finit',
'fldcw','fnstcw','fstcw','fnstsw','fstsw','fnclex','fclex','fnstenv','fstenv',
'fldenv','fnsave','fsave','frstor','fneni','feni','fndisi','fdisi','fnsetpm',
'fsetpm','frstpm','ffree','ffreep','fnop','fwait','addr16','addr32','aword',
'adword','data16','data32','word','dword','lock','wait','cs','ds','es','fs',
'gs','ss','rep','repe','repz','repne','repnz','ht','hnt','rex','rexz','rexy',
'rexyz','rexx','rexxz','rexxy','rexxyz','rex64','rex64z','rex64y','rex64yz',
'rex64x','rex64xz','rex64xy','rex64xyz','bswap','xadd','cmpxchg','invd','wbinvd',
'invlpg','cpuid','wrmsr','rdtsc','rdmsr','cmpxchg8b','sysenter','sysexit','fxsave',
'fxsave64','fxrstor','fxrstor64','rdpmc','ud2','ud2a','ud1','ud2b','cmovo','cmovno',
'cmovb','cmovc','cmovnae','cmovae','cmovnc','cmovnb','cmove','cmovz','cmovne',
'cmovnz','cmovbe','cmovna','cmova','cmovnbe','cmovs','cmovns','cmovp','cmovnp',
'cmovl','cmovnge','cmovge','cmovnl','cmovle','cmovng','cmovg','cmovnle','cmovpe',
'cmovpo','fcmovb','fcmovnae','fcmove','fcmovbe','fcmovna','fcmovu','fcmovae',
'fcmovnb','fcmovne','fcmova','fcmovnbe','fcmovnu','fcomi','fucomi','fcomip',
'fcompi','fucomip','fucompi','movnti','clflush','lfence','mfence','pause','emms',
'movd','movq','packssdw','packsswb','packuswb','paddb','paddw','paddd','paddq',
'paddsb','paddsw','paddusb','paddusw','pand','pandn','pcmpeqb','pcmpeqw','pcmpeqd',
'pcmpgtb','pcmpgtw','pcmpgtd','pmaddwd','pmulhw','pmullw','por','psllw','pslld',
'psllq','psraw','psrad','psrlw','psrld','psrlq','psubb','psubw','psubd','psubq',
'psubsb','psubsw','psubusb','psubusw','punpckhbw','punpckhwd','punpckhdq',
'punpcklbw','punpcklwd','punpckldq','pxor','addps','addss','andnps','andps',
'cmpeqps','cmpeqss','cmpleps','cmpless','cmpltps','cmpltss','cmpneqps','cmpneqss',
'cmpnleps','cmpnless','cmpnltps','cmpnltss','cmpordps','cmpordss','cmpunordps',
'cmpunordss','cmpps','cmpss','comiss','cvtpi2ps','cvtps2pi','cvtsi2ss','cvtss2si',
'cvttps2pi','cvttss2si','divps','divss','ldmxcsr','maskmovq','maxps','maxss',
'minps','minss','movaps','movhlps','movhps','movlhps','movlps','movmskps','movntps',
'movntq','movntdq','movss','movups','mulps','mulss','orps','pavgb','pavgw','pextrw',
'pinsrw','pmaxsw','pmaxub','pminsw','pminub','pmovmskb','pmulhuw','prefetchnta',
'prefetcht0','prefetcht1','prefetcht2','psadbw','pshufw','rcpps','rcpss','rsqrtps',
'rsqrtss','sfence','shufps','sqrtps','sqrtss','stmxcsr','subps','subss','ucomiss',
'unpckhps','unpcklps','xorps','addpd','addsd','andnpd','andpd','cmpeqpd','cmpeqsd',
'cmplepd','cmplesd','cmpltpd','cmpltsd','cmpneqpd','cmpneqsd','cmpnlepd','cmpnlesd',
'cmpnltpd','cmpnltsd','cmpordpd','cmpordsd','cmpunordpd','cmpunordsd','cmppd',
'cmpsd','comisd','cvtpi2pd','cvtsi2sd','divpd','divsd','maxpd','maxsd','minpd',
'minsd','movapd','movhpd','movlpd','movmskpd','movntpd','movsd','movupd','mulpd',
'mulsd','orpd','shufpd','sqrtpd','sqrtsd','subpd','subsd','ucomisd','unpckhpd',
'unpcklpd','xorpd','cvtdq2pd','cvtpd2dq','cvtdq2ps','cvtpd2pi','cvtpd2ps','cvtps2pd',
'cvtps2dq','cvtsd2si','cvtsd2ss','cvtss2sd','cvttpd2pi','cvttsd2si','cvttpd2dq',
'cvttps2dq','maskmovdqu','movdqa','movdqu','movdq2q','movq2dq','pmuludq','pshufd',
'pshufhw','pshuflw','pslldq','psrldq','punpckhqdq','punpcklqdq','addsubpd','addsubps',
'cmpxchg16b','fisttp','fisttpll','haddpd','haddps','hsubpd','hsubps','lddqu',
'monitor','movddup','movshdup','movsldup','mwait','vmcall','vmclear','vmlaunch',
'vmresume','vmptrld','vmptrst','vmread','vmwrite','vmxoff','vmxon','vmfunc','getsec',
'invept','invvpid','invpcid','phaddw','phaddd','phaddsw','phsubw','phsubd','phsubsw',
'pmaddubsw','pmulhrsw','pshufb','psignb','psignw','psignd','palignr','pabsb','pabsw',
'pabsd','blendpd','blendps','blendvpd','blendvps','dppd','dpps','extractps','insertps',
'movntdqa','mpsadbw','packusdw','pblendvb','pblendw','pcmpeqq','pextrb','pextrd',
'pextrq','phminposuw','pinsrb','pinsrd','pinsrq','pmaxsb','pmaxsd','pmaxud','pmaxuw',
'pminsb','pminsd','pminud','pminuw','pmovsxbw','pmovsxbd','pmovsxbq','pmovsxwd',
'pmovsxwq','pmovsxdq','pmovzxbw','pmovzxbd','pmovzxbq','pmovzxwd','pmovzxwq',
'pmovzxdq','pmuldq','pmulld','ptest','roundpd','roundps','roundsd','roundss',
'pcmpgtq','pcmpestri','pcmpestrm','pcmpistri','pcmpistrm','crc32','xsave',
'xsave64','xrstor','xrstor64','xgetbv','xsetbv','xsaveopt','xsaveopt64','aesdec',
'aesdeclast','aesenc','aesenclast','aesimc','aeskeygenassist','pclmulqdq',
'pclmullqlqdq','pclmulhqlqdq','pclmullqhqdq','pclmulhqhqdq','vaddpd','vaddps',
'vaddsd','vaddss','vaddsubpd','vaddsubps','vandnpd','vandnps','vandpd','vandps',
'vblendpd','vblendps','vblendvpd','vblendvps','vbroadcastf128','vbroadcastsd',
'vbroadcastss','vcmpeq_ospd','vcmpeq_osps','vcmpeq_ossd','vcmpeq_osss','vcmpeqpd',
'vcmpeqps','vcmpeqsd','vcmpeqss','vcmpeq_uqpd','vcmpeq_uqps','vcmpeq_uqsd',
'vcmpeq_uqss','vcmpeq_uspd','vcmpeq_usps','vcmpeq_ussd','vcmpeq_usss','vcmpfalse_ospd',
'vcmpfalse_osps','vcmpfalse_ossd','vcmpfalse_osss','vcmpfalsepd','vcmpfalseps',
'vcmpfalsesd','vcmpfalsess','vcmpge_oqpd','vcmpge_oqps','vcmpge_oqsd','vcmpge_oqss',
'vcmpgepd','vcmpgeps','vcmpgesd','vcmpgess','vcmpgt_oqpd','vcmpgt_oqps',
'vcmpgt_oqsd','vcmpgt_oqss','vcmpgtpd','vcmpgtps','vcmpgtsd','vcmpgtss',
'vcmple_oqpd','vcmple_oqps','vcmple_oqsd','vcmple_oqss','vcmplepd','vcmpleps',
'vcmplesd','vcmpless','vcmplt_oqpd','vcmplt_oqps','vcmplt_oqsd','vcmplt_oqss',
'vcmpltpd','vcmpltps','vcmpltsd','vcmpltss','vcmpneq_oqpd','vcmpneq_oqps',
'vcmpneq_oqsd','vcmpneq_oqss','vcmpneq_ospd','vcmpneq_osps','vcmpneq_ossd',
'vcmpneq_osss','vcmpneqpd','vcmpneqps','vcmpneqsd','vcmpneqss','vcmpneq_uspd',
'vcmpneq_usps','vcmpneq_ussd','vcmpneq_usss','vcmpngepd','vcmpngeps','vcmpngesd',
'vcmpngess','vcmpnge_uqpd','vcmpnge_uqps','vcmpnge_uqsd','vcmpnge_uqss','vcmpngtpd',
'vcmpngtps','vcmpngtsd','vcmpngtss','vcmpngt_uqpd','vcmpngt_uqps','vcmpngt_uqsd',
'vcmpngt_uqss','vcmpnlepd','vcmpnleps','vcmpnlesd','vcmpnless','vcmpnle_uqpd',
'vcmpnle_uqps','vcmpnle_uqsd','vcmpnle_uqss','vcmpnltpd','vcmpnltps','vcmpnltsd',
'vcmpnltss','vcmpnlt_uqpd','vcmpnlt_uqps','vcmpnlt_uqsd','vcmpnlt_uqss','vcmpordpd',
'vcmpordps','vcmpordsd','vcmpord_spd','vcmpord_sps','vcmpordss','vcmpord_ssd',
'vcmpord_sss','vcmppd','vcmpps','vcmpsd','vcmpss','vcmptruepd','vcmptrueps',
'vcmptruesd','vcmptruess','vcmptrue_uspd','vcmptrue_usps','vcmptrue_ussd',
'vcmptrue_usss','vcmpunordpd','vcmpunordps','vcmpunordsd','vcmpunord_spd',
'vcmpunord_sps','vcmpunordss','vcmpunord_ssd','vcmpunord_sss','vcomisd',
'vcomiss','vcvtdq2pd','vcvtdq2ps','vcvtpd2dq','vcvtpd2dqx','vcvtpd2dqy','vcvtpd2ps',
'vcvtpd2psx','vcvtpd2psy','vcvtps2dq','vcvtps2pd','vcvtsd2si','vcvtsd2ss',
'vcvtsi2sd','vcvtsi2ss','vcvtss2sd','vcvtss2si','vcvttpd2dq','vcvttpd2dqx',
'vcvttpd2dqy','vcvttps2dq','vcvttsd2si','vcvttss2si','vdivpd','vdivps','vdivsd',
'vdivss','vdppd','vdpps','vextractf128','vextractps','vhaddpd','vhaddps',
'vhsubpd','vhsubps','vinsertf128','vinsertps','vlddqu','vldmxcsr','vmaskmovdqu',
'vmaskmovpd','vmaskmovps','vmaxpd','vmaxps','vmaxsd','vmaxss','vminpd','vminps',
'vminsd','vminss','vmovapd','vmovaps','vmovd','vmovddup','vmovdqa','vmovdqu',
'vmovhlps','vmovhpd','vmovhps','vmovlhps','vmovlpd','vmovlps','vmovmskpd','vmovmskps',
'vmovntdq','vmovntdqa','vmovntpd','vmovntps','vmovq','vmovsd','vmovshdup',
'vmovsldup','vmovss','vmovupd','vmovups','vmpsadbw','vmulpd','vmulps','vmulsd',
'vmulss','vorpd','vorps','vpabsb','vpabsd','vpabsw','vpackssdw','vpacksswb',
'vpackusdw','vpackuswb','vpaddsb','vpaddsw','vpaddb','vpaddd','vpaddq','vpaddw',
'vpaddusb','vpaddusw','vpalignr','vpand','vpandn','vpavgb','vpavgw','vpblendvb',
'vpblendw','vpcmpeqb','vpcmpeqd','vpcmpeqq','vpcmpeqw','vpcmpestri','vpcmpestrm',
'vpcmpgtb','vpcmpgtd','vpcmpgtq','vpcmpgtw','vpcmpistri','vpcmpistrm','vperm2f128',
'vpermilpd','vpermilps','vpextrb','vpextrd','vpextrq','vpextrw','vphaddd','vphaddsw',
'vphaddw','vphminposuw','vphsubd','vphsubsw','vphsubw','vpinsrb','vpinsrd','vpinsrq',
'vpinsrw','vpmaddubsw','vpmaddwd','vpmaxsb','vpmaxsd','vpmaxsw','vpmaxub','vpmaxud',
'vpmaxuw','vpminsb','vpminsd','vpminsw','vpminub','vpminud','vpminuw','vpmovmskb',
'vpmovsxbd','vpmovsxbq','vpmovsxbw','vpmovsxdq','vpmovsxwd','vpmovsxwq','vpmovzxbd',
'vpmovzxbq','vpmovzxbw','vpmovzxdq','vpmovzxwd','vpmovzxwq','vpmuldq','vpmulhrsw',
'vpmulhuw','vpmulhw','vpmulld','vpmullw','vpmuludq','vpor','vpsadbw','vpshufb',
'vpshufd','vpshufhw','vpshuflw','vpsignb','vpsignd','vpsignw','vpslld','vpslldq',
'vpsllq','vpsllw','vpsrad','vpsraw','vpsrld','vpsrldq','vpsrlq','vpsrlw','vpsubb',
'vpsubd','vpsubq','vpsubsb','vpsubsw','vpsubusb','vpsubusw','vpsubw','vptest',
'vpunpckhbw','vpunpckhdq','vpunpckhqdq','vpunpckhwd','vpunpcklbw','vpunpckldq',
'vpunpcklqdq','vpunpcklwd','vpxor','vrcpps','vrcpss','vroundpd','vroundps',
'vroundsd','vroundss','vrsqrtps','vrsqrtss','vshufpd','vshufps','vsqrtpd','vsqrtps',
'vsqrtsd','vsqrtss','vstmxcsr','vsubpd','vsubps','vsubsd','vsubss','vtestpd',
'vtestps','vucomisd','vucomiss','vunpckhpd','vunpckhps','vunpcklpd','vunpcklps',
'vxorpd','vxorps','vzeroall','vzeroupper','vbroadcasti128','vpblendd','vpbroadcastb',
'vpbroadcastd','vpbroadcastq','vpbroadcastw','vperm2i128','vpermd','vpermpd',
'vpermps','vpermq','vextracti128','vinserti128','vpmaskmovd','vpmaskmovq','vpsllvd',
'vpsllvq','vpsravd','vpsrlvd','vpsrlvq','vgatherdpd','vgatherdps','vgatherqpd',
'vgatherqps','vpgatherdd','vpgatherdq','vpgatherqd','vpgatherqq','vaesdec','vaesdeclast',
'vaesenc','vaesenclast','vaesimc','vaeskeygenassist','vpclmulqdq','vpclmullqlqdq',
'vpclmulhqlqdq','vpclmullqhqdq','vpclmulhqhqdq','rdfsbase','rdgsbase','rdrand',
'wrfsbase','wrgsbase','vcvtph2ps','vcvtps2ph','vfmadd132pd','vfmadd132ps','vfmadd213pd',
'vfmadd213ps','vfmadd231pd','vfmadd231ps','vfmadd132sd','vfmadd132ss','vfmadd213sd',
'vfmadd213ss','vfmadd231sd','vfmadd231ss','vfmaddsub132pd','vfmaddsub132ps',
'vfmaddsub213pd','vfmaddsub213ps','vfmaddsub231pd','vfmaddsub231ps','vfmsubadd132pd',
'vfmsubadd132ps','vfmsubadd213pd','vfmsubadd213ps','vfmsubadd231pd','vfmsubadd231ps',
'vfmsub132pd','vfmsub132ps','vfmsub213pd','vfmsub213ps','vfmsub231pd','vfmsub231ps',
'vfmsub132sd','vfmsub132ss','vfmsub213sd','vfmsub213ss','vfmsub231sd','vfmsub231ss',
'vfnmadd132pd','vfnmadd132ps','vfnmadd213pd','vfnmadd213ps','vfnmadd231pd',
'vfnmadd231ps','vfnmadd132sd','vfnmadd132ss','vfnmadd213sd','vfnmadd213ss',
'vfnmadd231sd','vfnmadd231ss','vfnmsub132pd','vfnmsub132ps','vfnmsub213pd',
'vfnmsub213ps','vfnmsub231pd','vfnmsub231ps','vfnmsub132sd','vfnmsub132ss',
'vfnmsub213sd','vfnmsub213ss','vfnmsub231sd','vfnmsub231ss','xacquire','xrelease',
'xabort','xbegin','xend','xtest','bzhi','mulx','pdep','pext','rorx','sarx','shlx',
'shrx','vfmaddpd','vfmaddps','vfmaddsd','vfmaddss','vfmaddsubpd','vfmaddsubps',
'vfmsubaddpd','vfmsubaddps','vfmsubpd','vfmsubps','vfmsubsd','vfmsubss','vfnmaddpd',
'vfnmaddps','vfnmaddsd','vfnmaddss','vfnmsubpd','vfnmsubps','vfnmsubsd','vfnmsubss',
'vfrczpd','vfrczps','vfrczsd','vfrczss','vpcmov','vpcomb','vpcomd','vpcomq',
'vpcomub','vpcomud','vpcomuq','vpcomuw','vpcomw','vpermil2pd','vpermil2ps',
'vpcomltb','vpcomltd','vpcomltq','vpcomltub','vpcomltud','vpcomltuq','vpcomltuw',
'vpcomltw','vpcomleb','vpcomled','vpcomleq','vpcomleub','vpcomleud','vpcomleuq',
'vpcomleuw','vpcomlew','vpcomgtb','vpcomgtd','vpcomgtq','vpcomgtub','vpcomgtud',
'vpcomgtuq','vpcomgtuw','vpcomgtw','vpcomgeb','vpcomged','vpcomgeq','vpcomgeub',
'vpcomgeud','vpcomgeuq','vpcomgeuw','vpcomgew','vpcomeqb','vpcomeqd','vpcomeqq',
'vpcomequb','vpcomequd','vpcomequq','vpcomequw','vpcomeqw','vpcomneqb','vpcomneqd',
'vpcomneqq','vpcomnequb','vpcomnequd','vpcomnequq','vpcomnequw','vpcomneqw',
'vpcomfalseb','vpcomfalsed','vpcomfalseq','vpcomfalseub','vpcomfalseud',
'vpcomfalseuq','vpcomfalseuw','vpcomfalsew','vpcomtrueb','vpcomtrued','vpcomtrueq',
'vpcomtrueub','vpcomtrueud','vpcomtrueuq','vpcomtrueuw','vpcomtruew','vphaddbd',
'vphaddbq','vphaddbw','vphadddq','vphaddubd','vphaddubq','vphaddubw','vphaddudq',
'vphadduwd','vphadduwq','vphaddwd','vphaddwq','vphsubbw','vphsubdq','vphsubwd',
'vpmacsdd','vpmacsdqh','vpmacsdql','vpmacssdd','vpmacssdqh','vpmacssdql',
'vpmacsswd','vpmacssww','vpmacswd','vpmacsww','vpmadcsswd','vpmadcswd','vpperm',
'vprotb','vprotd','vprotq','vprotw','vpshab','vpshad','vpshaq','vpshaw','vpshlb',
'vpshld','vpshlq','vpshlw','llwpcb','slwpcb','lwpval','lwpins','andn','bextr','blsi',
'blsmsk','blsr','tzcnt','blcfill','blci','blcic','blcmsk','blcs','blsfill','blsic',
't1mskc','tzmsk','prefetch','prefetchw','femms','pavgusb','pf2id','pf2iw','pfacc',
'pfadd','pfcmpeq','pfcmpge','pfcmpgt','pfmax','pfmin','pfmul','pfnacc','pfpnacc',
'pfrcp','pfrcpit1','pfrcpit2','pfrsqit1','pfrsqrt','pfsub','pfsubr','pi2fd','pi2fw',
'pmulhrw','pswapd','syscall','sysret','swapgs','rdtscp','clgi','invlpga','skinit',
'stgi','vmload','vmmcall','vmrun','vmsave','movntsd','movntss','extrq','insertq',
'popcnt','lzcnt','xstore','xcrypt','montmul','xsha1','xsha256','xstorerng',
'xcryptecb','xcryptcbc','xcryptctr','xcryptcfb','xcryptofb','xstore','adcx','adox',
'rdseed','clac','stac','bnd','bndmk','bndmov','bndcl','bndcu','bndcn','bndstx',
'bndldx','sha1rnds4','sha1nexte','sha1msg1','sha1msg2','sha256rnds2','sha256msg1','sha256msg2',
'kandnw','kandw','korw','kxnorw','kxorw','kmovw','knotw','kortestw','kshiftlw',
'kshiftrw','kunpckbw','valignd','vpternlogd','valignq','vpternlogq','vblendmpd',
'vpblendmq','vpermi2pd','vpermi2q','vpermt2pd','vpermt2q','vpmaxsq','vpmaxuq',
'vpminsq','vpminuq','vprolvq','vprorvq','vpsravq','vblendmps','vpblendmd',
'vpermi2d','vpermi2ps','vpermt2d','vpermt2ps','vprolvd','vprorvd','vbroadcastf32x4',
'vbroadcasti32x4','vbroadcastf64x4','vbroadcasti64x4','vcmpeq_oqpd','vcmpfalse_oqpd',
'vcmpge_ospd','vcmpgt_ospd','vcmple_ospd','vcmplt_ospd','vcmpneq_uqpd','vcmpnge_uspd',
'vcmpngt_uspd','vcmpnle_uspd','vcmpnlt_uspd','vcmpord_qpd','vcmptrue_uqpd',
'vcmpunord_qpd','vcmpeq_oqps','vcmpfalse_oqps','vcmpge_osps','vcmpgt_osps',
'vcmple_osps','vcmplt_osps','vcmpneq_uqps','vcmpnge_usps','vcmpngt_usps',
'vcmpnle_usps','vcmpnlt_usps','vcmpord_qps','vcmptrue_uqps','vcmpunord_qps',
'vcmpeq_oqsd','vcmpfalse_oqsd','vcmpge_ossd','vcmpgt_ossd','vcmple_ossd','vcmplt_ossd',
'vcmpneq_uqsd','vcmpnge_ussd','vcmpngt_ussd','vcmpnle_ussd','vcmpnlt_ussd',
'vcmpord_qsd','vcmptrue_uqsd','vcmpunord_qsd','vcmpeq_oqss','vcmpfalse_oqss',
'vcmpge_osss','vcmpgt_osss','vcmple_osss','vcmplt_osss','vcmpneq_uqss',
'vcmpnge_usss','vcmpngt_usss','vcmpnle_usss','vcmpnlt_usss','vcmpord_qss',
'vcmptrue_uqss','vcmpunord_qss','vcompresspd','vpcompressq','vpscatterdq',
'vpscatterqq','vscatterdpd','vscatterqpd','vcompressps','vpcompressd','vpscatterdd',
'vscatterdps','vcvtudq2pd','vcvtps2udq','vcvtpd2udq','vcvtsd2usi','vcvtusi2sd',
'vcvtusi2ss','vcvtss2usi','vcvttpd2udq','vcvttps2udq','vcvttsd2usi','vcvttss2usi',
'vcvtudq2ps','vexpandpd','vpexpandq','vexpandps','vpexpandd','vextractf32x4',
'vextracti32x4','vextractf64x4','vextracti64x4','vfixupimmpd','vfixupimmps',
'vfixupimmsd','vgetmantsd','vrndscalesd','vfixupimmss','vgetmantss','vrndscaless',
'vscalefpd','vscalefps','vscalefsd','vscalefss','vgetexppd','vgetexpps',
'vgetexpsd','vgetexpss','vgetmantpd','vrndscalepd','vgetmantps','vrndscaleps',
'vinsertf32x4','vinserti32x4','vinsertf64x4','vinserti64x4','vmovdqa64',
'vmovdqa32','vmovdqu32','vmovdqu64','vrcp14ps','vrsqrt14ps','vpabsq',
'vrcp14pd','vrsqrt14pd','vpandd','vpandnd','vpord','vpxord','vpandnq',
'vpandq','vporq','vpxorq','vpcmpd','vpcmpled','vpcmpltd','vpcmpneqd',
'vpcmpnled','vpcmpnltd','vpcmpud','vpcmpequd','vpcmpleud','vpcmpltud',
'vpcmpnequd','vpcmpnleud','vpcmpnltud','vpcmpq','vpcmpleq','vpcmpltq',
'vpcmpneqq','vpcmpnleq','vpcmpnltq','vpcmpuq','vpcmpequq','vpcmpleuq',
'vpcmpltuq','vpcmpnequq','vpcmpnleuq','vpcmpnltuq','vptestmq','vpmovdb',
'vpmovsdb','vpmovusdb','vpmovdw','vpmovsdw','vpmovusdw','vpmovqb','vpmovsqb',
'vpmovusqb','vpmovqd','vpmovsqd','vpmovusqd','vpmovqw','vpmovsqw','vpmovusqw',
'vprold','vprord','vprolq','vprorq','vpscatterqd','vscatterqps','vpsraq',
'vptestmd','vrcp14sd','vrsqrt14sd','vrcp14ss','vrsqrt14ss','vshuff32x4',
'vshufi32x4','vshuff64x2','vshufi64x2','vptestnmd','vptestnmq','vpbroadcastmb2q',
'vpbroadcastmw2d','vpconflictd','vpconflictq','vplzcntd','vplzcntq','vexp2pd',
'vexp2ps','vrcp28pd','vrsqrt28pd','vrcp28ps','vrsqrt28ps','vrcp28sd',
'vrsqrt28sd','vrcp28ss','vrsqrt28ss','vgatherpf0dpd','vgatherpf0qpd',
'vgatherpf1dpd','vgatherpf1qpd','vscatterpf0dpd','vscatterpf0qpd',
'vscatterpf1dpd','vscatterpf1qpd','vgatherpf0dps','vgatherpf0qps',
'vgatherpf1dps','vgatherpf1qps','vscatterpf0dps','vscatterpf0qps',
'vscatterpf1dps','vscatterpf1qps','prefetchwt1','clflushopt','xrstors',
'xrstors64','xsaves','xsaves64','xsavec','xsavec64','encls','enclu',
'vcvtpd2udqx','vcvtpd2udqy','vcvttpd2udqx','vcvttpd2udqy','kaddd','kandd',
'kandnd','kmovd','knotd','kord','kortestd','ktestd','kxnord','kxord','kaddq',
'kandnq','kandq','kmovq','knotq','korq','kortestq','ktestq','kunpckdq',
'kunpckwd','kxnorq','kxorq','kshiftld','kshiftlq','kshiftrd','kshiftrq',
'vdbpsadbw','vmovdqu16','vmovdqu8','vpblendmb','vpblendmw','vpermi2w',
'vpermt2w','vpermw','vpsllvw','vpsravw','vpsrlvw','vpcmpb','vpcmpub',
'vpcmpuw','vpcmpw','vpmovb2m','vpmovm2b','vpmovm2w','vpmovswb','vpmovuswb',
'vpmovwb','vpmovw2m','vptestmb','vptestmw','vptestnmb','vptestnmw','kaddb',
'kandb','kandnb','kmovb','knotb','korb','kortestb','ktestb','kxnorb','kxorb',
'kaddw','ktestw','kshiftlb','kshiftrb','vbroadcastf32x2','vbroadcastf32x8',
'vbroadcasti32x2','vbroadcasti32x8','vbroadcastf64x2','vbroadcasti64x2',
'vcvtpd2qq','vcvtpd2uqq','vcvtps2qq','vcvtps2uqq','vcvtqq2pd','vcvtuqq2pd',
'vcvtqq2ps','vcvtqq2psx','vcvtqq2psy','vcvttpd2qq','vcvttpd2uqq','vcvttps2qq',
'vcvttps2uqq','vcvtuqq2ps','vcvtuqq2psx','vcvtuqq2psy','vextractf32x8',
'vextracti32x8','vinsertf32x8','vinserti32x8','vfpclassss','vextractf64x2',
'vextracti64x2','vfpclasssd','vinsertf64x2','vinserti64x2','vfpclasspd',
'vfpclasspdz','vfpclasspdx','vfpclasspdy','vfpclassps','vfpclasspsz','vfpclasspsx',
'vfpclasspsy','vpmovd2m','vpmovm2d','vpmovm2q','vpmovq2m','vpmullq','vrangepd',
'vreducepd','vrangeps','vreduceps','vrangesd','vreducesd','vrangess','vreducess',
'clwb','pcommit','vpmadd52huq','vpmadd52luq','vpmultishiftqb','vpermb','vpermi2b',
'vpermt2b','clzero','monitorx','mwaitx','rdpkru','wrpkru','rdpid']
def count_asm_symbols(asm_code):
symbols = [0]*7
for row in asm_code:
if '*' in row:
symbols[0] += 1
if '-' in row:
symbols[1] += 1
if '+' in row:
symbols[2] += 1
if '[' in row:
symbols[3] += 1
if ']' in row:
symbols[4] += 1
if '@' in row:
symbols[5] += 1
if '?' in row:
symbols[6] += 1
return symbols
def count_asm_APIs(asm_code, apis):
apis_values = [0]*len(apis)
for row in asm_code:
for i in range(len(apis)):
if apis[i] in row:
apis_values[i] += 1
break
return apis_values
def count_asm_misc(asm_code):
keywords_values = [0]*len(keywords)
for row in asm_code:
for i in range(len(keywords)):
if keywords[i] in row:
keywords_values[i] += 1
break
return keywords_values
def count_asm_registers(asm_code):
registers_values = [0]*len(x86_registers) # Need to optimise this init stuff to global vars.
for row in asm_code:
parts = row.replace(',',' ').replace('+',' ').replace('*',' ').replace('[',' ').replace(']',' ') \
.replace('-',' ').split()
for idx, register in enumerate(x86_registers):
registers_values[idx] += parts.count(register)
return registers_values
def count_asm_opcodes(asm_code):
opcodes_values = [0]*len(x86_opcodes)
for row in asm_code:
parts = row.split()
for idx, opcode in enumerate(x86_opcodes):
if opcode in parts:
opcodes_values[idx] += 1
break
return opcodes_values
def extract_asm_features(multi_param):
pid = os.getpid()
feature_file = 'data/' + str(pid) + "-" + multi_param.temp_file
ext_drive = multi_para.ext_drive
lmsg = 'Process id: {:d} feature file: {:s}'.format(pid, feature_file)
print(lmsg)
# Do this in call graph feature extraction.
#fapi = open("data/APIs.txt")
#defined_apis = fapi.readlines()
#defined_apis = defined_apis[0].split(',')
flog = open("data/" + str(pid) + "-pe-asm-log.txt", "w")
flog.write(lmsg + "\n")
asm_files = [i for i in tfiles if '.pe.asm' in i]
ftot = len(asm_files)
feature_counts = []
with open(feature_file, 'w') as f:
fw = writer(f)
for idx, fname in enumerate(asm_files):
fasm = open(ext_drive + fname, 'r')
content = fasm.readlines()
fasm.close()
fname = fname[fname.find("_")+1:] # Remove VirusShare_ from the start of the file name.
reg_vals = count_asm_registers(content)
opc_vals = count_asm_opcodes(content)
#api_vals = count_asm_APIs(content, defined_apis) put in Call Graph features.
#sec_vals = count_asm_sections(content) already in PE header features.
#mis_vals = count_asm_misc(content) mostly already in PE header and call graph features.
count_vals = reg_vals + opc_vals # + api_vals + mis_vals
feature_counts.append([fname[:fname.find('.asm')]] + count_vals)
# Writing rows after every 10 files processed
if (idx+1) % 10 == 0:
lmsg = "{:d} {:d} of {:d} files processed.".format(pid, idx + 1, ftot)
print(lmsg)
flog.write(lmsg + "\n")
fw.writerows(feature_counts)
feature_counts = []
# Writing remaining files
if len(feature_counts) > 0:
fw.writerows(feature_counts)
feature_counts = []
lmsg = "{:d} Completed processing {:d} PE ASM files.".format(pid, ftot)
print(lmsg)
flog.write(lmsg + "\n")
flog.close()
return
def combine_asm_files(out_file, temp_file):
# Function to combine the newly generated asm feature files into one file:
# 1. list data directory
# 2. For each file in file list that matches (\d\d\d\d-asm-features.csv)
# 3. Trim the filenames if necessary (should remove VirusShare_ prefix).
# 4. Concatenate the unsorted asm feature files.
# 5. Sort and write to data/sorted-asm-features.csv
fop = open('data/' + out_file, 'w')
colnames = "file_name," + ",".join(x86_registers) + "," + ",".join(x86_opcodes) + "\n"
fop.write(colnames)
print("Column names: {:s}".format(colnames))
p1 = re.compile('\d{3,5}-' + temp_file) # This is the PID prefix for each file.
file_list = os.listdir('data/')
counter = 0
for file_name in file_list:
if p1.match(file_name):
fip = open('data/' + file_name, 'r')
in_lines = fip.readlines()
fop.writelines(in_lines)
counter += len(in_lines)
fip.close()
print('Completed combine of {:d} ASM features.'.format(counter))
fop.close()
asms = pd.read_csv('data/' + out_file)
# DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values().
sorted_asms = asms.sort('file_name')
sorted_asms.to_csv('data/sorted-' + out_file, index=False)
sorted_asms.head(20)
return
class Multi_Params(object):
def __init__(self, featurefile="", tempfile="", extdrive="", filelist=[]):
self.feature_file = featurefile
self.temp_file = tempfile
self.ext_drive = extdrive
self.file_list = filelist
In [ ]:
out_file = 'pe-asm-features-apt.csv'
temp_file = 'pe-asm-temp-apt.csv'
ext_drive = '/opt/vs/asm/'
tfiles = os.listdir(ext_drive)
mp1 = Multi_Params(out_file, temp_file, tfiles)
extract_asm_features(mp1)
combine_asm_files(out_file, temp_file)
In [3]:
# find out what is going on with the IDA Pro disassembly.
fip = open('/opt/vs/asm/filetypes.txt', 'r')
lines = fip.readlines()
dir_list = os.listdir('/opt/vs/asm/')
file_list = []
counter = 0
for line in lines:
tokens = line.split(':')
file_name = tokens[0]
counter += 1
file_list.append(file_name)
print("PE32 files: {:d} directory files: {:d}".format(counter, len(dir_list)))
for fname in dir_list:
if fname not in file_list:
print("File not in PE32 list: {:s}".format(fname))
In [6]:
def get_unpacked_file_list(packer_id_feature_file, file_id_feature_file, trid_id_feature_file):
# Load the malware packer id features and file id features from the sample set.
packer_id_features = pd.read_csv(packer_id_feature_file)
file_id_features = pd.read_csv(file_id_feature_file)
trid_id_features = pd.read_csv(trid_id_feature_file)
# Get a list of unpacked PE files that are not .NET CIL format.
# IDA Pro cannot disassemble .NET files, have to use Ildisasm.exe in Visual Studio.
unpacked_files = packer_id_features[packer_id_features['is_packed'] == 0]
unpacked_pe_files = unpacked_files[unpacked_files['valid_pe'] == 1]
not_dot_net = []
counter = 0
dot_net_counter = 0
# Get the trid and file rows that are for unpacked PE files.
trids = trid_id_features[trid_id_features['file_name'].isin(unpacked_pe_files['file_name'])]
fids = file_id_features[file_id_features['file_name'].isin(unpacked_pe_files['file_name'])]
# Iterate over the unpacked PE file list and check if each is a .NET file.
# If not a .NET file then add to file list.
pe_names_list = unpacked_pe_files['file_name']
for idx, file_name in enumerate(pe_names_list):
trid_name = trids.iloc[idx, 1]
fid_name = fids.iloc[idx, 1]
trid_name = trid_name.lower()
fid_name = fid_name.lower()
if trid_name.find('.net') > -1 or fid_name.find('.net') > -1:
#print('Found: {:s} - {:s}'.format(trid_name, fid_name))
dot_net_counter += 1
continue
#print('Found: {:s} - {:s}'.format(trid_name, fid_name))
not_dot_net.append(file_name)
counter += 1
file_list = []
write_list = []
counter = 0
# Iterate over the file list and prepend the full file name.
for file_name in not_dot_net:
full_name = "VirusShare_" + file_name
file_list.append(full_name)
write_list.append(full_name + "\n")
counter += 1
if (len(file_list) > 0):
fop = open('data/temp-unpacked-pe-non-dot-net.txt','w')
fop.writelines(write_list)
fop.close()
print("Got {:d} unpacked PE filenames and {:d} .NET filenames.".format(counter, dot_net_counter))
return file_list
In [8]:
packer_id_file = 'data/sorted-packer-id-features-vs251.csv'
file_id_file = 'data/sorted-file-id-features-vs251.csv'
trid_id_file = 'data/sorted-trid-id-features-vs251.csv'
ext_drive = '/opt/vs/train1/'
unflist = get_unpacked_file_list(packer_id_file, file_id_file, trid_id_file)
file_list = []
completed_list = os.listdir('/opt/vs/train1asm/')
print("Got {:d} completed ASM files.".format(len(completed_list)))
for idx, fname in enumerate(completed_list):
completed_list[idx] = fname[0:fname.find(".asm")]
for idx, fname in enumerate(unflist):
if fname not in completed_list:
file_list.append(ext_drive + fname)
print("Processing {:d} files out of {:d} total unpacked PE files.".format(len(file_list), len(unflist)))
In [9]:
33814 + 21315
Out[9]:
In [8]:
def get_unpacked_file_list(packer_id_feature_file, file_id_feature_file, trid_id_feature_file):
# Load the malware packer id features and file id features from the sample set.
packer_id_features = pd.read_csv(packer_id_feature_file)
file_id_features = pd.read_csv(file_id_feature_file)
trid_id_features = pd.read_csv(trid_id_feature_file)
# Get a list of unpacked PE files that are not .NET CIL format.
# IDA Pro cannot disassemble .NET files, have to use Ildisasm.exe in Visual Studio.
unpacked_files = packer_id_features[packer_id_features['is_packed'] == 0]
unpacked_pe_files = unpacked_files[unpacked_files['valid_pe'] == 1]
not_dot_net = []
counter = 0
dot_net_counter = 0
amd64_bit_counter = 0
# Get the trid and file rows that are for unpacked PE files.
trids = trid_id_features[trid_id_features['file_name'].isin(unpacked_pe_files['file_name'])]
fids = file_id_features[file_id_features['file_name'].isin(unpacked_pe_files['file_name'])]
# Iterate over the unpacked PE file list and check if each is a .NET file.
# If not a .NET file then add to file list.
pe_names_list = unpacked_pe_files['file_name']
for idx, file_name in enumerate(pe_names_list):
trid_name = trids.iloc[idx, 1]
fid_name = fids.iloc[idx, 1]
trid_name = trid_name.lower()
fid_name = fid_name.lower()
#print("Trid: {:s}".format(trid_name))
#print("Fid: {:s}".format(fid_name))
if trid_name.find('.net') > -1 or fid_name.find('.net') > -1:
print('Found: {:s} - {:s}'.format(trid_name, fid_name))
dot_net_counter += 1
continue
if trid_name.find('win64') > -1 or fid_name.startswith('pe32+'):
print('Found: {:s} - {:s}'.format(trid_name, fid_name))
amd64_bit_counter += 1
continue
#print('Found: {:s} - {:s}'.format(trid_name, fid_name))
not_dot_net.append(file_name)
counter += 1
file_list = []
write_list = []
counter = 0
# Iterate over the file list and prepend the full file name.
for file_name in not_dot_net:
full_name = "VirusShare_" + file_name
file_list.append(full_name)
write_list.append(full_name + "\n")
counter += 1
if (len(file_list) > 0):
fop = open('data/temp-unpacked-pe-non-dot-net.txt','w')
fop.writelines(write_list)
fop.close()
print("Got {:d} unpacked PE files.".format(counter))
print("Got {:d} .NET file and {:d} 64 Bit files.".format(dot_net_counter, amd64_bit_counter))
return file_list
In [ ]:
packer_id_file = 'data/sorted-packer-id-features-apt.csv'
file_id_file = 'data/sorted-file-id-features-apt.csv'
trid_id_file = 'data/sorted-trid-id-features-apt.csv'
ext_drive = '/opt/vs/apt/'
unflist = get_unpacked_file_list(packer_id_file, file_id_file, trid_id_file)
In [ ]:
In [13]:
# Test ELF disassembly.
def get_elf_file_list(ext_drive, packer_id_feature_file, file_id_feature_file, trid_id_feature_file):
# Load the malware packer id features and file id features from the sample set.
packer_id_features = pd.read_csv(packer_id_feature_file)
file_id_features = pd.read_csv(file_id_feature_file)
trid_id_features = pd.read_csv(trid_id_feature_file)
counter = 0
file_names_list = file_id_features['file_name']
file_list = []
write_list = []
fid_list = []
for idx, file_name in enumerate(file_names_list):
trid_name = trid_id_features.iloc[idx, 1]
fid_name = file_id_features.iloc[idx, 1]
if trid_name.find('ELF') > -1 or fid_name.find('ELF') > -1:
print('Found: {:s} - {:s}'.format(trid_name, fid_name))
counter += 1
full_name = ext_drive + "VirusShare_" + file_name
write_list = full_name + "\n"
file_list.append(full_name)
fid_list.append(fid_name)
fop = open('data/elf-file-list.txt','w')
fop.writelines(write_list)
fop.close()
print("Got {:d} ELF filenames.".format(counter))
return file_list, fid_list
def disassemble_elf_binaries(file_list, fid_list):
# Use the command "objdump -d -M intel file_name" to dump out all
# the code sections of the ELF binary and generate assembly code in Intel
# format as this is easier to read and better for machine learning
# feature extraction.
# Use the command "objdump -g -x file_name -o file_name.txt to dump out
# all header sections.
counter = 0
disassed = 0
error_count = 0
print("Disassembling {:d} binary ELF files.".format(len(file_list)))
for idx, file_name in enumerate(file_list):
file_path = file_name.rstrip() # remove the newlines or else !!!
asm_file_name = file_path + ".elf.asm"
hdr_file_name = file_path + ".elf.txt"
fid_name = fid_list[idx]
if (os.path.isfile(file_path)):
fopasm = open(asm_file_name, "w")
# Dump the assembly code listing.
if "Intel" in fid_name:
sub.call(["objdump", "-d", "-M intel", file_path], stdout=fopasm)
#sub.call(["ndisasm", "-d", "-M intel", file_path], stdout=fopasm)
elif "x86" in fid_name:
sub.call(["objdump", "-d", "-M intel", file_path], stdout=fopasm)
elif "ARM" in fid_name:
sub.call(["objdump", "-d", "-marm", file_path], stdout=fopasm)
elif "PowerPC" in fid_name:
sub.call(["objdump", "-d", "-mpowerpc", file_path], stdout=fopasm)
elif "Motorola" in fid_name:
sub.call(["objdump", "-d", "-mm68k", file_path], stdout=fopasm)
elif "SPARC" in fid_name:
sub.call(["objdump", "-d", "-msparc", file_path], stdout=fopasm)
elif "MIPS" in fid_name:
sub.call(["objdump", "-d", "-mmips", file_path], stdout=fopasm)
elif "Renesas" in fid_name: # SuperH
sub.call(["objdump", "-d", "-msh", file_path], stdout=fopasm)
# Dump the ELF section headers.
fophdr = open(hdr_file_name, "w")
sub.call(["readelf", "-e", file_path], stdout=fophdr)
fophdr.close()
fopasm.close()
# now delete the binary, we do not need it anymore.
# sub.call(["rm", file_path1])
disassed += 1
else:
#print("Error: file does not exist - {:s}".format(file_path))
error_count += 1
counter += 1
if (counter % 1000) == 0: # print progress
print('Disassembled: {:d} - {:s}'.format(counter, file_path))
print("Disassembled {:d} ELF binaries with {:d} file path errors.".format(disassed, error_count))
#sub.call(["mv", "*.asm", "/opt/vs/asm"])
return
In [7]:
ext_drive = '/opt/vs/train1/'
packer_id_file = 'data/sorted-packer-id-features-vs251.csv'
file_id_file = 'data/sorted-file-id-features-vs251.csv'
trid_id_file = 'data/sorted-trid-id-features-vs251.csv'
unflist, fidlist = get_elf_file_list(ext_drive, packer_id_file, file_id_file, trid_id_file)
disassemble_elf_binaries(unflist, fidlist)
In [8]:
ext_drive = '/opt/vs/train2/'
packer_id_file = 'data/sorted-packer-id-features-vs252.csv'
file_id_file = 'data/sorted-file-id-features-vs252.csv'
trid_id_file = 'data/sorted-trid-id-features-vs252.csv'
unflist, fidlist = get_elf_file_list(ext_drive, packer_id_file, file_id_file, trid_id_file)
disassemble_elf_binaries(unflist, fidlist)
In [14]:
ext_drive = '/opt/vs/train3/'
packer_id_file = 'data/sorted-packer-id-features-vs263.csv'
file_id_file = 'data/sorted-file-id-features-vs263.csv'
trid_id_file = 'data/sorted-trid-id-features-vs263.csv'
unflist, fidlist = get_elf_file_list(ext_drive, packer_id_file, file_id_file, trid_id_file)
disassemble_elf_binaries(unflist, fidlist)
In [ ]:
In [ ]:
In [ ]:
ext_drive = '/opt/vs/train4/'
packer_id_file = 'data/sorted-packer-id-features-vs264.csv'
file_id_file = 'data/sorted-file-id-features-vs264.csv'
trid_id_file = 'data/sorted-trid-id-features-vs264.csv'
unflist, fidlist = get_elf_file_list(ext_drive, packer_id_file, file_id_file, trid_id_file)
disassemble_elf_binaries(unflist, fidlist)
In [ ]:
fip = open('data/amd64-instruction-set.txt')
inlines = fip.readlines()
inlines
In [ ]:
opcode_list = []
for line in inlines:
tokens = line.rstrip()
opcode_list.append(tokens.lower())
opcode_str = "[\'" + "','".join(opcode_list) + "\']"
opcode_str
In [ ]:
fip = open('data/arm-instruction-set.txt')
inlines = fip.readlines()
inlines
In [4]:
opcode_list = []
for line in inlines:
tokens = line.split()
opcode_list.append(tokens[0].lower())
opcode_str = "[\'" + "','".join(opcode_list) + "\']"
opcode_str
Out[4]:
In [ ]:
fip = open('data/sparc-instruction-set.txt')
inlines = fip.readlines()
inlines
In [ ]:
opcode_list = []
for line in inlines:
tokens = line.split()
opcode_list.append(tokens[0].lower())
opcode_str = "[\'" + "','".join(opcode_list) + "\']"
opcode_str
In [ ]:
fip = open('data/powerpc-instruction-set.txt')
inlines = fip.readlines()
inlines
In [ ]:
opcode_list = []
for line in inlines:
tokens = line.replace('[', ' ').split()
opcode_list.append(tokens[0].lower())
opcode_str = "[\'" + "','".join(opcode_list) + "\']"
opcode_str
In [6]:
len(opcode_list)
Out[6]:
In [ ]:
fip = open('data/powerpc-version-202-instruction-set.txt')
inlines = fip.readlines()
inlines
In [ ]:
opcode_list = []
for line in inlines:
tokens = line.replace('[', ' ').rstrip().split()
opcode_list.append(tokens[0].lower())
opcode_str = "[\'" + "','".join(opcode_list) + "\']"
opcode_str
In [9]:
len(opcode_list)
Out[9]:
In [ ]:
In [ ]:
fip = open('data/motorola-instruction-set.txt')
inlines = fip.readlines()
inlines
In [ ]:
In [ ]:
fip = open('data/mips-instruction-set.txt')
inlines = fip.readlines()
inlines
In [ ]:
In [ ]:
In [ ]:
In [2]:
# Check file id strings for ELF executables
fip = open('data/sorted-file-id-features-vs251.csv')
inlines = fip.readlines()
for line in inlines:
if "ELF" in line:
print("-> {:s}".format(line))
In [3]:
fip = open('data/sorted-file-id-features-vs252.csv')
inlines = fip.readlines()
for line in inlines:
if "ELF" in line:
print("-> {:s}".format(line))
In [4]:
fip = open('data/sorted-file-id-features-vs263.csv')
inlines = fip.readlines()
for line in inlines:
if "ELF" in line:
print("-> {:s}".format(line))
In [ ]:
fip = open('data/sorted-file-id-features-vs264.csv')
inlines = fip.readlines()
for line in inlines:
if "ELF" in line:
print("-> {:s}".format(line))
In [ ]:
In [ ]:
fip = open("/home/derek/binutils.txt")
inlines = fip.readlines()
inlines
In [9]:
package_list = []
for line in inlines:
tokens = line.replace(":", " ").split()
package_list.append(tokens[1])
command = "apt install " + " ".join(package_list)
command
Out[9]:
In [ ]:
fip = open("/home/derek/binutils.txt")
inlines = fip.readlines()
inlines
In [5]:
package_list = []
for line in inlines:
if ":i386" not in line:
tokens = line.split()
package_list.append(tokens[1])
command = "apt install " + " ".join(package_list)
command
Out[5]:
In [6]:
x86_registers = ['edx','esi','es','fs','ds','ss','gs','cs','ah','al',
'ax','bh','bl','bx','ch','cl','cx','dh','dl','dx',
'eax','ebp','ebx','ecx','edi','esp']
x86_opcodes = ['add','al','bt','call','cdq','cld','cli','cmc','cmp','const','cwd','daa','db'
,'dd','dec','dw','endp','ends','faddp','fchs','fdiv','fdivp','fdivr','fild'
,'fistp','fld','fstcw','fstcwimul','fstp','fword','fxch','imul','in','inc'
,'ins','int','jb','je','jg','jge','jl','jmp','jnb','jno','jnz','jo','jz'
,'lea','loope','mov','movzx','mul','near','neg','not','or','out','outs'
,'pop','popf','proc','push','pushf','rcl','rcr','rdtsc','rep','ret','retn'
,'rol','ror','sal','sar','sbb','scas','setb','setle','setnle','setnz'
,'setz','shl','shld','shr','sidt','stc','std','sti','stos','sub','test'
,'wait','xchg','xor']
amd64_registers = ['rax','rbx','rcx','rdx','rsi','rdi','rbp','rsp','r8','r9','r10','r11','r12','r13','r14','r15']
amd64_opcodes = ['aaa','aad','aam','aas','adc','add','and','andn','bextr','bextr','blcfill','blci','blcic',
'blcmsk','blcs','blsfill','blsi','blsic','blsmsk','blsr','bound','bsf','bsr','bswap','bt',
'btc','btr','bts','bzhi','call','cbw','cwde','cdqe','cwd','cdq','cqo','clc','cld','clflush','cmc','cmov',
'cmp','cmps','cmpsb','cmpsw','cmpsd','cmpsq','cmpxchg','cmpxchg8b','cmpxchg16b','cpuid',
'crc32','daa','das','dec','div','enter','idiv','imul','in','inc','ins','insb','insw','insd',
'int','into','jcxz','jecxz','jrcxz','jmp','lahf','lds','les','lfs','lgs','lss','lea','leave','lfence',
'llwpcb','lods','lodsb','lodsw','lodsd','lodsq','loop','loope','loopne','loopnz','loopz','lwpins',
'lwpval','lzcnt','mfence','mov','movbe','movd','movmskpd','movmskps','movnti','movs','movsb',
'movsw','movsd','movsq','movsx','movsxd','movzx','mul','mulx','neg','nop','not','or','out',
'outs','outsb','outsw','outsd','pause','pdep','pext','pop','popa','popad','popcnt','popf','popfd',
'popfq','prefetch','prefetchw','prefetch','push','pusha','pushad','pushf','pushfd','pushfq',
'rcl','rcr','rdfsbase','rdgsbase','rdrand','ret','rol','ror','rorx','sahf','sal','shl','sar','sarx',
'sbb','scas','scasb','scasw','scasd','scasq','set','sfence','shl','shld','shlx',
'shr','shrd','shrx','slwpcb','stc','std','stos','stosb','stosw','stosd','stosq','sub','t1mskc',
'test','tzcnt','tzmsk','wrfsbase','wrgsbase','xadd','xchg','xlat','xlatb','xor','arpl','clgi','cli',
'clts','hlt','int','invd','invlpg','invlpga','iret','iretd','iretq','lar','lgdt','lidt','lldt',
'lmsw','lsl','ltr','monitor','monitorx','mwait','mwaitx','rdmsr','rdpmc','rdtsc','rdtscp',
'rsm','sgdt','sidt','skinit','sldt','smsw','sti','stgi','str','swapgs',
'syscall','sysenter','sysexit','sysret','ud2','verr','verw',
'vmload','vmmcall','vmrun','vmsave','wbinvd','wrmsr']
MIPS_registers = []
MIPS_opcodes = []
SPARC_registers = []
SPARC_opcodes = []
ARM_registers = ['r0','r1','r2','r3','r4','r5','r6','r7','r8','r9','r10','r11','r12','r13','r14','r15','cpsr']
ARM_opcodes = ['adc','msr','add','mul','and','mvn','b','orr','bic','rsb','bl','rsc','bx','sbc','cdp','smlal','cmn','smull',
'cmp','stc','eor','stm','ldc','str','ldm','strb','ldr','strbt','ldrb','strh','ldrbt','strt','ldrh','sub','ldrsb','swi',
'ldrsh','swp','ldrt','swpb','mcr','teq','mla','tst','mov','umlal','mrc','umull','mrs']
Motorola_registers = ['d0','d1','d2','d3','d4','d5','d6','d7','a0','a1','a2','a3','a4','a5','a6','a7','usp','ssp']
Motorola_opcodes = []
PowerPC_registers = ['r0','r1','r2','r3','r4','r5','r6','r7','r8','r9','r10','r11','r12','r13','r14','r15',
'r16','r17','r18','r19','r20','r21','r22','r23','r24','r25','r26','r27','r28','r29','r30','r31']
PowerPC_opcodes = ['add','addc','adde','addi','addic','addic.','addis','addme','addze','and','andc','andi.','andis.',
'b','bc','bcctr','bclr','cmp','cmpi','cmpl','cmpli','cntlzd','cntlzw','crand','crandc','creqv',
'crnand','crnor','cror','crorc','crxor','dcbf','dcbst','dcbt','dcbtst','dcbz','divd','divdu',
'divw','divwu','eciwx','ecowx','eieio','eqv','extsb','extsh','extsw','fabs','fadd','fadds',
'fcfid','fcmpo','fcmpu','fctid','fctidz','fctiw','fctiwz','fdiv','fdivs','fmadd','fmadds',
'fmr','fmsub','fmsubs','fmul','fmuls','fnabs','fneg','fnmadd','fnmadds','fnmsub','fnmsubs',
'fre','fres','frsp','frsqrte','frsqrtes','fsel','fsqrt','fsqrts','fsub','fsubs','hrfid','icbi',
'isync','lbz','lbzu','lbzux','lbzx','ld','ldarx','ldu','ldux','ldx','lfd','lfdu','lfdux','lfdx',
'lfs','lfsu','lfsux','lfsx','lha','lhau','lhaux','lhax','lhbrx','lhz','lhzu','lhzux','lhzx','lmw',
'lswi','lswx','lwa','lwarx','lwaux','lwax','lwbrx','lwz','lwzu','lwzux','lwzx','mcrf','mcrfs',
'mcrxr','mfcr','mfocrf','mffs','mfmsr','mfspr','mfsr','mfsrin','mftb','mtcrf','mtocrf','mtfsb0',
'mtfsb1','mtfsf','mtfsfi','mtmsr','mtmsrd','mtspr','mtsr','mtsrin','mulhd','mulhdu','mulhw','mulhwu',
'mulld','mulli','mullw','nand','neg','nor','or','orc','ori','oris','popcntb','rfid','rldcl','rldcr',
'rldic','rldicl','rldicr','rldimi','rlwimi','rlwinm','rlwnm','sc','slbia','slbie','slbmfee',
'slbmfev','slbmte','sld','slw','srad','sradi','sraw','srawi','srd','srw','stb','stbu','stbux',
'stbx','std','stdcx.','stdu','stdux','stdx','stfd','stfdu','stfdux','stfdx','stfiwx','stfs',
'stfsu','stfsux','stfsx','sth','sthbrx','sthu','sthux','sthx','stmw','stswi','stswx','stw',
'stwbrx','stwcx.','stwu','stwux','stwx','subf','subfc','subfe','subfic','subfme','subfze',
'sync','td','tdi','tlbia','tlbie','tlbsync','tw','twi','xor','xori','xoris']
In [7]:
fip = open("data/arm-listing.txt")
inlines = fip.readlines()
inlines[:20]
Out[7]:
In [ ]:
# Parse the contents of arm-dis.c in binutils and extract all the ARM opcodes.
opcode_list = []
counter = 0
opcode = 'none'
for line in inlines:
line = line.lstrip()
line = line.replace('\\t',' ')
if len(line) < 10:
continue
if line.startswith('{'):
continue
if line.startswith('"'):
idx = line.find('%')
if idx > 0:
opcode = line[1:idx]
else:
continue
else:
tokens = line.split()
if len(tokens) < 3:
continue
opcodestr = tokens[2]
idx = opcodestr.find('%')
if idx > 0:
opcode = opcodestr[1:idx]
else:
continue
if opcode not in opcode_list:
opcode_list.append(opcode)
opcode_str = "[\'" + "','".join(opcode_list) + "\']"
opcode_str
In [ ]:
fip = open("data/m68k-opc.c")
inlines = fip.readlines()
inlines[:20]
In [ ]:
# Parse the contents of m68k-opc.c in binutils and extract all the Motorola opcodes.
opcode_list = []
counter = 0
opcode = 'none'
for line in inlines:
line = line.lstrip()
if len(line) < 10:
continue
if line.startswith("{"):
line = line[2:]
idx = line.find("\"")
if idx > 0:
opcode = line[:idx]
else:
continue
else:
continue
if opcode not in opcode_list:
opcode_list.append(opcode)
opcode_str = "[\'" + "','".join(opcode_list) + "\']"
opcode_str
In [ ]:
fip = open("data/mips-instruction-set.txt")
inlines = fip.readlines()
inlines[:20]
In [ ]:
# Parse the contents of mips-instruction-set.txt and extract all the things.
opcode_list = []
counter = 0
opcode = 'none'
for line in inlines:
line = line.lstrip()
if len(line) < 10:
continue
if line.startswith("{"):
line = line[2:]
idx = line.find("\"")
if idx > 0:
opcode = line[:idx]
else:
continue
else:
continue
idx = opcode.find(".")
if idx > 0:
opcode = opcode[:idx]
if opcode not in opcode_list:
opcode_list.append(opcode)
opcode_str = "[\'" + "','".join(opcode_list) + "\']"
opcode_str
In [ ]:
fip = open("data/i386-opc.tbl")
inlines = fip.readlines()
inlines[:20]
In [ ]:
# Parse the contents of i386-opc.tbl from binutils and extract all the things.
opcode_list = []
counter = 0
opcode = 'none'
for line in inlines:
if len(line) < 10:
continue
if line.startswith("//"):
continue
line = line.replace(',', ' ')
tokens = line.split()
if len(tokens) > 0:
opcode = tokens[0]
else:
continue
idx = opcode.find(".")
if idx > 0:
opcode = opcode[:idx]
if opcode not in opcode_list:
opcode_list.append(opcode)
opcode_str = "[\'" + "','".join(opcode_list) + "\']"
opcode_str
In [2]:
fip = open("data/Java-bytecode-instruction-listing.txt")
inlines = fip.readlines()
inlines[:20]
Out[2]:
In [3]:
# Parse the contents of Java-bytecode-instruction-listing.txt and extract all the things.
opcode_list = []
counter = 0
opcode = 'none'
for line in inlines:
if len(line) < 10:
continue
line = line.replace('\t', ' ')
tokens = line.split()
if len(tokens) > 0:
opcode = tokens[0]
else:
continue
if opcode not in opcode_list:
opcode_list.append(opcode)
opcode_str = "[\'" + "','".join(opcode_list) + "\']"
opcode_str
Out[3]:
In [4]:
len(opcode_list)
Out[4]:
In [ ]:
In [ ]:
In [ ]:
import os
from csv import writer
import numpy as np
import pandas as pd
# Start of Script
target_dir = "/opt/vs/legitware/"
out_file = "data/pdf-features-legit.csv"
pdf_token_file = "data/2716-pdf-token-counts-non-malicious-set.csv"
#out_file = "data/pdf-features-vs251.csv"
file_list = os.listdir(target_dir)
pdflist = []
for fname in enumerate(file_list):
if fname.endswith('.pdf'):
pdflist.append(target_dir + fname)
print("Got {:d} PDF files.".format(len(pdflist)))
In [ ]:
In [ ]:
In [ ]: