## 1. Data Preparation and Extraction of Features

``````Features sets will consist of:
- Entropy and file size from packed binaries.
- Entropy and file size from unpacked binaries.
- ASM features from disassembled unpacked binaries.
- Call Graph Features.
- Sample Statistics.
- PE packer type.
- Behavioural features from Cuckoo Sandbox reports.
- Memory features from Volatility reports.

Training labels will be generated from ClamAV, Windows Defender and VirusTotal.com reports.``````
``````

In [1]:

from multiprocessing import Pool
import os
from csv import writer
import numpy as np
import pandas as pd
import math
import scipy.misc
import array
import time as tm
import re
import subprocess as sub

``````
``````

In [2]:

ext_drive = '/opt/vs/'
tfiles = os.listdir(ext_drive + "train")

``````
``````

In [4]:

len(tfiles)

``````
``````

Out[4]:

65536

``````

## 2. Generate Entropy and File Size of Packed Binaries and Non-Binary Files

``Script: feature_extraction_entropy.py``
``````

In [12]:

# Calculate Shannon's Entropy, https://en.wikipedia.org/wiki/Entropy_(information_theory)

def calculate_entropy(byte_counts, total):

entropy = 0.0

for count in byte_counts:
# If no bytes of this value were seen in the value, it doesn't affect
# the entropy of the file.
if count == 0:
continue
# p is the probability of seeing this byte in the file, as a floating-point number
p = 1.0 * count / total
entropy -= p * math.log(p, 256)

return entropy

def entropy_counter(byte_code):

byte_counts = [0] * 256
code_length = len(byte_code)

for i in range(len(byte_code)):
byte_counts[int(byte_code[i])] += 1

entropy = calculate_entropy(byte_counts, code_length)

return entropy

def sort_and_save_entropy_feature_file():
# DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values().
sorted_entropys = entropys.sort('file_name')
sorted_entropys.to_csv('data/sorted-entropy-features.csv', index=False)

return

def combine_entropy_files():
# Function to combine the newly generated entropy files into one file:
# 1. list data directory
# 2. For each file in file list that matches (\d\d\d\d-entropy-features.csv)
# 3. Trim the filenames if necessary (should remove VirusShare_  prefix).
# 4. Concatenate the unsorted packer id feature files.
# 5. Sort and write to data/sorted-packer-id-features.csv
fop = open('data/entropy-features.csv','w')
fop.write('file_name,entropy,file_size\n')
p1 = re.compile('\d{3,5}-entropy-features-bin.csv') # This is the PID prefix for each file.
file_list = os.listdir('data/')
counter = 0
for file_name in file_list:
if p1.match(file_name):
fip = open('data/' + file_name, 'r')
fop.writelines(in_lines)
counter += len(in_lines)
fip.close()

print('Completed combine of {:d} entropy features.'.format(counter))

fop.close()

sort_and_save_entropy_feature_file()

return

# feature extraction for the binary files

def extract_binary_features(tfiles):
#byte_files = [i for i in tfiles if '.bytes' in i]
ftot = len(tfiles)

pid = os.getpid()
print('Process id:', pid)
feature_file = 'data/' + str(pid) + '-entropy-features-bin.csv' # entropy, file size, ngrams...
print('feature file:', feature_file)

feature_counts = []
with open(feature_file, 'w') as f:
# Write the column names for the csv file
fw = writer(f)
# Do this when combining the files.
#colnames = ['file_name'] + ['entropy'] + ['file_size']
#fw.writerow(colnames)

# Now iterate through the file list and extract the features from each file.
for idx, fname in enumerate(tfiles):
fasm = open(ext_drive + fname, 'rb')
filesize = os.path.getsize(ext_drive + fname)

# TODO: Do ngram extraction
# First do entropy calculations and filesize
# Convert the input array into a byte array to prevent type errors
# in entropy counter function.
in_bytes = bytearray(in_bytes)
#print("Type = {:s}").format(type(in_bytes))
entropy = entropy_counter(in_bytes)

count_vals = [entropy, filesize]

feature_counts.append([fname[fname.find('_')+1:]] + count_vals)

fasm.close()

# Print progress
if (idx+1) % 1000 == 0:
print("{:d} - {:d} of {:d} files processed.".format(pid, idx + 1, ftot))
fw.writerows(feature_counts)
feature_counts = []

# Write remaining files
if len(feature_counts) > 0:
fw.writerows(feature_counts)
feature_counts = []

print("Completed processing {:d} rows for feature file {:s}".format(ftot,feature_file))

``````
``````

In [ ]:

# TRAINING
# Now divide the train files into four groups for multiprocessing
ext_drive = '/opt/vs/train/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print("Files: {:d} - {:d} - {:d}".format(len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4))))
trains = [train1, train2, train3, train4]
p = Pool(4)
p.map(extract_binary_features, trains)
combine_entropy_files()

``````
``````

In [ ]:

# TRAINING
# Now divide the train files into four groups for multiprocessing
ext_drive = '/opt/vs/train2/'
tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print("Files: {:d} - {:d} - {:d}".format(len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4))))
trains = [train1, train2, train3, train4]
p = Pool(4)
p.map(extract_binary_features, trains)
combine_entropy_files()

``````
``````

In [15]:

sorted_entropys = entropys.sort('file_name')
sorted_entropys.to_csv('data/sorted-entropy-features-vs251-252.csv', index=False)

``````
``````

Out[15]:

file_name
entropy
file_size

0
00002e640cafb741bea9a48eaee27d6f
0.992174
208860

1
0.834382
201600

2
0001776237ac37a69fcef93c1bac0988
0.966021
682192

65536
00027c21667d9119a454df8cef2dc1c7
0.666599
18390

65537
0003887ab64b8ae19ffa988638decac2
0.903260
1134320

3
000403e4e488356b7535cc613fbeb80b
0.773787
199168

65538
0.803515
149720

4
0004c8b2a0f4680a5694d74199b40ea2
0.985592
1165440

5
000595d8b586915c12053104cf845097
0.841920
264240

65539
000634f03457d088c71dbffb897b1315
0.957584
1725502

65540
00072ed24314e91b63b425b3dc572f50
0.486112
328093

65541
00092d369958b67557da8661cc9093bc
0.845657
522936

6
00093d5fa5cb7ce77f6eaf39962daa12
0.803481
742064

7
00099926d51b44c6f8c93a48c2567891
0.997032
725288

65542
0009a64f786fa29bfa6423278cc74f02
0.996663
671280

8
000a2db4762dc06628a086c9e117f884
0.535436
61551

65543
000ac11fa7587b2316470b154254a219
0.997824
1874471

9
000ae2c63ba69fc93dfc395b40bfe03a
0.899481
487386

65544
000ae90736a51c47543dcc6d8a735362
0.863887
260144

65545
000b41258d624ef2d6e430822d0c0c8f
0.992772
590824

20 rows × 3 columns

``````

## 3. Generate Entropy and File Size of Unpacked Binaries

``````

In [ ]:

# TODO: everything.

``````
``````

In [ ]:

``````

## 4. Generate PE ASM and Header Features

``````- PE Header Features from objdump header summaries.
- ASM Features from IDA Pro assembly files.

- Script: feature_extraction_pe_asm.py``````
``````

In [ ]:

keywords = ['Virtual','Offset','loc','Import','Imports','var','Forwarder','UINT','LONG'
,'BOOL','WORD','BYTES','large','short','dd','db','dw','XREF','ptr','DATA'
,'FUNCTION','extrn','byte','word','dword','char','DWORD','stdcall','arg'
,'LRESULT','HANDLE','HWND','LPSTR','int','HLOCAL','FARPROC','ATOM','HMODULE'
,'WPARAM','HGLOBAL','entry','rva','COLLAPSED','config','exe','Software'
,'CurrentVersion','__imp_','INT_PTR','UINT_PTR','---Seperator','PCCTL_CONTEXT'
,'__IMPORT_','INTERNET_STATUS_CALLBACK','.rdata:','.data:','.text:','case'
,'installdir','market','microsoft','policies','proc','scrollwindow','search'
,'hardware','hkey_current_user','hkey_local_machine','sp-analysisfailed','unableto']

known_sections = ['.text', '.data', '.bss', '.rdata', '.edata', '.idata', '.rsrc', '.tls', '.reloc']

registers = ['edx','esi','es','fs','ds','ss','gs','cs','ah','al',
'ax','bh','bl','bx','ch','cl','cx','dh','dl','dx',
'eax','ebp','ebx','ecx','edi','esp']

,'fistp','fld','fstcw','fstcwimul','fstp','fword','fxch','imul','in','inc'
,'ins','int','jb','je','jg','jge','jl','jmp','jnb','jno','jnz','jo','jz'
,'lea','loope','mov','movzx','mul','near','neg','not','or','out','outs'
,'pop','popf','proc','push','pushf','rcl','rcr','rdtsc','rep','ret','retn'
,'rol','ror','sal','sar','sbb','scas','setb','setle','setnle','setnz'
,'setz','shl','shld','shr','sidt','stc','std','sti','stos','sub','test'
,'wait','xchg','xor']

def count_asm_symbols(asm_code):
symbols = [0]*7
for row in asm_code:
if '*' in row:
symbols[0] += 1
if '-' in row:
symbols[1] += 1
if '+' in row:
symbols[2] += 1
if '[' in row:
symbols[3] += 1
if ']' in row:
symbols[4] += 1
if '@' in row:
symbols[5] += 1
if '?' in row:
symbols[6] += 1

return symbols

def count_asm_registers(asm_code):
registers_values = [0]*len(registers)
for row in asm_code:
parts = row.replace(',',' ').replace('+',' ').replace('*',' ').replace('[',' ').replace(']',' ') \
.replace('-',' ').split()
for register in registers:
registers_values[registers.index(register)] += parts.count(register)
return registers_values

def count_asm_opcodes(asm_code):
opcodes_values = [0]*len(opcodes)
for row in asm_code:
parts = row.split()

for opcode in opcodes:
if opcode in parts:
opcodes_values[opcodes.index(opcode)] += 1
break
return opcodes_values

def count_asm_APIs(asm_code, apis):
apis_values = [0]*len(apis)
for row in asm_code:
for i in range(len(apis)):
if apis[i] in row:
apis_values[i] += 1
break
return apis_values

def count_asm_misc(asm_code):
keywords_values = [0]*len(keywords)
for row in asm_code:
for i in range(len(keywords)):
if keywords[i] in row:
keywords_values[i] += 1
break
return keywords_values

# Extract features from test/training asm files, file list is passed in as a parameter

def extract_asm_features(tfiles):

pid = os.getpid()
print('Process id:', pid)
feature_file = 'data/' + str(pid) + '-malware-features-asm.csv' # Windows API, symbols, registers, opcodes, etc...
print('feature file:', feature_file)

fapi = open("data/APIs.txt")
defined_apis = defined_apis[0].split(',')

asm_files = [i for i in tfiles if '.asm' in i]
ftot = len(asm_files)

feature_counts = []
with open(feature_file, 'w') as f:
fw = writer(f)
colnames = ['file_name'] + registers + opcodes + defined_apis + keywords
fw.writerow(colnames)

for idx, fname in enumerate(asm_files):
fasm = open(ext_drive + fname, 'r')

reg_vals = count_asm_registers(content)
opc_vals = count_asm_opcodes(content)
api_vals = count_asm_APIs(content, defined_apis)
#sec_vals = count_asm_sections(content)
mis_vals = count_asm_misc(content)
count_vals = reg_vals + opc_vals + api_vals + mis_vals

feature_counts.append([fname[:fname.find('.asm')]] + count_vals)

# Writing rows after every 10 files processed
if (idx+1) % 10 == 0:
print(pid, idx + 1, 'of', ftot, 'files processed.')
fw.writerows(feature_counts)
feature_counts = []

# Writing remaining files
if len(feature_counts) > 0:
fw.writerows(feature_counts)
feature_counts = []

return

``````
``````

In [ ]:

``````
``````

In [ ]:

``````

## 8. Clean and Sort Function Names

``- script: function_name_clean.py``
``````

In [ ]:

# Need to clean up and sort these function names for ASM feature extraction.
fip = open('data/all-function-column-names-multiline.csv')
fip.close()

function_names.sort()
function_names[:50]

fop = open('data/sorted-function-names-multiline.txt','w')
fop.writelines(function_names)
fop.close()

``````
``````

In [22]:

fip = open('data/sorted-function-names-multiline.txt','r')
fip.close()
fip = open('data/APIs.txt','r')
fip.close()
api_names_str = api_names_str.rstrip()
api_names = api_names_str.split(',')
api_names.sort()
len(api_names)

``````
``````

Out[22]:

792

``````
``````

In [23]:

for idx in range(len(sorted_function_names)):
sorted_function_names[idx] = sorted_function_names[idx].rstrip()

for aname in api_names:
if aname not in sorted_function_names:
sorted_function_names.append(aname)

sorted_function_names.sort()
len(sorted_function_names)

``````
``````

Out[23]:

155548

``````
``````

In [ ]:

sorted_function_names[:50]

``````
``````

In [25]:

function_count = len(sorted_function_names)
total_chars = 0
for func_name in sorted_function_names:
total_chars += len(func_name)

avg_name_len = int(total_chars / function_count)
avg_name_len

``````
``````

Out[25]:

32

``````
``````

In [ ]:

# truncate function names to reduce the size of the huge sparse matrix.
function_column_names = []
for func in sorted_function_names:
if func.startswith('sub') or func.startswith('loc') or func.startswith('unk'):
func = func[:5] # lets try to reduce the vast number of functions.
elif func.startswith('eax+') or func.startswith('ebx+') or func.startswith('ecx+') or func.startswith('edx+'):
func = func[:3]
elif func.startswith('edi+') or func.startswith('esi+'):
func = func[:3]
elif func.startswith('byte_') or func.startswith('word_') or func.startswith('off_'):
func = func[:4]
elif func.startswith('_') or func.startswith('\$'):
func = func[1:]
elif func.startswith('__') or func.startswith('\$\$'):
func = func[2:]
#else: need a regex here to match a bunch of random crap
#    func = func[:33]

if len(func) > 32: # Reduce the the function name length to max of average function length.
func = func[:32]

if func not in function_column_names:
function_column_names.append(func)

function_column_names[:50]

``````
``````

In [27]:

fop = open('data/sorted-reduced-function-names.txt','w')

for fname in function_column_names:
fop.write(fname + "\n")

fop.close()

len(function_column_names)

``````
``````

Out[27]:

143048

``````
``````

In [ ]:

# Use a regex to remove function names that are just hexadecimal addresses.
p1 = re.compile('\d\w+h')
reduced_function_names = []
fip = open('data/sorted-reduced-function-names.txt','r')
fip.close()

fop = open('data/sorted-reduced-function-names-hexless.txt','w')
for fname in function_column_names:
fname = fname.rstrip()
m = p1.match(fname)
if m == None:
fop.write(fname + "\n")
reduced_function_names.append(fname)

fop.close()
reduced_function_names[:50]

``````
``````

In [29]:

len(reduced_function_names)

``````
``````

Out[29]:

135436

``````
``````

In [ ]:

``````

## 10. Test Code Only

``````

In [4]:

signat = sub.check_output(["file",'-b', '/opt/vs/agobot.exe'])

``````
``````

In [5]:

signat

``````
``````

Out[5]:

'PE32 executable (GUI) Intel 80386, for MS Windows, UPX compressed\n'

``````

## Generate C Library API Tokens for ASM and Header Feature Extraction.

``````

In [13]:

# Generate libc function calls for ELF API feature extraction.
# Put in feature_extraction_elf_asm.py

def generate_libc_api():
# Extract libc function and variable names from GNU Libc Documentation.
# Put in feature_extraction.py

fipfunc = open('data/libc-function-index.txt', 'r')
fipfunc.close()

counter = 0
func_list = []

for idx, fline in enumerate(funclines):
fline = fline.replace('\t','').replace('\n','')
if fline.startswith('|'):
tokens = fline.split('|')   # The function names are |funcname|
funcname = tokens[1]
func_list.append(funcname)
counter += 1

print("Found {:d} function definitions for libc api.".format(counter))

fop = open('data/elf-libc-api.txt', 'w')

for func_name in func_list:
fop.write(func_name + "\n")

fop.close()

return func_list

def generate_libc_var():
# Extract libc function and variable names from GNU Libc Documentation.
# Put in feature_extraction.py

fipvar = open('data/libc-variable-index.txt', 'r')
fipvar.close()

counter = 0
var_list = []

for idx, vline in enumerate(varlines):
vline = vline.replace('\t','').replace('\n','')
if vline.startswith('|'):
tokens = vline.split('|')   # The function names are |funcname|
varname = tokens[1]
var_list.append(varname)
counter += 1

print("Found {:d} variable definitions for libc api.".format(counter))

fop = open('data/elf-libc-var.txt', 'w')

for var_name in var_list:
fop.write(var_name + "\n")

fop.close()

return var_list

``````
``````

In [ ]:

func_list = generate_libc_api()
func_list

``````

## Validate Disassembly Results.

``````

In [18]:

# Check interrupted disassembly results for train1 feature set.
# def validate_disassembly():
# put in disassemble_pe.py

t1asm = os.listdir('/opt/vs/train1asm/')
t1hdr = os.listdir('/opt/vs/train1hdr/')
asm_files = []
hdr_files = []

for fname in t1asm:
if fname.endswith('.asm'):
asm_files.append(fname)

for fname in t1hdr:
if fname.endswith('.txt'):
hdr_files.append(fname)

print("asm dir: {:d} asm files {:d} hdr dir {:d} hdr files {:d}".format(len(t1asm),len(asm_files),len(t1hdr),len(hdr_files)))

``````
``````

asm dir: 21314 asm files 21314 hdr dir 21409 hdr files 21409

``````
``````

In [19]:

len(t1hdr) - len(t1asm)

``````
``````

Out[19]:

95

``````
``````

In [14]:

os.path.getsize('/opt/vs/train1asm/VirusShare_5ac1817d757a27edb90cdf887ba66870.asm')

``````
``````

Out[14]:

77814888

``````
``````

In [20]:

counter = 0
missing_hdr_list = []

for fname in asm_files:
hdr_name = fname.replace('.asm', '.txt')
if hdr_name not in hdr_files:
print("{:s} not in header file list.".format(hdr_name))
counter += 1
missing_asm_list.append(fname)

``````
``````

``````
``````

In [ ]:

counter = 0
missing_asm_list = []

for fname in hdr_files:
asm_name = fname.replace('.txt','.asm')
if asm_name not in asm_files:
print("{:s} not in asm file list.".format(asm_name))
counter += 1
missing_asm_list.append(fname)

print("{:d} missing assembly files.".format(counter))

``````
``````

In [23]:

counter = 0
fop = open('data/disass-train1-missing-asm-files.txt', 'w')
for fname in missing_asm_list:
fop.write(fname + "\n")
counter += 1

fop.close()
print("Wrote {:d} missing asm file names.".format(counter))

``````
``````

Wrote 95 missing asm file names.

``````
``````

In [ ]:

``````
``````

In [25]:

counter = 0

for fname in hdr_files:
fsize = os.path.getsize('/opt/vs/train1hdr/' + fname)
if fsize < 1000:
print("{:s} bad output, filesize = {:d}.".format(fname, fsize))
counter += 1

``````
``````

VirusShare_d5eff38b212286c46db007aa7159ffd8.txt bad output, filesize = 0.
VirusShare_592d7ac775519110d58e9ce1975c1b5b.txt bad output, filesize = 0.
VirusShare_c80d9b2dbf9b7953a3b6e9b51a39a0c2.txt bad output, filesize = 0.

``````
``````

In [ ]:

counter = 0

for fname in asm_files:
fsize = os.path.getsize('/opt/vs/train1asm/' + fname)
if fsize < 1000:
print("{:s} bad output, filesize = {:d}.".format(fname, fsize))
counter += 1

``````
``````

In [ ]:

``````
``````

In [3]:

apt_file_list = apt_df['file_name']
apt_file_list.shape

``````
``````

Out[3]:

(293,)

``````
``````

In [5]:

``````
``````

Out[5]:

0    001dd76872d80801692ff942308c64e6
1    002325a0a67fded0381b5648d7fe9b8e
2    00dbb9e1c09dbdafb360f3163ba5a3de
3    0149b7bd7218aab4e257d28469fddb0d
4    01e0dc079d4e33d8edd050c4900818da
Name: file_name, dtype: object

``````
``````

In [7]:

f_list = os.listdir('/home/derek/project/temp/train/')
counter = 0
file_list = []
for fname in f_list:
if fname.startswith('Virus'):
tname = fname[fname.find('_') + 1:]
file_list.append(tname)
counter += 1

print("Got {:d} files in training directory.".format(counter))

apt_list = np.array(apt_file_list)
for fname in file_list:
if fname not in apt_list:
print("Extra file: {:s}".format(fname))

``````
``````

Got 294 files in training directory.
Extra file: 00248ef21706d78c1f0e1eca3cab72c3

``````
``````

In [ ]:

``````

## Rename Header and ASM Files Generated by IDA Pro.

``````

In [5]:

# Rename all the PE headers files so it is easier to process them.

file_list = os.listdir(ext_dir)
counter = 0

for fname in file_list:
if fname.startswith('Virus'):
file_path = ext_dir + fname
trunc_name = fname[0:fname.find('.txt')]
new_path = ext_dir + trunc_name + '.pe.txt'
result = sub.check_call(['mv', file_path, new_path])
counter += 1

if (counter % 1000) == 0:

print('Completed move of {:d} header files.'.format(counter))

return

``````
``````

In [ ]:

ext_dir = '/home/derek/project/temp/'

``````
``````

In [2]:

def rename_asm_files(ext_dir, new_dir):
# Rename all the PE ASM files and move to a new directory
# so it is easier to process them.

file_list = os.listdir(ext_dir)
counter = 0

print("Got {:d} files in directory {:s}".format(len(file_list), ext_dir))

for fname in file_list:
if fname.endswith('.asm'):
file_path = ext_dir + fname
trunc_name = fname[0:fname.find('.asm')]
new_path = new_dir + trunc_name + '.pe.asm'
result = sub.check_call(['mv', file_path, new_path])
counter += 1

if (counter % 1000) == 0:
print('Renamed {:d} ASM files.'.format(counter))

print('Completed rename of {:d} ASM files.'.format(counter))

return

``````
``````

In [ ]:

rename_asm_files('/opt/vs/train3/','opt/vs/train3asm/')

``````
``````

In [4]:

def rename_asm_files_fix(ext_dir):
# Rename all the PE headers files so it is easier to process them.

file_list = os.listdir(ext_dir)
pe_counter = 0
unpe_counter = 0

print("Got total files: {:d}".format(len(file_list)))

for fname in file_list:
if fname.endswith('.pe.asm'):
pe_counter += 1
elif fname.endswith('.asm'):
file_path = ext_dir + fname
trunc_name = fname[0:fname.find('.asm')]
new_path = ext_dir + trunc_name + '.pe.asm'
result = sub.check_call(['mv', file_path, new_path])
unpe_counter += 1

if (unpe_counter % 1000) == 0:
print('Renamed {:d} ASM files.'.format(unpe_counter))

print('Completed move of {:d} ASM files with {:d} files already renamed.'.format(unpe_counter, pe_counter))

return

``````
``````

In [ ]:

rename_asm_files_fix('/opt/vs/train3/')

``````
``````

In [ ]:

``````

## Validate Disassembly Results Part 2

``- disassemble_pe.py``
``````

In [4]:

# Moved to feature-extraction-validation.ipynb

``````
``````

In [ ]:

``````
``````

In [ ]:

``````

## Find ELF Disassembly Files.

``````

In [6]:

def find_elf_train1asm(ext_dir):
# Rename all the PE headers files so it is easier to process them.

file_list = os.listdir(ext_dir)
elf_counter = 0

print("Got total files: {:d}".format(len(file_list)))

for fname in file_list:
if 'elf' in fname:
elf_counter += 1
file_path = ext_dir + fname
#trunc_name = fname[0:fname.find('.elf')]
new_path = '/opt/vs/' + fname
#result = sub.check_call(['mv', file_path, new_path])

#if (elf_counter % 1000) == 0:
print('ELF ASM file {:s}.'.format(file_path))

print('Completed move of {:d} ELF ASM files.'.format(elf_counter))

return

``````
``````

In [7]:

find_elf_train1asm('/opt/vs/train1asm/')

``````
``````

Got total files: 54911
Completed move of 0 ELF ASM files.

``````
``````

In [8]:

find_elf_train1asm('/opt/vs/train2asm/')

``````
``````

Got total files: 46166
Completed move of 0 ELF ASM files.

``````
``````

In [ ]:

find_elf_train1asm('/opt/vs/train3asm/')

``````
``````

In [ ]:

find_elf_train1asm('/opt/vs/train4asm/')

``````
``````

In [ ]:

find_elf_train1asm('/opt/vs/aptasm/')

``````

## Test Generation of PE/COFF Header Tokens.

``- generate_pe_header_tokens.py``
``````

In [27]:

def save_token_counts(token_counter_map, out_file):
# Output the malware sample classification counts.
fop = open(out_file, 'w')
csv_wouter = writer(fop)
cols = ['token_name','count'] # write out the column names.
csv_wouter.writerow(cols)
outlines = []
sorted_keys = token_counter_map.keys()
sorted_keys.sort()
counter = 0
for key in sorted_keys:
outlines.append([key, token_counter_map[key]])
counter += 1
if (counter % 100) == 0: # write out some lines
csv_wouter.writerows(outlines)
outlines = []
print("Processed token {:s} -> {:d}.".format(key, token_counter_map[key]))

# Finish off.
if (len(outlines) > 0):
csv_wouter.writerows(outlines)
outlines = []

print("Completed writing {:d} tokens.".format(len(sorted_keys)))
fop.close()

return

def get_token_count_map(token_df):
# Read in the token count file and create a dict.
token_dict = {}
type_y = np.array(token_df['token_name'])

for idx in range(token_df.shape[0]): # First fill the dict with the token counts
token_dict[token_df.iloc[idx,0]] = token_df.iloc[idx,1]

def combine_token_files():
# TODO: everything

return

def generate_pe_tokens(file_list, out_token_file, out_count_file):

psections = re.compile('\s+\d{1,2}\s+(\.\w+|\w+)\s+\d+')            # Pattern for section names.
pdlls = re.compile('\s+DLL Name: (\w+)')                  # Pattern for import DLL names.
pfunctions = re.compile('\s+\w+\s+\d{1,4}\s+(.+)')        # Pattern for import function names.
preloc = re.compile('\s+reloc')                           # Pattern for relocation entries.
pexports = re.compile('\s+\[\s*\d+\]\s+(\w+)')            # Pattern for exported function names.

token_counter_map = {}
counter = 0
pid = os.getpid()

for idx, fname in enumerate(file_list):

fip = open(fname, 'r')

counter += 1

for line in in_lines:

line = line.rstrip() # get rid of newlines they are annoying.
token_val = ""

m = preloc.match(line)
if m != None:
#token_val = m.group(2)
continue

m = psections.match(line)
if m != None:
token_val = m.group(1)
print("Section: {:s}".format(token_val))
else:
m = pdlls.match(line)
if m != None:
token_val = m.group(1)
else:
m = pfunctions.match(line)
if m != None:
token_val = m.group(1)
else:
m = pexports.match(line)
if m != None:
token_val = m.group(1)
print("Export: {:s}".format(token_val))
else:
continue

# Count the token type.
if token_val in token_counter_map.keys():
token_counter_map[token_val] += 1
else:
token_counter_map[token_val] = 1

if (counter % 100) == 0:
print("{:d} Processed {:d} header files.".format(pid, counter))

fip.close()

save_token_counts(token_counter_map, out_count_file)

return

``````
``````

In [ ]:

``````
``````

In [ ]:

ext_drive = '/opt/vs/hdr/'
file_list = os.listdir(ext_drive)
file_paths = []

for fname in file_list:
file_paths.append(ext_drive + fname)

``````
``````

In [ ]:

``````
``````

In [6]:

# Testing PE header token generation.

def save_token_counts(token_counter_map, out_file_name):
# Output the PE Header token counts.
pid = os.getpid()
out_file = "data/" + str(pid) + "-" + out_file_name
fop = open(out_file, 'w')
csv_wouter = writer(fop)

outlines = []
sorted_keys = token_counter_map.keys()
sorted_keys.sort()
counter = 0

for key in sorted_keys:
outlines.append([key, token_counter_map[key]])
counter += 1
if (counter % 100) == 0: # write out some lines
csv_wouter.writerows(outlines)
outlines = []
print("Processed token {:s} -> {:d}.".format(key, token_counter_map[key]))

# Finish off.
if (len(outlines) > 0):
csv_wouter.writerows(outlines)
outlines = []

print("Completed writing {:d} tokens.".format(len(sorted_keys)))
fop.close()

return

def get_token_count_map(token_df):
# Read in the token count file and create a dict.
token_dict = {}
type_y = np.array(token_df['token_name'])

for idx in range(token_df.shape[0]): # First fill the dict with the token counts
token_dict[token_df.iloc[idx,0]] = token_df.iloc[idx,1]

def generate_pe_tokens(mp_params):
# Parse a bunch of PE/COFF headers dumped by objdump and extract
# section names, import DLLs, import functions and exported functions.
file_list = mp_params.file_list
out_count_file = mp_params.count_file

psections = re.compile('\s+\d{1,2}\s+(\.\w+|\w+)\s+\d+')  # Pattern for section names.
pdlls = re.compile('\s+DLL Name: (\w+)')                  # Pattern for import DLL names.
pfunctions = re.compile('\s+\w+\s+\d{1,4}\s+(.+)')        # Pattern for import function names.
preloc = re.compile('\s+reloc')                           # Pattern for relocation entries.
pexports = re.compile('\s+\[\s*\d+\]\s+(\w+)')            # Pattern for exported function names.

token_counter_map = {}
counter = 0
pid = os.getpid()

for idx, fname in enumerate(file_list):

fip = open(fname, 'r')

counter += 1

for line in in_lines:

line = line.rstrip() # get rid of newlines they are annoying.
token_val = ""

m = preloc.match(line)
if m != None:
#token_val = m.group(2)
continue

m = psections.match(line)
if m != None:
token_val = m.group(1)
#print("Section: {:s}".format(token_val))
else:
m = pdlls.match(line)
if m != None:
token_val = m.group(1)
else:
m = pfunctions.match(line)
if m != None:
token_val = m.group(1)
else:
m = pexports.match(line)
if m != None:
token_val = m.group(1)
#print("Export: {:s}".format(token_val))
else:
continue

# Count the token type.
if token_val in token_counter_map.keys():
token_counter_map[token_val] += 1
else:
token_counter_map[token_val] = 1

if (counter % 100) == 0:
print("{:d} Processed {:d} header files.".format(pid, counter))

fip.close()

save_token_counts(token_counter_map, out_count_file)

return

def save_combine(token_counter_map, out_file_name):
# Save the combined token counts.

out_file = "data/" + out_file_name
fop = open(out_file, 'w')
csv_wouter = writer(fop)
cols = ['token_name','count']
csv_wouter.writerow(cols)

outlines = []
sorted_keys = token_counter_map.keys()
sorted_keys.sort()
counter = 0

for key in sorted_keys:
outlines.append([key, token_counter_map[key]])
counter += 1
if (counter % 100) == 0: # write out some lines
csv_wouter.writerows(outlines)
outlines = []
print("Processed token {:s} -> {:d}.".format(key, token_counter_map[key]))

# Finish off.
if (len(outlines) > 0):
csv_wouter.writerows(outlines)
outlines = []

fop.close()

print("Completed writing {:d} tokens.".format(len(sorted_keys)))

return

def combine_token_files(token_file, count_file):
# Function to combine the newly generated token files into one file:
# 1. list data directory
# 2. For each file in file list that matches (\d\d\d\d-pe-header-tokens.csv)
# 3. Trim the filenames if necessary (should remove VirusShare_  prefix).
# 4. Concatenate the unsorted token feature files.
# 5. Sort and write to data/sorted-token-features.csv

p1 = re.compile('\d{3,5}-' + count_file) # This is the PID prefix for each file.
file_list = os.listdir('data/')
counter = 0
token_map = {}

for file_name in file_list:
if p1.match(file_name):
fip = open('data/' + file_name, 'r')
for line in in_lines:
tokens = line.split(',')
if tokens[0] not in token_map.keys():
token_map[tokens[0]] = int(tokens[1])
else:
token_map[tokens[0]] += int(tokens[1])

counter += len(in_lines)
fip.close()

save_combine(token_map, token_file)

print('Completed combine of {:d} PE/COFF header tokens.'.format(counter))

return

class Multi_Params(object):
def __init__(self, tokenfile="", countfile="", filelist=[]):
self.token_file = tokenfile
self.count_file = countfile
self.file_list = filelist

``````
``````

In [3]:

ext_drive = '/opt/vs/apthdr/'
file_list = os.listdir(ext_drive)
tfiles = []

for fname in file_list:
tfiles.append(ext_drive + fname)

``````
``````

In [ ]:

mp1 = Multi_Params(token_file, count_file, tfiles)

generate_pe_tokens(mp1)

``````
``````

In [7]:

combine_token_files(token_file, count_file)

``````
``````

Processed token CryptGetUserKey -> 3.
Processed token GetSystemTimeAsFileTime -> 29.
Processed token IsValidLocale -> 7.
Processed token OutputDebugStringA -> 32.
Processed token SHCreateDirectoryExA -> 7.
Processed token UnhandledExceptionFilter -> 64.
Processed token free -> 120.
Processed token wsprintfA -> 20.
Completed writing 1103 tokens.
Completed combine of 1103 PE/COFF header tokens.

``````

## Test PE Header Feature Extraction

``- feature_extraction_pe_header.py``
``````

In [6]:

# Testing PE header feature extraction.

"BaseOfCode","BaseOfData","ImageBase","SectionAlignment","FileAlignment",
"MajorOSystemVersion","MinorOSystemVersion","MajorImageVersion","MinorImageVersion",
"MajorSubsystemVersion","MinorSubsystemVersion","Win32Version",

field_list_len = len(field_list)

ptime = re.compile("Time/Date\s+(.+)") # Time/Date pattern for PE Header field.

field_vals = [0] * field_list_len

for idx1 in range(0,44): # The PE header fields are the first 44 lines of the file.

tokens = line.split()

for idx2, field_name in enumerate(field_list):

if field_name in tokens:
if field_name.startswith("Time"):
time_match = ptime.match(field_name)
if time_match != None:
time_str = time_match.group(1)
time_s = tm.strptime(time_str, "%a %b %d %H:%M:%S %Y") # Convert time string to epoch int.
time_epoch = tm.mktime(time_s)
else:
time_epoch = 0

field_vals[idx2] = time_epoch

elif len(tokens) > 1:
field_vals[idx2] = int(tokens[1], 16) # Convert the hex value of the field to int.

return field_vals

keywords_values = [0] * klen

for row in asm_code:
for i in range(klen):
if keywords[i] in row:
keywords_values[i] += 1
break

return keywords_values

# 1. Get the feature file and token/keyword file names
# 2. Create an array of token/keyword values.
# 3. Iterate throught the PE header file list and counter the occurrence of the keywords in each file.

pid = os.getpid()
feature_file = 'data/' + str(pid) + "-" + multi_parameters.out_file
token_file = 'data/' + multi_parameters.token_file

print('Process id: {:d} - Feature file: {:s} - Keyword file: {:s}'.format(pid, feature_file, token_file))

tokens = list(hdr_pd['token_name'])
tlen = len(tokens)

for idx, token in enumerate(tokens): # Clamp the token name length and demangle C++ names, they are annoying.
token = token.replace('@','').replace('\$','').replace('?','')
if len(token) > 32:
tokens[idx] = token[:32]
else:
tokens[idx] = token

asm_files = [i for i in tfiles if '.pe.txt' in i]
ftot = len(asm_files)

feature_counts = []
with open(feature_file, 'w') as f:

fw = writer(f)

for idx, fname in enumerate(asm_files):

fasm = open(ext_drive + fname, 'r')
fasm.close()

fname = fname[fname.find("_")+1:] # Remove VirusShare_ from the start of the file name.

field_vals = get_field_values(content)

feature_counts.append([fname[0:fname.find('.pe.txt')]] + field_vals + keyword_vals)

# Writing rows after every 10 files processed
if (idx+1) % 1000 == 0:
print("{:d} - {:d} of {:d} files processed.".format(pid, idx + 1, ftot))
fw.writerows(feature_counts)
feature_counts = []

# Writing remaining features
if len(feature_counts) > 0:
fw.writerows(feature_counts)
feature_counts = []

print("{:d} Completed processing {:d} PE header files.".format(pid, ftot))

return

def combine_feature_files(feature_file_name, token_file):
# Function to combine the newly generated PE header feature files into one file:
# 1. list data directory
# 2. For each file in file list that matches (\d\d\d\d-pe-header-features.csv)
# 3. Trim the filenames if necessary (should remove VirusShare_  prefix).
# 4. Concatenate the unsorted pe header feature files.
# 5. Sort and write to data/sorted-pe-header-features.csv

tokens = list(hdr_pd['token_name'])
for idx, token in enumerate(tokens): # Clamp the token name length and demangle C++ names, they are annoying.
token = token.replace('@','').replace('\$','').replace('?','')
if len(token) > 32:
tokens[idx] = token[:32]
else:
tokens[idx] = token

fop = open('data/' + feature_file_name,'w')
colnames = "file_name," + ",".join(field_list) + "," + ",".join(tokens) + "\n"
print("Column names: {:s}".format(colnames))
fop.write(colnames)

p1 = re.compile('\d{3,5}-' + feature_file_name) # This is the PID prefix for each file.
file_list = os.listdir('data/')
counter = 0

for file_name in file_list:
if p1.match(file_name):
fip = open('data/' + file_name, 'r')
fop.writelines(in_lines)
counter += len(in_lines)
fip.close()

fop.close()

# DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values().
sorted_features = features.sort('file_name')
sorted_features.to_csv('data/sorted-' + feature_file_name, index=False)

print('Completed combine of {:d} PE header file features.'.format(counter))

return

class Multi_Params(object):
def __init__(self, outfile="", tokenfile="", fieldnames=[], filelist=[]):
self.out_file = outfile
self.token_file = tokenfile
self.field_names = fieldnames
self.file_list = filelist

``````
``````

In [ ]:

ext_drive = '/opt/vs/apthdr/'
tfiles = os.listdir(ext_drive)

mp1 = Multi_Params(out_file, token_file, header_field_names, tfiles)

combine_feature_files(out_file, token_file)

``````
``````

In [7]:

combine_feature_files(out_file, token_file)

``````
``````

Completed combine of 275 PE header file features.

``````
``````

In [ ]:

combine_feature_files(out_file, token_file)

``````

## Test PE/COFF ASM Feature Extraction.

``````

In [15]:

# Test PE ASM feature extraction.

x86_registers = ['edx','esi','es','fs','ds','ss','gs','cs','ah','al',
'ax','bh','bl','bx','ch','cl','cx','dh','dl','dx',
'eax','ebp','ebx','ecx','edi','esp']

,'fistp','fld','fstcw','fstcwimul','fstp','fword','fxch','imul','in','inc'
,'ins','int','jb','je','jg','jge','jl','jmp','jnb','jno','jnz','jo','jz'
,'lea','loope','mov','movzx','mul','near','neg','not','or','out','outs'
,'pop','popf','proc','push','pushf','rcl','rcr','rdtsc','rep','ret','retn'
,'rol','ror','sal','sar','sbb','scas','setb','setle','setnle','setnz'
,'setz','shl','shld','shr','sidt','stc','std','sti','stos','sub','test'
,'wait','xchg','xor']

def count_asm_registers(asm_code):
registers_values = [0]*len(x86_registers) # Need to optimise this init stuff to global vars.

for row in asm_code:
parts = row.replace(',',' ').replace('+',' ').replace('*',' ').replace('[',' ').replace(']',' ') \
.replace('-',' ').split()

for idx, register in enumerate(x86_registers):
registers_values[idx] += parts.count(register)

return registers_values

def count_asm_opcodes(asm_code):
opcodes_values = [0]*len(x86_opcodes)

for row in asm_code:
parts = row.split()

for idx, opcode in enumerate(x86_opcodes):
if opcode in parts:
opcodes_values[idx] += 1
break

return opcodes_values

def extract_asm_features(multi_param):

pid = os.getpid()
feature_file = 'data/' + str(pid) + "-" + multi_param.temp_file
print('Process id: {:d} feature file: {:s}'.format(pid,feature_file))

# Do this in call graph feature extraction.
#fapi = open("data/APIs.txt")
#defined_apis = defined_apis[0].split(',')

asm_files = [i for i in tfiles if '.asm' in i]
ftot = len(asm_files)

feature_counts = []
with open(feature_file, 'w') as f:

fw = writer(f)

for idx, fname in enumerate(asm_files):

fasm = open(ext_drive + fname, 'r')
fasm.close()

fname = fname[fname.find("_")+1:] # Remove VirusShare_ from the start of the file name.

reg_vals = count_asm_registers(content)
opc_vals = count_asm_opcodes(content)
#api_vals = count_asm_APIs(content, defined_apis) put in Call Graph features.
#mis_vals = count_asm_misc(content) mostly already in PE header and call graph features.
count_vals = reg_vals + opc_vals # + api_vals + mis_vals

feature_counts.append([fname[:fname.find('.asm')]] + count_vals)

# Writing rows after every 10 files processed
if (idx+1) % 10 == 0:
print("{:d} {:d} of {:d} files processed.".format(pid, idx + 1, ftot))
fw.writerows(feature_counts)
feature_counts = []

# Writing remaining files
if len(feature_counts) > 0:
fw.writerows(feature_counts)
feature_counts = []

return

def combine_asm_files(out_file, temp_file):
# Function to combine the newly generated asm feature files into one file:
# 1. list data directory
# 2. For each file in file list that matches (\d\d\d\d-asm-features.csv)
# 3. Trim the filenames if necessary (should remove VirusShare_  prefix).
# 4. Concatenate the unsorted asm feature files.
# 5. Sort and write to data/sorted-asm-features.csv
fop = open('data/' + out_file,'w')
colnames = "file_name," + ",".join(x86_registers) + "," + ",".join(x86_opcodes) + "\n"
fop.write(colnames)

print("Column names: {:s}".format(colnames))

p1 = re.compile('\d{3,5}-' + temp_file) # This is the PID prefix for each file.
file_list = os.listdir('data/')
counter = 0

for file_name in file_list:
if p1.match(file_name):
fip = open('data/' + file_name, 'r')
fop.writelines(in_lines)
counter += len(in_lines)
fip.close()

print('Completed combine of {:d} ASM features.'.format(counter))

fop.close()

# DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values().
sorted_asms = asms.sort('file_name')
sorted_asms.to_csv('data/sorted-' + out_file, index=False)

return

class Multi_Params(object):
def __init__(self, featurefile="", tempfile="", filelist=[]):
self.feature_file = featurefile
self.temp_file = tempfile
self.file_list = filelist

``````
``````

In [2]:

# Includes x86 and amd64 registers and opcodes.
x86_registers = ['edx','esi','es','fs','ds','ss','gs','cs','ah','al',
'ax','bh','bl','bx','ch','cl','cx','dh','dl','dx',
'eax','ebp','ebx','ecx','edi','esp','rax','rbx','rcx','rdx','rsi',
'rdi','rbp','rsp','r8','r9','r10','r11','r12','r13','r14','r15']

x86_opcodes = ['mov','movabs','movbe','movsbl','movsbw','movswl','movsbq','movswq','movslq','movsx',
'movsxd','movzb','movzw','movzx','push','pusha','pop','popa','xchg','in','out','lea',
'lds','les','lfs','lgs','lss','clc','cld','cli','clts','cmc','lahf','sahf','pushf',
'cdq','cqo','cbtw','cltq','cwtl','cwtd','cltd','cqto','mul','imul','div','idiv',
'rol','ror','rcl','rcr','sal','shl','shr','sar','shld','shrd','call','lcall',
'jmp','ljmp','ret','lret','retf','enter','leave','jo','jno','jb','jc','jnae','jnb',
'jnc','jae','je','jz','jne','jnz','jbe','jna','jnbe','ja','js','jns','jp','jpe',
'jnp','jpo','jl','jnge','jnl','jge','jle','jng','jnle','jg','jcxz','jecxz','jrcxz',
'loop','loopz','loope','loopnz','loopne','seto','setno','setb','setc','setnae',
'setnb','setnc','setae','sete','setz','setne','setnz','setbe','setna','setnbe',
'seta','sets','setns','setp','setpe','setnp','setpo','setl','setnge','setnl',
'setge','setle','setng','setnle','setg','cmps','scmp','ins','outs','lods',
'slod','movs','smov','scas','ssca','stos','ssto','xlat','bsf','bsr','bt',
'btc','btr','bts','int','int3','into','iret','rsm','bound','hlt','nop','arpl',
'lar','lgdt','lidt','lldt','lmsw','lsl','ltr','sgdt','sidt','sldt','smsw','str',
'verr','verw','fld','fild','fildll','fldt','fbld','fst','fist','fstp','fistp',
'fistpll','fstpt','fbstp','fxch','fcom','ficom','fcomp','ficomp','fcompp','fucom',
'fucomp','fucompp','ftst','fxam','fld1','fldl2t','fldl2e','fldpi','fldlg2','fldln2',
'fmul','fimul','fmulp','fdiv','fidiv','fdivp','fdivr','fidivr','fdivrp','f2xm1',
'fyl2x','fptan','fpatan','fxtract','fprem1','fdecstp','fincstp','fprem','fyl2xp1',
'fsqrt','fsincos','frndint','fscale','fsin','fcos','fchs','fabs','fninit','finit',
'fldcw','fnstcw','fstcw','fnstsw','fstsw','fnclex','fclex','fnstenv','fstenv',
'fldenv','fnsave','fsave','frstor','fneni','feni','fndisi','fdisi','fnsetpm',
'gs','ss','rep','repe','repz','repne','repnz','ht','hnt','rex','rexz','rexy',
'rexyz','rexx','rexxz','rexxy','rexxyz','rex64','rex64z','rex64y','rex64yz',
'invlpg','cpuid','wrmsr','rdtsc','rdmsr','cmpxchg8b','sysenter','sysexit','fxsave',
'fxsave64','fxrstor','fxrstor64','rdpmc','ud2','ud2a','ud1','ud2b','cmovo','cmovno',
'cmovb','cmovc','cmovnae','cmovae','cmovnc','cmovnb','cmove','cmovz','cmovne',
'cmovnz','cmovbe','cmovna','cmova','cmovnbe','cmovs','cmovns','cmovp','cmovnp',
'cmovl','cmovnge','cmovge','cmovnl','cmovle','cmovng','cmovg','cmovnle','cmovpe',
'cmovpo','fcmovb','fcmovnae','fcmove','fcmovbe','fcmovna','fcmovu','fcmovae',
'fcmovnb','fcmovne','fcmova','fcmovnbe','fcmovnu','fcomi','fucomi','fcomip',
'fcompi','fucomip','fucompi','movnti','clflush','lfence','mfence','pause','emms',
'psubsb','psubsw','psubusb','psubusw','punpckhbw','punpckhwd','punpckhdq',
'cmpeqps','cmpeqss','cmpleps','cmpless','cmpltps','cmpltss','cmpneqps','cmpneqss',
'cmpnleps','cmpnless','cmpnltps','cmpnltss','cmpordps','cmpordss','cmpunordps',
'cmpunordss','cmpps','cmpss','comiss','cvtpi2ps','cvtps2pi','cvtsi2ss','cvtss2si',
'minps','minss','movaps','movhlps','movhps','movlhps','movlps','movmskps','movntps',
'movntq','movntdq','movss','movups','mulps','mulss','orps','pavgb','pavgw','pextrw',
'pinsrw','pmaxsw','pmaxub','pminsw','pminub','pmovmskb','pmulhuw','prefetchnta',
'rsqrtss','sfence','shufps','sqrtps','sqrtss','stmxcsr','subps','subss','ucomiss',
'cmplepd','cmplesd','cmpltpd','cmpltsd','cmpneqpd','cmpneqsd','cmpnlepd','cmpnlesd',
'cmpnltpd','cmpnltsd','cmpordpd','cmpordsd','cmpunordpd','cmpunordsd','cmppd',
'cmpsd','comisd','cvtpi2pd','cvtsi2sd','divpd','divsd','maxpd','maxsd','minpd',
'minsd','movapd','movhpd','movlpd','movmskpd','movntpd','movsd','movupd','mulpd',
'mulsd','orpd','shufpd','sqrtpd','sqrtsd','subpd','subsd','ucomisd','unpckhpd',
'unpcklpd','xorpd','cvtdq2pd','cvtpd2dq','cvtdq2ps','cvtpd2pi','cvtpd2ps','cvtps2pd',
'cvtps2dq','cvtsd2si','cvtsd2ss','cvtss2sd','cvttpd2pi','cvttsd2si','cvttpd2dq',
'monitor','movddup','movshdup','movsldup','mwait','vmcall','vmclear','vmlaunch',
'pabsd','blendpd','blendps','blendvpd','blendvps','dppd','dpps','extractps','insertps',
'pextrq','phminposuw','pinsrb','pinsrd','pinsrq','pmaxsb','pmaxsd','pmaxud','pmaxuw',
'pminsb','pminsd','pminud','pminuw','pmovsxbw','pmovsxbd','pmovsxbq','pmovsxwd',
'pmovsxwq','pmovsxdq','pmovzxbw','pmovzxbd','pmovzxbq','pmovzxwd','pmovzxwq',
'pmovzxdq','pmuldq','pmulld','ptest','roundpd','roundps','roundsd','roundss',
'pcmpgtq','pcmpestri','pcmpestrm','pcmpistri','pcmpistrm','crc32','xsave',
'xsave64','xrstor','xrstor64','xgetbv','xsetbv','xsaveopt','xsaveopt64','aesdec',
'aesdeclast','aesenc','aesenclast','aesimc','aeskeygenassist','pclmulqdq',
'vcmpeqps','vcmpeqsd','vcmpeqss','vcmpeq_uqpd','vcmpeq_uqps','vcmpeq_uqsd',
'vcmpeq_uqss','vcmpeq_uspd','vcmpeq_usps','vcmpeq_ussd','vcmpeq_usss','vcmpfalse_ospd',
'vcmpfalse_osps','vcmpfalse_ossd','vcmpfalse_osss','vcmpfalsepd','vcmpfalseps',
'vcmpfalsesd','vcmpfalsess','vcmpge_oqpd','vcmpge_oqps','vcmpge_oqsd','vcmpge_oqss',
'vcmpgepd','vcmpgeps','vcmpgesd','vcmpgess','vcmpgt_oqpd','vcmpgt_oqps',
'vcmpgt_oqsd','vcmpgt_oqss','vcmpgtpd','vcmpgtps','vcmpgtsd','vcmpgtss',
'vcmple_oqpd','vcmple_oqps','vcmple_oqsd','vcmple_oqss','vcmplepd','vcmpleps',
'vcmplesd','vcmpless','vcmplt_oqpd','vcmplt_oqps','vcmplt_oqsd','vcmplt_oqss',
'vcmpltpd','vcmpltps','vcmpltsd','vcmpltss','vcmpneq_oqpd','vcmpneq_oqps',
'vcmpneq_oqsd','vcmpneq_oqss','vcmpneq_ospd','vcmpneq_osps','vcmpneq_ossd',
'vcmpneq_osss','vcmpneqpd','vcmpneqps','vcmpneqsd','vcmpneqss','vcmpneq_uspd',
'vcmpneq_usps','vcmpneq_ussd','vcmpneq_usss','vcmpngepd','vcmpngeps','vcmpngesd',
'vcmpngess','vcmpnge_uqpd','vcmpnge_uqps','vcmpnge_uqsd','vcmpnge_uqss','vcmpngtpd',
'vcmpngtps','vcmpngtsd','vcmpngtss','vcmpngt_uqpd','vcmpngt_uqps','vcmpngt_uqsd',
'vcmpngt_uqss','vcmpnlepd','vcmpnleps','vcmpnlesd','vcmpnless','vcmpnle_uqpd',
'vcmpnle_uqps','vcmpnle_uqsd','vcmpnle_uqss','vcmpnltpd','vcmpnltps','vcmpnltsd',
'vcmpnltss','vcmpnlt_uqpd','vcmpnlt_uqps','vcmpnlt_uqsd','vcmpnlt_uqss','vcmpordpd',
'vcmpordps','vcmpordsd','vcmpord_spd','vcmpord_sps','vcmpordss','vcmpord_ssd',
'vcmpord_sss','vcmppd','vcmpps','vcmpsd','vcmpss','vcmptruepd','vcmptrueps',
'vcmptruesd','vcmptruess','vcmptrue_uspd','vcmptrue_usps','vcmptrue_ussd',
'vcmptrue_usss','vcmpunordpd','vcmpunordps','vcmpunordsd','vcmpunord_spd',
'vcmpunord_sps','vcmpunordss','vcmpunord_ssd','vcmpunord_sss','vcomisd',
'vcomiss','vcvtdq2pd','vcvtdq2ps','vcvtpd2dq','vcvtpd2dqx','vcvtpd2dqy','vcvtpd2ps',
'vcvtpd2psx','vcvtpd2psy','vcvtps2dq','vcvtps2pd','vcvtsd2si','vcvtsd2ss',
'vcvtsi2sd','vcvtsi2ss','vcvtss2sd','vcvtss2si','vcvttpd2dq','vcvttpd2dqx',
'vcvttpd2dqy','vcvttps2dq','vcvttsd2si','vcvttss2si','vdivpd','vdivps','vdivsd',
'vminsd','vminss','vmovapd','vmovaps','vmovd','vmovddup','vmovdqa','vmovdqu',
'vmovhlps','vmovhpd','vmovhps','vmovlhps','vmovlpd','vmovlps','vmovmskpd','vmovmskps',
'vmovntdq','vmovntdqa','vmovntpd','vmovntps','vmovq','vmovsd','vmovshdup',
'vmulss','vorpd','vorps','vpabsb','vpabsd','vpabsw','vpackssdw','vpacksswb',
'vpblendw','vpcmpeqb','vpcmpeqd','vpcmpeqq','vpcmpeqw','vpcmpestri','vpcmpestrm',
'vpcmpgtb','vpcmpgtd','vpcmpgtq','vpcmpgtw','vpcmpistri','vpcmpistrm','vperm2f128',
'vpmaxuw','vpminsb','vpminsd','vpminsw','vpminub','vpminud','vpminuw','vpmovmskb',
'vpmovsxbd','vpmovsxbq','vpmovsxbw','vpmovsxdq','vpmovsxwd','vpmovsxwq','vpmovzxbd',
'vpmovzxbq','vpmovzxbw','vpmovzxdq','vpmovzxwd','vpmovzxwq','vpmuldq','vpmulhrsw',
'vpshufd','vpshufhw','vpshuflw','vpsignb','vpsignd','vpsignw','vpslld','vpslldq',
'vpsubd','vpsubq','vpsubsb','vpsubsw','vpsubusb','vpsubusw','vpsubw','vptest',
'vpunpckhbw','vpunpckhdq','vpunpckhqdq','vpunpckhwd','vpunpcklbw','vpunpckldq',
'vpunpcklqdq','vpunpcklwd','vpxor','vrcpps','vrcpss','vroundpd','vroundps',
'vroundsd','vroundss','vrsqrtps','vrsqrtss','vshufpd','vshufps','vsqrtpd','vsqrtps',
'vsqrtsd','vsqrtss','vstmxcsr','vsubpd','vsubps','vsubsd','vsubss','vtestpd',
'vtestps','vucomisd','vucomiss','vunpckhpd','vunpckhps','vunpcklpd','vunpcklps',
'vpsllvq','vpsravd','vpsrlvd','vpsrlvq','vgatherdpd','vgatherdps','vgatherqpd',
'vgatherqps','vpgatherdd','vpgatherdq','vpgatherqd','vpgatherqq','vaesdec','vaesdeclast',
'vaesenc','vaesenclast','vaesimc','vaeskeygenassist','vpclmulqdq','vpclmullqlqdq',
'vpclmulhqlqdq','vpclmullqhqdq','vpclmulhqhqdq','rdfsbase','rdgsbase','rdrand',
'vfmsub132pd','vfmsub132ps','vfmsub213pd','vfmsub213ps','vfmsub231pd','vfmsub231ps',
'vfmsub132sd','vfmsub132ss','vfmsub213sd','vfmsub213ss','vfmsub231sd','vfmsub231ss',
'vfnmsub213ps','vfnmsub231pd','vfnmsub231ps','vfnmsub132sd','vfnmsub132ss',
'vfnmsub213sd','vfnmsub213ss','vfnmsub231sd','vfnmsub231ss','xacquire','xrelease',
'xabort','xbegin','xend','xtest','bzhi','mulx','pdep','pext','rorx','sarx','shlx',
'vfrczpd','vfrczps','vfrczsd','vfrczss','vpcmov','vpcomb','vpcomd','vpcomq',
'vpcomub','vpcomud','vpcomuq','vpcomuw','vpcomw','vpermil2pd','vpermil2ps',
'vpcomltb','vpcomltd','vpcomltq','vpcomltub','vpcomltud','vpcomltuq','vpcomltuw',
'vpcomltw','vpcomleb','vpcomled','vpcomleq','vpcomleub','vpcomleud','vpcomleuq',
'vpcomleuw','vpcomlew','vpcomgtb','vpcomgtd','vpcomgtq','vpcomgtub','vpcomgtud',
'vpcomgtuq','vpcomgtuw','vpcomgtw','vpcomgeb','vpcomged','vpcomgeq','vpcomgeub',
'vpcomgeud','vpcomgeuq','vpcomgeuw','vpcomgew','vpcomeqb','vpcomeqd','vpcomeqq',
'vpcomequb','vpcomequd','vpcomequq','vpcomequw','vpcomeqw','vpcomneqb','vpcomneqd',
'vpcomneqq','vpcomnequb','vpcomnequd','vpcomnequq','vpcomnequw','vpcomneqw',
'vpcomfalseb','vpcomfalsed','vpcomfalseq','vpcomfalseub','vpcomfalseud',
'vpcomfalseuq','vpcomfalseuw','vpcomfalsew','vpcomtrueb','vpcomtrued','vpcomtrueq',
'vpmacsdd','vpmacsdqh','vpmacsdql','vpmacssdd','vpmacssdqh','vpmacssdql',
'vpshld','vpshlq','vpshlw','llwpcb','slwpcb','lwpval','lwpins','andn','bextr','blsi',
'blsmsk','blsr','tzcnt','blcfill','blci','blcic','blcmsk','blcs','blsfill','blsic',
't1mskc','tzmsk','prefetch','prefetchw','femms','pavgusb','pf2id','pf2iw','pfacc',
'pfrcp','pfrcpit1','pfrcpit2','pfrsqit1','pfrsqrt','pfsub','pfsubr','pi2fd','pi2fw',
'pmulhrw','pswapd','syscall','sysret','swapgs','rdtscp','clgi','invlpga','skinit',
'popcnt','lzcnt','xstore','xcrypt','montmul','xsha1','xsha256','xstorerng',
'rdseed','clac','stac','bnd','bndmk','bndmov','bndcl','bndcu','bndcn','bndstx',
'bndldx','sha1rnds4','sha1nexte','sha1msg1','sha1msg2','sha256rnds2','sha256msg1','sha256msg2',
'kandnw','kandw','korw','kxnorw','kxorw','kmovw','knotw','kortestw','kshiftlw',
'kshiftrw','kunpckbw','valignd','vpternlogd','valignq','vpternlogq','vblendmpd',
'vpblendmq','vpermi2pd','vpermi2q','vpermt2pd','vpermt2q','vpmaxsq','vpmaxuq',
'vpminsq','vpminuq','vprolvq','vprorvq','vpsravq','vblendmps','vpblendmd',
'vcmpge_ospd','vcmpgt_ospd','vcmple_ospd','vcmplt_ospd','vcmpneq_uqpd','vcmpnge_uspd',
'vcmpngt_uspd','vcmpnle_uspd','vcmpnlt_uspd','vcmpord_qpd','vcmptrue_uqpd',
'vcmpunord_qpd','vcmpeq_oqps','vcmpfalse_oqps','vcmpge_osps','vcmpgt_osps',
'vcmple_osps','vcmplt_osps','vcmpneq_uqps','vcmpnge_usps','vcmpngt_usps',
'vcmpnle_usps','vcmpnlt_usps','vcmpord_qps','vcmptrue_uqps','vcmpunord_qps',
'vcmpeq_oqsd','vcmpfalse_oqsd','vcmpge_ossd','vcmpgt_ossd','vcmple_ossd','vcmplt_ossd',
'vcmpneq_uqsd','vcmpnge_ussd','vcmpngt_ussd','vcmpnle_ussd','vcmpnlt_ussd',
'vcmpord_qsd','vcmptrue_uqsd','vcmpunord_qsd','vcmpeq_oqss','vcmpfalse_oqss',
'vcmpge_osss','vcmpgt_osss','vcmple_osss','vcmplt_osss','vcmpneq_uqss',
'vcmpnge_usss','vcmpngt_usss','vcmpnle_usss','vcmpnlt_usss','vcmpord_qss',
'vcmptrue_uqss','vcmpunord_qss','vcompresspd','vpcompressq','vpscatterdq',
'vpscatterqq','vscatterdpd','vscatterqpd','vcompressps','vpcompressd','vpscatterdd',
'vscatterdps','vcvtudq2pd','vcvtps2udq','vcvtpd2udq','vcvtsd2usi','vcvtusi2sd',
'vcvtusi2ss','vcvtss2usi','vcvttpd2udq','vcvttps2udq','vcvttsd2usi','vcvttss2usi',
'vcvtudq2ps','vexpandpd','vpexpandq','vexpandps','vpexpandd','vextractf32x4',
'vextracti32x4','vextractf64x4','vextracti64x4','vfixupimmpd','vfixupimmps',
'vfixupimmsd','vgetmantsd','vrndscalesd','vfixupimmss','vgetmantss','vrndscaless',
'vscalefpd','vscalefps','vscalefsd','vscalefss','vgetexppd','vgetexpps',
'vgetexpsd','vgetexpss','vgetmantpd','vrndscalepd','vgetmantps','vrndscaleps',
'vinsertf32x4','vinserti32x4','vinsertf64x4','vinserti64x4','vmovdqa64',
'vmovdqa32','vmovdqu32','vmovdqu64','vrcp14ps','vrsqrt14ps','vpabsq',
'vrcp14pd','vrsqrt14pd','vpandd','vpandnd','vpord','vpxord','vpandnq',
'vpandq','vporq','vpxorq','vpcmpd','vpcmpled','vpcmpltd','vpcmpneqd',
'vpcmpnled','vpcmpnltd','vpcmpud','vpcmpequd','vpcmpleud','vpcmpltud',
'vpcmpnequd','vpcmpnleud','vpcmpnltud','vpcmpq','vpcmpleq','vpcmpltq',
'vpcmpneqq','vpcmpnleq','vpcmpnltq','vpcmpuq','vpcmpequq','vpcmpleuq',
'vpcmpltuq','vpcmpnequq','vpcmpnleuq','vpcmpnltuq','vptestmq','vpmovdb',
'vpmovsdb','vpmovusdb','vpmovdw','vpmovsdw','vpmovusdw','vpmovqb','vpmovsqb',
'vpmovusqb','vpmovqd','vpmovsqd','vpmovusqd','vpmovqw','vpmovsqw','vpmovusqw',
'vprold','vprord','vprolq','vprorq','vpscatterqd','vscatterqps','vpsraq',
'vptestmd','vrcp14sd','vrsqrt14sd','vrcp14ss','vrsqrt14ss','vshuff32x4',
'vexp2ps','vrcp28pd','vrsqrt28pd','vrcp28ps','vrsqrt28ps','vrcp28sd',
'vrsqrt28sd','vrcp28ss','vrsqrt28ss','vgatherpf0dpd','vgatherpf0qpd',
'vgatherpf1dpd','vgatherpf1qpd','vscatterpf0dpd','vscatterpf0qpd',
'vscatterpf1dpd','vscatterpf1qpd','vgatherpf0dps','vgatherpf0qps',
'vgatherpf1dps','vgatherpf1qps','vscatterpf0dps','vscatterpf0qps',
'vscatterpf1dps','vscatterpf1qps','prefetchwt1','clflushopt','xrstors',
'xrstors64','xsaves','xsaves64','xsavec','xsavec64','encls','enclu',
'kandnq','kandq','kmovq','knotq','korq','kortestq','ktestq','kunpckdq',
'kunpckwd','kxnorq','kxorq','kshiftld','kshiftlq','kshiftrd','kshiftrq',
'vpermt2w','vpermw','vpsllvw','vpsravw','vpsrlvw','vpcmpb','vpcmpub',
'vpcmpuw','vpcmpw','vpmovb2m','vpmovm2b','vpmovm2w','vpmovswb','vpmovuswb',
'kandb','kandnb','kmovb','knotb','korb','kortestb','ktestb','kxnorb','kxorb',
'vcvtpd2qq','vcvtpd2uqq','vcvtps2qq','vcvtps2uqq','vcvtqq2pd','vcvtuqq2pd',
'vcvtqq2ps','vcvtqq2psx','vcvtqq2psy','vcvttpd2qq','vcvttpd2uqq','vcvttps2qq',
'vcvttps2uqq','vcvtuqq2ps','vcvtuqq2psx','vcvtuqq2psy','vextractf32x8',
'vextracti32x8','vinsertf32x8','vinserti32x8','vfpclassss','vextractf64x2',
'vextracti64x2','vfpclasssd','vinsertf64x2','vinserti64x2','vfpclasspd',
'vfpclasspdz','vfpclasspdx','vfpclasspdy','vfpclassps','vfpclasspsz','vfpclasspsx',
'vfpclasspsy','vpmovd2m','vpmovm2d','vpmovm2q','vpmovq2m','vpmullq','vrangepd',
'vreducepd','vrangeps','vreduceps','vrangesd','vreducesd','vrangess','vreducess',
'vpermt2b','clzero','monitorx','mwaitx','rdpkru','wrpkru','rdpid']

def count_asm_symbols(asm_code):
symbols = [0]*7
for row in asm_code:
if '*' in row:
symbols[0] += 1
if '-' in row:
symbols[1] += 1
if '+' in row:
symbols[2] += 1
if '[' in row:
symbols[3] += 1
if ']' in row:
symbols[4] += 1
if '@' in row:
symbols[5] += 1
if '?' in row:
symbols[6] += 1

return symbols

def count_asm_APIs(asm_code, apis):
apis_values = [0]*len(apis)
for row in asm_code:
for i in range(len(apis)):
if apis[i] in row:
apis_values[i] += 1
break
return apis_values

def count_asm_misc(asm_code):
keywords_values = [0]*len(keywords)
for row in asm_code:
for i in range(len(keywords)):
if keywords[i] in row:
keywords_values[i] += 1
break
return keywords_values

def count_asm_registers(asm_code):
registers_values = [0]*len(x86_registers) # Need to optimise this init stuff to global vars.

for row in asm_code:
parts = row.replace(',',' ').replace('+',' ').replace('*',' ').replace('[',' ').replace(']',' ') \
.replace('-',' ').split()

for idx, register in enumerate(x86_registers):
registers_values[idx] += parts.count(register)

return registers_values

def count_asm_opcodes(asm_code):
opcodes_values = [0]*len(x86_opcodes)

for row in asm_code:
parts = row.split()

for idx, opcode in enumerate(x86_opcodes):
if opcode in parts:
opcodes_values[idx] += 1
break

return opcodes_values

def extract_asm_features(multi_param):

pid = os.getpid()
feature_file = 'data/' + str(pid) + "-" + multi_param.temp_file
ext_drive = multi_para.ext_drive

lmsg = 'Process id: {:d} feature file: {:s}'.format(pid, feature_file)
print(lmsg)

# Do this in call graph feature extraction.
#fapi = open("data/APIs.txt")
#defined_apis = defined_apis[0].split(',')

flog = open("data/" + str(pid) + "-pe-asm-log.txt", "w")
flog.write(lmsg + "\n")

asm_files = [i for i in tfiles if '.pe.asm' in i]
ftot = len(asm_files)

feature_counts = []
with open(feature_file, 'w') as f:

fw = writer(f)

for idx, fname in enumerate(asm_files):

fasm = open(ext_drive + fname, 'r')
fasm.close()

fname = fname[fname.find("_")+1:] # Remove VirusShare_ from the start of the file name.

reg_vals = count_asm_registers(content)
opc_vals = count_asm_opcodes(content)
#api_vals = count_asm_APIs(content, defined_apis) put in Call Graph features.
#mis_vals = count_asm_misc(content) mostly already in PE header and call graph features.
count_vals = reg_vals + opc_vals # + api_vals + mis_vals

feature_counts.append([fname[:fname.find('.asm')]] + count_vals)

# Writing rows after every 10 files processed
if (idx+1) % 10 == 0:
lmsg = "{:d} {:d} of {:d} files processed.".format(pid, idx + 1, ftot)
print(lmsg)
flog.write(lmsg + "\n")
fw.writerows(feature_counts)
feature_counts = []

# Writing remaining files
if len(feature_counts) > 0:
fw.writerows(feature_counts)
feature_counts = []

lmsg = "{:d} Completed processing {:d} PE ASM files.".format(pid, ftot)
print(lmsg)
flog.write(lmsg + "\n")
flog.close()

return

def combine_asm_files(out_file, temp_file):
# Function to combine the newly generated asm feature files into one file:
# 1. list data directory
# 2. For each file in file list that matches (\d\d\d\d-asm-features.csv)
# 3. Trim the filenames if necessary (should remove VirusShare_  prefix).
# 4. Concatenate the unsorted asm feature files.
# 5. Sort and write to data/sorted-asm-features.csv

fop = open('data/' + out_file, 'w')
colnames = "file_name," + ",".join(x86_registers) + "," + ",".join(x86_opcodes) + "\n"
fop.write(colnames)

print("Column names: {:s}".format(colnames))

p1 = re.compile('\d{3,5}-' + temp_file) # This is the PID prefix for each file.
file_list = os.listdir('data/')
counter = 0

for file_name in file_list:
if p1.match(file_name):
fip = open('data/' + file_name, 'r')
fop.writelines(in_lines)
counter += len(in_lines)
fip.close()

print('Completed combine of {:d} ASM features.'.format(counter))

fop.close()

# DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values().
sorted_asms = asms.sort('file_name')
sorted_asms.to_csv('data/sorted-' + out_file, index=False)

return

class Multi_Params(object):
def __init__(self, featurefile="", tempfile="", extdrive="", filelist=[]):
self.feature_file = featurefile
self.temp_file = tempfile
self.ext_drive = extdrive
self.file_list = filelist

``````
``````

In [ ]:

out_file = 'pe-asm-features-apt.csv'
temp_file = 'pe-asm-temp-apt.csv'
ext_drive = '/opt/vs/asm/'
tfiles = os.listdir(ext_drive)

mp1 = Multi_Params(out_file, temp_file, tfiles)

extract_asm_features(mp1)

combine_asm_files(out_file, temp_file)

``````
``````

In [3]:

# find out what is going on with the IDA Pro disassembly.
fip = open('/opt/vs/asm/filetypes.txt', 'r')
dir_list = os.listdir('/opt/vs/asm/')
file_list = []
counter = 0

for line in lines:
tokens = line.split(':')
file_name = tokens[0]
counter += 1
file_list.append(file_name)

print("PE32 files: {:d} directory files: {:d}".format(counter, len(dir_list)))

for fname in dir_list:
if fname not in file_list:
print("File not in PE32 list: {:s}".format(fname))

``````
``````

PE32 files: 239 directory files: 272
File not in PE32 list: VirusShare_2bd02b41817d227058522cca40acd390.asm
File not in PE32 list: VirusShare_2daa4a4574ba06aa3203ae0e0b45b3b8.asm
File not in PE32 list: VirusShare_827040a5f5ae8de281a63899224b2f3a.asm
File not in PE32 list: VirusShare_67504a0c2c2bf47efccdab5ca981ad7d.asm
File not in PE32 list: VirusShare_1328eaceb140a3863951d18661b097af.asm
File not in PE32 list: VirusShare_95f25d3afc5370f5d9fd8e65c17d3599.asm
File not in PE32 list: VirusShare_a5d4ebc0285f0213e0c29d23bc410889.asm
File not in PE32 list: VirusShare_02c65973b6018f5d473d701b3e7508b2.asm
File not in PE32 list: VirusShare_fc1937c1aa536b3744ebdfb1716fd54d.asm
File not in PE32 list: VirusShare_6e442c5ef460bee4c9457c6bf7a132d6.asm
File not in PE32 list: VirusShare_31e5e58dbdfad05175613e795298ebb5.asm
File not in PE32 list: VirusShare_933b11bc4799f8d9f65466fb2e3ea659.asm
File not in PE32 list: VirusShare_7cb055ac3acbf53e07e20b65ec9126a1.asm
File not in PE32 list: VirusShare_c91eacab7655870764d13ba741aa9a73.asm
File not in PE32 list: VirusShare_4e551abcd14506092a0f8d54a45f3569.asm
File not in PE32 list: VirusShare_6f9992c486195edcf0bf2f6ee6c3ec74.asm
File not in PE32 list: VirusShare_4a54d7878d4170c3d4e3c3606365c42c.asm
File not in PE32 list: VirusShare_7712d05c8b499fc7a1f4a6a6b6dee825.asm
File not in PE32 list: VirusShare_123505024f9e5ff74cb6aa67d7fcc392.asm
File not in PE32 list: VirusShare_00dbb9e1c09dbdafb360f3163ba5a3de.asm
File not in PE32 list: VirusShare_9675827a495f4ba6a4efd4dd70932b7c.asm
File not in PE32 list: VirusShare_ea1b44094ae4d8e2b63a1771a3e61fd5.asm
File not in PE32 list: filetypes.txt
File not in PE32 list: VirusShare_ca327bc83fbe38b3689cd1a5505dfc33.asm
File not in PE32 list: VirusShare_6808ec6dbb23f0fa7637c108f44c5c80.asm
File not in PE32 list: VirusShare_e476e4a24f8b4ff4c8a0b260aa35fc9f.asm
File not in PE32 list: VirusShare_4f763b07a7b8a80f1f9408e590f79532.asm
File not in PE32 list: VirusShare_0b506c6dde8d07f9eeb82fd01a6f97d4.asm
File not in PE32 list: VirusShare_8934aeed5d213fe29e858eee616a6ec7.asm
File not in PE32 list: VirusShare_3de1bd0f2107198931177b2b23877df4.asm
File not in PE32 list: VirusShare_c99fa835350aa9e2427ce69323b061a9.asm
File not in PE32 list: VirusShare_3107de21e480ab1f2d67725f419b28d0.asm
File not in PE32 list: VirusShare_0908d8b3e459551039bade50930e4c1b.asm

``````
``````

In [6]:

def get_unpacked_file_list(packer_id_feature_file, file_id_feature_file, trid_id_feature_file):
# Load the malware packer id features and file id features from the sample set.

# Get a list of unpacked PE files that are not .NET CIL format.
# IDA Pro cannot disassemble .NET files, have to use Ildisasm.exe in Visual Studio.
unpacked_files = packer_id_features[packer_id_features['is_packed'] == 0]
unpacked_pe_files = unpacked_files[unpacked_files['valid_pe'] == 1]
not_dot_net = []
counter = 0
dot_net_counter = 0

# Get the trid and file rows that are for unpacked PE files.
trids = trid_id_features[trid_id_features['file_name'].isin(unpacked_pe_files['file_name'])]
fids = file_id_features[file_id_features['file_name'].isin(unpacked_pe_files['file_name'])]

# Iterate over the unpacked PE file list and check if each is a .NET file.
# If not a .NET file then add to file list.
pe_names_list = unpacked_pe_files['file_name']

for idx, file_name in enumerate(pe_names_list):
trid_name = trids.iloc[idx, 1]
fid_name = fids.iloc[idx, 1]
trid_name = trid_name.lower()
fid_name = fid_name.lower()

if trid_name.find('.net') > -1 or fid_name.find('.net') > -1:
#print('Found: {:s} - {:s}'.format(trid_name, fid_name))
dot_net_counter += 1
continue

#print('Found: {:s} - {:s}'.format(trid_name, fid_name))
not_dot_net.append(file_name)
counter += 1

file_list = []
write_list = []
counter = 0

# Iterate over the file list and prepend the full file name.
for file_name in not_dot_net:
full_name = "VirusShare_" + file_name
file_list.append(full_name)
write_list.append(full_name + "\n")
counter += 1

if (len(file_list) > 0):
fop = open('data/temp-unpacked-pe-non-dot-net.txt','w')
fop.writelines(write_list)
fop.close()

print("Got {:d} unpacked PE filenames and {:d} .NET filenames.".format(counter, dot_net_counter))

return file_list

``````
``````

In [8]:

packer_id_file = 'data/sorted-packer-id-features-vs251.csv'
file_id_file = 'data/sorted-file-id-features-vs251.csv'
trid_id_file = 'data/sorted-trid-id-features-vs251.csv'
ext_drive = '/opt/vs/train1/'

unflist = get_unpacked_file_list(packer_id_file, file_id_file, trid_id_file)

file_list = []
completed_list = os.listdir('/opt/vs/train1asm/')
print("Got {:d} completed ASM files.".format(len(completed_list)))
for idx, fname in enumerate(completed_list):
completed_list[idx] = fname[0:fname.find(".asm")]

for idx, fname in enumerate(unflist):
if fname not in completed_list:
file_list.append(ext_drive + fname)

print("Processing {:d} files out of {:d} total unpacked PE files.".format(len(file_list), len(unflist)))

``````
``````

Got 55128 unpacked PE filenames and 348 .NET filenames.
Got 21315 completed ASM files.
Processing 33814 files out of 55128 total unpacked PE files.

``````
``````

In [9]:

33814 + 21315

``````
``````

Out[9]:

55129

``````
``````

In [8]:

def get_unpacked_file_list(packer_id_feature_file, file_id_feature_file, trid_id_feature_file):
# Load the malware packer id features and file id features from the sample set.

# Get a list of unpacked PE files that are not .NET CIL format.
# IDA Pro cannot disassemble .NET files, have to use Ildisasm.exe in Visual Studio.
unpacked_files = packer_id_features[packer_id_features['is_packed'] == 0]
unpacked_pe_files = unpacked_files[unpacked_files['valid_pe'] == 1]
not_dot_net = []
counter = 0
dot_net_counter = 0
amd64_bit_counter = 0

# Get the trid and file rows that are for unpacked PE files.
trids = trid_id_features[trid_id_features['file_name'].isin(unpacked_pe_files['file_name'])]
fids = file_id_features[file_id_features['file_name'].isin(unpacked_pe_files['file_name'])]

# Iterate over the unpacked PE file list and check if each is a .NET file.
# If not a .NET file then add to file list.
pe_names_list = unpacked_pe_files['file_name']

for idx, file_name in enumerate(pe_names_list):
trid_name = trids.iloc[idx, 1]
fid_name = fids.iloc[idx, 1]
trid_name = trid_name.lower()
fid_name = fid_name.lower()

#print("Trid: {:s}".format(trid_name))
#print("Fid: {:s}".format(fid_name))

if trid_name.find('.net') > -1 or fid_name.find('.net') > -1:
print('Found: {:s} - {:s}'.format(trid_name, fid_name))
dot_net_counter += 1
continue

if trid_name.find('win64') > -1 or fid_name.startswith('pe32+'):
print('Found: {:s} - {:s}'.format(trid_name, fid_name))
amd64_bit_counter += 1
continue

#print('Found: {:s} - {:s}'.format(trid_name, fid_name))
not_dot_net.append(file_name)
counter += 1

file_list = []
write_list = []
counter = 0

# Iterate over the file list and prepend the full file name.
for file_name in not_dot_net:
full_name = "VirusShare_" + file_name
file_list.append(full_name)
write_list.append(full_name + "\n")
counter += 1

if (len(file_list) > 0):
fop = open('data/temp-unpacked-pe-non-dot-net.txt','w')
fop.writelines(write_list)
fop.close()

print("Got {:d} unpacked PE files.".format(counter))
print("Got {:d} .NET file and {:d} 64 Bit files.".format(dot_net_counter, amd64_bit_counter))

return file_list

``````
``````

In [ ]:

packer_id_file = 'data/sorted-packer-id-features-apt.csv'
file_id_file = 'data/sorted-file-id-features-apt.csv'
trid_id_file = 'data/sorted-trid-id-features-apt.csv'
ext_drive = '/opt/vs/apt/'

unflist = get_unpacked_file_list(packer_id_file, file_id_file, trid_id_file)

``````
``````

In [ ]:

``````

## Test ELF Disassembly.

``````

In [13]:

# Test ELF disassembly.

def get_elf_file_list(ext_drive, packer_id_feature_file, file_id_feature_file, trid_id_feature_file):
# Load the malware packer id features and file id features from the sample set.

counter = 0

file_names_list = file_id_features['file_name']
file_list = []
write_list = []
fid_list = []

for idx, file_name in enumerate(file_names_list):
trid_name = trid_id_features.iloc[idx, 1]
fid_name = file_id_features.iloc[idx, 1]

if trid_name.find('ELF') > -1 or fid_name.find('ELF') > -1:
print('Found: {:s} - {:s}'.format(trid_name, fid_name))
counter += 1
full_name = ext_drive + "VirusShare_" + file_name
write_list =  full_name + "\n"
file_list.append(full_name)
fid_list.append(fid_name)

fop = open('data/elf-file-list.txt','w')
fop.writelines(write_list)
fop.close()

print("Got {:d} ELF filenames.".format(counter))

return file_list, fid_list

def disassemble_elf_binaries(file_list, fid_list):
# Use the command "objdump -d -M intel file_name" to dump out all
# the code sections of the ELF binary and generate assembly code in Intel
# format as this is easier to read and better for machine learning
# feature extraction.
# Use the command "objdump -g -x file_name -o file_name.txt to dump out

counter = 0
disassed = 0
error_count = 0

print("Disassembling {:d} binary ELF files.".format(len(file_list)))

for idx, file_name in enumerate(file_list):
file_path = file_name.rstrip() # remove the newlines or else !!!
asm_file_name = file_path + ".elf.asm"
hdr_file_name = file_path + ".elf.txt"
fid_name = fid_list[idx]

if (os.path.isfile(file_path)):
fopasm = open(asm_file_name, "w")
# Dump the assembly code listing.
if "Intel" in fid_name:
sub.call(["objdump", "-d", "-M intel", file_path], stdout=fopasm)
#sub.call(["ndisasm", "-d", "-M intel", file_path], stdout=fopasm)
elif "x86" in fid_name:
sub.call(["objdump", "-d", "-M intel", file_path], stdout=fopasm)
elif "ARM" in fid_name:
sub.call(["objdump", "-d", "-marm", file_path], stdout=fopasm)
elif "PowerPC" in fid_name:
sub.call(["objdump", "-d", "-mpowerpc", file_path], stdout=fopasm)
elif "Motorola" in fid_name:
sub.call(["objdump", "-d", "-mm68k", file_path], stdout=fopasm)
elif "SPARC" in fid_name:
sub.call(["objdump", "-d", "-msparc", file_path], stdout=fopasm)
elif "MIPS" in fid_name:
sub.call(["objdump", "-d", "-mmips", file_path], stdout=fopasm)
elif "Renesas" in fid_name: # SuperH
sub.call(["objdump", "-d", "-msh", file_path], stdout=fopasm)

# Dump the ELF section headers.
fophdr = open(hdr_file_name, "w")
fophdr.close()

fopasm.close()

# now delete the binary, we do not need it anymore.
# sub.call(["rm", file_path1])

disassed += 1

else:
#print("Error: file does not exist - {:s}".format(file_path))
error_count += 1

counter += 1
if (counter % 1000) == 0: # print progress
print('Disassembled: {:d} - {:s}'.format(counter, file_path))

print("Disassembled {:d} ELF binaries with {:d} file path errors.".format(disassed, error_count))

#sub.call(["mv", "*.asm", "/opt/vs/asm"])

return

``````
``````

In [7]:

ext_drive = '/opt/vs/train1/'
packer_id_file = 'data/sorted-packer-id-features-vs251.csv'
file_id_file = 'data/sorted-file-id-features-vs251.csv'
trid_id_file = 'data/sorted-trid-id-features-vs251.csv'

unflist, fidlist = get_elf_file_list(ext_drive, packer_id_file, file_id_file, trid_id_file)

disassemble_elf_binaries(unflist, fidlist)

``````
``````

Found: unknown - ELF 32-bit MSB  executable PowerPC or cisco 4500 version 1 (SYSV) statically linked not stripped
Found: unknown - ELF 32-bit LSB  executable MIPS MIPS-I version 1 (SYSV) statically linked not stripped
Got 2 ELF filenames.
Disassembling 2 binary ELF files.
Disassembled 2 ELF binaries with 0 file path errors.

``````
``````

In [8]:

ext_drive = '/opt/vs/train2/'
packer_id_file = 'data/sorted-packer-id-features-vs252.csv'
file_id_file = 'data/sorted-file-id-features-vs252.csv'
trid_id_file = 'data/sorted-trid-id-features-vs252.csv'

unflist, fidlist = get_elf_file_list(ext_drive, packer_id_file, file_id_file, trid_id_file)

disassemble_elf_binaries(unflist, fidlist)

``````
``````

Found: unknown - ELF 32-bit MSB  executable PowerPC or cisco 4500 version 1 (SYSV) statically linked not stripped
Found: unknown - ELF 32-bit LSB  executable ARM version 1 statically linked not stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) dynamically linked (uses shared libs) for GNU/Linux 2.0.0 stripped
Got 3 ELF filenames.
Disassembling 3 binary ELF files.
Disassembled 3 ELF binaries with 0 file path errors.

``````
``````

In [14]:

ext_drive = '/opt/vs/train3/'
packer_id_file = 'data/sorted-packer-id-features-vs263.csv'
file_id_file = 'data/sorted-file-id-features-vs263.csv'
trid_id_file = 'data/sorted-trid-id-features-vs263.csv'

unflist, fidlist = get_elf_file_list(ext_drive, packer_id_file, file_id_file, trid_id_file)

disassemble_elf_binaries(unflist, fidlist)

``````
``````

Found: unknown - ELF 32-bit LSB  executable ARM EABI4 version 1 (SYSV) statically linked for GNU/Linux 2.6.14 stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked for GNU/Linux 2.2.5 not stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (GNU/Linux) statically linked for GNU/Linux 2.6.18 not stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked for GNU/Linux 2.6.9 not stripped
Found: unknown - ELF 32-bit LSB  executable ARM EABI5 version 1 (SYSV) statically linked for GNU/Linux 2.6.16 not stripped
Got 5 ELF filenames.
Disassembling 5 binary ELF files.
Disassembled 5 ELF binaries with 0 file path errors.

``````
``````

In [ ]:

``````
``````

In [ ]:

``````

## Generate Instruction Sets for Various Computer Architectures/Processors/JVM.

``````

In [ ]:

ext_drive = '/opt/vs/train4/'
packer_id_file = 'data/sorted-packer-id-features-vs264.csv'
file_id_file = 'data/sorted-file-id-features-vs264.csv'
trid_id_file = 'data/sorted-trid-id-features-vs264.csv'

unflist, fidlist = get_elf_file_list(ext_drive, packer_id_file, file_id_file, trid_id_file)

disassemble_elf_binaries(unflist, fidlist)

``````
``````

In [ ]:

fip = open('data/amd64-instruction-set.txt')
inlines

``````
``````

In [ ]:

opcode_list = []
for line in inlines:
tokens = line.rstrip()
opcode_list.append(tokens.lower())

opcode_str = "[\'" + "','".join(opcode_list) + "\']"
opcode_str

``````
``````

In [ ]:

fip = open('data/arm-instruction-set.txt')
inlines

``````
``````

In [4]:

opcode_list = []
for line in inlines:
tokens = line.split()
opcode_list.append(tokens[0].lower())

opcode_str = "[\'" + "','".join(opcode_list) + "\']"
opcode_str

``````
``````

Out[4]:

``````
``````

In [ ]:

fip = open('data/sparc-instruction-set.txt')
inlines

``````
``````

In [ ]:

opcode_list = []
for line in inlines:
tokens = line.split()
opcode_list.append(tokens[0].lower())

opcode_str = "[\'" + "','".join(opcode_list) + "\']"
opcode_str

``````
``````

In [ ]:

fip = open('data/powerpc-instruction-set.txt')
inlines

``````
``````

In [ ]:

opcode_list = []
for line in inlines:
tokens = line.replace('[', ' ').split()
opcode_list.append(tokens[0].lower())

opcode_str = "[\'" + "','".join(opcode_list) + "\']"
opcode_str

``````
``````

In [6]:

len(opcode_list)

``````
``````

Out[6]:

223

``````
``````

In [ ]:

fip = open('data/powerpc-version-202-instruction-set.txt')
inlines

``````
``````

In [ ]:

opcode_list = []
for line in inlines:
tokens = line.replace('[', ' ').rstrip().split()
opcode_list.append(tokens[0].lower())

opcode_str = "[\'" + "','".join(opcode_list) + "\']"
opcode_str

``````
``````

In [9]:

len(opcode_list)

``````
``````

Out[9]:

233

``````
``````

In [ ]:

``````
``````

In [ ]:

fip = open('data/motorola-instruction-set.txt')
inlines

``````
``````

In [ ]:

``````
``````

In [ ]:

fip = open('data/mips-instruction-set.txt')
inlines

``````
``````

In [ ]:

``````
``````

In [ ]:

``````
``````

In [ ]:

``````
``````

In [2]:

# Check file id strings for ELF executables
fip = open('data/sorted-file-id-features-vs251.csv')
for line in inlines:
if "ELF" in line:
print("-> {:s}".format(line))

``````
``````

-> abbde81d7f4733c16046cbd8ee7409d3,ELF 32-bit MSB  executable PowerPC or cisco 4500 version 1 (SYSV) statically linked not stripped,56

-> f04f278048fc082dd5d0f34efa3c05f8,ELF 32-bit LSB  executable MIPS MIPS-I version 1 (SYSV) statically linked not stripped,475

``````
``````

In [3]:

fip = open('data/sorted-file-id-features-vs252.csv')
for line in inlines:
if "ELF" in line:
print("-> {:s}".format(line))

``````
``````

-> c6813bcaf9a2801973e9c44fe75ef75b,ELF 32-bit MSB  executable PowerPC or cisco 4500 version 1 (SYSV) statically linked not stripped,56

-> cbb492024bdd2484f39893ab77da0cae,ELF 32-bit LSB  executable ARM version 1 statically linked not stripped,216

-> fa390c69553d757c3a10737a0a8604dc,ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) dynamically linked (uses shared libs) for GNU/Linux 2.0.0 stripped,463

``````
``````

In [4]:

fip = open('data/sorted-file-id-features-vs263.csv')
for line in inlines:
if "ELF" in line:
print("-> {:s}".format(line))

``````
``````

-> 480813ec6548a4e55245a0e446e63c36,ELF 32-bit LSB  executable ARM EABI4 version 1 (SYSV) statically linked for GNU/Linux 2.6.14 stripped,188

-> 5b88e0490dd764e66e13c8a543099c9d,ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked for GNU/Linux 2.2.5 not stripped,192

-> 62d33be03ef3bc9c81d703898fc0e18c,ELF 32-bit LSB  executable Intel 80386 version 1 (GNU/Linux) statically linked for GNU/Linux 2.6.18 not stripped,349

-> 7a891a96d6af45865e5fe6142b40eb77,ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked for GNU/Linux 2.6.9 not stripped,447

-> af8970eb045a77ad1c427eb6333c9efd,ELF 32-bit LSB  executable ARM EABI5 version 1 (SYSV) statically linked for GNU/Linux 2.6.16 not stripped,176

``````
``````

In [ ]:

fip = open('data/sorted-file-id-features-vs264.csv')
for line in inlines:
if "ELF" in line:
print("-> {:s}".format(line))

``````
``````

In [ ]:

``````

## Generate Assembly Instruction Sets for Various Computer Architectures.

``````

In [ ]:

fip = open("/home/derek/binutils.txt")
inlines

``````
``````

In [9]:

package_list = []
for line in inlines:
tokens = line.replace(":", " ").split()
package_list.append(tokens[1])

command = "apt install " + " ".join(package_list)
command

``````
``````

Out[9]:

'apt install binutils binutils binutils-aarch64-linux-gnu binutils-aarch64-linux-gnu binutils-arm-linux-gnueabi binutils-arm-linux-gnueabi binutils-arm-linux-gnueabihf binutils-arm-linux-gnueabihf binutils-arm-none-eabi binutils-arm-none-eabi binutils-avr binutils-avr binutils-dev binutils-dev binutils-doc binutils-gold binutils-gold binutils-h8300-hms binutils-h8300-hms binutils-m68hc1x binutils-m68hc1x binutils-mingw-w64 binutils-mingw-w64-i686 binutils-mingw-w64-i686 binutils-mingw-w64-x86-64 binutils-mingw-w64-x86-64 binutils-msp430 binutils-msp430 binutils-multiarch binutils-multiarch binutils-multiarch-dev binutils-multiarch-dev binutils-powerpc-linux-gnu binutils-powerpc-linux-gnu binutils-powerpc64le-linux-gnu binutils-powerpc64le-linux-gnu binutils-source binutils-static binutils-static binutils-z80 binutils-z80 elf-binutils elf-binutils mingw32-binutils mingw32-binutils'

``````
``````

In [ ]:

fip = open("/home/derek/binutils.txt")
inlines

``````
``````

In [5]:

package_list = []
for line in inlines:
if ":i386" not in line:
tokens = line.split()
package_list.append(tokens[1])

command = "apt install " + " ".join(package_list)
command

``````
``````

Out[5]:

'apt install binutils binutils-aarch64-linux-gnu binutils-alpha-linux-gnu binutils-arm-linux-gnueabi binutils-arm-linux-gnueabihf binutils-arm-linux-gnueabihf:i3 binutils-arm-none-eabi binutils-avr binutils-dev binutils-doc binutils-gold binutils-h8300-hms binutils-hppa-linux-gnu binutils-hppa64 binutils-hppa64-linux-gnu binutils-m68hc1x binutils-m68k-linux-gnu binutils-mingw-w64 binutils-mingw-w64-i686 binutils-mingw-w64-x86-64 binutils-mips-linux-gnu binutils-mips64-linux-gnuabi64 binutils-mips64-linux-gnuabi64: binutils-mips64el-linux-gnuabi6 binutils-mips64el-linux-gnuabi6 binutils-mipsel-linux-gnu binutils-msp430 binutils-multiarch binutils-multiarch-dev binutils-powerpc-linux-gnu binutils-powerpc-linux-gnuspe binutils-powerpc-linux-gnuspe:i binutils-powerpc64-linux-gnu binutils-powerpc64-linux-gnu:i3 binutils-powerpc64le-linux-gnu binutils-powerpc64le-linux-gnu: binutils-s390x-linux-gnu binutils-sh4-linux-gnu binutils-source binutils-sparc64-linux-gnu binutils-z80 elf-binutils'

``````
``````

In [6]:

x86_registers = ['edx','esi','es','fs','ds','ss','gs','cs','ah','al',
'ax','bh','bl','bx','ch','cl','cx','dh','dl','dx',
'eax','ebp','ebx','ecx','edi','esp']

,'fistp','fld','fstcw','fstcwimul','fstp','fword','fxch','imul','in','inc'
,'ins','int','jb','je','jg','jge','jl','jmp','jnb','jno','jnz','jo','jz'
,'lea','loope','mov','movzx','mul','near','neg','not','or','out','outs'
,'pop','popf','proc','push','pushf','rcl','rcr','rdtsc','rep','ret','retn'
,'rol','ror','sal','sar','sbb','scas','setb','setle','setnle','setnz'
,'setz','shl','shld','shr','sidt','stc','std','sti','stos','sub','test'
,'wait','xchg','xor']

amd64_registers = ['rax','rbx','rcx','rdx','rsi','rdi','rbp','rsp','r8','r9','r10','r11','r12','r13','r14','r15']

'blcmsk','blcs','blsfill','blsi','blsic','blsmsk','blsr','bound','bsf','bsr','bswap','bt',
'btc','btr','bts','bzhi','call','cbw','cwde','cdqe','cwd','cdq','cqo','clc','cld','clflush','cmc','cmov',
'cmp','cmps','cmpsb','cmpsw','cmpsd','cmpsq','cmpxchg','cmpxchg8b','cmpxchg16b','cpuid',
'crc32','daa','das','dec','div','enter','idiv','imul','in','inc','ins','insb','insw','insd',
'int','into','jcxz','jecxz','jrcxz','jmp','lahf','lds','les','lfs','lgs','lss','lea','leave','lfence',
'llwpcb','lods','lodsb','lodsw','lodsd','lodsq','loop','loope','loopne','loopnz','loopz','lwpins',
'lwpval','lzcnt','mfence','mov','movbe','movd','movmskpd','movmskps','movnti','movs','movsb',
'movsw','movsd','movsq','movsx','movsxd','movzx','mul','mulx','neg','nop','not','or','out',
'rcl','rcr','rdfsbase','rdgsbase','rdrand','ret','rol','ror','rorx','sahf','sal','shl','sar','sarx',
'sbb','scas','scasb','scasw','scasd','scasq','set','sfence','shl','shld','shlx',
'shr','shrd','shrx','slwpcb','stc','std','stos','stosb','stosw','stosd','stosq','sub','t1mskc',
'clts','hlt','int','invd','invlpg','invlpga','iret','iretd','iretq','lar','lgdt','lidt','lldt',
'lmsw','lsl','ltr','monitor','monitorx','mwait','mwaitx','rdmsr','rdpmc','rdtsc','rdtscp',
'rsm','sgdt','sidt','skinit','sldt','smsw','sti','stgi','str','swapgs',
'syscall','sysenter','sysexit','sysret','ud2','verr','verw',

MIPS_registers = []

MIPS_opcodes = []

SPARC_registers = []

SPARC_opcodes = []

ARM_registers = ['r0','r1','r2','r3','r4','r5','r6','r7','r8','r9','r10','r11','r12','r13','r14','r15','cpsr']

'cmp','stc','eor','stm','ldc','str','ldm','strb','ldr','strbt','ldrb','strh','ldrbt','strt','ldrh','sub','ldrsb','swi',
'ldrsh','swp','ldrt','swpb','mcr','teq','mla','tst','mov','umlal','mrc','umull','mrs']

Motorola_registers = ['d0','d1','d2','d3','d4','d5','d6','d7','a0','a1','a2','a3','a4','a5','a6','a7','usp','ssp']

Motorola_opcodes = []

PowerPC_registers = ['r0','r1','r2','r3','r4','r5','r6','r7','r8','r9','r10','r11','r12','r13','r14','r15',
'r16','r17','r18','r19','r20','r21','r22','r23','r24','r25','r26','r27','r28','r29','r30','r31']

'b','bc','bcctr','bclr','cmp','cmpi','cmpl','cmpli','cntlzd','cntlzw','crand','crandc','creqv',
'crnand','crnor','cror','crorc','crxor','dcbf','dcbst','dcbt','dcbtst','dcbz','divd','divdu',
'fre','fres','frsp','frsqrte','frsqrtes','fsel','fsqrt','fsqrts','fsub','fsubs','hrfid','icbi',
'isync','lbz','lbzu','lbzux','lbzx','ld','ldarx','ldu','ldux','ldx','lfd','lfdu','lfdux','lfdx',
'lfs','lfsu','lfsux','lfsx','lha','lhau','lhaux','lhax','lhbrx','lhz','lhzu','lhzux','lhzx','lmw',
'lswi','lswx','lwa','lwarx','lwaux','lwax','lwbrx','lwz','lwzu','lwzux','lwzx','mcrf','mcrfs',
'mcrxr','mfcr','mfocrf','mffs','mfmsr','mfspr','mfsr','mfsrin','mftb','mtcrf','mtocrf','mtfsb0',
'mtfsb1','mtfsf','mtfsfi','mtmsr','mtmsrd','mtspr','mtsr','mtsrin','mulhd','mulhdu','mulhw','mulhwu',
'mulld','mulli','mullw','nand','neg','nor','or','orc','ori','oris','popcntb','rfid','rldcl','rldcr',
'rldic','rldicl','rldicr','rldimi','rlwimi','rlwinm','rlwnm','sc','slbia','slbie','slbmfee',
'stbx','std','stdcx.','stdu','stdux','stdx','stfd','stfdu','stfdux','stfdx','stfiwx','stfs',
'stfsu','stfsux','stfsx','sth','sthbrx','sthu','sthux','sthx','stmw','stswi','stswx','stw',
'stwbrx','stwcx.','stwu','stwux','stwx','subf','subfc','subfe','subfic','subfme','subfze',
'sync','td','tdi','tlbia','tlbie','tlbsync','tw','twi','xor','xori','xoris']

``````
``````

In [7]:

fip = open("data/arm-listing.txt")
inlines[:20]

``````
``````

Out[7]:

['  /* XScale instructions.  */\n',
'  {ARM_FEATURE_COPROC (ARM_CEXT_XSCALE),\n',
'    0x0e200010, 0x0fff0ff0,\n',
'    "mia%c\\tacc0, %0-3r, %12-15r"},\n',
'  {ARM_FEATURE_COPROC (ARM_CEXT_XSCALE),\n',
'    0x0e280010, 0x0fff0ff0,\n',
'    "miaph%c\\tacc0, %0-3r, %12-15r"},\n',
'  {ARM_FEATURE_COPROC (ARM_CEXT_XSCALE),\n',
'    0x0e2c0010, 0x0ffc0ff0, "mia%17\'T%17`B%16\'T%16`B%c\\tacc0, %0-3r, %12-15r"},\n',
'  {ARM_FEATURE_COPROC (ARM_CEXT_XSCALE),\n',
'    0x0c400000, 0x0ff00fff, "mar%c\\tacc0, %12-15r, %16-19r"},\n',
'  {ARM_FEATURE_COPROC (ARM_CEXT_XSCALE),\n',
'    0x0c500000, 0x0ff00fff, "mra%c\\t%12-15r, %16-19r, acc0"},\n',
'\n',
'  /* Intel Wireless MMX technology instructions.  */\n',
'  {ARM_FEATURE_CORE_LOW (0), SENTINEL_IWMMXT_START, 0, "" },\n',
'  {ARM_FEATURE_COPROC (ARM_CEXT_IWMMXT),\n',
'    0x0e130130, 0x0f3f0fff, "tandc%22-23w%c\\t%12-15r"},\n',
'  {ARM_FEATURE_COPROC (ARM_CEXT_XSCALE),\n',
'    0x0e400010, 0x0ff00f3f, "tbcst%6-7w%c\\t%16-19g, %12-15r"},\n']

``````
``````

In [ ]:

# Parse the contents of arm-dis.c in binutils and extract all the ARM opcodes.
opcode_list = []
counter = 0
opcode = 'none'
for line in inlines:
line = line.lstrip()
line = line.replace('\\t',' ')
if len(line) < 10:
continue
if line.startswith('{'):
continue
if line.startswith('"'):
idx = line.find('%')
if idx > 0:
opcode = line[1:idx]
else:
continue
else:
tokens = line.split()
if len(tokens) < 3:
continue
opcodestr = tokens[2]
idx = opcodestr.find('%')
if idx > 0:
opcode = opcodestr[1:idx]
else:
continue

if opcode not in opcode_list:
opcode_list.append(opcode)

opcode_str = "[\'" + "','".join(opcode_list) + "\']"
opcode_str

``````
``````

In [ ]:

fip = open("data/m68k-opc.c")
inlines[:20]

``````
``````

In [ ]:

# Parse the contents of m68k-opc.c in binutils and extract all the Motorola opcodes.
opcode_list = []
counter = 0
opcode = 'none'
for line in inlines:
line = line.lstrip()
if len(line) < 10:
continue
if line.startswith("{"):
line = line[2:]
idx = line.find("\"")
if idx > 0:
opcode = line[:idx]
else:
continue
else:
continue

if opcode not in opcode_list:
opcode_list.append(opcode)

opcode_str = "[\'" + "','".join(opcode_list) + "\']"
opcode_str

``````
``````

In [ ]:

fip = open("data/mips-instruction-set.txt")
inlines[:20]

``````
``````

In [ ]:

# Parse the contents of mips-instruction-set.txt and extract all the things.
opcode_list = []
counter = 0
opcode = 'none'
for line in inlines:
line = line.lstrip()
if len(line) < 10:
continue
if line.startswith("{"):
line = line[2:]
idx = line.find("\"")
if idx > 0:
opcode = line[:idx]
else:
continue
else:
continue

idx = opcode.find(".")
if idx > 0:
opcode = opcode[:idx]

if opcode not in opcode_list:
opcode_list.append(opcode)

opcode_str = "[\'" + "','".join(opcode_list) + "\']"
opcode_str

``````
``````

In [ ]:

fip = open("data/i386-opc.tbl")
inlines[:20]

``````
``````

In [ ]:

# Parse the contents of i386-opc.tbl from binutils and extract all the things.
opcode_list = []
counter = 0
opcode = 'none'
for line in inlines:
if len(line) < 10:
continue
if line.startswith("//"):
continue

line = line.replace(',', ' ')
tokens = line.split()
if len(tokens) > 0:
opcode = tokens[0]
else:
continue

idx = opcode.find(".")
if idx > 0:
opcode = opcode[:idx]

if opcode not in opcode_list:
opcode_list.append(opcode)

opcode_str = "[\'" + "','".join(opcode_list) + "\']"
opcode_str

``````
``````

In [2]:

fip = open("data/Java-bytecode-instruction-listing.txt")
inlines[:20]

``````
``````

Out[2]:

['aaload \t32 \t0011 0010 \t\tarrayref, index \xe2\x86\x92 value \tload onto the stack a reference from an array\n',
'aastore \t53 \t0101 0011 \t\tarrayref, index, value \xe2\x86\x92 \tstore into a reference in an array\n',
'aconst_null \t01 \t0000 0001 \t\t\xe2\x86\x92 null \tpush a /null/ reference onto the stack\n',
'aload \t19 \t0001 1001 \t1: index \t\xe2\x86\x92 objectref \tload a reference onto the stack from a local variable /#index/\n',
'aload_0 \t2a \t0010 1010 \t\t\xe2\x86\x92 objectref \tload a reference onto the stack from local variable 0\n',
'aload_1 \t2b \t0010 1011 \t\t\xe2\x86\x92 objectref \tload a reference onto the stack from local variable 1\n',
'aload_2 \t2c \t0010 1100 \t\t\xe2\x86\x92 objectref \tload a reference onto the stack from local variable 2\n',
'aload_3 \t2d \t0010 1101 \t\t\xe2\x86\x92 objectref \tload a reference onto the stack from local variable 3\n',
'anewarray \tbd \t1011 1101 \t2: indexbyte1, indexbyte2 \tcount \xe2\x86\x92 arrayref create a new array of references of length /count/ and component type\n',
'areturn \tb0 \t1011 0000 \t\tobjectref \xe2\x86\x92 [empty] \treturn a reference from a method\n',
'arraylength \tbe \t1011 1110 \t\tarrayref \xe2\x86\x92 length \tget the length of an array\n',
'astore \t3a \t0011 1010 \t1: index \tobjectref \xe2\x86\x92 \tstore a reference into a local variable /#index/\n',
'astore_0 \t4b \t0100 1011 \t\tobjectref \xe2\x86\x92 \tstore a reference into local variable 0\n',
'astore_1 \t4c \t0100 1100 \t\tobjectref \xe2\x86\x92 \tstore a reference into local variable 1\n',
'astore_2 \t4d \t0100 1101 \t\tobjectref \xe2\x86\x92 \tstore a reference into local variable 2\n',
'astore_3 \t4e \t0100 1110 \t\tobjectref \xe2\x86\x92 \tstore a reference into local variable 3\n',
'athrow \tbf \t1011 1111 \t\tobjectref \xe2\x86\x92 [empty], objectref \tthrows an error\n',
'baload \t33 \t0011 0011 \t\tarrayref, index \xe2\x86\x92 value \tload a byte or Boolean value from an array\n',
'bastore \t54 \t0101 0100 \t\tarrayref, index, value \xe2\x86\x92 \tstore a byte or Boolean value into an array\n',
'bipush \t10 \t0001 0000 \t1: byte \t\xe2\x86\x92 value \tpush a /byte/ onto the stack as an integer /value/\n']

``````
``````

In [3]:

# Parse the contents of Java-bytecode-instruction-listing.txt and extract all the things.
opcode_list = []
counter = 0
opcode = 'none'
for line in inlines:
if len(line) < 10:
continue

line = line.replace('\t', ' ')
tokens = line.split()
if len(tokens) > 0:
opcode = tokens[0]
else:
continue

if opcode not in opcode_list:
opcode_list.append(opcode)

opcode_str = "[\'" + "','".join(opcode_list) + "\']"
opcode_str

``````
``````

Out[3]:

``````
``````

In [4]:

len(opcode_list)

``````
``````

Out[4]:

205

``````
``````

In [ ]:

``````
``````

In [ ]:

``````

## Test PDF Keywords/Names Feature Extraction.

``````

In [ ]:

import os
from csv import writer
import numpy as np
import pandas as pd

# Start of Script

target_dir = "/opt/vs/legitware/"
out_file = "data/pdf-features-legit.csv"
pdf_token_file = "data/2716-pdf-token-counts-non-malicious-set.csv"
#out_file = "data/pdf-features-vs251.csv"

file_list = os.listdir(target_dir)
pdflist = []

for fname in enumerate(file_list):
if fname.endswith('.pdf'):
pdflist.append(target_dir + fname)

print("Got {:d} PDF files.".format(len(pdflist)))

``````
``````

In [ ]:

``````
``````

In [ ]:

``````
``````

In [ ]:

``````