extracting assembly instruction sequences

Aaron Gonzales

uses some methods stored in the feat_extraction folder in asm_utils and asm_instructions.csv.


In [1]:
%load_ext autoreload
import re
%aimport asm_utils

In [2]:
samples = asm_utils.get_collection()

In [3]:
# get names of files from mongo; testing with class 5
names = [post['id'] for post in samples.find({'class':'5'})]
test_file = names[0]
test_file


Out[3]:
'1IpWLz6eyhVxDAfQMKEd'

In [4]:
test_id = '0qjuDC7Rhx9rHkLlItAp'
post_test = samples.find_one({'id': test_id})
print(post_test['id'])


0qjuDC7Rhx9rHkLlItAp

In [5]:
with open('asm_instructions.csv', 'r') as f:
    asm_instr = [line.strip() for line in f.readlines()]

In [104]:
asm = asm_utils.read_assembly(post['id'])
instr_seq = asm_utils.get_inst(asm, asm_instr)
test_asm_info = asm_seq_info(instr_seq)

In [51]:
# test_a = [to_utf(a) for a in asm if len(a) > 1]
# words = [line.split() for line in test_a if line != None]

In [6]:
def make_asm_info(post):
    """Takes a mongodb post and inserts information about it's assembly code in it"""
    asm = asm_utils.read_assembly(post['id'])
    instr_seq = asm_utils.get_inst(asm, asm_instr)
    post['asm_info'] = asm_utils.asm_seq_info(instr_seq)
    #post['asm_instr_seq'] = instr_seq
    #post['asm_instr_count'] = len(instr_seq)
    #post['asm_uniq_instr'] = set(instr_seq)
    #post['asm_uniq_count'] = len(set(instr_seq))
    samples.save(post)

In [7]:


In [8]:
make_asm_info(post_test)

In [13]:
documents = samples.find()

In [15]:
for doc in documents:
    print('extracting assembly info for %s' % doc['id'])
    make_asm_info(doc)


Out[15]:
10868

In [10]:


In [21]:


In [66]:


In [34]:


In [117]:


In [118]:
jumps = [word for word in a if word  in ['je', 'ne', 'jz', 'jg', 'jge', 'jl', 'jle']]

In [113]:


In [114]:
# lcss


Out[114]:
34

In [117]:



Out[117]:
ObjectId('5513a805127d27664d47bd44')

In [130]:
def update_collection(collection, _id, key, value):
    # samples.update({'id': test_file}, {"$set": {'ida_comments': test_comments }})
    if id is None:
        print('you must pass an id')
        return
    print(collection)
    # print('updating %s with %s' % (_id, value['num_instr']))
    collection.update({'id': _id},
                      {"$set": {key :  value }})

In [12]:


In [8]:


In [131]:
update_collection(samples, test_file, 'asm_count', 'x')


Collection(Database(MongoClient('afruizc-office.cs.unm.edu', 27017), 'malware'), 'samples')

In [60]:



<class 'str'>

In [118]:


In [9]:
a = samples.find({'id': post_test['id']})

In [12]:
a[0]['asm_info']['num_instr']


Out[12]:
2348

In [ ]:


In [ ]: