1. Various Code Testing.


In [1]:
from multiprocessing import Pool
import os
from csv import writer
import numpy as np
import pandas as pd
import math
import scipy.misc
import array
import time as tm
import re
import subprocess as sub

In [ ]:

2. PE Header Feature Extraction Tests.


In [5]:
# Testing PE header feature extraction.

field_list = [ "Characteristics","Time/Date","Magic","MajorLinkerVersion","MinorLinkerVersion",
"SizeOfCode","SizeOfInitializedData","SizeOfUninitializedData","AddressOfEntryPoint",
"BaseOfCode","BaseOfData","ImageBase","SectionAlignment","FileAlignment",
"MajorOSystemVersion","MinorOSystemVersion","MajorImageVersion","MinorImageVersion",
"MajorSubsystemVersion","MinorSubsystemVersion","Win32Version",
"SizeOfImage","SizeOfHeaders","CheckSum","Subsystem","DllCharacteristics","SizeOfStackReserve",
"SizeOfStackCommit","SizeOfHeapReserve","SizeOfHeapCommit","LoaderFlags","NumberOfRvaAndSizes" ]

field_list_len = len(field_list)

ptime = re.compile("Time/Date\s+(.+)") # Time/Date pattern for PE Header field.






def combine_feature_files(feature_file_name, token_file):
    # Function to combine the newly generated PE header feature files into one file:
    # 1. list data directory
    # 2. For each file in file list that matches (\d\d\d\d-pe-header-features.csv)
    # 3. Trim the filenames if necessary (should remove VirusShare_  prefix).
    # 4. Concatenate the unsorted pe header feature files.
    # 5. Sort and write to data/sorted-pe-header-features.csv
    
    hdr_pd = pd.read_csv('data/' + token_file)
    tokens = list(hdr_pd['token_name'])
    for idx, token in enumerate(tokens): # Clamp the token name length and demangle C++ names, they are annoying.
        token = token.replace('@','').replace('$','').replace('?','')
        if len(token) > 32:
            tokens[idx] = token[:32]
        else:
            tokens[idx] = token
        


    fop = open('data/' + feature_file_name,'w')
    colnames = "file_name," + ",".join(field_list) + "," + ",".join(tokens) + "\n"
    #print("Column names: {:s}".format(colnames))
    fop.write(colnames)                    

    p1 = re.compile('\d{3,5}-' + feature_file_name) # This is the PID prefix for each file.
    file_list = os.listdir('data/')
    counter = 0
    
    for file_name in file_list:
        if p1.match(file_name):
            fip = open('data/' + file_name, 'r')
            in_lines = fip.readlines()
            fop.writelines(in_lines)
            counter += len(in_lines)
            fip.close()
            
    
    fop.close()
    
    features = pd.read_csv('data/' + feature_file_name)
    # DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values().
    sorted_features = features.sort('file_name')
    sorted_features.to_csv('data/sorted-' + feature_file_name, index=False)
    
    print('Completed combine of {:d} PE header file features.'.format(counter))  
    
    return

In [6]:
out_file = 'pe-header-features-vs251.csv'
token_file = 'pe-header-tokens-vs251.csv'
combine_feature_files(out_file, token_file)


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-6-1d3a7a74f42e> in <module>()
      1 out_file = 'pe-header-features-vs251.csv'
      2 token_file = 'pe-header-tokens-vs251.csv'
----> 3 combine_feature_files(out_file, token_file)

<ipython-input-5-876d0d79f536> in combine_feature_files(feature_file_name, token_file)
     57     fop.close()
     58 
---> 59     features = pd.read_csv('data/' + feature_file_name)
     60     # DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values().
     61     sorted_features = features.sort('file_name')

/usr/lib/python2.7/dist-packages/pandas/io/parsers.pyc in parser_f(filepath_or_buffer, sep, dialect, compression, doublequote, escapechar, quotechar, quoting, skipinitialspace, lineterminator, header, index_col, names, prefix, skiprows, skipfooter, skip_footer, na_values, na_fvalues, true_values, false_values, delimiter, converters, dtype, usecols, engine, delim_whitespace, as_recarray, na_filter, compact_ints, use_unsigned, low_memory, buffer_lines, warn_bad_lines, error_bad_lines, keep_default_na, thousands, comment, decimal, parse_dates, keep_date_col, dayfirst, date_parser, memory_map, nrows, iterator, chunksize, verbose, encoding, squeeze, mangle_dupe_cols, tupleize_cols, infer_datetime_format)
    418                     infer_datetime_format=infer_datetime_format)
    419 
--> 420         return _read(filepath_or_buffer, kwds)
    421 
    422     parser_f.__name__ = name

/usr/lib/python2.7/dist-packages/pandas/io/parsers.pyc in _read(filepath_or_buffer, kwds)
    223         return parser
    224 
--> 225     return parser.read()
    226 
    227 _parser_defaults = {

/usr/lib/python2.7/dist-packages/pandas/io/parsers.pyc in read(self, nrows)
    632         index, columns, col_dict = self._create_index(ret)
    633 
--> 634         df = DataFrame(col_dict, columns=columns, index=index)
    635 
    636         if self.squeeze and len(df.columns) == 1:

/usr/lib/python2.7/dist-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
    199                                  dtype=dtype, copy=copy)
    200         elif isinstance(data, dict):
--> 201             mgr = self._init_dict(data, index, columns, dtype=dtype)
    202         elif isinstance(data, ma.MaskedArray):
    203             import numpy.ma.mrecords as mrecords

/usr/lib/python2.7/dist-packages/pandas/core/frame.pyc in _init_dict(self, data, index, columns, dtype)
    321 
    322         return _arrays_to_mgr(arrays, data_names, index, columns,
--> 323                               dtype=dtype)
    324 
    325     def _init_ndarray(self, values, index, columns, dtype=None,

/usr/lib/python2.7/dist-packages/pandas/core/frame.pyc in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
   4471     axes = [_ensure_index(columns), _ensure_index(index)]
   4472 
-> 4473     return create_block_manager_from_arrays(arrays, arr_names, axes)
   4474 
   4475 

/usr/lib/python2.7/dist-packages/pandas/core/internals.pyc in create_block_manager_from_arrays(arrays, names, axes)
   3753 def create_block_manager_from_arrays(arrays, names, axes):
   3754     try:
-> 3755         blocks = form_blocks(arrays, names, axes)
   3756         mgr = BlockManager(blocks, axes)
   3757         mgr._consolidate_inplace()

/usr/lib/python2.7/dist-packages/pandas/core/internals.pyc in form_blocks(arrays, names, axes)
   3830 
   3831     if len(int_items):
-> 3832         int_blocks = _multi_blockify(int_items, items, is_unique=is_unique)
   3833         blocks.extend(int_blocks)
   3834 

/usr/lib/python2.7/dist-packages/pandas/core/internals.pyc in _multi_blockify(tuples, ref_items, dtype, is_unique)
   3893 
   3894         block_items, values, placement = _stack_arrays(
-> 3895             list(tup_block), ref_items, dtype)
   3896         if is_unique:
   3897             placement = None

/usr/lib/python2.7/dist-packages/pandas/core/internals.pyc in _stack_arrays(tuples, ref_items, dtype)
   3942     shape = (len(arrays),) + _shape_compat(first)
   3943 
-> 3944     stacked = np.empty(shape, dtype=dtype)
   3945     for i, arr in enumerate(arrays):
   3946         stacked[i] = _asarray_compat(arr)

MemoryError: