In [1]:

    
# Using python 2.7 as Anaconda does not have winappdbg module official or community.
# Winappdbg is only run on Windows due to the ctypes module having Windows specific declarations.
from multiprocessing import Pool
import os
import peutils
import pefile
import sys
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import subprocess as sub

#from csv import writer
#import unpack

1. Identify The PE Executable Files And Packer Type In The Malware Samples.

Script name: feature_extraction_packer_id.py



In [ ]:

    
ext_drive = '/opt/vs/train/'
ext_drive2 = '/opt/vs/train2/'
file_list = os.listdir(ext_drive)
len(file_list)



In [13]:

    
def load_packer_id_map():
    # Load the packer ID scalar labels and create a map. There are a lot of duplicate names so the total is less than
    # the number of packers listed in the signature db.
    packer_id_map = {}

    counter = 0
    fip = open('data/packer-id.csv','r')
    in_lines = fip.readlines()
    for idx in range(1,len(in_lines)):
        tokens = in_lines[idx].split(',')
        packer_name = tokens[0]
        if packer_name not in packer_id_map.keys():
            packer_id_map[packer_name] = int(tokens[1])
            counter += 1

    fip.close()
    print('Completed {:d} packer IDs.'.format(counter))
    
    return packer_id_map




def sort_and_save_packer_id_feature_file():
    packers = pd.read_csv('data/packer-id-features.csv')
    # DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values().
    sorted_packers = packers.sort('file_name')
    sorted_packers.to_csv('data/sorted-packer-id-features.csv', index=False)
    sorted_packers.head(20)
    
    return


def combine_packer_id_files():
    # Function to combine the four packer id files in one file
    # 1. list data directory
    # 2. For each file in file list that matches (\d\d\d\d-packer-id-features.csv)
    # 3. Trim the filenames if necessary (should remove VirusShare_  prefix).
    # 4. Concatenate the unsorted packer id feature files.
    # 5. Sort and write to data/sorted-packer-id-features.csv
    fop = open('data/packer-id-features.csv','w')
    fop.write('file_name,packer_name,packer_id,valid_pe,is_packed\n')
    p1 = re.compile('\d{3,5}-sorted-packer-id-features.csv') # This is the PID prefix for each file.
    file_list = os.listdir('data/')
    counter = 0
    for file_name in file_list:
        if p1.match(file_name):
            fip = open('data/' + file_name, 'r')
            in_lines = fip.readlines()
            #if counter > 0:
            #    in_lines = in_lines[1:] # skip the column header row
            fop.writelines(in_lines)
            counter += len(in_lines)
            fip.close()
            
    print('Completed combine of {:d} packer ID features.'.format(counter))  
    
    fop.close()
    
    sort_and_save_packer_id_feature_file()
    
    return

    

#NOTE: move to feature-extraction-packer-id.py



def generate_sample_packer_id(file_list):
    # Generate scalar packer IDs for each sample.
    pid = os.getpid()
    file_name = "data/" + str(pid) + "-sorted-packer-id-features.csv"
    fop = open(file_name,'w')
    #fop.write('file_name,packer_type,label,is_valid,is_packed\n') put column headers in during the combine stage.
    out_lines = []
    packer_id_map = load_packer_id_map()
    signatures = peutils.SignatureDatabase('data/userdb-sans.txt')
    non_pe_counter = 0
    pe_file_counter = 0
    exception_counter = 0
    signat = 'unknown'
    error_str = 'none'
    
    for idx, file_name in enumerate(file_list):
        tokens = file_name.split('_')
        truncated_file_name = tokens[1] # remove the VirusShare_ prefix from the filename.
        matches = None
        packer_id = 0
        is_valid = 0
        is_packed = 0
        
        try:
            pe = pefile.PE(ext_drive + file_name, fast_load=True)
            pe_file_counter += 1
            #matches = signatures.match_all(pe, ep_only = True)
            is_valid = 1
            
            try:
                                
                if peutils.is_probably_packed(pe): # NOTE: peutils.is_valid() has not been implemented yet.
                    #is_valid = 1
                    is_packed = 1
                    
                matches = signatures.match(pe, ep_only = True)       
                signat = matches[0]
                if (signat in packer_id_map.keys()):
                    packer_id = packer_id_map[signat]
                else:
                    packer_id = 0
                
                #signat = signat.replace(',','') # remove commas or they will cause an error when loading dataframes.
                # NOTE: If the signature database has commas in the packer name then remove them or they will
                #       cause problems later on when loading the dataframes.
                row = truncated_file_name + "," + signat + "," + str(packer_id) + "," + str(is_valid) + "," + str(is_packed) + "\n"
                
            except:
                signat = ",unknown,0," + str(is_valid) + "," + str(is_packed) + "\n"
                row = truncated_file_name + signat
                    
            
            pe.close()
        except Exception as e:
            error_str = str(e)
            non_pe_counter += 1
            error_str = error_str.replace(',','') # remove commas or they will cause an error when loading dataframes.
            signat = "," + error_str + ",0,0,0\n"
            row = truncated_file_name + signat
       
    
        out_lines.append(row)
        
        if (idx % 1000) == 0: # print progress
            fop.writelines(out_lines)
            out_lines = []
            print('{:s} - {:s} - {:d} - {:s}'.format(str(pid),truncated_file_name,idx,signat))


    if len(out_lines) > 0:
        fop.writelines(out_lines)
        out_lines = []

    fop.close()

    print('{:s} - Completed {:d} non PE files and {:d} PE files.'.format(str(pid), non_pe_counter, pe_file_counter))
    
    return



In [ ]:

    
ext_drive = '/opt/vs/train/'
tfiles = os.listdir(ext_drive)
generate_sample_packer_id(tfiles[0:50])



In [14]:

    
#NOTE: move to feature-extraction-packer-id.py

ext_drive = '/opt/vs/train/'

tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print("Files: {:d} - {:d} - {:d}".format(len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4)))
trains = [train1, train2, train3, train4]
p = Pool(4)
p.map(generate_sample_packer_id, trains)
print('Completed processing {:d} files.'.format(len(tfiles)))
combine_packer_id_files()









    



65536 16384 65536
Completed 2390 packer IDs.
Completed 2390 packer IDs.
Completed 2390 packer IDs.
Completed 2390 packer IDs.
18562 - fdfe889f3f9000ff4aeffc9e67af79df - 0 - ,'DOS Header magic not found.',0,0,0

18556 - 46b510e161423a7e626adc3d95440f44 - 0 - Microsoft Visual C++ v5.0/v6.0 (MFC)
18558 - 4319791e286503885053e82b01205638 - 0 - Microsoft Visual C++ v5.0/v6.0 (MFC)
18560 - f52bce7cb849de1c173040564db27b2c - 0 - BobSoft Mini Delphi -> BoB / BobSoft
18556 - e616e7a2bfe3f3bea6a3e6d3b6c91c28 - 1000 - ,unknown,0,1,0

18560 - a70fa71f909c61102aabf199c3120585 - 1000 - Microsoft Visual C++ 8
18558 - ac6abbbe5175f9106924e157be61cac7 - 1000 - Microsoft Visual C++ 8
18562 - 5de698e7aea16bb613cec839df4b3572 - 1000 - Microsoft Visual C++ 8
18560 - d31fecec736e02dd58dae97c98a8174e - 2000 - Microsoft Visual C++ v5.0/v6.0 (MFC)
18562 - 3ecf9474399aab8698d9b0f92f42235b - 2000 - Microsoft Visual C++ 8
18558 - 19f2cc5e4c61645796e81de40a9e1293 - 2000 - ,unknown,0,1,1

18556 - 7fb7c2ec5b58decc43e2331f1517bcda - 2000 - Microsoft Visual C++ 8
18560 - 8d1bcf36d7921e2d6a6ca7aabf15b2e5 - 3000 - BobSoft Mini Delphi -> BoB / BobSoft
18558 - 51ce646272d76c8750337ca9f6272fd4 - 3000 - ,unknown,0,1,0

18556 - 1fb2b8ae511fc4a20ef6bc45971b8372 - 3000 - Microsoft Visual Basic v5.0/v6.0
18562 - 9d0573b0c2fc61ce12a6771ba45cca1e - 3000 - ,unknown,0,1,1

18556 - 62e67f7dd75056c4cda1a95ea5d53127 - 4000 - BobSoft Mini Delphi -> BoB / BobSoft
18558 - df7aa157e3b7f67888a88c9e03f255a4 - 4000 - Microsoft Visual C++ 8
18560 - 7bfc5b0071a35425e558750181abb066 - 4000 - BobSoft Mini Delphi -> BoB / BobSoft
18562 - f00f4e94d0c5b8388c98f2ada8e54a78 - 4000 - Microsoft Visual C++ 8
18556 - ef559ec5ed3f7281e1e64da2eb96b08b - 5000 - Microsoft Visual C++ 8
18558 - 1d189222c7c8560363e434f43a1f6b30 - 5000 - Microsoft Visual C++ 8
18560 - 4c2267b0a1825805cfcff0579fc9c13f - 5000 - ,unknown,0,1,0

18562 - 11dbc11c1a878583a521672b239693d5 - 5000 - Microsoft Visual C++ v5.0/v6.0 (MFC)
18556 - 20f5f176c8b6f0b28634ff13e36e4d98 - 6000 - ,unknown,0,1,0

18558 - f88f669af28652745fc2ad772bbfaefe - 6000 - ,unknown,0,1,1

18560 - 7af0f27f8908f174fac31bd9c10ec9df - 6000 - ,unknown,0,1,0

18562 - f76c5b060dc72fa0570fa85bdc0547ba - 6000 - ,unknown,0,1,0

18558 - 187a7f5ff4e5512ffc75a69d18133724 - 7000 - Microsoft Visual C++ 8
18560 - 16875af9f44ab9b39acb120373c11c17 - 7000 - Microsoft Visual C++ 8
18556 - 223c88c24429a84fb3d3774509d05061 - 7000 - ,'DOS Header magic not found.',0,0,0

18562 - 351f166758ee383fa1a2ae8035d4ca74 - 7000 - Microsoft Visual C++ 8
18560 - fd325e3acb71439c63efea80396893cb - 8000 - ,'DOS Header magic not found.',0,0,0

18558 - f240ae1d6cff83a1fd9d712e3150d926 - 8000 - ,unknown,0,1,0

18556 - 5039ac8290c69b9118ae747fa5ecb2f8 - 8000 - BobSoft Mini Delphi -> BoB / BobSoft
18562 - bb235e4b72137332caed82c4a6b9c122 - 8000 - ,unknown,0,1,0

18560 - 2229f21ff17a3e39fac9d3fdec4e710f - 9000 - Microsoft Visual C++ v5.0/v6.0 (MFC)
18558 - 4f19236f3086886f8528cf2483e5d4df - 9000 - Microsoft Visual Basic v5.0/v6.0
18556 - 6d69e1b1336adbbb0c0ee0d4b00ebd74 - 9000 - Microsoft Visual C++ 8
18562 - a0759ac6bfbfc2377f485267b50b038c - 9000 - ,unknown,0,1,0

18560 - 9932343e6234b40cc6a400cf868b6bf7 - 10000 - ,unknown,0,1,0

18556 - 315762706a9156364caa1e1e4e6922df - 10000 - Microsoft Visual Basic v5.0/v6.0
18558 - 83777db99f9c3de46a95869681d4f7c1 - 10000 - Microsoft Visual C++ 8
18562 - 071f2b8fa0ee4c2e001e43ec4abec64b - 10000 - ,unknown,0,1,0

18560 - 975c7a6e118c5b4bf1f5a184adef4363 - 11000 - Microsoft Visual C++ 8
18556 - 1eccb6695b7f882fcce1edb70f5535b7 - 11000 - Microsoft Visual C++ 8
18558 - 941b3fda8ed0686d35e4c729e8d3605e - 11000 - ,unknown,0,1,0

18562 - 84fcd611b3b376a2c740fb2fad538e2c - 11000 - BobSoft Mini Delphi -> BoB / BobSoft
18560 - accd7cc9dc4814b242ac52a65b08d6b0 - 12000 - ,unknown,0,1,0

18556 - b6aa84c222bf536271599d39507f5e57 - 12000 - ,unknown,0,1,0

18558 - 0b29e63983bc03f90d7c17a31f3587f8 - 12000 - ,unknown,0,1,0

18562 - 93e1a94327c32f2690216563c9f974d1 - 12000 - Microsoft Visual C++ 8
18560 - 6160090869eafc07bea1da0e450c376b - 13000 - BobSoft Mini Delphi -> BoB / BobSoft
18556 - 7883ce9ddd8dd4eace12a36d615175a4 - 13000 - BobSoft Mini Delphi -> BoB / BobSoft
18558 - 833d0205c56537d015d1be0e66bff485 - 13000 - TASM / MASM
18562 - a87c2ef2cc293af548aadf70e6b5703a - 13000 - Microsoft Visual C++ 8
18560 - 23f4863a4ab01905cc4fb3a72a6c7500 - 14000 - BobSoft Mini Delphi -> BoB / BobSoft
18556 - 316e4e7e6b3737f77925cdc8369f1282 - 14000 - ,unknown,0,1,0

18558 - 606832f7b2169d4c6ba9731f23b41938 - 14000 - Microsoft Visual C++ 8
18562 - bf7f2f15f4422d5b33fddae2d8cc8c2f - 14000 - ,unknown,0,1,1

18560 - d3699ecc7f6aa9753e54c6e3893f2874 - 15000 - Microsoft Visual C++ v5.0/v6.0 (MFC)
18556 - ebecfd8687c5ef1779ecf186f46e3588 - 15000 - BobSoft Mini Delphi -> BoB / BobSoft
18558 - db69d6f47efe6fa383cf513ea0d6654e - 15000 - ,unknown,0,1,0

18562 - 745158dac29ea0cb807b11e79a02ecbb - 15000 - BobSoft Mini Delphi -> BoB / BobSoft
18556 - d54a2368c27996528e89e5bb43277110 - 16000 - ,unknown,0,1,1

18560 - ccf61acf09e3b4e3f427d4833bc0e4ae - 16000 - ,unknown,0,1,0

18558 - 9e24bd43f7749cfb3de644c84fc16245 - 16000 - ,unknown,0,1,1

18556 - Completed 800 non PE files and 15584 PE files.
18560 - Completed 829 non PE files and 15555 PE files.
18562 - 892249ec0eaefa8b39604bbdf5640f85 - 16000 - BobSoft Mini Delphi -> BoB / BobSoft
18558 - Completed 801 non PE files and 15583 PE files.
18562 - Completed 821 non PE files and 15563 PE files.
Completed processing 65536 files.
Completed combine of 65536 packer ID features.



In [19]:

    
# Save the malware packer id features sets from VirusShare 251 and 252.
packer_id_features = pd.read_csv('data/sorted-packer-id-features.csv')
#sorted_packer_id_features = packer_id_features_1.value_sort(on='file_name')
sorted_packer_id_features = packer_id_features.sort('file_name')
sorted_packer_id_features.to_csv('data/sorted-packer-id-features-vs251-252.csv', index=False)
sorted_packer_id_features.head()









    Out[19]:






  
    
      
      file_name
      packer_name
      packer_id
      valid_pe
      is_packed
    
  
  
    
      0    
       00002e640cafb741bea9a48eaee27d6f
                                unknown
          0
       1
       1
    
    
      1    
       000118d12cbf9ad6103e8b914a6e1ac3
                 Microsoft Visual C++ 8
       1101
       1
       0
    
    
      2    
       0001776237ac37a69fcef93c1bac0988
                                unknown
          0
       1
       1
    
    
      65536
       00027c21667d9119a454df8cef2dc1c7
          'DOS Header magic not found.'
          0
       0
       0
    
    
      65537
       0003887ab64b8ae19ffa988638decac2
       Microsoft Visual C# / Basic .NET
       1067
       1
       0
    
  

5 rows × 5 columns



In [ ]:

    
ext_drive = '/opt/vs/train2/'

tfiles = os.listdir(ext_drive)
quart = len(tfiles)/4
train1 = tfiles[:quart]
train2 = tfiles[quart:(2*quart)]
train3 = tfiles[(2*quart):(3*quart)]
train4 = tfiles[(3*quart):]
print("Files: {:d} - {:d} - {:d}".format(len(tfiles), quart, (len(train1)+len(train2)+len(train3)+len(train4)))
trains = [train1, train2, train3, train4]
p = Pool(4)
p.map(generate_sample_packer_id, trains)
print('Completed processing {:d} files.'.format(len(tfiles)))
combine_packer_id_files()



In [17]:

    
# Now sort the packer id features by filename
#NOTE: move to feature-extraction-packer-id.py
packers = pd.read_csv('data/packer-id-features.csv')
packers.head(20)









    Out[17]:






  
    
      
      file_name
      packer_type
      label
    
  
  
    
      0 
       VirusShare_46b510e161423a7e626adc3d95440f44
       Microsoft Visual C++ v5.0/v6.0 (MFC)
       1111
    
    
      1 
       VirusShare_1103c897ed2979339774f48ff47c0203
       Microsoft Visual C++ v5.0/v6.0 (MFC)
       1111
    
    
      2 
       VirusShare_1835b8c9ed56ca729ad664e4c1725b1c
                                    unknown
          0
    
    
      3 
       VirusShare_da301519b87e8b796ece22b3f4c13429
       BobSoft Mini Delphi -> BoB / BobSoft
        317
    
    
      4 
       VirusShare_579659363281e349a93adfe5cfadf320
                     Microsoft Visual C++ 8
       1101
    
    
      5 
       VirusShare_3d91f9da7b6ddd05f7fc3e6854ba51b9
                     Microsoft Visual C++ 8
       1101
    
    
      6 
       VirusShare_344f8b5a063b8ea9bb7b857c88846e49
                     Microsoft Visual C++ 8
       1101
    
    
      7 
       VirusShare_d4b0554e0f2e41c8fc5a954cd10bdc8c
                     Microsoft Visual C++ 8
       1101
    
    
      8 
       VirusShare_3d86c407c61c1c241ebe343574732bab
                                    unknown
          0
    
    
      9 
       VirusShare_afeca052db9266bcdeb97d6f2a61a5e9
                                      nonpe
          0
    
    
      10
       VirusShare_79b46831edc60b8895bd2bfa9eecd62e
                                    unknown
          0
    
    
      11
       VirusShare_ee7cbb0780566ba538d81510bdd4bdab
                            UPX 2.90 (LZMA)
       2434
    
    
      12
       VirusShare_ba251cd16eb5f6b16efbdd65f28eafc2
                                      nonpe
          0
    
    
      13
       VirusShare_2d64e4734c285e733042a8e006d75921
                                    unknown
          0
    
    
      14
       VirusShare_100dad48224a439b7563256a61299624
       BobSoft Mini Delphi -> BoB / BobSoft
        317
    
    
      15
       VirusShare_f7d8e338a110b8864d19574bee4c2558
       BobSoft Mini Delphi -> BoB / BobSoft
        317
    
    
      16
       VirusShare_ef18a320072ab2ca58418a94d9a20bf2
                                    unknown
          0
    
    
      17
       VirusShare_7d03f1d4bcf044d44dec7396e750bef9
                     Microsoft Visual C++ 8
       1101
    
    
      18
       VirusShare_9d03b0c2f333fb339e4e47359af759ef
                                    unknown
          0
    
    
      19
       VirusShare_415f30ebd0d8a13a2ad8d0ca23213561
                     Microsoft Visual C++ 8
       1101
    
  

20 rows × 3 columns



In [ ]:

    
# trim the file_names and sort
#NOTE: move to feature-extraction-packer-id.py
counter = 0
for file_name in packers['file_name']:
    tokens = file_name.split('_')
    packers.iloc[counter,0] = tokens[1]
    counter += 1
    if (counter % 1000) == 0:
        print('{:s}'.format(file_name))
    
packers.head(20)



In [25]:

    
# DataFrame.sort() is deprecated, but this is an old version of pandas, does not have sort_values()!!!
sorted_packers = packers.sort('file_name')
sorted_packers.head(20)









    Out[25]:






  
    
      
      file_name
      packer_type
      label
    
  
  
    
      80860 
       00002e640cafb741bea9a48eaee27d6f
                                    unknown
          0
    
    
      95202 
       000118d12cbf9ad6103e8b914a6e1ac3
                     Microsoft Visual C++ 8
       1101
    
    
      74640 
       0001776237ac37a69fcef93c1bac0988
                                    unknown
          0
    
    
      23744 
       00027c21667d9119a454df8cef2dc1c7
                                      nonpe
          0
    
    
      45415 
       0003887ab64b8ae19ffa988638decac2
           Microsoft Visual C# / Basic .NET
       1067
    
    
      115121
       000403e4e488356b7535cc613fbeb80b
                     Microsoft Visual C++ 8
       1101
    
    
      24351 
       0004376a62e22f6ad359467eb742b8ff
                                    unknown
          0
    
    
      80003 
       0004c8b2a0f4680a5694d74199b40ea2
       Microsoft Visual C++ v5.0/v6.0 (MFC)
       1111
    
    
      91928 
       000595d8b586915c12053104cf845097
                                    unknown
          0
    
    
      223   
       000634f03457d088c71dbffb897b1315
                     Microsoft Visual C++ 8
       1101
    
    
      58612 
       00072ed24314e91b63b425b3dc572f50
           Microsoft Visual Basic v5.0/v6.0
       1060
    
    
      20945 
       00092d369958b67557da8661cc9093bc
                     Microsoft Visual C++ 8
       1101
    
    
      65725 
       00093d5fa5cb7ce77f6eaf39962daa12
                     Microsoft Visual C++ 8
       1101
    
    
      90793 
       00099926d51b44c6f8c93a48c2567891
                                    unknown
          0
    
    
      4545  
       0009a64f786fa29bfa6423278cc74f02
                                    unknown
          0
    
    
      119064
       000a2db4762dc06628a086c9e117f884
                    Armadillo v1.xx - v2.xx
        153
    
    
      3909  
       000ac11fa7587b2316470b154254a219
                                    unknown
          0
    
    
      124467
       000ae2c63ba69fc93dfc395b40bfe03a
                                    unknown
          0
    
    
      45100 
       000ae90736a51c47543dcc6d8a735362
                                    unknown
          0
    
    
      63808 
       000b41258d624ef2d6e430822d0c0c8f
                                    unknown
          0
    
  

20 rows × 3 columns



In [26]:

    
#NOTE: move to feature-extraction-packer-id.py
sorted_packers.to_csv('data/sorted-packer-id-features.csv', index=False)



In [27]:

    
sorted_packers.shape









    Out[27]:





(131072, 3)

2. Generate Unique Scalar IDs for Packers

   Script name: generate-packer-ids.py



In [ ]:

    
# This is one of the packer ID databases used by PEid.exe
fip = open('data/userdb-sans.txt', 'r')
in_lines = fip.readlines()
len(in_lines)



In [ ]:

    
# This is one of the packer ID databases used by PEid.exe
fip = open('data/userdb-sans.txt', 'r')
in_lines = fip.readlines()
fop = open('data/packer-id.csv','w')
fop.write('packer_name,packer_id\n')
fop.write('unknown,0\n')
p1 = re.compile('\[(.*)\]')
out_lines = []
row = ' '
counter = 0
for idx, line in enumerate(in_lines):
    if line.startswith('['):
        counter += 1
        m = p1.match(line)
        if m != None:
            row = m.group(1) + ',' + str(counter) + '\n'
            out_lines.append(row)
        else:
            continue
    else:
        continue
    
    if (idx % 100) == 0: # print progress
        fop.writelines(out_lines)
        out_lines = []
        print('Filename: {:s} - {:d}'.format(row.rstrip(),idx))
        

if len(out_lines) > 0:
    fop.writelines(out_lines)
    out_lines = []

fip.close()
fop.close()

print('Completed {:d} packer IDs.'.format(counter))

3. Unpack all PE Files that are packed.

   - script: unpack_pe_binary.py



In [10]:

    
# Load the malware packer id features sets from VirusShare 251 and 252.
packer_id_features = pd.read_csv('data/sorted-packer-id-features-vs251-252.csv')
packer_id_features.head(20)









    Out[10]:






  
    
      
      file_name
      packer_name
      packer_id
      valid_pe
      is_packed
    
  
  
    
      0 
       00002e640cafb741bea9a48eaee27d6f
                                    unknown
          0
       1
       1
    
    
      1 
       000118d12cbf9ad6103e8b914a6e1ac3
                     Microsoft Visual C++ 8
       1101
       1
       0
    
    
      2 
       0001776237ac37a69fcef93c1bac0988
                                    unknown
          0
       1
       1
    
    
      3 
       00027c21667d9119a454df8cef2dc1c7
              'DOS Header magic not found.'
          0
       0
       0
    
    
      4 
       0003887ab64b8ae19ffa988638decac2
           Microsoft Visual C# / Basic .NET
       1067
       1
       0
    
    
      5 
       000403e4e488356b7535cc613fbeb80b
                     Microsoft Visual C++ 8
       1101
       1
       0
    
    
      6 
       0004376a62e22f6ad359467eb742b8ff
                                    unknown
          0
       1
       0
    
    
      7 
       0004c8b2a0f4680a5694d74199b40ea2
       Microsoft Visual C++ v5.0/v6.0 (MFC)
       1111
       1
       1
    
    
      8 
       000595d8b586915c12053104cf845097
                                    unknown
          0
       1
       1
    
    
      9 
       000634f03457d088c71dbffb897b1315
                     Microsoft Visual C++ 8
       1101
       1
       0
    
    
      10
       00072ed24314e91b63b425b3dc572f50
           Microsoft Visual Basic v5.0/v6.0
       1060
       1
       0
    
    
      11
       00092d369958b67557da8661cc9093bc
                     Microsoft Visual C++ 8
       1101
       1
       0
    
    
      12
       00093d5fa5cb7ce77f6eaf39962daa12
                     Microsoft Visual C++ 8
       1101
       1
       0
    
    
      13
       00099926d51b44c6f8c93a48c2567891
                                    unknown
          0
       1
       0
    
    
      14
       0009a64f786fa29bfa6423278cc74f02
                                    unknown
          0
       1
       0
    
    
      15
       000a2db4762dc06628a086c9e117f884
                    Armadillo v1.xx - v2.xx
        153
       1
       0
    
    
      16
       000ac11fa7587b2316470b154254a219
                                    unknown
          0
       1
       0
    
    
      17
       000ae2c63ba69fc93dfc395b40bfe03a
                                    unknown
          0
       1
       0
    
    
      18
       000ae90736a51c47543dcc6d8a735362
                                    unknown
          0
       1
       1
    
    
      19
       000b41258d624ef2d6e430822d0c0c8f
                                    unknown
          0
       1
       0
    
  

20 rows × 5 columns



In [12]:

    
# Lets have a look at all the valid PE binaries that are packed or suspected of being packed.
packed_files = packer_id_features[packer_id_features['is_packed'] == 1]
packed_files.head(20)









    Out[12]:






  
    
      
      file_name
      packer_name
      packer_id
      valid_pe
      is_packed
    
  
  
    
      0  
       00002e640cafb741bea9a48eaee27d6f
                                    unknown
          0
       1
       1
    
    
      2  
       0001776237ac37a69fcef93c1bac0988
                                    unknown
          0
       1
       1
    
    
      7  
       0004c8b2a0f4680a5694d74199b40ea2
       Microsoft Visual C++ v5.0/v6.0 (MFC)
       1111
       1
       1
    
    
      8  
       000595d8b586915c12053104cf845097
                                    unknown
          0
       1
       1
    
    
      18 
       000ae90736a51c47543dcc6d8a735362
                                    unknown
          0
       1
       1
    
    
      32 
       0018356588de2f25ec1de5b153df1e72
                                    unknown
          0
       1
       1
    
    
      35 
       001a3503f871188051f7babd8d6b9c70
                     Microsoft Visual C++ 8
       1101
       1
       1
    
    
      43 
       00219cbc6ccfa008e98637d0d9869221
       Microsoft Visual C++ v5.0/v6.0 (MFC)
       1111
       1
       1
    
    
      52 
       002c634dd7ef576ccdbf74c8e2f00e2e
                                    unknown
          0
       1
       1
    
    
      53 
       002d79dbdb98cbe735b5be980c1fcdd7
       Microsoft Visual C++ v5.0/v6.0 (MFC)
       1111
       1
       1
    
    
      64 
       00338461c67178886518806214bf8d1f
                                    unknown
          0
       1
       1
    
    
      67 
       00347521d61dbe729465f89a31c43e3f
                                    unknown
          0
       1
       1
    
    
      72 
       0038221b407096f0bef09bbd06d4347a
                             PECompact v2.0
       1600
       1
       1
    
    
      84 
       003f76510683e95c3ab210d83073e3a1
                                    unknown
          0
       1
       1
    
    
      85 
       004090764a95a6a4c6f4dd3d382f4fdc
                     Microsoft Visual C++ 8
       1101
       1
       1
    
    
      91 
       004415100f0913f9b2d38be162379dac
       Microsoft Visual C++ v5.0/v6.0 (MFC)
       1111
       1
       1
    
    
      99 
       004926581d5f3e6aa96d5741d239c604
       Microsoft Visual C++ v5.0/v6.0 (MFC)
       1111
       1
       1
    
    
      101
       004b3d107fb3fc98499fe84c73d00948
                            UPX 2.90 (LZMA)
       2434
       1
       1
    
    
      104
       004ee892da7ab8aad921ef8cee7b0c2b
                                    unknown
          0
       1
       1
    
    
      113
       0054185537924e08785969538feb00c2
                                    unknown
          0
       1
       1
    
  

20 rows × 5 columns



In [14]:

    
# 16321 packed files to run through unpack.py
# Some of these are likely to have been wrongly classified as packed, but 
# the unpacker should be able to handle them correctly.


import sys
import traceback
import winappdbg
import time
import struct
import ctypes


# Log file which we log info to
logfile = None

class MyEventHandler(winappdbg.EventHandler):

###
# A. Declaring variables
###

  # A.1 used to keep track of allocated executable memory
  allocedmem = {}

  # A.2 used to indicate that we've found the entry point
  entrypt = 0x00000000

  #
  # variables used to find and disassemble unpacking loop
  #

  # A.3 used to indicate that we're single stepping
  tracing = -1

  # A.4 remember the last two eip values
  lasteip = [0x00000000,0x00000000]

  # A.5 lowest eip address we see
  lowesteip = 0xffffffff

  # A.6 highest eip address we see
  highesteip = 0x00000000

  # A.7 list of addresses which we've disassembled
  disasmd = []

  # A.8 keeps track of addresses and instructions
  #     that write to the allocated memory block(s)
  writeaddrs = {}

  #
  # variables used to keep track of created processes
  #

  # A.9 keeps track of created processes to map
  #     hProcess from WriteProcessMemory() back to
  #     process name
  createdprocesses = {}

  # A.10 keeps track of processes that were created
  #      with the CREATE_SUSPENDED flag set
  createsuspended = {}

  #
  # variables used for logging
  #

  # A.11 used to keep a log of events
  eventlog = []


###
# B. Class methods (functions)
###

  ### B.1
  # get_funcargs(event)
  #     query winappdbg to get the function arguments
  #
  #     return a tuple consisting of the return address
  #     and a sub-tuple of function arguments
  ###

  def get_funcargs(self,event):
    h = event.hook
    t = event.get_thread()
    tid = event.get_tid()

    return (t.get_pc(),h.get_params(tid))


  ### B.2
  # guarded_read(d,t,addr,size)
  #     read memory after checking for, and if necessary,
  #     disabling memory breakpoints
  #
  #     returns a string of data
  ###

  def guarded_read(self,d,t,addr,size):
    # keep track of breakpoints that we disabled
    # so that we can enable them again after we've
    # finished
    reenablebps = []

    # initialise the variable to hold the read 
    # memory data
    data = ""

    # check that the requested size is sane
    if (size > 0):
      p = t.get_process()

      # check to see if the requested address falls within
      # any of the existing memory breakpoints by checking
      # if either the requested start address or end address
      # is covered by any breakpoint
      mem_bps = d.get_all_page_breakpoints()
      for (pid,pgbp) in mem_bps:
        (startaddr,endaddr) = pgbp.get_span()
        if (pid == p.get_pid()) and (pgbp.is_here(addr) or pgbp.is_here(addr + size - 1)):
          log("[D]   Memory read in guarded memory. Disabling breakpoint: {:s}".format(pgbp))
          pgbp.disable(p,t)
          reenablebps.append(pgbp)

      # read the memory
      data = p.read(addr,size)

      # enable all of the breakpoints that we disabled
      if (len(reenablebps) > 0):
        for pgbp in reenablebps:
          log("[D]   Re-enabling breakpoint: {:s}".format(pgbp))
          pgbp.enable(p,t)

    # return the read memory as a string
    return data


###
# C. API Hooks
###

  ### C.1
  # apiHooks: winappdbg defined hash of API calls to hook
  #
  #     Each entry is indexed by library name and is an array of 
  #     tuples consisting of API call name and number of args
  ###

  apiHooks = {
    "kernel32.dll":[
      ("VirtualAlloc",4),
      ("VirtualAllocEx",5),
      ("IsDebuggerPresent",0),
      ("CreateProcessA",10),
      ("CreateProcessW",10),
      ("WriteProcessMemory",5)
    ],
    "advapi32.dll":[
      ("CryptDecrypt",6)
    ],
    "wininet.dll":[
      ("InternetOpenA",5),
      ("InternetOpenW",5)
    ],
    "ntdll.dll":[
      ("RtlDecompressBuffer",6)
    ],
    "secur32.dll":[
      ("EncryptMessage",4),
      ("DecryptMessage",4)
    ]
  }


  ###
  # API hook callback functions
  #
  #     These are defined by winappdbg and consist of functions
  #     named pre_<apifuncname> and post_<apifuncname> which are
  #     called on entry to, and on exit from, the given API 
  #     function (<apifuncname>), respectively.
  ###

  # C.2
  # VirtualAlloc() hook(s)
  #

  def post_VirtualAllocEx(self,event,retval):
    try:
      # C.2.1 Get the return address and arguments

      (ra,(hProcess,lpAddress,dwSize,flAllocationType,flProtect)) = self.get_funcargs(event)

      # Get an instance to the debugger which triggered the event
      # and also the process id and thread id of the process to which 
      # the event pertains

      d = event.debug
      pid = event.get_pid()
      tid = event.get_tid()

      # Log the fact that we've seen a VirtualAllocEx() call

      log("[*] <%d:%d> 0x%x: VirtualAllocEx(0x%x,0x%x,0x%x (%d),0x%x,0x%03x) = 0x%x" % (pid,tid,ra,hProcess,lpAddress,dwSize,dwSize,flAllocationType,flProtect,retval))

      # C.2.2 All the memory protection bits which include EXECUTE
      # permission use bits 4 - 7, which is nicely matched 
      # by masking (ANDing) it with 0xf0 and checking for a 
      # non-zero result

      if (flProtect & 0x0f0):
        log("[-]   Request for EXECUTEable memory")

        # We can only set page guards on our own process
        # otherwise page guard exception will occur in 
        # system code when this process attempts to write 
        # to the allocated memory.
        # This causes ZwWriteVirtualMemory() to fail

        # We can, however, set a page guard on it when 
        # this process creates the remote thread, as it 
        # will have presumably stopped writing to the 
        # other process' memory at that point.

        # C.2.2.1 Check that this VirtualAllocEx() call is for
        # the current process (hProcess == -1), and if
        # so, ask the winappdbg debugger instance to 
        # create a page guard on the memory region.
        # Also add information about the allocated region
        # to our allocedmem hash, indexed by pid and 
        # base address.

        if (hProcess == 0xffffffff):
          d.watch_buffer(pid,retval,dwSize - 1,self.guard_page_exemem)
          self.allocedmem[(pid,retval)] = dwSize

      # C.2.3 Create a JSON event log entry

      self.eventlog.append({
        "time": time.time(),
        "name": "VirtualAllocEx",
        "type": "Win32 API",
        "pid": pid,
        "tid": tid,
        "addr": ra,
        "args": {
          "hProcess": hProcess,
          "lpAddress": lpAddress,
          "dwSize": dwSize,
          "flAllocationType": flAllocationType,
          "flProtect": flProtect
        },
        "ret": retval
      })
    except:
      traceback.print_exc()
      raise

    return


  def post_VirtualAlloc(self,event,retval):
    try:
      # C.2.4 Get the return address and arguments

      (ra,(lpAddress,dwSize,flAllocationType,flProtect)) = self.get_funcargs(event)

      # Get an instance to the debugger which triggered the event
      # and also the process id and thread id of the process to which 
      # the event pertains

      d = event.debug
      pid = event.get_pid()
      tid = event.get_tid()

      # Log the fact that we've seen a VirtualAlloc() call
      # This is so that we get the address in the debuggee code from which it was called
      # where as if we just let the VirtualAllocEx() hook log it, the address from 
      # which it was called is inside the VirtualAlloc() code in kernel32.dll

      log("[*] <%d:%d> 0x%x: VirtualAlloc(0x%x,0x%x (%d),0x%x,0x%03x) = 0x%x" % (pid, tid, ra, lpAddress, dwSize, dwSize, flAllocationType, flProtect, retval))

      # C.2.5 Create a JSON event log entry

      self.eventlog.append({
        "time": time.time(),
        "name": "VirtualAlloc",
        "type": "Win32 API",
        "pid": pid,
        "tid": tid,
        "addr": ra,
        "args": {
          "lpAddress": lpAddress,
          "dwSize": dwSize,
          "flAllocationType": flAllocationType,
          "flProtect": flProtect
        },
        "ret": retval
      })
    except:
      traceback.print_exc()
      raise

    return


  # C.3
  # CryptDecrypt() hook(s)
  #

  def pre_CryptDecrypt(self,event,*args):
    # C.3.1 Get the return address and arguments

    (ra,hKey,hHash,Final,dwFlags,pbData,pdwDataLen) = (args[0],args[1],args[2],args[3],args[4],args[5],args[6])

    # C.3.2 Get a Process object and dereference the pdwDataLen argument to read the buffer size

    p = event.get_process()
    buffsize = p.read_uint(pdwDataLen)

    # C.3.3 Save a copy of the encrypted data

    filename = "%s.memblk0x%x.enc" % (sys.argv[1],pbData)
    log("[-]   Dumping %d bytes of encrypted memory at 0x%x to %s" % (buffsize,pbData,filename))
    databuff = open(filename,"wb")
    databuff.write(p.read(pbData,buffsize));
    databuff.close()

    return


  def post_CryptDecrypt(self,event,retval):
    # C.3.4 Get the return address and arguments

    (ra,(hKey,hHash,Final,dwFlags,pbData,pdwDataLen)) = self.get_funcargs(event)

    # Get a Process object, and dereference the pdwDataLen argument

    p = event.get_process()
    buffsize = p.read_uint(pdwDataLen)

    pid = event.get_pid()
    tid = event.get_tid()

    log("[*] <%d:%d> 0x%x: CryptDecrypt(0x%x,0x%x,0x%x,0x%x,0x%x,0x%x (%d)) = %d" % (pid, tid, ra, hKey, hHash, Final, dwFlags, pbData, buffsize, buffsize, retval))

    # C.3.5 Save a copy of the decrypted data

    filename_enc = "%s.memblk0x%x.enc" % (sys.argv[1],pbData)
    filename = "%s.memblk0x%x.dec" % (sys.argv[1],pbData)
    log("[-]   Dumping %d bytes of decrypted memory at 0x%x to %s" % (buffsize,pbData,filename))
    databuff = open(filename,"wb")
    databuff.write(p.read(pbData,buffsize))
    databuff.close()

    # C.3.6 Create a JSON event log entry

    pid = event.get_pid()
    tid = event.get_tid()
    self.eventlog.append({
      "time": time.time(),
      "name": "CryptDecrypt",
      "type": "Win32 API",
      "pid": pid,
      "tid": tid,
      "addr": ra,
      "args": {
        "hKey": hKey,
        "hHash": hHash,
        "Final": Final,
        "dwFlags": dwFlags,
        "pbData": pdwDataLen
      },
      "ret": retval,
      "info": {
        "filename_enc": filename_enc,
        "filename_dec": filename
      }
    })


    return


  # C.4
  # RtlDecompressBuffer() hook(s)
  #

  def pre_RtlDecompressBuffer(self,event,*args):
    try:
      # C.4.1 Get the return address and arguments

      (ra,CompressionFormat,UncompressedBuffer,UncompressedBufferSize,CompressedBuffer,CompressedBufferSize,FinalUncompressedSize) = (args[0],args[1],args[2],args[3],args[4],args[5],args[6])

      p = event.get_process()

      # C.4.2 Save a copy of the compressed data

      filename = "%s.memblk0x%x.comp" % (sys.argv[1],CompressedBuffer)
      log("[-]   Dumping %d bytes of compressed memory at 0x%x to %s" % (CompressedBufferSize,CompressedBuffer,filename))
      databuff = open(filename,"wb")
      databuff.write(p.read(CompressedBuffer,CompressedBufferSize));
      databuff.close()
    except:
      traceback.print_exc()
      raise
    
    
    return


  def post_RtlDecompressBuffer(self,event,retval):
    try:
      # C.4.3 Get the return address and arguments

      (ra,(CompressionFormat,UncompressedBuffer,UncompressedBufferSize,CompressedBuffer,CompressedBufferSize,FinalUncompressedSize)) = self.get_funcargs(event)

      pid = event.get_pid()
      tid = event.get_tid()

      log("[*] <%d:%d> 0x%x: RtlDecompressBuffer(0x%x,0x%x,0x%x,0x%x,0x%x,0x%x): %d" % (pid,tid,ra,CompressionFormat,UncompressedBuffer,UncompressedBufferSize,CompressedBuffer,CompressedBufferSize,FinalUncompressedSize,retval))

      # Get a Process object, and dereference the FinalUncompressedSize argument

      p = event.get_process()
      buffsize = p.read_uint(FinalUncompressedSize)

      # C.4.4 save a copy of the decompressed data

      filename_comp = "%s.memblk0x%x.comp" % (sys.argv[1],CompressedBuffer)
      filename = "%s.memblk0x%x.decomp" % (sys.argv[1],UncompressedBuffer)
      log("[-]   Dumping %d bytes of decompressed memory at 0x%x to %s" % (buffsize,UncompressedBuffer,filename))
      databuff = open(filename,"wb")
      databuff.write(p.read(UncompressedBuffer,buffsize))
      databuff.close()

      # C.4.5 Create a JSON event log entry

      self.eventlog.append({
        "time": time.time(),
        "name": "RtlDecompressBuffer",
        "type": "Win32 API",
        "pid": pid,
        "tid": tid,
        "addr": ra,
        "args": {
          "CompressionFormat": CompressionFormat,
          "UncompressedBuffer": UncompressedBuffer,
          "UncompressedBufferSize": UncompressedBufferSize,
          "CompressedBuffer": CompressedBuffer,
          "CompressedBufferSize": CompressedBufferSize,
          "FinalUncompressedSize": FinalUncompressedSize
        },
        "ret": retval,
        "info": {
          "filename_comp": filename_comp,
          "filename_decomp": filename
        }
      })
    except:
      traceback.print_exc()
      raise

    return


  # C.5
  # CreateProcess() hook(s)
  #

  def post_CreateProcess(self,event,retval,fUnicode):
    try:
      # C.5.1 Get the return address and arguments

      (ra,(lpApplicationName,lpCommandLine,lpProcessAttributes,lpThreadAttributes,bInheritHandles,dwCreationFlags,lpEnvironment,lpCurrentDirectory,lpStartupInfo,lpProcessInformation)) = self.get_funcargs(event)

      p = event.get_process()
      t = event.get_thread()

      pid = event.get_pid()
      tid = event.get_tid()

      # C.5.2 Dereference arguments
      # Use the Process object to dereference the lpApplicationName and lpCommandLine arguments
      # as either ASCII or WCHAR depending on the fUnicode argument
      # (and hence whether we were called from post_CreateProcessA() or post_CreateProcessW() respectively

      szApplicationName = p.peek_string(lpApplicationName,fUnicode)
      szCommandLine = p.peek_string(lpCommandLine,fUnicode)

      # If the lpProcessInformation argument is a valid pointer...

      if (lpProcessInformation):
        # ... dereference it to get the ProcessInformation structure

        d = event.debug
        ProcessInformation = self.guarded_read(d,t,lpProcessInformation,16)

        # Extract the various fields from the ProcessInformation structure

        hProcess = struct.unpack("<L",ProcessInformation[0:4])[0]
        hThread  = struct.unpack("<L",ProcessInformation[4:8])[0]
        dwProcessId = struct.unpack("<L",ProcessInformation[8:12])[0]
        dwThreadId = struct.unpack("<L",ProcessInformation[12:16])[0]
      else:
        log("[E]   lpProcessInformation is null")

      log("[*] <%d:%d> 0x%x: CreateProcess(\"%s\",\"%s\",0x%x): %d (0x%x, 0x%x, <%d:%d>)" % (pid,tid,ra,szApplicationName,szCommandLine,dwCreationFlags,retval,hProcess,hThread,dwProcessId,dwThreadId))

      # C.5.3 Check if the process is being created in a suspended state (CREATE_SUSPENDED flag)...

      if (dwCreationFlags & 0x4):
        # ... hook the ResumeThread() API call
        # so that we are notified when it is resumed

        d = event.debug
        stat = d.hook_function(pid,"ResumeThread",preCB = self.hook_createsuspendedresume,paramCount = 1)
        self.createsuspended[(pid,hThread)] = dwProcessId
        log("[-]   CREATE_SUSPENDED. Hooking ResumeThread() (%d)" % stat)

      # C.5.4 Keep track of processes that were created, so we know which 
      # process any WriteProcessMemory() calls are writing to

      self.createdprocesses[hProcess] = {
        "time": time.time(),
        "ppid": pid,
        "ptid": tid,
        "paddr": ra,
        "ApplicationName":szApplicationName,
        "CommandLine": szCommandLine,
        "CreationFlags": dwCreationFlags,
        "hProcess": hProcess,
        "hThread": hThread,
        "ProcessId": dwProcessId,
        "ThreadId": dwThreadId
      }

      # C.5.5 Create a JSON event log entry

      self.eventlog.append({
        "time": time.time(),
        "name": "CreateProcess",
        "type": "Win32 API",
        "pid": pid,
        "tid": tid,
        "addr": ra,
        "args": {
          "ApplicationName":szApplicationName,
          "CommandLine": szCommandLine,
          "CreationFlags": dwCreationFlags,
          "hProcess": hProcess,
          "hThread": hThread,
          "ProcessId": dwProcessId,
          "ThreadId": dwThreadId
        },
        "info": {
          "fUnicode":fUnicode
        },
        "ret": retval
      })
    except:
      traceback.print_exc()
      raise


    return



  # C.5.6 post_CreateProcessA() and post_CreateProcessW()
  # Actual hook call-back function called by WinAppDbg
  # To save duplicating code between this and post_CreateProcessW()
  # both of them call post_CreateProcess() with a parameter, fUnicode, 
  # which specifies whether the strings are ASCII (CreateProcessA()) 
  # or WCHAR (CreateProcessW())

  def post_CreateProcessA(self,event,retval):
    self.post_CreateProcess(event,retval,False)

    return

   

  def post_CreateProcessW(self,event,retval):
    self.post_CreateProcess(event,retval,True)

    return


  # hook_createsuspendedresume() is a call-back function called when
  # ResumeThread() is call by a process which has created a suspended
  # process

  def hook_createsuspendedresume(self,event,*args):
    # C.5.7 Get the return address and arguments

    (ra,(hThread,)) = self.get_funcargs(event)

    pid = event.get_pid()
    tid = event.get_tid()

    log("[*] <%d:%d> 0x%x: ResumeThread(0x%x)" % (pid,tid,ra,hThread))

    # C.5.8 Find the process id of the resumed process

    if ((pid,hThread) in self.createsuspended):
      pidresumed = self.createsuspended[(pid,hThread)]
      log("[-]   New suspended process (pid %d) resumed" % pidresumed)


    return



  # C.6
  # WriteProcessMemory() hook(s)
  #

  def post_WriteProcessMemory(self,event,retval):
    # C.6.1 Get the return address and arguments

    try:
      (ra,(hProcess,lpBaseAddress,lpBuffer,nSize,lpNumberOfBytesWritten)) = self.get_funcargs(event)

      pid = event.get_pid()
      tid = event.get_tid()

      log("[*] <%d:%d> 0x%x: WriteProcessMemory(0x%x,0x%x,0x%x,0x%x,0x%x): %d" % (pid,tid,ra,hProcess,lpBaseAddress,lpBuffer,nSize,lpNumberOfBytesWritten,retval))

      d = event.debug
      t = event.get_thread()

      # C.6.2 Dereference lpNumberOfBytesWritten to get the number of bytes written to the target process'
      #       address space

      if (lpNumberOfBytesWritten):
        NumberOfBytesWritten = struct.unpack("<L",self.guarded_read(d,t,lpNumberOfBytesWritten,4))[0]
      else:
        NumberOfBytesWritten = None

      # C.6.3 Get process information that was saved by CreateProcess() hook

      if (hProcess in self.createdprocesses):
        ProcessId = self.createdprocesses[hProcess]["ProcessId"]
        ApplicationName = self.createdprocesses[hProcess]["ApplicationName"]
        CommandLine = self.createdprocesses[hProcess]["CommandLine"]
      else:
        log("[W]   hProcess not in createdprocesses[]")
        ProcessId = None
        ApplicationName = None
        CommandLine = None

      d = event.debug
      t = event.get_thread()

      # C.6.4 Save a copy of the written memory

      pid = event.get_pid()
      tid = event.get_tid()
      filename = "%s.memblk0x%x-%d.wpm" % (sys.argv[1],lpBaseAddress,ProcessId)
      log("[-]   Dumping %d bytes of memory at %d:0x%x written to %d:0x%x to %s" % (nSize,pid,lpBuffer,ProcessId,lpBaseAddress,filename))
      databuff = open(filename,"wb")
      databuff.write(self.guarded_read(d,t,lpBuffer,nSize))
      databuff.close()

      # C.6.5 Create a JSON event log entry

      self.eventlog.append({
        "time": time.time(),
        "name": "WriteProcessMemory",
        "type": "Win32 API",
        "pid": pid,
        "tid": tid,
        "addr": ra,
        "args": {
          "hProcess": hProcess,
          "lpBaseAddress": lpBaseAddress,
          "lpBuffer": lpBuffer,
          "nSize": nSize,
          "lpNumberOfBytesWritten": lpNumberOfBytesWritten,
          "NumberOfBytesWritten": NumberOfBytesWritten
        },
        "ret": retval,
        "info": {
          "filename": filename,
          "targetprocesspid": ProcessId,
          "targetprocessname": ApplicationName,
          "targetprocesscmdline": CommandLine
        }
      })
    except:
      traceback.print_exc()
      raise


    return


  # C.7
  # IsDebuggerPresent() hook(s)
  # (mainly added so that AutoIt compiled scripts would run, but also useful
  #  as an anti-anti-malware technique)
  #

  def post_IsDebuggerPresent(self,event,retval):
    # C.7.1 Get the return address and arguments

    (ra,noargs) = self.get_funcargs(event)

    pid = event.get_pid()
    tid = event.get_tid()

    log("[*] <%d:%d> 0x%x: IsDebuggerPresent(): 0x%x" % (pid,tid,ra,retval))
    log("[-]   Returning 0")

    # C.7.2 Changed the 'eax' register (return value) to '0' (no debugger present)
    #       just before we continue running the calling thread

    t = event.get_thread()
    t.set_register("Eax",0x0)

    # C.7.3 Create a JSON event log entry

    self.eventlog.append({
      "time": time.time(),
      "name": "IsDebuggerPresent",
      "type": "Win32 API",
      "pid": pid,
      "tid": tid,
      "addr": ra,
      "args": {},
      "ret": retval,
      "info": {}
    })


    return


  # C.8
  # InternetOpen() hook(s)
  #

  def post_InternetOpen(self,event,retval,fUnicode):
    # C.8.1 Get the return address and arguments

    (ra,(lpszAgent,dwAccessType,lpszProxyName,lpszProxyBypass,dwFlags)) = self.get_funcargs(event)

    pid = event.get_pid()
    tid = event.get_tid()

    # C.8.2 Dereference arguments

    p = event.get_process()
    szAgent = p.peek_string(lpszAgent,fUnicode)
    szProxyName = p.peek_string(lpszProxyName,fUnicode)
    szProxyBypass = p.peek_string(lpszProxyBypass,fUnicode)

    log("[*] <%d:%d> 0x%x: InternetOpen(\"%s\",0x%x,\"%s\",\"%s\",0x%x) = 0x%x" % (pid,tid,ra,szAgent,dwAccessType,szProxyName,szProxyBypass,dwFlags,retval))

    # C.8.3 Create a JSON event log entry

    self.eventlog.append({
      "time": time.time(),
      "name": "InternetOpen",
      "type": "Win32 API",
      "pid": pid,
      "tid": tid,
      "addr": ra,
      "args": {},
      "ret": retval,
      "info": {}
    })


    return


  def post_InternetOpenA(self,event,retval):
    self.post_InternetOpen(event,retval,False)

    return


  def post_InternetOpenW(self,event,retval):
    self.post_InternetOpen(event,retval,True)

    return


  def pre_EncryptMessage(self,event,*args):
    # C.?.1 Get the return address and arguments

    try:
      (ra,phContext,fQOP,pMessage,MessageSeqNo) = (args[0],args[1],args[2],args[3],args[4])

      pid = event.get_pid()
      tid = event.get_tid()

      # Right -- this is going to get annoying
      # pMessage is a pointer to a SecBufferDesc structure
      # which describes an array of SecBuffer structures
      p = event.get_process()
      l = p.get_label_at_address(ra)

      # really ought to use a ctypes struct for this!
      ulVersion = p.peek_uint(pMessage)
      cBuffers = p.peek_uint(pMessage + 4)
      pBuffers = p.peek_uint(pMessage + 8)

      log("[*] <%d:%d> %s 0x%x: EncryptMessage(...)" % (pid,tid,l,ra))
      log("[D]   ulVersion: %d" % ulVersion)
      log("[D]   cBuffers:  %d" % cBuffers)
      log("[D]   pBuffers:  0x%x" % pBuffers)

      # dump buffer list
      for i in range(0,cBuffers):
        cbBuffer = p.peek_uint(pBuffers + (i * 12) + 0)
        BufferType = p.peek_uint(pBuffers + (i * 12) + 4)
        pvBuffer = p.peek_uint(pBuffers + (i * 12) + 8)

        if (BufferType == 1):	# SECBUFFER_DATA
          # we have data to save
          filename = sys.argv[1] + ".encmsg0x%08x-%d" % (pvBuffer,pid)

          f = open(filename,"ab")
          f.write(p.peek(pvBuffer,cbBuffer))
          f.close()

        log("[D]")
        log("[D]   cbBuffer: 0x%x (%d)" % (cbBuffer,cbBuffer))
        log("[D]   BufferType: 0x%x" % BufferType)
        log("[D]   pvBuffer: 0x%x" % pvBuffer)
    except:
      traceback.print_exc()
      raise

    return


  def post_DecryptMessage(self,event,retval):
    # C.?.1 Get the return address and arguments

    try:
      (ra,(phContext,pMessage,MessageSeqNo,pfQOP)) = self.get_funcargs(event)

      pid = event.get_pid()
      tid = event.get_tid()

      # Right -- this is going to get annoying
      # pMessage is a pointer to a SecBufferDesc structure
      # which describes an array of SecBuffer structures
      p = event.get_process()

      # really ought to use a ctypes struct for this!
      ulVersion = p.peek_uint(pMessage)
      cBuffers = p.peek_uint(pMessage + 4)
      pBuffers = p.peek_uint(pMessage + 8)

      log("[*] <%d:%d> 0x%x: DecryptMessage(...)" % (pid,tid,ra))
      log("[D]   ulVersion: %d" % ulVersion)
      log("[D]   cBuffers:  %d" % cBuffers)
      log("[D]   pBuffers:  0x%x" % pBuffers)

      # dump buffer list
      for i in range(0,cBuffers):
        cbBuffer = p.peek_uint(pBuffers + (i * 12) + 0)
        BufferType = p.peek_uint(pBuffers + (i * 12) + 4)
        pvBuffer = p.peek_uint(pBuffers + (i * 12) + 8)

        if (BufferType == 1):	# SECBUFFER_DATA
          # we have data to save
          filename = sys.argv[1] + ".decmsg0x%08x-%d" % (pvBuffer,pid)
          f = open(filename,"ab")
          f.write(p.peek(pvBuffer,cbBuffer))
          f.close()

        log("[D]")
        log("[D]   cbBuffer: 0x%x (%d)" % (cbBuffer,cbBuffer))
        log("[D]   BufferType: 0x%x" % BufferType)
        log("[D]   pvBuffer: 0x%x" % pvBuffer)
    except:
      traceback.print_exc()
      raise

    return


###
# D. winappdbg debug event handlers
###

  ### D.1
  # create_process
  #
  #     winappdbg defined callback function to handle process creation events
  ###

  def create_process(self,event):
    p = event.get_process()

    pid = event.get_pid()
    tid = event.get_tid()
    
    log("[*] <%d:%d> Create process event for pid %d (%s)" % (pid,tid,p.get_pid(),p.get_image_name()))
    log("[-]   command line: %s" % p.get_command_line())
    #log("[D]   Create process event for pid %d (%d)" % (pid,tid))

    self.eventlog.append({
      "time": time.time(),
      "name": event.get_event_name(),
      "type": "WinAppDbg Event",
      "pid": pid,
      "tid": tid,
      "info": {
        "pid": p.get_pid(),
        "module_base": event.get_module_base(),
        "filename": event.get_filename(),
        "cmdline": p.get_command_line()
      },
    })


    return


  ### D.2
  # exit_process
  #
  #     winappdbg defined callback function to handle process exit events
  ###

  def exit_process(self,event):
    pid = event.get_pid()
    tid = event.get_tid()

    log("[*] <%d:%d> Exit process event for %s: 0x%x" % (pid,tid,event.get_filename(),event.get_exit_code()))

    self.eventlog.append({
      "time": time.time(),
      "name": event.get_event_name(),
      "type": "WinAppDbg Event",
      "pid": pid,
      "tid": tid,
      "info": {
        "module_base": event.get_module_base(),
        "filename": event.get_filename(),
        "exitcode": event.get_exit_code()
      },
    })

    return


  ### D.3
  # create_thread
  #
  #     winappdbg defined callback function to handle thread creation events
  ###

  def create_thread(self,event):
    pid = event.get_pid()
    tid = event.get_tid()

    t = event.get_thread()
    name = t.get_name()
    
    log("[*] <%d:%d> Create thread event \"%s\" @ 0x%x" % (pid,tid,name,event.get_start_address()))

    self.eventlog.append({
      "time": time.time(),
      "name": event.get_event_name(),
      "type": "WinAppDbg Event",
      "pid": pid,
      "tid": tid,
      "info": {
        "startaddress": event.get_start_address(),
        "threadname": name
      },
    })

    return


  ### D.4
  # exit_thread
  #
  #     winappdbg defined callback function to handle thread exit events
  ###

  def exit_thread(self,event):
    pid = event.get_pid()
    tid = event.get_tid()

    t = event.get_thread()
    name = t.get_name()

    log("[*] <%d:%d> Exit thread event \"%s\"" % (pid,tid,name,))

    self.eventlog.append({
      "time": time.time(),
      "name": event.get_event_name(),
      "type": "WinAppDbg Event",
      "pid": pid,
      "tid": tid,
      "info": {
        "threadname": name
      },
    })

    return


  ### D.5
  # load_dll
  #
  #     winappdbg defined callback function to handle DLL load events
  ###

  def load_dll(self,event):
    pid = event.get_pid()
    tid = event.get_tid()

    log("[*] <%d:%d> Load DLL event: %s" % (pid,tid,event.get_filename()))

    self.eventlog.append({
      "time": time.time(),
      "name": event.get_event_name(),
      "type": "WinAppDbg Event",
      "pid": pid,
      "tid": tid,
      "info": {
        "module_base": event.get_module_base(),
        "filename": event.get_filename(),
      },
    })

    return


  ### D.6
  # event
  #
  #     winappdbg defined callback function to handle any remaining events
  ###

  def event(self,event):
    pid = event.get_pid()
    tid = event.get_tid()

    log("[*] <%d:%d> Unhandled event: %s" % (pid,tid,event.get_event_name()))

    return


###
# E. winappdbg debug exception handlers
###

  ### E.1
  # guard_page
  #
  #     winappdbg defined callback function to handle guard page exceptions
  ###

  def guard_page_exemem(self,exception):
    try:
      f_type = exception.get_fault_type()

      e_addr = exception.get_exception_address()
      f_addr = exception.get_fault_address()

      # get the process and thread ids
      pid = exception.get_pid()
      tid = exception.get_tid()

      # It is interesting to log this, but it generates a lot of log 
      # output and slows the whole process down
      #log("[!] <%d:%d> 0x%x: GUARD_PAGE(%d) exception for address 0x%x" % (pid,tid,e_addr,f_type,f_addr))
      #log("[*] VirtualAlloc()d memory address 0x%x accessed (%d) from 0x%x (%s)" % (f_addr,f_type,e_addr,instr))

      # E.1.2 Was it a memory write operation?
      if (f_type == winappdbg.win32.EXCEPTION_WRITE_FAULT):
        # E.1.2.1 Use the writeaddrs[] array to check to see 
        #         if we have already logged access from this
        #         address, as unpacking is generally done in 
        #         a loop and we don't want to log the same
        #         instructions for each iteration
        if not e_addr in self.writeaddrs:
          p = exception.get_process()
          t = exception.get_thread()
          label = p.get_label_at_address(e_addr)
          instr = t.disassemble_instruction(e_addr)[2].lower()
          log("[*] VirtualAlloc()d memory address 0x%x written from 0x%x (%s): %s" % (f_addr,e_addr,label,instr))
          self.writeaddrs[e_addr] = instr

        # E.1.2.2 Use the tracing variable to see if we have
        #         already started tracing, that is single 
        #         stepping. If not, enable it, and make a note
        #         of the fact by setting the tracing variable
        #         to True
        if (self.tracing == -1):
          self.tracing = 0
          d = exception.debug
          log("[-]   Enabling tracing")
          d.start_tracing(exception.get_tid())

      # E.1.3 Was it a memory instruction fetch (execute) operation, 
      #       and if so, are we still looking for the entry point address?
      if (f_type == winappdbg.win32.EXCEPTION_EXECUTE_FAULT) and (self.entrypt == 0):
        self.entrypt = e_addr
        t = exception.get_thread()
        jmpinstr = t.disassemble_instruction(self.lasteip[0])[2].lower()

        # E.1.3.1 Log what we've found
        #log("[D]     lasteip[1]: 0x%x" % self.lasteip[1])
        log("[*]   Found unpacked entry point at 0x%x called from 0x%x (%s) (after executing %d instructions)" % (self.entrypt,self.lasteip[0],jmpinstr,self.tracing))
        log("[-]   Unpacking loop at 0x%x - 0x%x" % (self.lowesteip,self.highesteip))

        pid = exception.get_pid()
        tid = exception.get_tid()

        elog = ({
          "time": time.time(),
          "name": "unpacking loop found",
          "type": "unpack event",
          "pid": pid,
          "tid": tid,
          "info": {
            "unpacked_entry_point": self.entrypt,
            "callingaddr": self.lasteip[0],
            "callinginstr": jmpinstr
          },
        })

        # E.1.3.2
        for (mem_pid,memblk) in self.allocedmem:
          if (mem_pid == pid):
            size = self.allocedmem[(mem_pid,memblk)]
            endaddr = memblk + size - 1
            if (e_addr >= memblk) and (e_addr <= endaddr):
              # E.1.3.3 Log what we're doing and delete the memory breakpoint
              log("[-]   Dumping %d bytes of memory range 0x%x - 0x%x" % (size,memblk,endaddr))
              d = exception.debug
              d.dont_watch_buffer(exception.get_pid(),memblk,size - 1)

              # E.1.3.4 Disable single-step debugging
              self.tracing = -1
              d.stop_tracing(exception.get_tid())

              # E.1.3.5 Reset unpacking loop variables
              self.entrypt = 0x00000000
              #del self.lasteip
              self.lasteip = [0x00000000,0x00000000]
              self.lowesteip = 0xffffffff
              self.highest = 0x00000000

              # E.1.3.6 Dump the memory block to a file
              p = exception.get_process()

              filename = sys.argv[1] + ".memblk0x%08x" % memblk
              dumpfile = open(filename,"wb")
              dumpfile.write(p.read(memblk,size))
              dumpfile.close()

              elog["info"]["filename"] = filename
        self.eventlog.append(elog)
    except Exception as e:
      traceback.print_exc()
      raise

    return


  ### E.2
  # single_step
  #
  #     winappdbg defined callback function to handle single step exceptions
  ###

  def single_step(self,exception):
    try:
      # E.2.1 Get the exception address
      e_addr = exception.get_exception_address()

      # E.2.2 If we have just looped back (eip has gone backward)
      if (e_addr < self.lasteip[1]):
        # Remember this lower address as the lowest loop address
        if self.lowesteip == 0xffffffff: self.lowesteip = e_addr

        # ... and the address we just jumped from as the highest loop address
        if self.highesteip == 0x00000000: self.highesteip = self.lasteip[1]

      # E.2.3 If we are executing an instruction within the bounds of the loop
      #       and we haven't already disassembled this address, then do so
      if (e_addr >= self.lowesteip) and (e_addr <= self.highesteip) and (not e_addr in self.disasmd):
        t = exception.get_thread()
        disasm = t.disassemble_instruction(e_addr)
        instr = disasm[2].lower()
        log("    0x%x: %s" % (e_addr,instr))
        self.disasmd.append(e_addr)

      # E.2.4 Remember the last two instruction addresses (eip values)
      #       We need to remember the last two in order to be able to
      #       disassemble the instruction that jumped to the original 
      #       entry point in the unpacked code
      self.lasteip[0] = self.lasteip[1]
      self.lasteip[1] = e_addr

      # E.2.5 Increment the instruction counter, and check to see if 
      #       we have reached our limit of 250,000 instructions.
      #       If so, assume that there is no unpacking loop and stop
      #       tracing (to speed up execution).
      self.tracing += 1
      if (self.tracing >= 250000):
        log("[E] Reached tracing limit of 250000 instructions")

        d = exception.debug
        pid = exception.get_pid()
        d.break_at(pid,e_addr,self.bp_stoptracing)

        self.tracing = -1
    except Exception as e:
      traceback.print_exc()
      raise

    return


  # E.2.6 bp_stoptracing()
  #       Set as a breakpoint handler when we want to stop tracing, as we can't
  #       disable single-step tracing from within the single-step call-back function.

  def bp_stoptracing(self,exception):
    log("[D] Single-step instruction limit reached -- stopping tracing")
    d = exception.debug
    tid = exception.get_tid()
    pid = exception.get_pid()
    d.stop_tracing(tid)
    d.dont_break_at(pid,exception.get_exception_address())

    return


  ### E.3
  # exception
  #
  #     winappdbg defined callback function to handle remaining exceptions
  ###

  def exception(self,exception):
    log("[*] Unhandled exception at 0x%x: %s" % (exception.get_exception_address(),exception.get_exception_name()))
    #log("[-]   0x%x fault at 0x%x" % (exception.get_fault_type(),exception.get_fault_address()))

    return


#
#### end of MyEventHandler class
#


###
# F. Miscellaneous functions
###

### F.1
# log(msg):
###
def log(msg):
    global logfile

    print(msg)
    
    if not logfile:
        logfile = open(sys.argv[1] + ".log","w")
        
    if logfile:
        logfile.write(msg + "\n")
        logfile.flush()

    #logfile.log_text(msg)

    return


### F.2
# simple_debugger(argv):
###
def simple_debugger(filename):
    global logfile

    try:
        handler = MyEventHandler()
        #logfile = winappdbg.textio.Logger(filename + ".log",verbose = True)
    except:
        traceback.print_exc()
            
            
    with winappdbg.Debug(handler, bKillOnExit = True, bHostileCode = False) as debug:
        log("[*] Starting {:s}".format(filename))
        debug.execl(filename, bFollow = False)
        log("[*] Starting debug loop")
        debug.loop()
        log("[*] Terminating")

        
    log("[D] Number of created processes: {:d}".format(len(handler.createdprocesses)))
        
    for i in range(0, len(handler.eventlog)):
        log("{:s}".format(handler.eventlog[i]))

    
    return


    
###
# G. Start of script execution
###

#log("[*] Started at %s" % time.strftime("%Y-%m-%d %H:%M:%S"))
#simple_debugger(sys.argv[1])
#log("[*] Completed at %s" % time.strftime("%Y-%m-%d %H:%M:%S"))    
    
# End of original unpack.py code

# Now write out the file list so we can run the list through unpack.py
# on our Windows XP sandbox VM.

def write_packed_file_list(packer_id_feature_file, packed_list_file_name):
    # Load the malware packer id features sets from the sample set.
    packer_id_features = pd.read_csv(packer_id_feature_file)
    packed_files = packer_id_features[packer_id_features['is_packed'] == 1]
    
    fop = open(packed_list_file_name, 'w')
    counter = 0
    
    for file_name in packed_files['file_name']:
        full_name = "VirusShare_" + file_name + "\n"
        fop.write(full_name)
        counter += 1

    fop.close()
    
    print("Write {:d} filenames.".format(counter))
    
    return



def unpack_pe_binaries(file_list):
    
    counter = 0
    unpacked = 0
    error_count = 0
    for file_name in file_list:
        file_name = file_name.rstrip() # remove the newlines or else !!!
        file_path = ext_drive + file_name
        if (os.path.isfile(file_path)):
            # call unpack.py
            unpacked += 1
        else:
            #print("Error: file does not exist - {:s}".format(file_path))
            error_count += 1
           
        counter += 1
        if (counter % 1000) == 0: # print progress
            print('Disassembled: {:d} - {:s}'.format(counter,file_name))    
            
    print("Disassembled {:d} binaries with {:d} file path errors.".format(unpacked, error_count))  
    
    return









    



Write 16321 filenames.



In [ ]:

4. Disassemble All The PE Binaries That Are Not Packed.

- script: disassemble_unpacked_pe.py



In [ ]:

    
# For this we will use our old friend objdump.
!objdump



In [2]:

    
# Load the malware packer id features sets from VirusShare 251 and 252.
packer_id_features = pd.read_csv('data/sorted-packer-id-features-vs251-252.csv')
unpacked_files = packer_id_features[packer_id_features['is_packed'] == 0]
unpacked_pe_files = unpacked_files[unpacked_files['valid_pe'] == 1]
unpacked_pe_files.head(20)









    Out[2]:






  
    
      
      file_name
      packer_name
      packer_id
      valid_pe
      is_packed
    
  
  
    
      1 
       000118d12cbf9ad6103e8b914a6e1ac3
                     Microsoft Visual C++ 8
       1101
       1
       0
    
    
      4 
       0003887ab64b8ae19ffa988638decac2
           Microsoft Visual C# / Basic .NET
       1067
       1
       0
    
    
      5 
       000403e4e488356b7535cc613fbeb80b
                     Microsoft Visual C++ 8
       1101
       1
       0
    
    
      6 
       0004376a62e22f6ad359467eb742b8ff
                                    unknown
          0
       1
       0
    
    
      9 
       000634f03457d088c71dbffb897b1315
                     Microsoft Visual C++ 8
       1101
       1
       0
    
    
      10
       00072ed24314e91b63b425b3dc572f50
           Microsoft Visual Basic v5.0/v6.0
       1060
       1
       0
    
    
      11
       00092d369958b67557da8661cc9093bc
                     Microsoft Visual C++ 8
       1101
       1
       0
    
    
      12
       00093d5fa5cb7ce77f6eaf39962daa12
                     Microsoft Visual C++ 8
       1101
       1
       0
    
    
      13
       00099926d51b44c6f8c93a48c2567891
                                    unknown
          0
       1
       0
    
    
      14
       0009a64f786fa29bfa6423278cc74f02
                                    unknown
          0
       1
       0
    
    
      15
       000a2db4762dc06628a086c9e117f884
                    Armadillo v1.xx - v2.xx
        153
       1
       0
    
    
      16
       000ac11fa7587b2316470b154254a219
                                    unknown
          0
       1
       0
    
    
      17
       000ae2c63ba69fc93dfc395b40bfe03a
                                    unknown
          0
       1
       0
    
    
      19
       000b41258d624ef2d6e430822d0c0c8f
                                    unknown
          0
       1
       0
    
    
      20
       000bd85557d31d61fccc8f28b2cc9307
                     Microsoft Visual C++ 8
       1101
       1
       0
    
    
      21
       000c48b18fef58580dde4002e70bb649
       BobSoft Mini Delphi -> BoB / BobSoft
        317
       1
       0
    
    
      22
       000d8dda1d4d1a88276e2b25a064fa43
       Microsoft Visual C++ v5.0/v6.0 (MFC)
       1111
       1
       0
    
    
      23
       000e48501c3e5a978696526b5bb2f06d
                     Microsoft Visual C++ 8
       1101
       1
       0
    
    
      24
       0010f0b18f275e6eed6e6d9b403ec6ce
                                    unknown
          0
       1
       0
    
    
      25
       00122531ac4bdbb86c4ff52672a223f3
                                    unknown
          0
       1
       0
    
  

20 rows × 5 columns



In [4]:

    
def write_unpacked_file_list(packer_id_feature_file, unpacked_list_file_name):
    # Load the malware packer id features sets from the sample set.
    packer_id_features = pd.read_csv(packer_id_feature_file)
    unpacked_files = packer_id_features[packer_id_features['is_packed'] == 0]
    unpacked_pe_files = unpacked_files[unpacked_files['valid_pe'] == 1]
    
    fop = open(unpacked_list_file_name, 'w')
    counter = 0
    
    for file_name in unpacked_pe_files['file_name']:
        full_name = "VirusShare_" + file_name + "\n"
        fop.write(full_name)
        counter += 1

    print("Write {:d} filenames.".format(counter))



In [8]:

    
unpacked_pe_files.shape









    Out[8]:





(105806, 5)



In [9]:

    
# Looks like we might have some false negatives on the "is_packed" feature, so lets have a look at the groups.
# The majority seem to be ok, but there are a few thousand that may be misclassified as having been packed
# but the entropy values indicate otherwise. So we will go ahead and disassemble the whole lot anyway and see what
# the result is.

packer_counts = unpacked_pe_files['packer_name'].value_counts()
packer_counts









    Out[9]:





Microsoft Visual C++ 8                  35291
unknown                                 35084
BobSoft Mini Delphi -> BoB / BobSoft    12587
Microsoft Visual Basic v5.0/v6.0        10558
Microsoft Visual C++ v5.0/v6.0 (MFC)     5939
Armadillo v1.xx - v2.xx                  1432
Microsoft Visual C++ 7.0 MFC              857
Installer VISE Custom                     458
Microsoft Visual C# / Basic .NET          407
.NET executable                           402
RAR SFX                                   378
Borland C++ for Win32 1999                330
WARNING -> TROJAN -> HuiGeZi              317
PureBasic 4.x -> Neil Hodgson             282
WinRAR 32-bit SFX Module                  194
...
Crunch/PE v1.0.x.x                                        1
Virogen Crypt v0.75                                       1
PowerBASIC/Win 8.00                                       1
VBOX v4.3 MTE                                             1
NsPack v3.7 -> North Star (h)                             1
Nullsoft Install System v2.0b2 v2.0b3                     1
Upack v0.399 -> Dwing                                     1
StarForce ??.X DLL -> StarForce Copy Protection System    1
PEQuake V0.06 -> forgat                                   1
SkD Undetectabler Pro 2.0 (No UPX Method) -> SkD          1
Macromedia Windows Flash Projector/Player v5.0            1
PseudoSigner 0.1 [Microsoft Visual C++ 6.20]              1
Microsoft CAB SFX module                                  1
PEcrypt - by archphase                                    1
UPX 2.90 (LZMA)                                           1
Length: 83, dtype: int64



In [2]:

    
def disassemble_pe_binaries(file_list):
    # Use command "objdump -D file_name" to dump out all sections of the PE binary.
    counter = 0
    disassed = 0
    error_count = 0
    command_line = ""
    
    for file_name in file_list:
        file_name = file_name.rstrip() # remove the newlines or else !!!
        file_path1 = ext_drive1 + file_name
        file_path2 = ext_drive2 + file_name
        file_path3 = ext_drive3 + file_name
        file_path4 = ext_drive4 + file_name
        asm_file_name = "/opt/vs/asm/" + file_name + ".asm"
        hdr_file_name = "/opt/vs/asm/" + file_name + ".txt"
        
        if (os.path.isfile(file_path1)):
            
            #command_line = "objdump -d {:s} > {:s}".format(file_path1, asm_file_name)
            #sub.call(["rasm2", "-d", "-a", "x86", "-s", "intel", "-f", file_path, "-O", asm_file_name])
            #sub.call(["./idaq69", "-B", file_path])
            #sub.call(["python", "vivisect", "-B", file_path])
            #sub.call(["objdump", "-g", "-x", "-D", "-s", "-t", "-T", "-M", "intel", file_path], stdout=fop)
            
            # We will have to use objdump for now although the output is not optimal for
            # machine learning objectives, need to translate call operand target addresses
            # to function names, but all the alternatives do not work in batch mode or
            # do not work at all.
            # NOTE: IDA Pro Demo does not save any output, IDA Pro Free has a
            #       popup window on startup that prevents batch processing mode.
            # NOTE: A call instruction at 0x00471d16 in IDA Pro Free (idag.exe) calls the
            #       function that displays the popup window.
            #
            
            sub.call(["wine", '/opt/vs/ida/idag.exe', "-B", file_path1])
            # Dump section headers and import tables.
            fop = open(hdr_file_name, "w")
            sub.call(["objdump", "-g", "-x", file_path1], stdout=fop)
            fop.close()
            
            # now delete the binary, we do not need it anymore.
            #sub.call(["rm", file_path1])
            
            disassed += 1
        elif (os.path.isfile(file_path2)):
            sub.call(["wine", "/opt/vs/ida/idag.exe", "-B", file_path2])
            # Dump section headers and import tables.
            fop = open(hdr_file_name, "w")
            sub.call(["objdump", "-g", "-x", file_path2], stdout=fop)
            fop.close()
            # now delete the binary, we do not need it anymore.
            #sub.call(["rm", file_path2])
            disassed += 1
        else:
            #print("Error: file does not exist - {:s}".format(file_path))
            error_count += 1
           
        counter += 1
        if (counter % 1) == 0: # print progress
            print('Disassembled: {:d} - {:s}'.format(counter, file_name))    
            
    print("Disassembled {:d} binaries with {:d} file path errors.".format(disassed, error_count))
    
    #sub.call(["mv", "*.asm", "/opt/vs/asm"])
    
    return



In [6]:

    
#ext_drive1 = '/opt/vs/train1/'
ext_drive1 = '/opt/vs/train/'
ext_drive2 = '/opt/vs/train2/'
ext_drive3 = '/opt/vs/train3/'
ext_drive4 = '/opt/vs/train4/'
#write_unpacked_file_list('data/sorted-packer-id-features-vs251-252.csv')
#fip = open("data/unpacked_file_list.txt")
#file_list = fip.readlines()
file_list = os.listdir('/opt/vs/train')
disassemble_pe_binaries(file_list[:10])









    



Disassembled: 1 - VirusShare_50f35b7c86aede891a72fcb85f06b0b7
Disassembled: 2 - VirusShare_00dbb9e1c09dbdafb360f3163ba5a3de
Disassembled: 3 - VirusShare_6570163cd34454b3d1476c134d44b9d9
Disassembled: 4 - VirusShare_3f8682ab074a097ebbaadbf26dfff560
Disassembled: 5 - VirusShare_cf9c2d5a8fbdd1c5adc20cfc5e663c21
Disassembled: 6 - VirusShare_44066f29aab6a9379f8dd30f6bec257d
Disassembled: 7 - VirusShare_57e79f7df13c0cb01910d0c688fcd296
Disassembled: 8 - VirusShare_a316d5aeca269ca865077e7fff356e7d
Disassembled: 9 - VirusShare_6f9992c486195edcf0bf2f6ee6c3ec74
Disassembled: 10 - VirusShare_390d1f2a620912104f53c034c8aef14b
Disassembled 10 binaries with 0 file path errors.



In [24]:

    
ls /opt/vs/









    



asm/                             train1/  VirusShare_00251.md5  vs00251.txt
ida/                             train2/  VirusShare_00251.zip  vs00252.txt
MPDetection-06012016-175509.log  train3/  VirusShare_00252.md5
processhacker-2.asm              train4/  VirusShare_00252.zip



In [ ]:

    
help(os)

8. Test Code Only



In [ ]:

    
signatures = peutils.SignatureDatabase('data/userdb-sans.txt')



In [ ]:

    
ext_drive = '/opt/vs/train/'
ext_drive2 = '/opt/vs/train2/'
file_list = os.listdir(ext_drive)
len(file_list)



In [ ]:

    
out_lines = []
for idx, file_name in enumerate(file_list):
        try:
            pe = pefile.PE(ext_drive + file_name, fast_load=True)
            #pe_file_counter += 1
            #matches = signatures.match_all(pe, ep_only = True)
            matches = signatures.match(pe, ep_only = True)
            if matches == None:
                row = file_name + ",unknown,0\n"
            else:
                row = file_name + ",".join(str(e) for e in matches) + "\n"

            out_lines.append(row)

            pe.close()   
            
            if (idx % 1000) == 0: # print progress
                out_lines = []
                print('Filename: {:s} - {:d}'.format(row,idx))

        except:
            #non_pe_counter += 1
            out_lines.append(file_name + ",nonpe,0\n")



In [ ]:

    
help(peutils)



In [2]:

    
testdf = pd.read_csv('data/sorted-packer-id-features.csv')
testdf.head(20)









    Out[2]:






  
    
      
      file_name
      packer_name
      packer_id
    
  
  
    
      0
      00002e640cafb741bea9a48eaee27d6f
      unknown
      0
    
    
      1
      000118d12cbf9ad6103e8b914a6e1ac3
      Microsoft Visual C++ 8
      1101
    
    
      2
      0001776237ac37a69fcef93c1bac0988
      unknown
      0
    
    
      3
      00027c21667d9119a454df8cef2dc1c7
      nonpe
      0
    
    
      4
      0003887ab64b8ae19ffa988638decac2
      Microsoft Visual C# / Basic .NET
      1067
    
    
      5
      000403e4e488356b7535cc613fbeb80b
      Microsoft Visual C++ 8
      1101
    
    
      6
      0004376a62e22f6ad359467eb742b8ff
      unknown
      0
    
    
      7
      0004c8b2a0f4680a5694d74199b40ea2
      Microsoft Visual C++ v5.0/v6.0 (MFC)
      1111
    
    
      8
      000595d8b586915c12053104cf845097
      unknown
      0
    
    
      9
      000634f03457d088c71dbffb897b1315
      Microsoft Visual C++ 8
      1101
    
    
      10
      00072ed24314e91b63b425b3dc572f50
      Microsoft Visual Basic v5.0/v6.0
      1060
    
    
      11
      00092d369958b67557da8661cc9093bc
      Microsoft Visual C++ 8
      1101
    
    
      12
      00093d5fa5cb7ce77f6eaf39962daa12
      Microsoft Visual C++ 8
      1101
    
    
      13
      00099926d51b44c6f8c93a48c2567891
      unknown
      0
    
    
      14
      0009a64f786fa29bfa6423278cc74f02
      unknown
      0
    
    
      15
      000a2db4762dc06628a086c9e117f884
      Armadillo v1.xx - v2.xx
      153
    
    
      16
      000ac11fa7587b2316470b154254a219
      unknown
      0
    
    
      17
      000ae2c63ba69fc93dfc395b40bfe03a
      unknown
      0
    
    
      18
      000ae90736a51c47543dcc6d8a735362
      unknown
      0
    
    
      19
      000b41258d624ef2d6e430822d0c0c8f
      unknown
      0



In [3]:

    
packer_counts = testdf['packer_name'].value_counts()
packer_counts









    Out[3]:





unknown                                                                  44820
Microsoft Visual C++ 8                                                   36963
BobSoft Mini Delphi -> BoB / BobSoft                                     13892
Microsoft Visual Basic v5.0/v6.0                                         10734
nonpe                                                                     8945
Microsoft Visual C++ v5.0/v6.0 (MFC)                                      7133
Armadillo v1.xx - v2.xx                                                   1644
Microsoft Visual C++ 7.0 MFC                                               902
UPX 2.90 (LZMA)                                                            687
Installer VISE Custom                                                      479
Microsoft Visual C# / Basic .NET                                           474
.NET executable                                                            460
RAR SFX                                                                    380
UPX v0.89.6 - v1.02 / v1.05 -v1.22 (Delphi) stub                           356
Borland C++ for Win32 1999                                                 338
WARNING -> TROJAN -> HuiGeZi                                               318
PureBasic 4.x -> Neil Hodgson                                              308
UPX v3.0 (EXE_LZMA) -> Markus Oberhumer & Laszlo Molnar & John Reiser      287
UPX v0.89.6 - v1.02 / v1.05 -v1.24 -> Markus & Laszlo [overlay]            199
WinRAR 32-bit SFX Module                                                   194
TASM / MASM                                                                191
Microsoft Visual C++ v6.0 SPx                                              174
Borland Delphi v6.0                                                        113
PECompact v2.0                                                             111
Microsoft Visual C++ (3.0 old crap)                                        105
Microsoft Visual C++ 7.0                                                    75
MingWin32 v?.? (h)                                                          72
Borland Delphi v5.0 KOL                                                     55
PureBasic DLL -> Neil Hodgson                                               51
dUP2 -> diablo2oo2                                                          46
                                                                         ...  
Themida/WinLicense V1.8.2.0 + -> Oreans Technologies ! Sign by fly           2
ASProtect v1.23 RC1                                                          2
FSG v1.10 (Eng) -> dulek/xt                                                  2
NsPack v3.7 -> North Star (h)                                                2
Macromedia Windows Flash Projector/Player v5.0                               1
ICrypt 1.0 - by BuGGz                                                        1
Virogen Crypt v0.75                                                          1
Microsoft Visual C++ v7.1 EXE                                                1
FSG v2.0 -> bart/xt                                                          1
Corrupted 5 EXE -> yy66                                                      1
NsPack 2.9 -> North Star                                                     1
theWRAP - by TronDoc                                                         1
Armadillo v2.01                                                              1
PowerBASIC/Win 7.0x                                                          1
PowerBASIC/Win 8.00                                                          1
PEArmor V0.7X -> Hying                                                       1
PE-Crypter                                                                   1
Wise Installer Stub                                                          1
ASPack v1.08.03                                                              1
VBOX v4.3 MTE                                                                1
SVK Protector v1.32 (Eng) -> Pavol Cerven                                    1
PC Guard for Win32 v5.00 -> SofPro/Blagoje Ceklic (h)                        1
Nullsoft Install System v2.0b2 v2.0b3                                        1
SimplePack V1.1X (Method1) -> bagie ! Sign by fly                            1
PEcrypt - by archphase                                                       1
StarForce ??.X DLL -> StarForce Copy Protection System                       1
PEQuake V0.06 -> forgat                                                      1
PseudoSigner 0.1 [Microsoft Visual C++ 6.20]                                 1
SkD Undetectabler Pro 2.0 (No UPX Method) -> SkD                             1
MicroJoiner 1.7 -> coban2k                                                   1
Name: packer_name, dtype: int64



In [4]:

    
packer_counts[:10].plot(kind='barh', rot=0)
plt.show()



In [5]:

    
plt.show()



In [ ]:



In [12]:

    
pe_counts = testdf['valid_pe'].value_counts()
pe_counts









    Out[12]:





1    44
0     6
Name: valid_pe, dtype: int64



In [13]:

    
is_packed = testdf['is_packed'].value_counts()
is_packed









    Out[13]:





0    39
1    11
Name: is_packed, dtype: int64



In [16]:

    
pecrows = testdf[testdf['packer_name'] == 'PECompact v2.0']
pecrows









    Out[16]:






  
    
      
      file_name
      packer_name
      packer_id
      valid_pe
      is_packed
    
  
  
    
      33
      0038221b407096f0bef09bbd06d4347a
      PECompact v2.0
      1600
      1
      1



In [17]:

    
unkrows = testdf[testdf['packer_name'] == 'unknown']
unkrows









    Out[17]:






  
    
      
      file_name
      packer_name
      packer_id
      valid_pe
      is_packed
    
  
  
    
      0
      5406a540868da572c4924c507980005a
      unknown
      0
      1
      0
    
    
      2
      00002e640cafb741bea9a48eaee27d6f
      unknown
      0
      1
      1
    
    
      4
      0001776237ac37a69fcef93c1bac0988
      unknown
      0
      1
      1
    
    
      7
      000595d8b586915c12053104cf845097
      unknown
      0
      1
      1
    
    
      9
      00099926d51b44c6f8c93a48c2567891
      unknown
      0
      1
      0
    
    
      11
      000ae2c63ba69fc93dfc395b40bfe03a
      unknown
      0
      1
      0
    
    
      19
      001f21fcfc8299d10af88ced33681658
      unknown
      0
      1
      0
    
    
      22
      0025420de5eeae2b56a44366aabdfe7a
      unknown
      0
      1
      0
    
    
      23
      002707c5430913ae95ed89bd3fad2bb7
      unknown
      0
      1
      0
    
    
      24
      0027d32bb0d65bc7f69f63775518a55c
      unknown
      0
      1
      0
    
    
      25
      002c634dd7ef576ccdbf74c8e2f00e2e
      unknown
      0
      1
      1
    
    
      27
      002de4622831b73f6b3346c990a22b94
      unknown
      0
      1
      0
    
    
      28
      002e77eaccc936a21b3cca1c60ffafd6
      unknown
      0
      1
      0
    
    
      29
      00304eb3856ee1f4b2f5164c95146979
      unknown
      0
      1
      0
    
    
      32
      003646671b05ca916dc8b2d0ef7aff56
      unknown
      0
      1
      0
    
    
      34
      003b431e6d7cccc02fcf655f4c7994f9
      unknown
      0
      1
      0
    
    
      35
      003d21b79c66aa80a9b3b59ae5d2deac
      unknown
      0
      1
      0
    
    
      36
      003f76510683e95c3ab210d83073e3a1
      unknown
      0
      1
      1
    
    
      39
      00443f2600d9c721a75723009b2fd5fb
      unknown
      0
      1
      0
    
    
      40
      00445aabf91f71ca316ee98f2f710317
      unknown
      0
      1
      0
    
    
      43
      00456877953a4d89543c740f9e03fbb0
      unknown
      0
      1
      0
    
    
      47
      0050fc442c7e41e42cd250841fe0334a
      unknown
      0
      1
      0
    
    
      48
      0051b4232358eab113530a8c2299108c
      unknown
      0
      1
      0
    
    
      49
      0052662dac0f708a9b91bfabe739529f
      unknown
      0
      1
      0



In [19]:

    
nonpe = testdf[testdf['valid_pe'] == 0]
nonpe









    Out[19]:






  
    
      
      file_name
      packer_name
      packer_id
      valid_pe
      is_packed
    
  
  
    
      12
      0012a82ce8e0107d909959961d5862a1
      'DOS Header magic not found.'
      0
      0
      0
    
    
      15
      001833c06efcaa39e7803c6a369e99dd
      'DOS Header magic not found.'
      0
      0
      0
    
    
      17
      001e6e1f510250f4de06f8c9c2784d45
      'DOS Header magic not found.'
      0
      0
      0
    
    
      30
      00314e1bd4ecb9a50efe307ca2d001b7
      'DOS Header magic not found.'
      0
      0
      0
    
    
      38
      00423f1656a26c53a787304f27aa60cd
      'DOS Header magic not found.'
      0
      0
      0
    
    
      41
      0044faab42e518c2b417ae3e528ee287
      'DOS Header magic not found.'
      0
      0
      0



In [2]:

    
fip = open("/opt/vs/error-list.txt","r")
file_list = fip.readlines()
fip.close()

for file_name in file_list:
    file_name = file_name.rstrip() # remove the newlines or else !!!
    file_path1 = "/opt/vs/train1/" + file_name

    if (os.path.isfile(file_path1)):
        sub.call(["wine", '/opt/vs/ida/idag.exe', "-B", file_path1])
        print("Disassembling PE file: {:s}".format(file_path1))









    



Disassembling PE file: /opt/vs/train1/VirusShare_0003887ab64b8ae19ffa988638decac2
Disassembling PE file: /opt/vs/train1/VirusShare_0025cc13683331a61986b6433e768f3f
Disassembling PE file: /opt/vs/train1/VirusShare_006b4c72e79e60d10515a64ec6a4e021
Disassembling PE file: /opt/vs/train1/VirusShare_00d574c8f6fe8453e0c57a8a731f15b4
Disassembling PE file: /opt/vs/train1/VirusShare_01561d7971d10d2192e87b75a74980a4



In [ ]:

    
ext_drive = "/media/derek/TOSHIBA EXT/train3"
file_list = os.listdir(ext_drive)
for file_name in file_list[:10]:
    file_name = file_name.rstrip() # remove the newlines or else !!!
    file_path1 = "/opt/vs/train1/" + file_name

    if (os.path.isfile(file_path1)):
        sub.call(["wine", '/opt/vs/ida/idag.exe', "-B", file_path1])
        print("Disassembling PE file: {:s}".format(file_path1))



In [ ]:

    
ext_drive = "/media/derek/TOSHIBA EXT/train3/"
file_list = os.listdir(ext_drive)
for file_name in file_list[:10]:
    file_name = file_name.rstrip() # remove the newlines or else !!!
    file_path1 =  ext_drive + file_name

    if (os.path.isfile(file_path1)):
        sub.call(["file", file_path1])
        print("File: {:s}".format(file_path1))









    



File: /media/derek/TOSHIBA EXT/train3/VirusShare_0002b2f621ea5786be03bf4153532dce
File: /media/derek/TOSHIBA EXT/train3/VirusShare_000401419eccde59975c713cfadc974c
File: /media/derek/TOSHIBA EXT/train3/VirusShare_00042f23bc15b89d9c6a7bde0e316f8b
File: /media/derek/TOSHIBA EXT/train3/VirusShare_0004824a60ff9fe1fb30d669a5baa627
File: /media/derek/TOSHIBA EXT/train3/VirusShare_0004c49071481789f1c8c80656638497
File: /media/derek/TOSHIBA EXT/train3/VirusShare_0005ec2ef0812dc3b41189193d45e3da



In [2]:

    
ext_drive = "/media/derek/TOSHIBA EXT/train3/"
file_list = os.listdir(ext_drive)
for file_name in file_list[:10]:
    file_name = file_name.rstrip() # remove the newlines or else !!!
    file_path1 =  ext_drive + file_name

    if (os.path.isfile(file_path1)):
        sub.call(["file", file_path1])
        print("File: {:s}".format(file_path1))









    



File: /media/derek/TOSHIBA EXT/train3/VirusShare_0002b2f621ea5786be03bf4153532dce
File: /media/derek/TOSHIBA EXT/train3/VirusShare_000401419eccde59975c713cfadc974c
File: /media/derek/TOSHIBA EXT/train3/VirusShare_00042f23bc15b89d9c6a7bde0e316f8b
File: /media/derek/TOSHIBA EXT/train3/VirusShare_0004824a60ff9fe1fb30d669a5baa627
File: /media/derek/TOSHIBA EXT/train3/VirusShare_0004c49071481789f1c8c80656638497
File: /media/derek/TOSHIBA EXT/train3/VirusShare_0005ec2ef0812dc3b41189193d45e3da
File: /media/derek/TOSHIBA EXT/train3/VirusShare_0006d2cd674c8501ffe59dae330ffcb5
File: /media/derek/TOSHIBA EXT/train3/VirusShare_0007893715059c51a92d3ce2b10d9cf5
File: /media/derek/TOSHIBA EXT/train3/VirusShare_0009346c96460fd0189cfe64bbd9db68
File: /media/derek/TOSHIBA EXT/train3/VirusShare_000a79e98b2a1a3bff2bcb93042d3e78



In [33]:

    
# file type/magic signature generation tests

def process_files(file_list):
    # Iterate over the file list and output the results from the "file" command.
    out_lines = []
    file_counter = 0
    pid = os.getpid()
    file_name = "data/" + str(pid) + "-file-id.csv"
    fop = open(file_name,'w')
    
    for idx, file_name in enumerate(file_list):
        file_name = file_name.rstrip() # remove the newlines or else !!!
        file_path =  ext_drive + file_name

        if (os.path.isfile(file_path)):
            #print("File: {:s}".format(file_path))
            signat = sub.check_output(["file","-b", file_path]) # Use the brief option, we do not need the file name.
            out_lines.append(signat)
        
        if (idx % 10) == 0: # print progress
            fop.writelines(out_lines)
            out_lines = []
            print('{:s} - {:s} - {:d} - {:s}'.format(str(pid), file_name, idx, signat))
            
            
    if len(out_lines) > 0:
        fop.writelines(out_lines)
        out_lines = []    

    fop.close()
    
    return


def process_trids(file_list):
    # Iterate over the file list and output the results from TrID.
    out_lines = []
    high_score_line = ""
    file_counter = 0
    pid = os.getpid()
    file_name = "data/" + str(pid) + "-trid-id.csv"
    fop = open(file_name,'w')
    
    for idx, file_name in enumerate(file_list):
        file_name = file_name.rstrip() # remove the newlines or else !!!
        file_path1 =  ext_drive + file_name

        if (os.path.isfile(file_path1)):
            #print("File: {:s}".format(file_path1))
            signat = sub.check_output(["/opt/vs/trid", file_path1])
            components = signat.split('\n')
            for idx2, line in enumerate(components):
                if line.startswith("Collect"):
                    high_score_line = components[idx2 + 1]
                    out_lines.append(high_score_line) # If we find a TrID signature the next line
                    break                             # contains the highest probability file type.
            
        if (idx % 10) == 0: # print progress
            fop.writelines(out_lines)
            out_lines = []
            print('{:s} - {:s} - {:d} - {:s}'.format(str(pid), file_name, idx, high_score_line))
            
            
    if len(out_lines) > 0:
        fop.writelines(out_lines)
        out_lines = []                
     
    fop.close()
    
    return


def combine_magic_reports():
    # Concatenate the four report files into one file.
    fop = open('data/magic-reports.csv','w')
    p1 = re.compile('\d{3,5}-\w+-id.csv') # This is the pattern for each file.
    file_list = os.listdir('data/')
    counter = 0
    
    for file_name in file_list:
        if p1.match(file_name):
            fip = open('data/' + file_name, 'r')
            in_lines = fip.readlines()
            fop.writelines(in_lines)
            counter += len(in_lines)
            fip.close()
            
    print('Completed combine of {:d} magic reports.'.format(counter))  
    
    fop.close()
    
    return



In [ ]:



In [ ]:

    
daf = pd.read_csv('data/sorted-entropy-features-vs263.csv')
daf.head()



In [ ]:

    
fsizes = daf[daf['file_size'] > 50000000]
fsizes.head()



In [26]:

    
fsizes.shape









    Out[26]:





(3, 3)



In [ ]:

    
daf1 = pd.read_csv('data/5299-entropy-features-bin.csv', names=['file_name','entropy','file_size'])
daf1.head()



In [17]:

    
daf1.shape









    Out[17]:





(11881, 3)



In [ ]:

    
fsizes1 = daf1[daf1['file_size'] > 50000000]
fsizes1.head()



In [22]:

    
fsizes1.shape









    Out[22]:





(17, 3)



In [ ]:

    
fsizes1.head(17)



In [ ]:



In [ ]:

    
ext_drive = "/opt/vs/apt/"
tfiles = os.listdir(ext_drive)
process_files(tfiles)



In [ ]:

    
ext_drive = "/opt/vs/apt/"
tfiles = os.listdir(ext_drive)
process_trids(tfiles)



In [31]:

    
def get_unpacked_file_list(packer_id_feature_file, file_id_feature_file, trid_id_feature_file):
    # Load the malware packer id features and file id features from the sample set.
    packer_id_features = pd.read_csv(packer_id_feature_file)
    file_id_features = pd.read_csv(file_id_feature_file)
    trid_id_features = pd.read_csv(trid_id_feature_file)
    
    # Get a list of unpacked PE files that are not .NET CIL format.
    # IDA Pro cannot disassemble .NET files.
    unpacked_files = packer_id_features[packer_id_features['is_packed'] == 0]
    unpacked_pe_files = unpacked_files[unpacked_files['valid_pe'] == 1]
    not_dot_net = []
    counter = 0
    dot_net_counter = 0
    
    # Get the trid and file rows that are for unpacked PE files.
    trids = trid_id_features[trid_id_features['file_name'].isin(unpacked_pe_files['file_name'])]
    fids = file_id_features[file_id_features['file_name'].isin(unpacked_pe_files['file_name'])]
    
    # Iterate over the unpacked PE file list and check if each is a .NET file.
    # If not a .NET file then add to file list.
    pe_names_list = unpacked_pe_files['file_name']
    for idx, file_name in enumerate(pe_names_list):
        trid_name = trids.iloc[idx, 1]
        fid_name = fids.iloc[idx, 1]
        trid_name = trid_name.lower()
        fid_name = fid_name.lower()
        
        if trid_name.find('.net') > -1 or fid_name.find('.net') > -1:
            print('Found: {:s} - {:s}'.format(trid_name, fid_name))
            dot_net_counter += 1
            continue
            
        #print('Found: {:s} - {:s}'.format(trid_name, fid_name))
        not_dot_net.append(file_name)
        counter += 1
    
    file_list = []
    counter = 0
    
    # Iterate over the file list and prepend the full file name.
    for file_name in not_dot_net:
        full_name = "VirusShare_" + file_name + "\n"
        file_list.append(full_name)
        counter += 1

    print("Got {:d} unpacked PE filenames and {:d} .NET filenames.".format(counter, dot_net_counter))

    return file_list



In [32]:

    
packer_id_file = 'data/sorted-packer-id-features-vs251.csv'
file_id_file = 'data/sorted-file-id-features-vs251.csv'
trid_id_file = 'data/sorted-trid-id-features-vs251.csv'
unflist = get_unpacked_file_list(packer_id_file, file_id_file, trid_id_file)
len(unflist)









    



Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) installshield setup - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) installshield setup - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) installshield setup - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) win32 dynamic link library (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) installshield setup - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: unknown - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) win32 dynamic link library (generic) - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.exe) installshield setup - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable (generic) - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) installshield setup - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable (generic) - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) generic cil executable (.net mono etc.) - pe32 executable (gui) intel 80386 for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) generic cil executable (.net mono etc.) - pe32 executable (console) intel 80386 for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.scr) windows screen saver - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32+ executable (gui) x86-64 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) installshield setup - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) installshield setup - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) win32 dynamic link library (generic) - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.odttf) obfuscated subsetted font - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) generic cil executable (.net mono etc.) - pe32 executable (gui) intel 80386 for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) installshield setup - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) win32 dynamic link library (generic) - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable (generic) - pe32 executable (gui) intel 80386 (stripped to external pdb) mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) installshield setup - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.exe) generic cil executable (.net mono etc.) - pe32 executable (gui) intel 80386 for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) installshield setup - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) win32 dynamic link library (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) installshield setup - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) installshield setup - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) installshield setup - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) win32 dynamic link library (generic) - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.dll) win32 dynamic link library (generic) - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.dll) win32 dynamic link library (generic) - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win32 executable ms visual c++ (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.dll) generic .net dll/assembly - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) installshield setup - pe32 executable (gui) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) installshield setup - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Found: (.exe) win64 executable (generic) - pe32 executable (dll) (console) intel 80386 mono/.net assembly for ms windows
Got 55128 unpacked PE filenames and 348 .NET filenames.






    Out[32]:





55128



In [ ]:



In [15]:

    
packer_id_file = 'data/sorted-packer-id-features-apt.csv'
file_id_file = 'data/sorted-file-id-features-apt.csv'
trid_id_file = 'data/sorted-trid-id-features-apt.csv'
packer_id_features = pd.read_csv(packer_id_file)
file_id_features = pd.read_csv(file_id_file)
trid_id_features = pd.read_csv(trid_id_file)

# Get a list of unpacked PE files that are not .NET CIL format.
# IDA Pro cannot disassemble .NET files.
unpacked_files = packer_id_features[packer_id_features['is_packed'] == 0]
unpacked_pe_files = unpacked_files[unpacked_files['valid_pe'] == 1]
not_dot_net = []
counter = 0

# Get the trid and file rows that are for unpacked PE files.
trids = trid_id_features[trid_id_features['file_name'].isin(unpacked_pe_files['file_name'])]
fids = file_id_features[file_id_features['file_name'].isin(unpacked_pe_files['file_name'])]
trids.head()









    Out[15]:






  
    
      
      file_name
      file_type
      percentage
      file_id
    
  
  
    
      0
       001dd76872d80801692ff942308c64e6
                     (.EXE) Win64 Executable (generic)
       4.6
       21
    
    
      1
       002325a0a67fded0381b5648d7fe9b8e
                     (.EXE) Win64 Executable (generic)
       4.6
       21
    
    
      2
       00dbb9e1c09dbdafb360f3163ba5a3de
                           (.SCR) Windows screen saver
       6.4
       24
    
    
      3
       0149b7bd7218aab4e257d28469fddb0d
                     (.EXE) Win64 Executable (generic)
       4.6
       21
    
    
      4
       01e0dc079d4e33d8edd050c4900818da
       (.EXE) Win32 Executable MS Visual C++ (generic)
       7.4
        1
    
  

5 rows × 4 columns



In [18]:

    
fids.head()









    Out[18]:






  
    
      
      file_name
      file_type
      file_id
    
  
  
    
      0
       001dd76872d80801692ff942308c64e6
       PE32 executable (GUI) Intel 80386 for MS Windows
       1
    
    
      1
       002325a0a67fded0381b5648d7fe9b8e
       PE32 executable (GUI) Intel 80386 for MS Windows
       1
    
    
      2
       00dbb9e1c09dbdafb360f3163ba5a3de
       PE32 executable (GUI) Intel 80386 for MS Windows
       1
    
    
      3
       0149b7bd7218aab4e257d28469fddb0d
       PE32 executable (GUI) Intel 80386 for MS Windows
       1
    
    
      4
       01e0dc079d4e33d8edd050c4900818da
       PE32 executable (GUI) Intel 80386 for MS Windows
       1
    
  

5 rows × 3 columns



In [10]:

    
def get_elf_file_list(packer_id_feature_file, file_id_feature_file, trid_id_feature_file):
    # Load the malware packer id features and file id features from the sample set.
    packer_id_features = pd.read_csv(packer_id_feature_file)
    file_id_features = pd.read_csv(file_id_feature_file)
    trid_id_features = pd.read_csv(trid_id_feature_file)
    
    counter = 0

    # Iterate over the unpacked PE file list and check if each is a .NET file.
    # If not a .NET file then add to file list.
    file_names_list = file_id_features['file_name']
    file_list = []
    
    for idx, file_name in enumerate(file_names_list):
        trid_name = trid_id_features.iloc[idx, 1]
        fid_name = file_id_features.iloc[idx, 1]
        #trid_name = trid_name.lower()
        #fid_name = fid_name.lower()
        
        if trid_name.find('ELF') > -1 or fid_name.find('ELF') > -1:
            print('Found: {:s} - {:s}'.format(trid_name, fid_name))
            counter += 1
            full_name = "VirusShare_" + file_name + "\n"
            file_list.append(full_name)


        
    fop = open('data/elf-file-list.txt','w')
    fop.writelines(file_list)
    fop.close()
    
    print("Got {:d} ELF filenames.".format(counter))

    return file_list



In [11]:

    
packer_id_file = 'data/sorted-packer-id-features-vs251.csv'
file_id_file = 'data/sorted-file-id-features-vs251.csv'
trid_id_file = 'data/sorted-trid-id-features-vs251.csv'
    
unflist = get_elf_file_list(packer_id_file, file_id_file, trid_id_file)
unflist









    



Found: unknown - ELF 32-bit MSB  executable PowerPC or cisco 4500 version 1 (SYSV) statically linked not stripped
Found: unknown - ELF 32-bit LSB  executable MIPS MIPS-I version 1 (SYSV) statically linked not stripped
Got 2 ELF filenames.






    Out[11]:





['VirusShare_abbde81d7f4733c16046cbd8ee7409d3\n',
 'VirusShare_f04f278048fc082dd5d0f34efa3c05f8\n']



In [12]:

    
packer_id_file = 'data/sorted-packer-id-features-vs252.csv'
file_id_file = 'data/sorted-file-id-features-vs252.csv'
trid_id_file = 'data/sorted-trid-id-features-vs252.csv'
    
unflist = get_elf_file_list(packer_id_file, file_id_file, trid_id_file)
unflist









    



Found: unknown - ELF 32-bit MSB  executable PowerPC or cisco 4500 version 1 (SYSV) statically linked not stripped
Found: unknown - ELF 32-bit LSB  executable ARM version 1 statically linked not stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) dynamically linked (uses shared libs) for GNU/Linux 2.0.0 stripped
Got 3 ELF filenames.






    Out[12]:





['VirusShare_c6813bcaf9a2801973e9c44fe75ef75b\n',
 'VirusShare_cbb492024bdd2484f39893ab77da0cae\n',
 'VirusShare_fa390c69553d757c3a10737a0a8604dc\n']



In [13]:

    
packer_id_file = 'data/sorted-packer-id-features-vs263.csv'
file_id_file = 'data/sorted-file-id-features-vs263.csv'
trid_id_file = 'data/sorted-trid-id-features-vs263.csv'
    
unflist = get_elf_file_list(packer_id_file, file_id_file, trid_id_file)
unflist









    



Found: unknown - ELF 32-bit LSB  executable ARM EABI4 version 1 (SYSV) statically linked for GNU/Linux 2.6.14 stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked for GNU/Linux 2.2.5 not stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (GNU/Linux) statically linked for GNU/Linux 2.6.18 not stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked for GNU/Linux 2.6.9 not stripped
Found: unknown - ELF 32-bit LSB  executable ARM EABI5 version 1 (SYSV) statically linked for GNU/Linux 2.6.16 not stripped
Got 5 ELF filenames.






    Out[13]:





['VirusShare_480813ec6548a4e55245a0e446e63c36\n',
 'VirusShare_5b88e0490dd764e66e13c8a543099c9d\n',
 'VirusShare_62d33be03ef3bc9c81d703898fc0e18c\n',
 'VirusShare_7a891a96d6af45865e5fe6142b40eb77\n',
 'VirusShare_af8970eb045a77ad1c427eb6333c9efd\n']



In [14]:

    
packer_id_file = 'data/sorted-packer-id-features-vs264.csv'
file_id_file = 'data/sorted-file-id-features-vs264.csv'
trid_id_file = 'data/sorted-trid-id-features-vs264.csv'
    
unflist = get_elf_file_list(packer_id_file, file_id_file, trid_id_file)
unflist









    



Found: unknown - ELF 32-bit MSB  executable SPARC version 1 (SYSV) statically linked stripped
Found: unknown - ELF 32-bit LSB  executable Renesas SH version 1 (SYSV) statically linked stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked not stripped
Found: unknown - ELF 32-bit MSB  executable Motorola 68020 - invalid byte order version 1 (SYSV) statically linked stripped
Found: unknown - ELF 32-bit MSB  executable PowerPC or cisco 4500 version 1 (SYSV) statically linked not stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked stripped
Found: unknown - ELF 64-bit LSB  executable x86-64 version 1 (SYSV) statically linked not stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked for GNU/Linux 2.2.5 not stripped
Found: unknown - ELF 32-bit LSB  executable ARM EABI4 version 1 (SYSV) statically linked stripped
Found: unknown - ELF 32-bit LSB  executable ARM EABI4 version 1 (SYSV) statically linked stripped
Found: unknown - ELF 32-bit LSB  executable ARM EABI4 version 1 (SYSV) statically linked for GNU/Linux 2.6.14 stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked not stripped
Found: unknown - ELF 32-bit LSB  executable MIPS MIPS-I version 1 (SYSV) statically linked stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked stripped
Found: unknown - ELF 32-bit LSB  executable Renesas SH version 1 (SYSV) statically linked not stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (GNU/Linux) statically linked for GNU/Linux 2.6.18 not stripped
Found: unknown - ELF 64-bit LSB  executable x86-64 version 1 (SYSV) statically linked not stripped
Found: unknown - ELF 64-bit LSB  executable x86-64 version 1 (SYSV) statically linked stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (GNU/Linux) statically linked stripped
Found: unknown - ELF 32-bit LSB  executable ARM version 1 statically linked stripped
Found: unknown - ELF 32-bit LSB  executable ARM version 1 statically linked stripped
Found: unknown - ELF 32-bit LSB  executable MIPS MIPS-I version 1 (SYSV) statically linked not stripped
Found: unknown - ELF 32-bit MSB  executable MIPS MIPS-I version 1 (SYSV) statically linked not stripped
Found: unknown - ELF 32-bit LSB  executable ARM version 1 statically linked stripped
Found: unknown - ELF 32-bit MSB  executable SPARC version 1 (SYSV) statically linked stripped
Found: unknown - ELF 32-bit LSB  shared object Intel 80386 version 1 (SYSV) dynamically linked BuildID[sha1]=71607db5350243b7610fe62ed8bd918996ea1ac1 stripped
Found: unknown - ELF 32-bit MSB  executable Motorola 68020 - invalid byte order version 1 (SYSV) statically linked stripped
Found: unknown - ELF 64-bit LSB  executable x86-64 version 1 (SYSV) statically linked stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked stripped
Found: unknown - ELF 32-bit LSB  executable ARM EABI4 version 1 (SYSV) dynamically linked (uses shared libs) not stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked stripped
Found: unknown - ELF 32-bit LSB  executable Renesas SH version 1 (SYSV) statically linked stripped
Found: unknown - ELF 32-bit LSB  executable ARM version 1 statically linked stripped
Found: unknown - ELF 32-bit LSB  executable ARM version 1 statically linked not stripped
Found: unknown - ELF 32-bit MSB  executable SPARC version 1 (SYSV) statically linked not stripped
Found: unknown - ELF 32-bit LSB  executable MIPS MIPS-I version 1 (SYSV) statically linked stripped
Found: unknown - ELF 32-bit MSB  executable MIPS MIPS-I version 1 (SYSV) statically linked stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked for GNU/Linux 2.2.5 not stripped
Found: unknown - ELF 32-bit MSB  executable Motorola 68020 - invalid byte order version 1 (SYSV) statically linked not stripped
Found: unknown - ELF 32-bit LSB  executable ARM EABI5 version 1 (SYSV) statically linked stripped
Found: unknown - ELF 32-bit LSB  executable ARM EABI4 version 1 (SYSV) statically linked not stripped
Found: unknown - ELF 32-bit MSB  executable MIPS MIPS-I version 1 (SYSV) statically linked stripped
Found: unknown - ELF 32-bit LSB  executable Intel 80386 version 1 (SYSV) statically linked stripped
Found: unknown - ELF 32-bit MSB  executable MIPS MIPS32 rel2 version 1 statically linked for GNU/Linux 2.6.16 not stripped
Got 44 ELF filenames.






    Out[14]:





['VirusShare_0013bead336ed08623d3641c2920563e\n',
 'VirusShare_038a818727882acf6a3e603c4faee1d9\n',
 'VirusShare_0403fc76b30b735ae5881a06abe539a6\n',
 'VirusShare_066093e509ff63f19a8c8296801278fa\n',
 'VirusShare_0afaa3632fca08688d3f1fb6246df4f7\n',
 'VirusShare_10c0e8ad9f935d33f396d99d0ba667a6\n',
 'VirusShare_137c1520b37dfc3ce5072be7995c96fc\n',
 'VirusShare_21652156824d4a074e1b690d4f6bfad7\n',
 'VirusShare_28c01fdcb452bc41581a745219311676\n',
 'VirusShare_2abbcf0543100be77bd67b36f0add8cb\n',
 'VirusShare_2adab02d07602d1c36e37cfd501eba2e\n',
 'VirusShare_2b11b4291193405868a9033fb2c768a1\n',
 'VirusShare_36767f86b42ff60e36b6aa478431a25d\n',
 'VirusShare_3aeeb4c92cb1af3ec9e15077fbe642c5\n',
 'VirusShare_44230febf1cc4a78d9e5a31b819337a3\n',
 'VirusShare_4c45fc4a7ba1a77b0c7f7479a1036702\n',
 'VirusShare_4e593af1ab25873681c62ca4f49e31e3\n',
 'VirusShare_4f5d0ed102de7c171d1df4989c4cdcd0\n',
 'VirusShare_5abdfc799d9df1edae9656b2634e1db9\n',
 'VirusShare_70b57c5edf914ba747698359c6e1050c\n',
 'VirusShare_776b82fab52965089f89469d143fe498\n',
 'VirusShare_7b810e34e639b4fec3e7c965cb7d5140\n',
 'VirusShare_7eab896b14684f9fd2dce2faee15c80a\n',
 'VirusShare_8a22372d721cebb60cb96363b33c9153\n',
 'VirusShare_8c7495e74a2ff0b2cd39aba125a8df46\n',
 'VirusShare_982f509e3a517985a93584aa60ef6354\n',
 'VirusShare_9e92940aebd4c647b371f84d28dec438\n',
 'VirusShare_a4944230d62083019d13af861b476f33\n',
 'VirusShare_a86488274b56159d89203a23060f4d39\n',
 'VirusShare_aefaab1e2ba111f1c22f1eb38c907944\n',
 'VirusShare_b04ce8871e94f850cb1c9c3f74286965\n',
 'VirusShare_bb7db6f58d7415090bdbaad0be616ca7\n',
 'VirusShare_c531bbc77278f6693499350f123273ba\n',
 'VirusShare_c653df6dacbee32a058ff8b7d4849f14\n',
 'VirusShare_cc5a5177172226ff69f378a4862e1207\n',
 'VirusShare_cdce2fc7c0d53edadbd4fcde2753e4ab\n',
 'VirusShare_cec31b825374ef72ebf1dd96aa8c3ed8\n',
 'VirusShare_d5e40f3e2d31e6c6c00d715a028db5bf\n',
 'VirusShare_d787607e772dacebde32c30ed149f01a\n',
 'VirusShare_e6a3ea2c75aa04512786c2e6125db7b8\n',
 'VirusShare_e842466ae692b04c98df998266b9e59d\n',
 'VirusShare_edd6fb1a60f69220d7b26ea4242d6c25\n',
 'VirusShare_f3e1f5db377c2f11e25cbd2aa9343d37\n',
 'VirusShare_fd71307636c7b0dd49e105611bdf3d6b\n']

	file_name	packer_name	packer_id	valid_pe	is_packed
0	00002e640cafb741bea9a48eaee27d6f	unknown	0	1	1
1	000118d12cbf9ad6103e8b914a6e1ac3	Microsoft Visual C++ 8	1101	1	0
2	0001776237ac37a69fcef93c1bac0988	unknown	0	1	1
65536	00027c21667d9119a454df8cef2dc1c7	'DOS Header magic not found.'	0	0	0
65537	0003887ab64b8ae19ffa988638decac2	Microsoft Visual C# / Basic .NET	1067	1	0

	file_name	packer_type	label
0	VirusShare_46b510e161423a7e626adc3d95440f44	Microsoft Visual C++ v5.0/v6.0 (MFC)	1111
1	VirusShare_1103c897ed2979339774f48ff47c0203	Microsoft Visual C++ v5.0/v6.0 (MFC)	1111
2	VirusShare_1835b8c9ed56ca729ad664e4c1725b1c	unknown	0
3	VirusShare_da301519b87e8b796ece22b3f4c13429	BobSoft Mini Delphi -> BoB / BobSoft	317
4	VirusShare_579659363281e349a93adfe5cfadf320	Microsoft Visual C++ 8	1101
5	VirusShare_3d91f9da7b6ddd05f7fc3e6854ba51b9	Microsoft Visual C++ 8	1101
6	VirusShare_344f8b5a063b8ea9bb7b857c88846e49	Microsoft Visual C++ 8	1101
7	VirusShare_d4b0554e0f2e41c8fc5a954cd10bdc8c	Microsoft Visual C++ 8	1101
8	VirusShare_3d86c407c61c1c241ebe343574732bab	unknown	0
9	VirusShare_afeca052db9266bcdeb97d6f2a61a5e9	nonpe	0
10	VirusShare_79b46831edc60b8895bd2bfa9eecd62e	unknown	0
11	VirusShare_ee7cbb0780566ba538d81510bdd4bdab	UPX 2.90 (LZMA)	2434
12	VirusShare_ba251cd16eb5f6b16efbdd65f28eafc2	nonpe	0
13	VirusShare_2d64e4734c285e733042a8e006d75921	unknown	0
14	VirusShare_100dad48224a439b7563256a61299624	BobSoft Mini Delphi -> BoB / BobSoft	317
15	VirusShare_f7d8e338a110b8864d19574bee4c2558	BobSoft Mini Delphi -> BoB / BobSoft	317
16	VirusShare_ef18a320072ab2ca58418a94d9a20bf2	unknown	0
17	VirusShare_7d03f1d4bcf044d44dec7396e750bef9	Microsoft Visual C++ 8	1101
18	VirusShare_9d03b0c2f333fb339e4e47359af759ef	unknown	0
19	VirusShare_415f30ebd0d8a13a2ad8d0ca23213561	Microsoft Visual C++ 8	1101

	file_name	packer_type	label
80860	00002e640cafb741bea9a48eaee27d6f	unknown	0
95202	000118d12cbf9ad6103e8b914a6e1ac3	Microsoft Visual C++ 8	1101
74640	0001776237ac37a69fcef93c1bac0988	unknown	0
23744	00027c21667d9119a454df8cef2dc1c7	nonpe	0
45415	0003887ab64b8ae19ffa988638decac2	Microsoft Visual C# / Basic .NET	1067
115121	000403e4e488356b7535cc613fbeb80b	Microsoft Visual C++ 8	1101
24351	0004376a62e22f6ad359467eb742b8ff	unknown	0
80003	0004c8b2a0f4680a5694d74199b40ea2	Microsoft Visual C++ v5.0/v6.0 (MFC)	1111
91928	000595d8b586915c12053104cf845097	unknown	0
223	000634f03457d088c71dbffb897b1315	Microsoft Visual C++ 8	1101
58612	00072ed24314e91b63b425b3dc572f50	Microsoft Visual Basic v5.0/v6.0	1060
20945	00092d369958b67557da8661cc9093bc	Microsoft Visual C++ 8	1101
65725	00093d5fa5cb7ce77f6eaf39962daa12	Microsoft Visual C++ 8	1101
90793	00099926d51b44c6f8c93a48c2567891	unknown	0
4545	0009a64f786fa29bfa6423278cc74f02	unknown	0
119064	000a2db4762dc06628a086c9e117f884	Armadillo v1.xx - v2.xx	153
3909	000ac11fa7587b2316470b154254a219	unknown	0
124467	000ae2c63ba69fc93dfc395b40bfe03a	unknown	0
45100	000ae90736a51c47543dcc6d8a735362	unknown	0
63808	000b41258d624ef2d6e430822d0c0c8f	unknown	0

	file_name	packer_name	valid_pe	is_packed
0	5406a540868da572c4924c507980005a	unknown	1	0
2	00002e640cafb741bea9a48eaee27d6f	unknown	1	1
4	0001776237ac37a69fcef93c1bac0988	unknown	1	1
7	000595d8b586915c12053104cf845097	unknown	1	1
9	00099926d51b44c6f8c93a48c2567891	unknown	1	0
11	000ae2c63ba69fc93dfc395b40bfe03a	unknown	1	0
19	001f21fcfc8299d10af88ced33681658	unknown	1	0
22	0025420de5eeae2b56a44366aabdfe7a	unknown	1	0
23	002707c5430913ae95ed89bd3fad2bb7	unknown	1	0
24	0027d32bb0d65bc7f69f63775518a55c	unknown	1	0
25	002c634dd7ef576ccdbf74c8e2f00e2e	unknown	1	1
27	002de4622831b73f6b3346c990a22b94	unknown	1	0
28	002e77eaccc936a21b3cca1c60ffafd6	unknown	1	0
29	00304eb3856ee1f4b2f5164c95146979	unknown	1	0
32	003646671b05ca916dc8b2d0ef7aff56	unknown	1	0
34	003b431e6d7cccc02fcf655f4c7994f9	unknown	1	0
35	003d21b79c66aa80a9b3b59ae5d2deac	unknown	1	0
36	003f76510683e95c3ab210d83073e3a1	unknown	1	1
39	00443f2600d9c721a75723009b2fd5fb	unknown	1	0
40	00445aabf91f71ca316ee98f2f710317	unknown	1	0
43	00456877953a4d89543c740f9e03fbb0	unknown	1	0
47	0050fc442c7e41e42cd250841fe0334a	unknown	1	0
48	0051b4232358eab113530a8c2299108c	unknown	1	0
49	0052662dac0f708a9b91bfabe739529f	unknown	1	0

	file_name	packer_name
12	0012a82ce8e0107d909959961d5862a1	'DOS Header magic not found.'
15	001833c06efcaa39e7803c6a369e99dd	'DOS Header magic not found.'
17	001e6e1f510250f4de06f8c9c2784d45	'DOS Header magic not found.'
30	00314e1bd4ecb9a50efe307ca2d001b7	'DOS Header magic not found.'
38	00423f1656a26c53a787304f27aa60cd	'DOS Header magic not found.'
41	0044faab42e518c2b417ae3e528ee287	'DOS Header magic not found.'

	file_name	file_type	percentage	file_id
0	001dd76872d80801692ff942308c64e6	(.EXE) Win64 Executable (generic)	4.6	21
1	002325a0a67fded0381b5648d7fe9b8e	(.EXE) Win64 Executable (generic)	4.6	21
2	00dbb9e1c09dbdafb360f3163ba5a3de	(.SCR) Windows screen saver	6.4	24
3	0149b7bd7218aab4e257d28469fddb0d	(.EXE) Win64 Executable (generic)	4.6	21
4	01e0dc079d4e33d8edd050c4900818da	(.EXE) Win32 Executable MS Visual C++ (generic)	7.4	1

	file_name	file_type	file_id
0	001dd76872d80801692ff942308c64e6	PE32 executable (GUI) Intel 80386 for MS Windows	1
1	002325a0a67fded0381b5648d7fe9b8e	PE32 executable (GUI) Intel 80386 for MS Windows	1
2	00dbb9e1c09dbdafb360f3163ba5a3de	PE32 executable (GUI) Intel 80386 for MS Windows	1
3	0149b7bd7218aab4e257d28469fddb0d	PE32 executable (GUI) Intel 80386 for MS Windows	1
4	01e0dc079d4e33d8edd050c4900818da	PE32 executable (GUI) Intel 80386 for MS Windows	1