In [1]:

    
# TODO: move all the merge code from model-selection-pe-coff.ipynb
import warnings
import numpy as np
import scipy as sp
import pandas as pd

1. VirusShare 251 Feature Merge.



In [ ]:

    
# NOTE: move this merging stuff to merge-feature-sets-pe-coff.ipynb
# it is making the notebook too long.

# Merge all the vs251 Feature sets:
# 1. PE ASM features.
# 2. Entropy features.
# 3. File ID features.
# 4. Packer ID features.
# 5. PE Call Graph features.
# 6. Trid ID features.
# 7. PE Header features.
#
# NOTE: keep function count features and ASM/Binary images separate, train on different models.
# 8. TODO: function count features.
# 9. TODO: Binary or ASM images (RNN/CNN models).
#
# Final Combination: combine all PE/COFF features then use chi2 tests to 
# reduce to 10% best feature set ->
#     all-combined-pe-features-10perc-vs251.csv

sorted_train_labels = pd.read_csv('data/sorted-train-labels-vs251.csv')
#sorted_asm_features = pd.read_csv('data/sorted-pe-asm-features-vs251.csv')
sorted_asm_features_fixed = pd.read_csv('data/sorted-pe-asm-features-vs251-fixed.csv')

sorted_entropy_features = pd.read_csv('data/sorted-entropy-features-vs251.csv')
sorted_file_id_features = pd.read_csv('data/sorted-file-id-features-vs251.csv')
sorted_packer_id_features = pd.read_csv('data/sorted-packer-id-features-vs251.csv')
sorted_call_graph_features = pd.read_csv('data/sorted-pe-call-graph-features-vs251.csv')
sorted_trid_id_features = pd.read_csv('data/sorted-trid-id-features-vs251.csv') # Select only scalar columns.
sorted_header_features = pd.read_csv('data/sorted-pe-header-features-10percent-vs251.csv')
sorted_packer_id_features.head()



In [ ]:

    
sorted_asm_features_fixed.head()



In [ ]:

    
fileidfeatures = pd.DataFrame(sorted_file_id_features.iloc[:,[0,2]])
fileidfeatures.head()



In [ ]:

    
packeridfeatures = pd.DataFrame(sorted_packer_id_features.iloc[:,[0,2]])
packeridfeatures.head()



In [ ]:

    
trididfeatures = pd.DataFrame(sorted_trid_id_features.iloc[:,[0,2,3]])
trididfeatures.head()



In [ ]:

    
# NOTE: the file name problem has been fixed now, do not use get_training_data()

def get_training_data(sorted_train_features_df, sorted_train_labels_df, train_labels_file_out):
    
    X = sorted_train_features_df.iloc[:,1:]
    ylabels = sorted_train_labels_df.iloc[:,4]
    # not necessary, labels start at zero. ylabels = ylabels - 1 
    sorted_sample_names = sorted_train_features_df.loc[:,'file_name']
    
    # Now get the labels of the PE malware samples from the label set.
    counter = 0
    y = []
    #train_names = sorted_train_labels['family_label']
    for fname in sorted_sample_names:
        counter += 1
        
        # Have a slight problem with the file_names in asm feature sets,
        # they still have .pe on the end.
        idx = fname.find('.pe')
        if idx > 0:
            fname = fname[:idx]
            
        if counter % 100 == 1:
            print("Appending {:d} -> {:s}".format(counter, fname))
        for idx, fname2 in enumerate(sorted_train_labels['file_name']):
            if (fname2 == fname):
                y.append(sorted_train_labels.iloc[idx, 4]) # Append the family class label.
                break
    
    ###############################
    # Write out the PE/COFF sample train labels for later use and validation.
    fop = open(train_labels_file_out, 'w')
    fop.writelines("\n".join(str(x) for x in y))
    fop.close()
    ###############################
    
    return X,y



In [ ]:

    
# Combine all the feature sets, ensure we drop all the rows that are not in the PE/COFF sample set.
# NOTE: use stream editor to fix ASM file_name field: echo 'as;dflkj;a.pe,123' | sed s/\.pe//

combined_train_features = sorted_asm_features_fixed.merge(sorted_header_features, on='file_name', how='inner', suffixes=('_asm','_hd'))
combined_train_features = combined_train_features.merge(sorted_call_graph_features, on='file_name', how='inner', suffixes=('_asm','_cg'))
combined_train_features = combined_train_features.merge(sorted_entropy_features, on='file_name', how='inner', suffixes=('_asm','_ent'))
combined_train_features = combined_train_features.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_asm','_fid'))
combined_train_features = combined_train_features.merge(trididfeatures, on='file_name', how='inner', suffixes=('_asm','_tid'))
combined_train_features = combined_train_features.merge(packeridfeatures, on='file_name', how='inner', suffixes=('_asm','_pid'))

combined_train_features.head()


combined_train_features.to_csv('data/combined-pe-features-vs251.csv', index=False)



In [ ]:



In [ ]:



In [ ]:

2. VirusShare 252 PE/COFF Feature Merge.



In [2]:

    
# Merge all the vs252 Feature sets:
# 1. PE ASM features.
# 2. Entropy features.
# 3. File ID features.
# 4. Packer ID features.
# 5. PE Call Graph features.
# 6. Trid ID features.
# 7. PE Header features.
# 8. TODO: function count features.
# 9. TODO: Binary or ASM images (RNN/CNN models).
#
# Final Combination: combine all PE/COFF features then use chi2 tests to 
# reduce to 10% best feature set ->
#     all-combined-pe-features-10perc-vs251.csv

sorted_train_labels = pd.read_csv('data/sorted-train-labels-vs252.csv')
#sorted_asm_features = pd.read_csv('data/sorted-pe-asm-features-vs252.csv')
sorted_asm_features_fixed = pd.read_csv('data/sorted-pe-asm-features-vs252-fixed.csv')

sorted_entropy_features = pd.read_csv('data/sorted-entropy-features-vs252.csv')
sorted_file_id_features = pd.read_csv('data/sorted-file-id-features-vs252.csv')
sorted_packer_id_features = pd.read_csv('data/sorted-packer-id-features-vs252.csv')
sorted_call_graph_features = pd.read_csv('data/sorted-pe-call-graph-features-vs252.csv')
sorted_trid_id_features = pd.read_csv('data/sorted-trid-id-features-vs252.csv') # Select only scalar columns.
sorted_header_features = pd.read_csv('data/sorted-pe-header-features-10percent-vs252.csv')

# Use separate model for function counts, otherwise the csv files are too big for github.
#sorted_function_count_features = pd.read_csv('data/sorted-pe-function-counts-10percent-vs252.csv')

sorted_asm_features_fixed.head()









    Out[2]:






  
    
      
      file_name
      edx
      esi
      es
      fs
      ds
      ss
      gs
      cs
      ah
      ...
      sidt
      stc
      std
      sti
      stos
      sub
      test
      wait
      xchg
      xor
    
  
  
    
      0
      000118d12cbf9ad6103e8b914a6e1ac3
      2151
      2782
      2
      2
      2
      2
      2
      2
      37
      ...
      0
      0
      9
      0
      0
      395
      836
      0
      1
      688
    
    
      1
      00093d5fa5cb7ce77f6eaf39962daa12
      4207
      11425
      4
      2
      6
      6
      2
      3
      53
      ...
      0
      0
      9
      0
      0
      893
      2560
      8
      2
      2530
    
    
      2
      00099926d51b44c6f8c93a48c2567891
      270
      1033
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      111
      225
      0
      0
      129
    
    
      3
      000a2db4762dc06628a086c9e117f884
      43
      1281
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      100
      184
      0
      0
      180
    
    
      4
      000ae2c63ba69fc93dfc395b40bfe03a
      4334
      12557
      11
      3
      8
      7
      3
      11
      41
      ...
      0
      3
      16
      0
      0
      964
      2884
      1
      9
      2104
    
  

5 rows × 120 columns



In [ ]:

    
sorted_asm_features_fixed.head()



In [3]:

    
fileidfeatures = pd.DataFrame(sorted_file_id_features.iloc[:,[0,2]])
fileidfeatures.head()









    Out[3]:






  
    
      
      file_name
      file_id
    
  
  
    
      0
      00002e640cafb741bea9a48eaee27d6f
      133
    
    
      1
      000118d12cbf9ad6103e8b914a6e1ac3
      1
    
    
      2
      0001776237ac37a69fcef93c1bac0988
      1
    
    
      3
      000403e4e488356b7535cc613fbeb80b
      1
    
    
      4
      0004c8b2a0f4680a5694d74199b40ea2
      1



In [4]:

    
packeridfeatures = pd.DataFrame(sorted_packer_id_features.iloc[:,[0,2]])
packeridfeatures.head()









    Out[4]:






  
    
      
      file_name
      packer_id
    
  
  
    
      0
      00002e640cafb741bea9a48eaee27d6f
      0
    
    
      1
      000118d12cbf9ad6103e8b914a6e1ac3
      1101
    
    
      2
      0001776237ac37a69fcef93c1bac0988
      0
    
    
      3
      000403e4e488356b7535cc613fbeb80b
      1101
    
    
      4
      0004c8b2a0f4680a5694d74199b40ea2
      1111



In [5]:

    
trididfeatures = pd.DataFrame(sorted_trid_id_features.iloc[:,[0,2,3]])
trididfeatures.head()









    Out[5]:






  
    
      
      file_name
      percentage
      trid_id
    
  
  
    
      0
      00002e640cafb741bea9a48eaee27d6f
      2.3
      2
    
    
      1
      000118d12cbf9ad6103e8b914a6e1ac3
      2.2
      1
    
    
      2
      0001776237ac37a69fcef93c1bac0988
      2.7
      14
    
    
      3
      000403e4e488356b7535cc613fbeb80b
      4.6
      21
    
    
      4
      0004c8b2a0f4680a5694d74199b40ea2
      2.2
      1



In [6]:

    
# Combine all the feature sets, ensure we drop all the rows that are not in the PE/COFF sample set.
# NOTE: use stream editor to fix ASM file_name field: echo 'as;dflkj;a.pe,123' | sed s/\.pe//

combined_train_features = sorted_asm_features_fixed.merge(sorted_header_features, on='file_name', how='inner', suffixes=('_asm','_hd'))
combined_train_features = combined_train_features.merge(sorted_call_graph_features, on='file_name', how='inner', suffixes=('_asm','_cg'))
combined_train_features = combined_train_features.merge(sorted_entropy_features, on='file_name', how='inner', suffixes=('_asm','_ent'))
combined_train_features = combined_train_features.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_asm','_fid'))
combined_train_features = combined_train_features.merge(trididfeatures, on='file_name', how='inner', suffixes=('_asm','_tid'))
combined_train_features = combined_train_features.merge(packeridfeatures, on='file_name', how='inner', suffixes=('_asm','_pid'))

combined_train_features.head()









    Out[6]:






  
    
      
      file_name
      edx
      esi
      es
      fs
      ds
      ss
      gs
      cs
      ah
      ...
      vertex_count
      edge_count
      delta_max
      density
      entropy
      file_size
      file_id
      percentage
      trid_id
      packer_id
    
  
  
    
      0
      000118d12cbf9ad6103e8b914a6e1ac3
      2151
      2782
      2
      2
      2
      2
      2
      2
      37
      ...
      2213
      3300
      101
      0.002277
      0.834382
      201600
      1
      2.2
      1
      1101
    
    
      1
      00093d5fa5cb7ce77f6eaf39962daa12
      4207
      11425
      4
      2
      6
      6
      2
      3
      53
      ...
      4640
      12158
      204
      0.005596
      0.803481
      742064
      1
      8.1
      17
      1101
    
    
      2
      00099926d51b44c6f8c93a48c2567891
      270
      1033
      0
      0
      0
      0
      0
      0
      0
      ...
      522
      868
      284
      0.267901
      0.997032
      725288
      4
      4.8
      3
      0
    
    
      3
      000a2db4762dc06628a086c9e117f884
      43
      1281
      0
      0
      0
      0
      0
      0
      0
      ...
      871
      959
      70
      0.035789
      0.535436
      61551
      5
      3.5
      13
      153
    
    
      4
      000ae2c63ba69fc93dfc395b40bfe03a
      4334
      12557
      11
      3
      8
      7
      3
      11
      41
      ...
      3939
      7218
      82
      0.004668
      0.899481
      487386
      1
      4.0
      17
      0
    
  

5 rows × 272 columns



In [7]:

    
combined_train_features.to_csv('data/combined-pe-features-vs252.csv', index=False)



In [ ]:

    
# DEPRECATED: do no use get_training_data() now, problem has been fixed.
X,y = get_training_data(sorted_asm_features_fixed, sorted_train_labels, 'data/sorted-pe-coff-train-labels-vs252.csv')
print("Length of y: {:d}".format(len(y)))
print("Shape of X: {:d} {:d}".format(X.shape[0], X.shape[1]))



In [ ]:

    
X,y = get_training_data(combined_train_features, sorted_train_labels, 'data/sorted-pe-coff-train-labels-vs252.csv')
print("Length of y: {:d}".format(len(y)))
print("Shape of X: {:d} {:d}".format(X.shape[0], X.shape[1]))



In [ ]:

3. VirusShare 263 PE/COFF Feature Merge.



In [3]:

    
# vs263 Feature sets:
# 1. PE ASM features.
# 2. Entropy features.
# 3. File ID features.
# 4. Packer ID features.
# 5. PE Call Graph features.
# 6. Trid ID features.
# 7. PE Header features.
# 8. function count features.
# 9. TODO: Binary or ASM images (RNN/CNN models).
#
# Final Combination: combine all PE/COFF features then use chi2 tests to 
# reduce to 10% best feature set ->
#     all-combined-pe-features-10perc-vs251.csv

sorted_train_labels = pd.read_csv('data/sorted-train-labels-vs263.csv')
#sorted_asm_features = pd.read_csv('data/sorted-pe-asm-features-vs263.csv')
sorted_asm_features_fixed = pd.read_csv('data/sorted-pe-asm-features-vs263-fixed.csv')

sorted_entropy_features = pd.read_csv('data/sorted-entropy-features-vs263.csv')
sorted_file_id_features = pd.read_csv('data/sorted-file-id-features-vs263.csv')
sorted_packer_id_features = pd.read_csv('data/sorted-packer-id-features-vs263.csv')
sorted_call_graph_features = pd.read_csv('data/sorted-pe-call-graph-features-vs263.csv')
sorted_trid_id_features = pd.read_csv('data/sorted-trid-id-features-vs263.csv') # Select only scalar columns.
sorted_header_features = pd.read_csv('data/sorted-pe-header-features-10percent-vs263.csv')
# Adding function counts makes the csv file too big for github and models too slow,
# model separately.
#sorted_function_count_features = pd.read_csv('data/sorted-pe-function-counts-10percent-vs263.csv')

sorted_asm_features_fixed.head()









    Out[3]:






  
    
      
      file_name
      edx
      esi
      es
      fs
      ds
      ss
      gs
      cs
      ah
      ...
      sidt
      stc
      std
      sti
      stos
      sub
      test
      wait
      xchg
      xor
    
  
  
    
      0
      0002b2f621ea5786be03bf4153532dce
      52
      490
      0
      0
      0
      0
      0
      0
      4
      ...
      0
      0
      0
      0
      0
      47
      118
      0
      0
      129
    
    
      1
      000401419eccde59975c713cfadc974c
      4378
      3246
      1
      0
      0
      0
      0
      0
      34
      ...
      0
      0
      1
      0
      0
      343
      649
      88
      42
      1031
    
    
      2
      0004824a60ff9fe1fb30d669a5baa627
      4374
      3295
      1
      0
      0
      0
      0
      0
      34
      ...
      0
      0
      1
      0
      0
      343
      653
      88
      42
      1032
    
    
      3
      0006d2cd674c8501ffe59dae330ffcb5
      2100
      959
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      1
      0
      0
      154
      307
      2
      6
      472
    
    
      4
      0007893715059c51a92d3ce2b10d9cf5
      163
      180
      0
      0
      0
      0
      0
      0
      31
      ...
      0
      0
      0
      0
      0
      73
      86
      0
      29
      34
    
  

5 rows × 120 columns



In [ ]:

    
X,y = get_training_data(sorted_asm_features_fixed, sorted_train_labels, 'data/sorted-pe-coff-train-labels-vs263.csv')
print("Length of y: {:d}".format(len(y)))
print("Shape of X: {:d} {:d}".format(X.shape[0], X.shape[1]))



In [4]:

    
fileidfeatures = pd.DataFrame(sorted_file_id_features.iloc[:,[0,2]])
fileidfeatures.head()









    Out[4]:






  
    
      
      file_name
      file_id
    
  
  
    
      0
      0002b2f621ea5786be03bf4153532dce
      5
    
    
      1
      000401419eccde59975c713cfadc974c
      1
    
    
      2
      00042f23bc15b89d9c6a7bde0e316f8b
      1
    
    
      3
      0004824a60ff9fe1fb30d669a5baa627
      1
    
    
      4
      0004c49071481789f1c8c80656638497
      13



In [5]:

    
packeridfeatures = pd.DataFrame(sorted_packer_id_features.iloc[:,[0,2]])
packeridfeatures.head()









    Out[5]:






  
    
      
      file_name
      packer_id
    
  
  
    
      0
      0002b2f621ea5786be03bf4153532dce
      0
    
    
      1
      000401419eccde59975c713cfadc974c
      317
    
    
      2
      00042f23bc15b89d9c6a7bde0e316f8b
      0
    
    
      3
      0004824a60ff9fe1fb30d669a5baa627
      317
    
    
      4
      0004c49071481789f1c8c80656638497
      0



In [6]:

    
trididfeatures = pd.DataFrame(sorted_trid_id_features.iloc[:,[0,2,3]])
trididfeatures.head()









    Out[6]:






  
    
      
      file_name
      percentage
      trid_id
    
  
  
    
      0
      0002b2f621ea5786be03bf4153532dce
      7.4
      1
    
    
      1
      000401419eccde59975c713cfadc974c
      5.4
      4
    
    
      2
      00042f23bc15b89d9c6a7bde0e316f8b
      7.4
      1
    
    
      3
      0004824a60ff9fe1fb30d669a5baa627
      5.4
      4
    
    
      4
      0004c49071481789f1c8c80656638497
      0.0
      0



In [8]:

    
# Combine all the feature sets, ensure we drop all the rows that are not in the PE/COFF sample set.
# NOTE: use stream editor to fix ASM file_name field: echo 'as;dflkj;a.pe,123' | sed s/\.pe//

combined_train_features = sorted_asm_features_fixed.merge(sorted_header_features, on='file_name', how='inner', suffixes=('_asm','_hd'))
combined_train_features = combined_train_features.merge(sorted_call_graph_features, on='file_name', how='inner', suffixes=('_asm','_cg'))
combined_train_features = combined_train_features.merge(sorted_entropy_features, on='file_name', how='inner', suffixes=('_asm','_ent'))
combined_train_features = combined_train_features.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_asm','_fid'))
combined_train_features = combined_train_features.merge(trididfeatures, on='file_name', how='inner', suffixes=('_asm','_tid'))
combined_train_features = combined_train_features.merge(packeridfeatures, on='file_name', how='inner', suffixes=('_asm','_pid'))

combined_train_features.head()









    Out[8]:






  
    
      
      file_name
      edx
      esi
      es
      fs
      ds
      ss
      gs
      cs
      ah
      ...
      vertex_count
      edge_count
      delta_max
      density
      entropy
      file_size
      file_id
      percentage
      trid_id
      packer_id
    
  
  
    
      0
      0002b2f621ea5786be03bf4153532dce
      52
      490
      0
      0
      0
      0
      0
      0
      4
      ...
      365
      518
      74
      0.234283
      0.684706
      81812
      5
      7.4
      1
      0
    
    
      1
      000401419eccde59975c713cfadc974c
      4378
      3246
      1
      0
      0
      0
      0
      0
      34
      ...
      1593
      3310
      56
      0.006680
      0.800788
      137131
      1
      5.4
      4
      317
    
    
      2
      0004824a60ff9fe1fb30d669a5baa627
      4374
      3295
      1
      0
      0
      0
      0
      0
      34
      ...
      1593
      3325
      56
      0.006710
      0.802050
      137630
      1
      5.4
      4
      317
    
    
      3
      0006d2cd674c8501ffe59dae330ffcb5
      2100
      959
      0
      0
      0
      0
      0
      0
      0
      ...
      717
      2167
      285
      0.049634
      0.762935
      83819
      1
      8.3
      13
      0
    
    
      4
      0007893715059c51a92d3ce2b10d9cf5
      163
      180
      0
      0
      0
      0
      0
      0
      31
      ...
      795
      1193
      355
      0.052839
      0.812079
      120832
      1
      3.5
      13
      0
    
  

5 rows × 204 columns



In [9]:

    
combined_train_features.to_csv('data/combined-pe-features-vs263.csv', index=False)



In [ ]:



In [ ]:



In [ ]:



In [ ]:

4. VirusShare 264 PE/COF Feature Merge.



In [4]:

    
# vs264 Feature sets:
# 1. PE ASM features.
# 2. Entropy features.
# 3. File ID features.
# 4. Packer ID features.
# 5. PE Call Graph features.
# 6. Trid ID features.
# 7. PE Header features.
# 8. ASM function count features.
# 9. TODO: Binary or ASM images (RNN/CNN models).
#
# Final Combination: combine all PE/COFF features then use chi2 tests to 
# reduce to 10% best feature set ->
#     all-combined-pe-features-10perc-vs264.csv

sorted_train_labels = pd.read_csv('data/sorted-train-labels-vs264.csv')
#sorted_asm_features = pd.read_csv('data/sorted-pe-asm-features-vs264.csv')
sorted_asm_features_fixed = pd.read_csv('data/sorted-pe-asm-features-vs264-fixed.csv')

sorted_entropy_features = pd.read_csv('data/sorted-entropy-features-vs264.csv')
sorted_file_id_features = pd.read_csv('data/sorted-file-id-features-vs264.csv')
sorted_packer_id_features = pd.read_csv('data/sorted-packer-id-features-vs264.csv')
sorted_call_graph_features = pd.read_csv('data/sorted-pe-call-graph-features-vs264.csv')
sorted_trid_id_features = pd.read_csv('data/sorted-trid-id-features-vs264.csv') # Select only scalar columns.
sorted_header_features = pd.read_csv('data/sorted-pe-header-features-10percent-vs264.csv')
sorted_function_count_features = pd.read_csv('data/sorted-pe-function-count-features-10percent-vs264.csv')

sorted_asm_features_fixed.head()









    Out[4]:






  
    
      
      file_name
      edx
      esi
      es
      fs
      ds
      ss
      gs
      cs
      ah
      ...
      sidt
      stc
      std
      sti
      stos
      sub
      test
      wait
      xchg
      xor
    
  
  
    
      0
      0006bac4260d377e3bf2b9cfea530eb9
      8714
      16348
      39
      8
      9
      12
      8
      8
      73
      ...
      0
      1
      4
      3
      0
      1736
      2990
      0
      7
      2176
    
    
      1
      0006e6b536f6622bf308e80f837ceaeb
      4374
      3295
      1
      0
      0
      0
      0
      0
      34
      ...
      0
      0
      1
      0
      0
      343
      653
      88
      42
      1032
    
    
      2
      0007106c237e8689cc68b5111db1a174
      3549
      13709
      8
      0
      0
      0
      0
      0
      76
      ...
      0
      0
      1
      0
      0
      1179
      2575
      1
      0
      3059
    
    
      3
      00094c34bd1a0f622b49c1b1b9274ae6
      1167
      4780
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      377
      996
      0
      0
      869
    
    
      4
      000b01ac8e485e5a143ff577be88b853
      1703
      1181
      0
      0
      0
      0
      0
      0
      32
      ...
      0
      0
      2
      0
      0
      174
      247
      5
      11
      395
    
  

5 rows × 120 columns



In [5]:

    
fileidfeatures = pd.DataFrame(sorted_file_id_features.iloc[:,[0,2]])
fileidfeatures.head()









    Out[5]:






  
    
      
      file_name
      file_id
    
  
  
    
      0
      000070db76b6dc1ee3497a3f9319848c
      16
    
    
      1
      00009cbc0a90337e4c30950a51ae3d67
      1
    
    
      2
      0003c05a1320e64fe72438ab48da7ecf
      15
    
    
      3
      0003e52a9267b657d9b08b2cbc0a2593
      16
    
    
      4
      0005743596135fe65f61da7a0eba0bb6
      15



In [6]:

    
packeridfeatures = pd.DataFrame(sorted_packer_id_features.iloc[:,[0,2]])
packeridfeatures.head()









    Out[6]:






  
    
      
      file_name
      packer_id
    
  
  
    
      0
      000070db76b6dc1ee3497a3f9319848c
      0
    
    
      1
      00009cbc0a90337e4c30950a51ae3d67
      317
    
    
      2
      0003c05a1320e64fe72438ab48da7ecf
      0
    
    
      3
      0003e52a9267b657d9b08b2cbc0a2593
      0
    
    
      4
      0005743596135fe65f61da7a0eba0bb6
      0



In [7]:

    
trididfeatures = pd.DataFrame(sorted_trid_id_features.iloc[:,[0,2,3]])
trididfeatures.head()









    Out[7]:






  
    
      
      file_name
      percentage
      trid_id
    
  
  
    
      0
      000070db76b6dc1ee3497a3f9319848c
      0.0
      0
    
    
      1
      00009cbc0a90337e4c30950a51ae3d67
      5.5
      4
    
    
      2
      0003c05a1320e64fe72438ab48da7ecf
      0.0
      0
    
    
      3
      0003e52a9267b657d9b08b2cbc0a2593
      0.0
      0
    
    
      4
      0005743596135fe65f61da7a0eba0bb6
      0.6
      8



In [8]:

    
# Combine all the feature sets, ensure we drop all the rows that are not in the PE/COFF sample set.
# NOTE: use stream editor to fix ASM file_name field: echo 'as;dflkj;a.pe,123' | sed s/\.pe//

combined_train_features = sorted_asm_features_fixed.merge(sorted_header_features, on='file_name', how='inner', suffixes=('_asm','_hd'))
combined_train_features = combined_train_features.merge(sorted_call_graph_features, on='file_name', how='inner', suffixes=('_asm','_cg'))
combined_train_features = combined_train_features.merge(sorted_entropy_features, on='file_name', how='inner', suffixes=('_asm','_ent'))
combined_train_features = combined_train_features.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_asm','_fid'))
combined_train_features = combined_train_features.merge(trididfeatures, on='file_name', how='inner', suffixes=('_asm','_tid'))
combined_train_features = combined_train_features.merge(packeridfeatures, on='file_name', how='inner', suffixes=('_asm','_pid'))

combined_train_features.head()









    Out[8]:






  
    
      
      file_name
      edx
      esi
      es
      fs
      ds
      ss
      gs
      cs
      ah
      ...
      vertex_count
      edge_count
      delta_max
      density
      entropy
      file_size
      file_id
      percentage
      trid_id
      packer_id
    
  
  
    
      0
      0006bac4260d377e3bf2b9cfea530eb9
      8714
      16348
      39
      8
      9
      12
      8
      8
      73
      ...
      4689
      8093
      377
      0.005650
      0.820715
      2015064.0
      1
      2.6
      7
      1101
    
    
      1
      0006e6b536f6622bf308e80f837ceaeb
      4374
      3295
      1
      0
      0
      0
      0
      0
      34
      ...
      1593
      3325
      56
      0.006710
      0.804215
      138620.0
      1
      5.4
      4
      317
    
    
      2
      0007106c237e8689cc68b5111db1a174
      3549
      13709
      8
      0
      0
      0
      0
      0
      76
      ...
      5753
      11775
      168
      0.013797
      0.826531
      524877.0
      1
      8.1
      17
      1101
    
    
      3
      00094c34bd1a0f622b49c1b1b9274ae6
      1167
      4780
      0
      0
      0
      0
      0
      0
      0
      ...
      2737
      3884
      125
      0.011075
      0.833980
      325328.0
      1
      7.4
      1
      1101
    
    
      4
      000b01ac8e485e5a143ff577be88b853
      1703
      1181
      0
      0
      0
      0
      0
      0
      32
      ...
      575
      1052
      59
      0.018804
      0.983293
      304458.0
      1
      7.7
      12
      0
    
  

5 rows × 244 columns



In [9]:

    
combined_train_features.to_csv('data/combined-pe-features-vs264.csv', index=False)



In [ ]:



In [ ]:

	file_name	edx	esi	es	fs	ds	ss	gs	cs	ah	...	stc	std	sub	test	wait	xchg	xor
0	000118d12cbf9ad6103e8b914a6e1ac3	2151	2782	2	2	2	2	2	2	37	...	0	9	395	836	0	1	688
1	00093d5fa5cb7ce77f6eaf39962daa12	4207	11425	4	2	6	6	2	3	53	...	0	9	893	2560	8	2	2530
2	00099926d51b44c6f8c93a48c2567891	270	1033	0	0	0	0	0	0	0	...	0	0	111	225	0	0	129
3	000a2db4762dc06628a086c9e117f884	43	1281	0	0	0	0	0	0	0	...	0	0	100	184	0	0	180
4	000ae2c63ba69fc93dfc395b40bfe03a	4334	12557	11	3	8	7	3	11	41	...	3	16	964	2884	1	9	2104

	file_name	file_id
0	00002e640cafb741bea9a48eaee27d6f	133
1	000118d12cbf9ad6103e8b914a6e1ac3	1
2	0001776237ac37a69fcef93c1bac0988	1
3	000403e4e488356b7535cc613fbeb80b	1
4	0004c8b2a0f4680a5694d74199b40ea2	1

	file_name	packer_id
0	00002e640cafb741bea9a48eaee27d6f	0
1	000118d12cbf9ad6103e8b914a6e1ac3	1101
2	0001776237ac37a69fcef93c1bac0988	0
3	000403e4e488356b7535cc613fbeb80b	1101
4	0004c8b2a0f4680a5694d74199b40ea2	1111

	file_name	edx	esi	es	ah	...	std	sub	test	wait	xchg	xor
0	0002b2f621ea5786be03bf4153532dce	52	490	0	4	...	0	47	118	0	0	129
1	000401419eccde59975c713cfadc974c	4378	3246	1	34	...	1	343	649	88	42	1031
2	0004824a60ff9fe1fb30d669a5baa627	4374	3295	1	34	...	1	343	653	88	42	1032
3	0006d2cd674c8501ffe59dae330ffcb5	2100	959	0	0	...	1	154	307	2	6	472
4	0007893715059c51a92d3ce2b10d9cf5	163	180	0	31	...	0	73	86	0	29	34

	file_name	file_id
0	0002b2f621ea5786be03bf4153532dce	5
1	000401419eccde59975c713cfadc974c	1
2	00042f23bc15b89d9c6a7bde0e316f8b	1
3	0004824a60ff9fe1fb30d669a5baa627	1
4	0004c49071481789f1c8c80656638497	13

	file_name	edx	esi	es	ah	...	vertex_count	edge_count	delta_max	density	entropy	file_size	file_id	percentage	trid_id	packer_id
0	0002b2f621ea5786be03bf4153532dce	52	490	0	4	...	365	518	74	0.234283	0.684706	81812	5	7.4	1	0
1	000401419eccde59975c713cfadc974c	4378	3246	1	34	...	1593	3310	56	0.006680	0.800788	137131	1	5.4	4	317
2	0004824a60ff9fe1fb30d669a5baa627	4374	3295	1	34	...	1593	3325	56	0.006710	0.802050	137630	1	5.4	4	317
3	0006d2cd674c8501ffe59dae330ffcb5	2100	959	0	0	...	717	2167	285	0.049634	0.762935	83819	1	8.3	13	0
4	0007893715059c51a92d3ce2b10d9cf5	163	180	0	31	...	795	1193	355	0.052839	0.812079	120832	1	3.5	13	0

	file_name	edx	esi	es	fs	ds	ss	gs	cs	ah	...	stc	std	sti	sub	test	wait	xchg	xor
0	0006bac4260d377e3bf2b9cfea530eb9	8714	16348	39	8	9	12	8	8	73	...	1	4	3	1736	2990	0	7	2176
1	0006e6b536f6622bf308e80f837ceaeb	4374	3295	1	0	0	0	0	0	34	...	0	1	0	343	653	88	42	1032
2	0007106c237e8689cc68b5111db1a174	3549	13709	8	0	0	0	0	0	76	...	0	1	0	1179	2575	1	0	3059
3	00094c34bd1a0f622b49c1b1b9274ae6	1167	4780	0	0	0	0	0	0	0	...	0	0	0	377	996	0	0	869
4	000b01ac8e485e5a143ff577be88b853	1703	1181	0	0	0	0	0	0	32	...	0	2	0	174	247	5	11	395

	file_name	file_id
0	000070db76b6dc1ee3497a3f9319848c	16
1	00009cbc0a90337e4c30950a51ae3d67	1
2	0003c05a1320e64fe72438ab48da7ecf	15
3	0003e52a9267b657d9b08b2cbc0a2593	16
4	0005743596135fe65f61da7a0eba0bb6	15