1. Reduce Call Graph Feature Sets

   The original feature set of function calls extracted from the call graphs had
   over 130,000 features. Reducing the feature set by truncating the function names
   produces a feature set of 71,319 function names. Using chi-squared tests to select
   the best 10% of features produces a set of 1561 call graph features.



In [2]:

    
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import chi2
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import cross_val_score, KFold
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

2. Load The Call Graph Training Data Features



In [ ]:



In [ ]:



In [11]:

    
# Preliminary column name setup.
colf = open('data/all-reduced-function-column-names.csv', 'r')
all_column_names = []
column_lines = colf.readlines()
for line in column_lines:
    all_column_names += line.split(',')
    
col_names_len = len(all_column_names)
colf.close()
print("Column Names: {:d}".format(col_names_len))









    



Column Names: 71319



In [3]:

    
# First load the .asm training data and training labels
# call_graph_features_train = pd.read_csv('data/call-graph-features-train.csv')
sorted_call_graph_features_train = pd.read_csv('data/sorted-call-graph-features-train.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')



In [3]:

    
#sorted_call_graph_features_train = call_graph_features_train.sort_values(by='filename')
#sorted_call_graph_features_train.to_csv('data/sorted-call-graph-features-train.csv', index=False)
sorted_call_graph_features_train.head()









    Out[3]:






  
    
      
      filename
      vertex_count
      edge_count
      delta_max
      density
    
  
  
    
      0
      01IsoiSMh5gxyDYTl4CB
      274
      333
      137
      0.081319
    
    
      1
      01SuzwMJEIXsK7A8dQbl
      187
      196
      82
      0.181314
    
    
      2
      01azqd4InC7m9JpocGv5
      158
      1533
      95
      0.140927
    
    
      3
      01jsnpXSAlgw6aPeDxrU
      26
      126
      35
      0.600000
    
    
      4
      01kcPWA9K2BOxQeS5Rju
      61
      53
      24
      0.504762



In [4]:

    
sorted_train_labels.head()









    Out[4]:






  
    
      
      Id
      Class
    
  
  
    
      0
      01IsoiSMh5gxyDYTl4CB
      2
    
    
      1
      01SuzwMJEIXsK7A8dQbl
      8
    
    
      2
      01azqd4InC7m9JpocGv5
      9
    
    
      3
      01jsnpXSAlgw6aPeDxrU
      9
    
    
      4
      01kcPWA9K2BOxQeS5Rju
      1



In [ ]:

2.2 Process 1 of 4 Function Count Files.



In [2]:

    
# Load the sorted function counts (1/4)
#call_graph_function_train_1 = pd.read_csv('/opt/kaggle/5833-call-graph-reduced-function_counts.csv', index_col=False, header=None, names=all_column_names)
#sorted_call_graph_function_train_1 = call_graph_function_train_1.sort_values(by='filename')
sorted_call_graph_function_train_1 = pd.read_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-1.csv')
sorted_call_graph_function_train_1.head()









    Out[2]:






  
    
      
      filename
      DialogFu
      sub_4
      edx+1
      fn
      eax+7
      start
      $+5
      esi
      ebp+var_
      ...
      _4570C4
      _4570C0
      _4570C8
      _457A58
      _457A60
      _457A5C
      _40F0D8
      1E387BAE
      "3AC83716
      "
    
  
  
    
      0
      01kcPWA9K2BOxQeS5Rju
      0
      0
      0
      0
      0
      0
      0
      0
      3
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      04EjIdbPV5e1XroFOpiN
      0
      0
      22
      5
      0
      0
      0
      0
      117
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      04QzZ3DVdPsEp9elLR65
      0
      0
      104
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      04sJnMaORYc1SV5pKjrP
      0
      0
      0
      0
      0
      0
      1
      0
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      05Kps4iFw8mOLJZQrb1H
      0
      0
      155
      0
      0
      0
      0
      0
      9
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 71319 columns



In [9]:

    
# Preliminary sorted file generation only.
del(call_graph_function_train_1)
sorted_call_graph_function_train_1.to_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-1.csv', index=False)
sorted_call_graph_function_train_1.head()









    Out[9]:






  
    
      
      filename
      DialogFu
      sub_4
      edx+1
      fn
      eax+7
      start
      $+5
      esi
      ebp+var_
      ...
      _4570C4
      _4570C0
      _4570C8
      _457A58
      _457A60
      _457A5C
      _40F0D8
      1E387BAE
      "3AC83716
      "
    
  
  
    
      1478
      01kcPWA9K2BOxQeS5Rju
      0
      0
      0
      0
      0
      0
      0
      0
      3
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      775
      04EjIdbPV5e1XroFOpiN
      0
      0
      22
      5
      0
      0
      0
      0
      117
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1536
      04QzZ3DVdPsEp9elLR65
      0
      0
      104
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1180
      04sJnMaORYc1SV5pKjrP
      0
      0
      0
      0
      0
      0
      1
      0
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      941
      05Kps4iFw8mOLJZQrb1H
      0
      0
      155
      0
      0
      0
      0
      0
      9
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 71319 columns



In [3]:

    
# Ok, so we still have 71000+ features even after severely reducing the function name lengths.
# This is a problem. Having to process such a huge sparse matrix requires a lot of memory.
# Solution 1: rent an AWS server with plenty-o-ram. (costs money and requires high bandwidth for file transfer)
# Solution 2: buy more RAM for my linux box. (costs money)
# Solution 3: break the sparse matrix into smaller chunks and process individually. (Ok)
# Solution 4: try the pandas sparse matrix data structure. (too slow)
sorted_call_graph_function_train_1.shape









    Out[3]:





(2666, 71319)



In [40]:

    
# Solution 3: slice the matrix into small chunks for processing.
# the pandas spare matrix still takes too long, break up into 10 different feature sets and try again.
onetenth = int(sorted_call_graph_function_train_1.shape[1]/10)
startidx = 1 # skip the filename column
endidx = onetenth
for idx1 in range(1,10):
    print("Processing column set {:d} -> {:d}".format(startidx, endidx))
    X = sorted_call_graph_function_train_1.iloc[:,startidx:endidx]
    y = []
    train_names = sorted_train_labels['Id']
    for fname in sorted_call_graph_function_train_1['filename']:
        # print("Appending {:s}".format(fname))
        for idx2,fname2 in enumerate(sorted_train_labels['Id']):
            if (fname2 == fname):
                y.append(sorted_train_labels.iloc[idx2,1])
                break

    # Find the top 10 percent variance features.
    print(X.shape)
    print(len(y))
    fsp = SelectPercentile(chi2, 10)
    X_new_10 = fsp.fit_transform(X,y)
    selected_names = fsp.get_support(indices=True)
    selected_names = selected_names + 1
    data_trimmed = sorted_call_graph_function_train_1.iloc[:,selected_names]
    data_fnames = pd.DataFrame(sorted_call_graph_function_train_1['filename'])
    data_reduced = data_fnames.join(data_trimmed)
    # Write to file as we do not have enough memory.
    filename = "data/sorted-function-counts-" + str(idx1) + "-10perc.csv"
    print("Writing file: {:s}".format(filename))
    data_reduced.to_csv(filename, index=False)
    startidx = endidx
    endidx += onetenth
    
    
# finish of the remaining columns
print("Processing final column set {:d} -> {:d}".format(startidx, endidx))
X = sorted_call_graph_function_train_1.iloc[:,startidx:]
y = []
train_names = sorted_train_labels['Id']
for fname in sorted_call_graph_function_train_1['filename']:
    for idx1,fname2 in enumerate(sorted_train_labels['Id']):
        if (fname2 == fname):
            y.append(sorted_train_labels.iloc[idx1,1])
            break

# Find the top 10 percent variance features.
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
data_trimmed = sorted_call_graph_function_train_1.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_call_graph_function_train_1['filename'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/sorted-function-counts-10-10perc.csv"
data_reduced.to_csv(filename, index=False)









    



Processing column set 1 -> 7131
(2666, 7130)
2666
Processing column set 7131 -> 14262
(2666, 7131)
2666
Processing column set 14262 -> 21393
(2666, 7131)
2666
Processing column set 21393 -> 28524
(2666, 7131)
2666
Processing column set 28524 -> 35655
(2666, 7131)
2666
Processing column set 35655 -> 42786
(2666, 7131)
2666
Processing column set 42786 -> 49917
(2666, 7131)
2666
Processing column set 49917 -> 57048
(2666, 7131)
2666
Processing column set 57048 -> 64179
(2666, 7131)
2666
Processing final column set 64179 -> 71310



In [7]:

    
# Now recombine the reduced sets and perform chi-squared tests again.
fname = "data/sorted-function-counts-1-10perc.csv"
reduced_function_counts = pd.read_csv(fname)
for idx in range(2,11):
    fname = "data/sorted-function-counts-" + str(idx) + "-10perc.csv"
    print("Processing file: {:s}".format(fname))
    nextfc = pd.read_csv(fname)
    reduced_function_counts = pd.merge(reduced_function_counts, nextfc, on='filename')
    

reduced_function_counts.head(20)









    



Processing file: data/sorted-function-counts-2-10perc.csv
Processing file: data/sorted-function-counts-3-10perc.csv
Processing file: data/sorted-function-counts-4-10perc.csv
Processing file: data/sorted-function-counts-5-10perc.csv
Processing file: data/sorted-function-counts-6-10perc.csv
Processing file: data/sorted-function-counts-7-10perc.csv
Processing file: data/sorted-function-counts-8-10perc.csv
Processing file: data/sorted-function-counts-9-10perc.csv
Processing file: data/sorted-function-counts-10-10perc.csv






    Out[7]:






  
    
      
      filename
      edx+1_x
      fn_x
      start_x
      $+5_x
      ebp+var__x
      __securi_x
      eax_x
      ebx_x
      edi_x
      ...
      MoveFile
      __itoa
      PulseEve_y
      _432D1C_y
      About
      ExtAbout_y
      GetProdu
      glEvalPo_y
      glIndexf_y
      glIndexi
    
  
  
    
      0
      01kcPWA9K2BOxQeS5Rju
      0
      0
      0
      0
      3
      0
      2
      3
      2
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      04EjIdbPV5e1XroFOpiN
      22
      5
      0
      0
      117
      0
      31
      53
      8
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      04QzZ3DVdPsEp9elLR65
      104
      0
      0
      0
      0
      0
      0
      1
      0
      ...
      0
      0
      0
      5
      0
      0
      0
      0
      0
      0
    
    
      3
      04sJnMaORYc1SV5pKjrP
      0
      0
      0
      1
      1
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      05Kps4iFw8mOLJZQrb1H
      155
      0
      0
      0
      9
      0
      0
      7
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      5
      05rJTUWYAKNegBk2wE8X
      0
      12
      5
      0
      563
      2
      0
      75
      149
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      6
      06aLOj8EUXMByS423sum
      343
      0
      0
      0
      9
      0
      0
      8
      2
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      7
      06osXqPUVM1HbvBGNncT
      326
      0
      0
      1
      1
      0
      0
      110
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      8
      09sXMJUHwQWVanrhzAoT
      29
      0
      0
      1
      9
      0
      0
      9
      10
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      9
      0AguvpOCcaf2myVDYFGb
      20
      0
      0
      0
      0
      1
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      10
      0BIdbVDEgmPwjYF4xzir
      148
      0
      0
      0
      0
      0
      0
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      11
      0BKcmNv4iGY2hsVSaXJ6
      0
      0
      0
      1
      1
      0
      0
      0
      0
      ...
      0
      0
      0
      3
      0
      0
      0
      0
      0
      0
    
    
      12
      0BY2iPso3bEmudlUzpfq
      31
      0
      0
      1
      0
      1
      0
      2
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      13
      0BZQIJak6Pu2tyAXfrzR
      35
      0
      0
      0
      0
      0
      0
      2
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      14
      0DM3hS6Gg2QVKb1fZydv
      14
      0
      0
      0
      6
      0
      0
      6
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
    
    
      15
      0DTs2PhZfCwEv7q8349K
      0
      0
      0
      1
      0
      0
      0
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      16
      0DbLeKSoxu47wjqVHsi9
      255
      0
      0
      0
      6
      25
      26
      7
      0
      ...
      2
      1
      9
      0
      0
      0
      0
      0
      0
      0
    
    
      17
      0DqUX5rkg3IbMY6BLGCE
      0
      21
      1
      0
      13
      0
      0
      19
      21
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      18
      0EAdHtLDypMcwjTFJziC
      169
      0
      0
      0
      8
      0
      0
      7
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      19
      0G2RV1chBlIbkt6JqA5Q
      174
      0
      0
      0
      7
      0
      0
      7
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

20 rows × 7132 columns



In [8]:

    
reduced_function_counts.shape









    Out[8]:





(2666, 7132)



In [9]:

    
reduced_function_counts.to_csv('data/reduced-fcounts-1.csv', index=False)



In [ ]:

    
help(pd.read_csv)

2.3 Process 2 of 4 Function Count Files.



In [12]:

    
# call_graph_function_train_2 = pd.read_csv('/opt/kaggle/27137-call-graph-function_counts.csv', header=None, names=all_column_names)
# Load the sorted function counts (2/4)
call_graph_function_train_2 = pd.read_csv('/opt/kaggle/5834-call-graph-reduced-function_counts.csv', index_col=False, header=None, names=all_column_names)
sorted_call_graph_function_train_2 = call_graph_function_train_2.sort_values(by='filename')
sorted_call_graph_function_train_2.to_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-2.csv', index=False)
sorted_call_graph_function_train_2.head()
del(call_graph_function_train_2)



In [ ]:

    
sorted_call_graph_function_train_2 = pd.read_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-2.csv')
sorted_call_graph_function_train_2.head()



In [14]:

    
# Solution 3: slice the matrix into small chunks for processing.
# the pandas spare matrix still takes too long, break up into 10 different feature sets and try again.
onetenth = int(sorted_call_graph_function_train_2.shape[1]/10)
startidx = 1 # skip the filename column
endidx = onetenth
for idx1 in range(1,10):
    print("Processing column set {:d} -> {:d}".format(startidx, endidx))
    X = sorted_call_graph_function_train_2.iloc[:,startidx:endidx]
    y = []
    train_names = sorted_train_labels['Id']
    for fname in sorted_call_graph_function_train_2['filename']:
        # print("Appending {:s}".format(fname))
        for idx2,fname2 in enumerate(sorted_train_labels['Id']):
            if (fname2 == fname):
                y.append(sorted_train_labels.iloc[idx2,1])
                break

    # Find the top 10 percent variance features.
    print(X.shape)
    print(len(y))
    fsp = SelectPercentile(chi2, 10)
    X_new_10 = fsp.fit_transform(X,y)
    selected_names = fsp.get_support(indices=True)
    selected_names = selected_names + 1
    data_trimmed = sorted_call_graph_function_train_2.iloc[:,selected_names]
    data_fnames = pd.DataFrame(sorted_call_graph_function_train_2['filename'])
    data_reduced = data_fnames.join(data_trimmed)
    # Write to file as we do not have enough memory.
    filename = "data/sorted-function-counts-" + str(idx1) + "-10perc.csv"
    print("Writing file: {:s}".format(filename))
    data_reduced.to_csv(filename, index=False)
    startidx = endidx
    endidx += onetenth
    
    
# finish of the remaining columns
print("Processing final column set {:d} -> {:d}".format(startidx, endidx))
X = sorted_call_graph_function_train_2.iloc[:,startidx:]
y = []
train_names = sorted_train_labels['Id']
for fname in sorted_call_graph_function_train_2['filename']:
    for idx1,fname2 in enumerate(sorted_train_labels['Id']):
        if (fname2 == fname):
            y.append(sorted_train_labels.iloc[idx1,1])
            break

# Find the top 10 percent variance features.
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
data_trimmed = sorted_call_graph_function_train_2.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_call_graph_function_train_2['filename'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/sorted-function-counts-10-10perc.csv"
data_reduced.to_csv(filename, index=False)









    



Processing column set 1 -> 7131
(2724, 7130)
2724
Writing file: data/sorted-function-counts-1-10perc.csv
Processing column set 7131 -> 14262
(2724, 7131)
2724
Writing file: data/sorted-function-counts-2-10perc.csv
Processing column set 14262 -> 21393
(2724, 7131)
2724
Writing file: data/sorted-function-counts-3-10perc.csv
Processing column set 21393 -> 28524
(2724, 7131)
2724
Writing file: data/sorted-function-counts-4-10perc.csv
Processing column set 28524 -> 35655
(2724, 7131)
2724
Writing file: data/sorted-function-counts-5-10perc.csv
Processing column set 35655 -> 42786
(2724, 7131)
2724
Writing file: data/sorted-function-counts-6-10perc.csv
Processing column set 42786 -> 49917
(2724, 7131)
2724
Writing file: data/sorted-function-counts-7-10perc.csv
Processing column set 49917 -> 57048
(2724, 7131)
2724
Writing file: data/sorted-function-counts-8-10perc.csv
Processing column set 57048 -> 64179
(2724, 7131)
2724
Writing file: data/sorted-function-counts-9-10perc.csv
Processing final column set 64179 -> 71310



In [15]:

    
# Now recombine the reduced sets and perform chi-squared tests again.
fname = "data/sorted-function-counts-1-10perc.csv"
reduced_function_counts = pd.read_csv(fname)
for idx in range(2,11):
    fname = "data/sorted-function-counts-" + str(idx) + "-10perc.csv"
    print("Processing file: {:s}".format(fname))
    nextfc = pd.read_csv(fname)
    reduced_function_counts = pd.merge(reduced_function_counts, nextfc, on='filename')
    

reduced_function_counts.head(20)









    



Processing file: data/sorted-function-counts-2-10perc.csv
Processing file: data/sorted-function-counts-3-10perc.csv
Processing file: data/sorted-function-counts-4-10perc.csv
Processing file: data/sorted-function-counts-5-10perc.csv
Processing file: data/sorted-function-counts-6-10perc.csv
Processing file: data/sorted-function-counts-7-10perc.csv
Processing file: data/sorted-function-counts-8-10perc.csv
Processing file: data/sorted-function-counts-9-10perc.csv
Processing file: data/sorted-function-counts-10-10perc.csv






    Out[15]:






  
    
      
      filename
      edx+1_x
      fn_x
      start_x
      $+5_x
      ebp+var__x
      __securi_x
      eax_x
      ebx_x
      edi_x
      ...
      MoveFile_y
      __itoa
      PulseEve_y
      _432D1C_y
      About
      ExtAbout
      GetProdu
      glEvalPo_y
      glIndexf
      glIndexi
    
  
  
    
      0
      02K5GMYITj7bBoAisEmD
      255
      0
      0
      0
      6
      25
      26
      7
      0
      ...
      2
      1
      9
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      02MRILoE6rNhmt7FUi45
      113
      0
      0
      0
      7
      0
      0
      8
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      2
      02mlBLHZTDFXGa7Nt6cr
      44
      0
      0
      0
      0
      0
      0
      2
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      04BfoQRA6XEshiNuI7pF
      0
      0
      0
      1
      0
      0
      0
      1
      0
      ...
      0
      0
      0
      10
      0
      0
      0
      0
      0
      0
    
    
      4
      04mcPSei852tgIKUwTJr
      148
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      5
      05EeG39MTRrI6VY21DPd
      0
      0
      0
      0
      7
      0
      19
      38
      6
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      6
      06KfrF7ltESna2ZHPVp5
      0
      0
      0
      1
      0
      0
      0
      0
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      7
      06QinlpeFIWj8qHc7Vys
      53
      0
      0
      1
      0
      0
      0
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      8
      06arUi9q3wHS2C8RZxeB
      0
      0
      0
      1
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      9
      07ECKjDTyQLnabNoxrIH
      1000
      0
      0
      1
      0
      1
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      10
      08BX5Slp2I1FraZWbc6j
      2231
      0
      0
      1
      191
      1
      0
      5
      58
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      11
      09LXtWxm1EbK5uVqcQS3
      150
      0
      0
      0
      0
      0
      0
      0
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      12
      0A32eTdBKayjCWhZqDOQ
      141
      0
      0
      0
      8
      0
      0
      9
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      13
      0ASH2csN7k8jZyoRaqtn
      55
      0
      0
      0
      6
      0
      0
      6
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      14
      0AwWs42SUQ19mI7eDcTC
      0
      2
      8
      0
      40
      1
      0
      31
      14
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      15
      0BFIPv1rO83whtpMYyAs
      55
      0
      0
      0
      0
      1
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      16
      0Cq4wfhLrKBJiut1lYAZ
      84
      0
      0
      0
      0
      8
      0
      10
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      17
      0D9IedmC1viTPugLRWX6
      0
      0
      0
      1
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      18
      0DNVFKwYlcjO7bTfJ5p1
      0
      0
      0
      0
      51
      4
      84
      55
      10
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      19
      0FdOaDWrfBU6TqwCRYxA
      329
      0
      0
      0
      0
      0
      0
      110
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

20 rows × 7132 columns



In [16]:

    
reduced_function_counts.shape









    Out[16]:





(2724, 7132)



In [17]:

    
reduced_function_counts.to_csv('data/reduced-fcounts-2.csv', index=False)

2.4 Process 3 of 4 Function Count Files.



In [18]:

    
#call_graph_function_train_3 = pd.read_csv('/opt/kaggle/27138-call-graph-function_counts.csv', header=None, names=all_column_names)
# Load the sorted function counts (3/4)
call_graph_function_train_3 = pd.read_csv('/opt/kaggle/5835-call-graph-reduced-function_counts.csv', index_col=False, header=None, names=all_column_names)
sorted_call_graph_function_train_3 = call_graph_function_train_3.sort_values(by='filename')
sorted_call_graph_function_train_3.to_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-3.csv', index=False)
sorted_call_graph_function_train_3.head()
del(call_graph_function_train_3)



In [19]:

    
sorted_call_graph_function_train_3 = pd.read_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-3.csv')
sorted_call_graph_function_train_3.head()









    Out[19]:






  
    
      
      filename
      DialogFu
      sub_4
      edx+1
      fn
      eax+7
      start
      $+5
      esi
      ebp+var_
      ...
      _4570C4
      _4570C0
      _4570C8
      _457A58
      _457A60
      _457A5C
      _40F0D8
      1E387BAE
      "3AC83716
      "
    
  
  
    
      94
      01jsnpXSAlgw6aPeDxrU
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2345
      02zcUmKV16Lya5xqnPGB
      0
      0
      214
      0
      0
      0
      0
      0
      7
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      450
      04cvLCVPqBMs6yn5xGlE
      0
      0
      328
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1451
      04hSzLv5s2TDYPlcgpHB
      0
      0
      81
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1402
      05IXcWGxvnkto4sq17zZ
      0
      0
      169
      10
      0
      0
      0
      0
      46
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 71319 columns



In [20]:

    
# Solution 3: slice the matrix into small chunks for processing.
# the pandas spare matrix still takes too long, break up into 10 different feature sets and try again.
onetenth = int(sorted_call_graph_function_train_3.shape[1]/10)
startidx = 1 # skip the filename column
endidx = onetenth
for idx1 in range(1,10):
    print("Processing column set {:d} -> {:d}".format(startidx, endidx))
    X = sorted_call_graph_function_train_3.iloc[:,startidx:endidx]
    y = []
    train_names = sorted_train_labels['Id']
    for fname in sorted_call_graph_function_train_3['filename']:
        # print("Appending {:s}".format(fname))
        for idx2,fname2 in enumerate(sorted_train_labels['Id']):
            if (fname2 == fname):
                y.append(sorted_train_labels.iloc[idx2,1])
                break

    # Find the top 10 percent variance features.
    print(X.shape)
    print(len(y))
    fsp = SelectPercentile(chi2, 10)
    X_new_10 = fsp.fit_transform(X,y)
    selected_names = fsp.get_support(indices=True)
    selected_names = selected_names + 1
    data_trimmed = sorted_call_graph_function_train_3.iloc[:,selected_names]
    data_fnames = pd.DataFrame(sorted_call_graph_function_train_3['filename'])
    data_reduced = data_fnames.join(data_trimmed)
    # Write to file as we do not have enough memory.
    filename = "data/sorted-function-counts-" + str(idx1) + "-10perc.csv"
    print("Writing file: {:s}".format(filename))
    data_reduced.to_csv(filename, index=False)
    startidx = endidx
    endidx += onetenth
    
    
# finish of the remaining columns
print("Processing final column set {:d} -> {:d}".format(startidx, endidx))
X = sorted_call_graph_function_train_3.iloc[:,startidx:]
y = []
train_names = sorted_train_labels['Id']
for fname in sorted_call_graph_function_train_3['filename']:
    for idx1,fname2 in enumerate(sorted_train_labels['Id']):
        if (fname2 == fname):
            y.append(sorted_train_labels.iloc[idx1,1])
            break

# Find the top 10 percent variance features.
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
data_trimmed = sorted_call_graph_function_train_3.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_call_graph_function_train_3['filename'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/sorted-function-counts-10-10perc.csv"
data_reduced.to_csv(filename, index=False)









    



Processing column set 1 -> 7131
(2742, 7130)
2742
Writing file: data/sorted-function-counts-1-10perc.csv
Processing column set 7131 -> 14262
(2742, 7131)
2742
Writing file: data/sorted-function-counts-2-10perc.csv
Processing column set 14262 -> 21393
(2742, 7131)
2742
Writing file: data/sorted-function-counts-3-10perc.csv
Processing column set 21393 -> 28524
(2742, 7131)
2742
Writing file: data/sorted-function-counts-4-10perc.csv
Processing column set 28524 -> 35655
(2742, 7131)
2742
Writing file: data/sorted-function-counts-5-10perc.csv
Processing column set 35655 -> 42786
(2742, 7131)
2742
Writing file: data/sorted-function-counts-6-10perc.csv
Processing column set 42786 -> 49917
(2742, 7131)
2742
Writing file: data/sorted-function-counts-7-10perc.csv
Processing column set 49917 -> 57048
(2742, 7131)
2742
Writing file: data/sorted-function-counts-8-10perc.csv
Processing column set 57048 -> 64179
(2742, 7131)
2742
Writing file: data/sorted-function-counts-9-10perc.csv
Processing final column set 64179 -> 71310



In [21]:

    
# Now recombine the reduced sets and perform chi-squared tests again.
fname = "data/sorted-function-counts-1-10perc.csv"
reduced_function_counts = pd.read_csv(fname)
for idx in range(2,11):
    fname = "data/sorted-function-counts-" + str(idx) + "-10perc.csv"
    print("Processing file: {:s}".format(fname))
    nextfc = pd.read_csv(fname)
    reduced_function_counts = pd.merge(reduced_function_counts, nextfc, on='filename')
    

reduced_function_counts.head(20)









    



Processing file: data/sorted-function-counts-2-10perc.csv
Processing file: data/sorted-function-counts-3-10perc.csv
Processing file: data/sorted-function-counts-4-10perc.csv
Processing file: data/sorted-function-counts-5-10perc.csv
Processing file: data/sorted-function-counts-6-10perc.csv
Processing file: data/sorted-function-counts-7-10perc.csv
Processing file: data/sorted-function-counts-8-10perc.csv
Processing file: data/sorted-function-counts-9-10perc.csv
Processing file: data/sorted-function-counts-10-10perc.csv






    Out[21]:






  
    
      
      filename
      edx+1_x
      fn_x
      start_x
      $+5_x
      ebp+var__x
      __securi_x
      eax_x
      ebx_x
      edi_x
      ...
      MoveFile
      __itoa_y
      PulseEve
      _432D1C_y
      About
      ExtAbout
      GetProdu
      glEvalPo_y
      glIndexf_y
      glIndexi_y
    
  
  
    
      0
      01jsnpXSAlgw6aPeDxrU
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      02zcUmKV16Lya5xqnPGB
      214
      0
      0
      0
      7
      0
      0
      7
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      04cvLCVPqBMs6yn5xGlE
      328
      0
      0
      0
      0
      0
      0
      110
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      04hSzLv5s2TDYPlcgpHB
      81
      0
      0
      0
      0
      1
      0
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      05IXcWGxvnkto4sq17zZ
      169
      10
      0
      0
      46
      6
      24
      9
      4
      ...
      2
      1
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      5
      05LHG8fR3iPn6agIo9z7
      43
      0
      0
      0
      1
      0
      0
      4
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      6
      065EZhxgbLRSHsB87uIF
      157
      0
      0
      0
      10
      0
      0
      7
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      1
      1
    
    
      7
      07iSOIG2urUvsMl9E5Rn
      0
      0
      0
      0
      7
      0
      0
      7
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      8
      09CPNMYyQjSguFrE8UOf
      557
      0
      0
      1
      0
      0
      0
      694
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      9
      09bfacpUzuBN5W3S8KTo
      0
      0
      0
      1
      0
      0
      0
      40
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      10
      0ACDbR5M3ZhBJajygTuf
      2
      0
      0
      1
      0
      1
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      11
      0AV6MPlrTWG4fYI7NBtQ
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      12
      0B2RwKm6dq9fjUWDNIOa
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      13
      0BLbmzJRkjNynCgQIdtV
      326
      0
      0
      1
      1
      0
      0
      110
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      14
      0CPaAXtyswrBq83D6VEg
      1531
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      15
      0CzL6rfwaTqGOu9eghBt
      326
      0
      0
      1
      0
      0
      0
      110
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      16
      0Dk7Wd8MERu3b5rmQzCK
      0
      0
      0
      1
      0
      0
      0
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      17
      0Eo9qT6idXHDMebwmvPA
      0
      0
      0
      1
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      18
      0F4qIHaR7xOrm19Set3o
      178
      0
      0
      0
      7
      0
      0
      7
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      19
      0FOXjzmnD9CUMVcSlEqh
      494
      0
      0
      0
      0
      0
      0
      0
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

20 rows × 7132 columns



In [22]:

    
reduced_function_counts.shape









    Out[22]:





(2742, 7132)



In [23]:

    
reduced_function_counts.to_csv('data/reduced-fcounts-3.csv', index=False)



In [ ]:



In [ ]:

2.5 Process 4 of 4 Function Count Files.



In [26]:

    
#call_graph_function_train_4 = pd.read_csv('/opt/kaggle/27139-call-graph-function_counts.csv', header=None, names=all_column_names)
# Load the sorted function counts (2/4)
call_graph_function_train_4 = pd.read_csv('/opt/kaggle/5836-call-graph-reduced-function_counts.csv', index_col=False, header=None, names=all_column_names)
sorted_call_graph_function_train_4 = call_graph_function_train_4.sort_values(by='filename')
sorted_call_graph_function_train_4.to_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-4.csv', index=False)
sorted_call_graph_function_train_4.head()
del(call_graph_function_train_4)



In [27]:

    
sorted_call_graph_function_train_4 = pd.read_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-4.csv')
sorted_call_graph_function_train_4.head()









    Out[27]:






  
    
      
      filename
      DialogFu
      sub_4
      edx+1
      fn
      eax+7
      start
      $+5
      esi
      ebp+var_
      ...
      _4570C4
      _4570C0
      _4570C8
      _457A58
      _457A60
      _457A5C
      _40F0D8
      1E387BAE
      "3AC83716
      "
    
  
  
    
      1749
      01IsoiSMh5gxyDYTl4CB
      0
      0
      216
      0
      0
      0
      0
      0
      10
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      663
      01SuzwMJEIXsK7A8dQbl
      0
      0
      80
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1106
      01azqd4InC7m9JpocGv5
      0
      0
      1632
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      508
      02IOCvYEy8mjiuAQHax3
      0
      0
      0
      0
      0
      0
      1
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1539
      02JqQ7H3yEoD8viYWlmS
      0
      0
      14
      0
      0
      0
      0
      0
      6
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 71319 columns



In [28]:

    
# Solution 3: slice the matrix into small chunks for processing.
# the pandas spare matrix still takes too long, break up into 10 different feature sets and try again.
onetenth = int(sorted_call_graph_function_train_4.shape[1]/10)
startidx = 1 # skip the filename column
endidx = onetenth
for idx1 in range(1,10):
    print("Processing column set {:d} -> {:d}".format(startidx, endidx))
    X = sorted_call_graph_function_train_4.iloc[:,startidx:endidx]
    y = []
    train_names = sorted_train_labels['Id']
    for fname in sorted_call_graph_function_train_4['filename']:
        # print("Appending {:s}".format(fname))
        for idx2,fname2 in enumerate(sorted_train_labels['Id']):
            if (fname2 == fname):
                y.append(sorted_train_labels.iloc[idx2,1])
                break

    # Find the top 10 percent variance features.
    print(X.shape)
    print(len(y))
    fsp = SelectPercentile(chi2, 10)
    X_new_10 = fsp.fit_transform(X,y)
    selected_names = fsp.get_support(indices=True)
    selected_names = selected_names + 1
    data_trimmed = sorted_call_graph_function_train_4.iloc[:,selected_names]
    data_fnames = pd.DataFrame(sorted_call_graph_function_train_4['filename'])
    data_reduced = data_fnames.join(data_trimmed)
    # Write to file as we do not have enough memory.
    filename = "data/sorted-function-counts-" + str(idx1) + "-10perc.csv"
    print("Writing file: {:s}".format(filename))
    data_reduced.to_csv(filename, index=False)
    startidx = endidx
    endidx += onetenth
    
    
# finish of the remaining columns
print("Processing final column set {:d} -> {:d}".format(startidx, endidx))
X = sorted_call_graph_function_train_4.iloc[:,startidx:]
y = []
train_names = sorted_train_labels['Id']
for fname in sorted_call_graph_function_train_4['filename']:
    for idx1,fname2 in enumerate(sorted_train_labels['Id']):
        if (fname2 == fname):
            y.append(sorted_train_labels.iloc[idx1,1])
            break

# Find the top 10 percent variance features.
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
data_trimmed = sorted_call_graph_function_train_4.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_call_graph_function_train_4['filename'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/sorted-function-counts-10-10perc.csv"
data_reduced.to_csv(filename, index=False)









    



Processing column set 1 -> 7131
(2736, 7130)
2736
Writing file: data/sorted-function-counts-1-10perc.csv
Processing column set 7131 -> 14262
(2736, 7131)
2736
Writing file: data/sorted-function-counts-2-10perc.csv
Processing column set 14262 -> 21393
(2736, 7131)
2736
Writing file: data/sorted-function-counts-3-10perc.csv
Processing column set 21393 -> 28524
(2736, 7131)
2736
Writing file: data/sorted-function-counts-4-10perc.csv
Processing column set 28524 -> 35655
(2736, 7131)
2736
Writing file: data/sorted-function-counts-5-10perc.csv
Processing column set 35655 -> 42786
(2736, 7131)
2736
Writing file: data/sorted-function-counts-6-10perc.csv
Processing column set 42786 -> 49917
(2736, 7131)
2736
Writing file: data/sorted-function-counts-7-10perc.csv
Processing column set 49917 -> 57048
(2736, 7131)
2736
Writing file: data/sorted-function-counts-8-10perc.csv
Processing column set 57048 -> 64179
(2736, 7131)
2736
Writing file: data/sorted-function-counts-9-10perc.csv
Processing final column set 64179 -> 71310



In [29]:

    
# Now recombine the reduced sets and perform chi-squared tests again.
fname = "data/sorted-function-counts-1-10perc.csv"
reduced_function_counts = pd.read_csv(fname)
for idx in range(2,11):
    fname = "data/sorted-function-counts-" + str(idx) + "-10perc.csv"
    print("Processing file: {:s}".format(fname))
    nextfc = pd.read_csv(fname)
    reduced_function_counts = pd.merge(reduced_function_counts, nextfc, on='filename')
    

reduced_function_counts.head(20)









    



Processing file: data/sorted-function-counts-2-10perc.csv
Processing file: data/sorted-function-counts-3-10perc.csv
Processing file: data/sorted-function-counts-4-10perc.csv
Processing file: data/sorted-function-counts-5-10perc.csv
Processing file: data/sorted-function-counts-6-10perc.csv
Processing file: data/sorted-function-counts-7-10perc.csv
Processing file: data/sorted-function-counts-8-10perc.csv
Processing file: data/sorted-function-counts-9-10perc.csv
Processing file: data/sorted-function-counts-10-10perc.csv






    Out[29]:






  
    
      
      filename
      edx+1_x
      fn_x
      start_x
      $+5_x
      ebp+var__x
      __securi_x
      eax_x
      ebx_x
      edi_x
      ...
      removeAt
      aout_For
      0CFliteD
      GetEcoMo_y
      IsValidS
      SetSetti
      off_407C
      __malloc
      __flush
      _IndexIn_y
    
  
  
    
      0
      01IsoiSMh5gxyDYTl4CB
      216
      0
      0
      0
      10
      0
      0
      7
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      01SuzwMJEIXsK7A8dQbl
      80
      0
      0
      0
      0
      1
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      01azqd4InC7m9JpocGv5
      1632
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      02IOCvYEy8mjiuAQHax3
      0
      0
      0
      1
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      02JqQ7H3yEoD8viYWlmS
      14
      0
      0
      0
      6
      0
      0
      6
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      5
      03nJaQV6K2ObICUmyWoR
      62
      0
      0
      0
      0
      0
      0
      2
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      6
      05aiMRw13bYWqZ8OHvjl
      195
      0
      0
      0
      9
      0
      0
      9
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      7
      07nrG1cLKUPxjOlWMFiV
      0
      0
      0
      1
      1
      0
      0
      89
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      8
      0AnoOZDNbPXIr2MRBSCJ
      0
      0
      0
      0
      15
      0
      0
      6
      10
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      9
      0BEsCP7NAUy8XmkenHWG
      148
      0
      0
      0
      8
      0
      0
      7
      2
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      10
      0C4aVbN58O1nAigFJt9z
      482
      1
      0
      0
      9
      1
      0
      11
      4
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      11
      0DTp59Av1RLifoKlUdm7
      139
      0
      0
      0
      10
      0
      0
      9
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      12
      0EL7OGZKozbiNCVP61gk
      33
      0
      0
      0
      3
      0
      0
      2
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      13
      0FKerJl18xOc3jdoyg4A
      113
      0
      0
      0
      7
      0
      0
      8
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      14
      0GKp9ZJclxTABMunIOD2
      148
      0
      0
      0
      8
      0
      0
      9
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      15
      0GKzFQ81IYXqUWkmfv26
      0
      0
      0
      1
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      16
      0GUIi7xAlODwZ4YBenNM
      47
      0
      0
      0
      2
      0
      0
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      17
      0GvtWEPUBfDAcMbiYVSR
      0
      0
      0
      1
      0
      0
      0
      89
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      18
      0HKFs3AXTt1IrOl52eVu
      84
      0
      0
      0
      0
      1
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      19
      0HVAnMrp1LjKDmuoOJFY
      63
      0
      0
      0
      0
      1
      0
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

20 rows × 7132 columns



In [30]:

    
reduced_function_counts.shape









    Out[30]:





(2736, 7132)



In [31]:

    
reduced_function_counts.to_csv('data/reduced-fcounts-4.csv', index=False)

3. Concatenate the 10% Best Features of the Function Count Sets.



In [4]:

    
# Now recombine the reduced sets and perform chi-squared tests again. Create a list of dataframes then
# call the pd.concat funtion once as this is more efficient than calling the pd.concat function
# multiple times.

dflist = []
for idx in range(1,5):
    fname = "data/reduced-fcounts-" + str(idx) + ".csv"
    print("Processing file: {:s}".format(fname))
    dflist.append(pd.read_csv(fname))
    

reduced_function_counts = pd.concat(dflist, ignore_index=True)
# Replace all the NaN values with 0
reduced_function_counts.fillna(0, inplace=True)
sorted_reduced_function_counts = reduced_function_counts.sort_values(by='filename')
final_reduced_function_counts = sorted_call_graph_features_train.merge(sorted_reduced_function_counts, on='filename')
final_reduced_function_counts.head(20)









    



Processing file: data/reduced-fcounts-1.csv
Processing file: data/reduced-fcounts-2.csv
Processing file: data/reduced-fcounts-3.csv
Processing file: data/reduced-fcounts-4.csv






    Out[4]:






  
    
      
      filename
      vertex_count
      edge_count
      delta_max
      density
      """"""""""
      """""""ignored"""
      $+5
      $+5_x
      $+5_x.1
      ...
      xallocio
      xsgetn$b
      xsputn$b
      ycpDestr
      ycpDestr_x
      ycpDestr_y
      ycpInitY
      ycpInitY_x
      ycpInitY_y
      yearQDat
    
  
  
    
      0
      01IsoiSMh5gxyDYTl4CB
      274
      333
      137
      0.081319
      0.0
      0.0
      0.0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      01SuzwMJEIXsK7A8dQbl
      187
      196
      82
      0.181314
      0.0
      0.0
      0.0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      01azqd4InC7m9JpocGv5
      158
      1533
      95
      0.140927
      0.0
      0.0
      0.0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      01jsnpXSAlgw6aPeDxrU
      26
      126
      35
      0.600000
      0.0
      0.0
      0.0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      01kcPWA9K2BOxQeS5Rju
      61
      53
      24
      0.504762
      0.0
      0.0
      0.0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      5
      02IOCvYEy8mjiuAQHax3
      8
      5
      5
      1.666667
      0.0
      0.0
      1.0
      1
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      6
      02JqQ7H3yEoD8viYWlmS
      77
      65
      22
      0.477941
      0.0
      0.0
      0.0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      7
      02K5GMYITj7bBoAisEmD
      779
      827
      100
      0.112044
      0.0
      0.0
      0.0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      8
      02MRILoE6rNhmt7FUi45
      182
      201
      66
      0.140461
      0.0
      0.0
      0.0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      9
      02mlBLHZTDFXGa7Nt6cr
      28
      55
      28
      0.604396
      0.0
      0.0
      0.0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      10
      02zcUmKV16Lya5xqnPGB
      229
      303
      74
      0.095886
      0.0
      0.0
      0.0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      11
      03nJaQV6K2ObICUmyWoR
      32
      75
      16
      1.136364
      0.0
      0.0
      0.0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      12
      04BfoQRA6XEshiNuI7pF
      171
      168
      9
      0.126697
      0.0
      0.0
      1.0
      1
      1.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      13
      04EjIdbPV5e1XroFOpiN
      1570
      2900
      248
      0.010810
      0.0
      0.0
      0.0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      14
      04QzZ3DVdPsEp9elLR65
      173
      168
      10
      0.109091
      0.0
      0.0
      0.0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      15
      04cvLCVPqBMs6yn5xGlE
      282
      278
      6
      0.020056
      0.0
      0.0
      0.0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      16
      04hSzLv5s2TDYPlcgpHB
      174
      185
      87
      0.157313
      0.0
      0.0
      0.0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      17
      04mcPSei852tgIKUwTJr
      34
      134
      28
      0.354497
      0.0
      0.0
      0.0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      18
      04sJnMaORYc1SV5pKjrP
      140
      137
      6
      0.144820
      0.0
      0.0
      0.0
      1
      1.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      19
      05EeG39MTRrI6VY21DPd
      585
      461
      31
      0.016625
      0.0
      0.0
      0.0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

20 rows × 15615 columns



In [6]:

    
# Find the top 10 percent variance features.
X = final_reduced_function_counts.iloc[:,1:]
y = sorted_train_labels.iloc[:,1]
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
data_trimmed = final_reduced_function_counts.iloc[:,selected_names]
data_fnames = pd.DataFrame(final_reduced_function_counts['filename'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/final-call-graph-features-10percent.csv"
data_reduced.to_csv(filename, index=False)
data_reduced.head(20)









    Out[6]:






  
    
      
      filename
      vertex_count
      edge_count
      delta_max
      density
      $+5
      $+5_x
      $+5_y
      $5MGU$ch
      $6MDU$ch
      ...
      subst_x
      subst_y
      unkno
      unkno_x
      unkno_x.1
      unkno_x.2
      unkno_y
      unkno_y.1
      unkno_y.2
      wpa_hexd
    
  
  
    
      0
      01IsoiSMh5gxyDYTl4CB
      274
      333
      137
      0.081319
      0.0
      0
      0
      0
      0.0
      ...
      0.0
      0.0
      1.0
      1
      0.0
      0.0
      1
      0.0
      0.0
      0.0
    
    
      1
      01SuzwMJEIXsK7A8dQbl
      187
      196
      82
      0.181314
      0.0
      0
      0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0
      0.0
      0.0
      0
      0.0
      0.0
      0.0
    
    
      2
      01azqd4InC7m9JpocGv5
      158
      1533
      95
      0.140927
      0.0
      0
      0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0
      0.0
      0.0
      0
      0.0
      0.0
      0.0
    
    
      3
      01jsnpXSAlgw6aPeDxrU
      26
      126
      35
      0.600000
      0.0
      0
      0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0
      0.0
      0.0
      0
      0.0
      0.0
      0.0
    
    
      4
      01kcPWA9K2BOxQeS5Rju
      61
      53
      24
      0.504762
      0.0
      0
      0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0
      0.0
      0.0
      0
      0.0
      0.0
      0.0
    
    
      5
      02IOCvYEy8mjiuAQHax3
      8
      5
      5
      1.666667
      1.0
      1
      1
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0
      0.0
      0.0
      0
      0.0
      0.0
      0.0
    
    
      6
      02JqQ7H3yEoD8viYWlmS
      77
      65
      22
      0.477941
      0.0
      0
      0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0
      0.0
      0.0
      0
      0.0
      0.0
      0.0
    
    
      7
      02K5GMYITj7bBoAisEmD
      779
      827
      100
      0.112044
      0.0
      0
      0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      1
      1.0
      1.0
      1
      1.0
      1.0
      0.0
    
    
      8
      02MRILoE6rNhmt7FUi45
      182
      201
      66
      0.140461
      0.0
      0
      0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      1
      1.0
      1.0
      1
      1.0
      1.0
      0.0
    
    
      9
      02mlBLHZTDFXGa7Nt6cr
      28
      55
      28
      0.604396
      0.0
      0
      0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0
      0.0
      0.0
      0
      0.0
      0.0
      0.0
    
    
      10
      02zcUmKV16Lya5xqnPGB
      229
      303
      74
      0.095886
      0.0
      0
      0
      0
      0.0
      ...
      0.0
      0.0
      1.0
      1
      1.0
      0.0
      1
      1.0
      0.0
      0.0
    
    
      11
      03nJaQV6K2ObICUmyWoR
      32
      75
      16
      1.136364
      0.0
      0
      0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0
      0.0
      0.0
      0
      0.0
      0.0
      0.0
    
    
      12
      04BfoQRA6XEshiNuI7pF
      171
      168
      9
      0.126697
      1.0
      1
      1
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0
      0.0
      0.0
      0
      0.0
      0.0
      0.0
    
    
      13
      04EjIdbPV5e1XroFOpiN
      1570
      2900
      248
      0.010810
      0.0
      0
      0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0
      0.0
      0.0
      0
      0.0
      0.0
      0.0
    
    
      14
      04QzZ3DVdPsEp9elLR65
      173
      168
      10
      0.109091
      0.0
      0
      0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0
      0.0
      0.0
      0
      0.0
      0.0
      0.0
    
    
      15
      04cvLCVPqBMs6yn5xGlE
      282
      278
      6
      0.020056
      0.0
      0
      0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0
      0.0
      0.0
      0
      0.0
      0.0
      0.0
    
    
      16
      04hSzLv5s2TDYPlcgpHB
      174
      185
      87
      0.157313
      0.0
      0
      0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0
      0.0
      0.0
      0
      0.0
      0.0
      0.0
    
    
      17
      04mcPSei852tgIKUwTJr
      34
      134
      28
      0.354497
      0.0
      0
      0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0
      0.0
      0.0
      0
      0.0
      0.0
      0.0
    
    
      18
      04sJnMaORYc1SV5pKjrP
      140
      137
      6
      0.144820
      0.0
      1
      1
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0
      0.0
      0.0
      0
      0.0
      0.0
      0.0
    
    
      19
      05EeG39MTRrI6VY21DPd
      585
      461
      31
      0.016625
      0.0
      0
      0
      0
      0.0
      ...
      0.0
      0.0
      0.0
      0
      0.0
      0.0
      0
      0.0
      0.0
      0.0
    
  

20 rows × 1562 columns

4. Perform Some Classification Tests



In [10]:

    
def run_cv(X,y, clf):

    # Construct a kfolds object
    kf = KFold(len(y),n_folds=10,shuffle=True)
    y_prob = np.zeros((len(y),9))
    y_pred = np.zeros(len(y))
    
    # Iterate through folds
    for train_index, test_index in kf:
        print(test_index, train_index)
        X_train = X.loc[train_index,:]
        X_test = X.loc[test_index,:]
        y_train = y[train_index]

        clf.fit(X_train,y_train)
        y_prob[test_index] = clf.predict_proba(X_test)
        y_pred[test_index] = clf.predict(X_test)
    
    return y_prob, y_pred



In [7]:

    
ytrain = np.array(y)
X = data_reduced.iloc[:,1:]
X.shape









    Out[7]:





(10868, 1561)



In [8]:

    
y.shape









    Out[8]:





(10868,)



In [11]:

    
# Now we can build a hypothesis
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,ytrain,clf1)
print("logloss = ", log_loss(y, p1))
print("score = ", accuracy_score(ytrain, pred1))
cm = confusion_matrix(y, pred1)
print(cm)









    



[   57    61    71 ..., 10818 10833 10856] [    0     1     2 ..., 10865 10866 10867]
[    5    16    17 ..., 10844 10847 10850] [    0     1     2 ..., 10865 10866 10867]
[   18    19    25 ..., 10857 10860 10862] [    0     1     2 ..., 10865 10866 10867]
[   10    12    14 ..., 10823 10832 10855] [    0     1     2 ..., 10865 10866 10867]
[    7    11    36 ..., 10822 10831 10839] [    0     1     2 ..., 10865 10866 10867]
[    0    23    35 ..., 10849 10852 10858] [    1     2     3 ..., 10865 10866 10867]
[    1    48    51 ..., 10827 10851 10861] [    0     2     3 ..., 10865 10866 10867]
[    8    20    21 ..., 10846 10853 10859] [    0     1     2 ..., 10865 10866 10867]
[    2     4     6 ..., 10845 10866 10867] [    0     1     3 ..., 10863 10864 10865]
[    3     9    13 ..., 10863 10864 10865] [    0     1     2 ..., 10862 10866 10867]
logloss =  0.0707704951195
score =  0.985645933014
[[1507    1    0   12    0    4    0   16    1]
 [   3 2467    0    2    0    1    0    3    2]
 [   1    0 2936    4    0    1    0    0    0]
 [   3    0    0  470    0    0    1    1    0]
 [   0    1    0    0   38    2    0    1    0]
 [   6    1    1    1    0  738    2    1    1]
 [   0    0    0   11    0    0  387    0    0]
 [  40    1    0   14    1    2    2 1164    4]
 [   1    1    1    1    0    1    0    3 1005]]

5. Test Code Only



In [ ]:

    
for idx in range(1,10):
    print("Index: {:d}".format(idx))
    filename = "data/sorted-function-counts-" + str(idx) + "-10perc.csv"
    print(filename)



In [ ]:

    
y = []
train_names = sorted_train_labels['Id']
for fname in sorted_call_graph_function_train_1['filename']:
    # print("Appending {:s}".format(fname))
    for idx,fname2 in enumerate(sorted_train_labels['Id']):
        if (fname2 == fname):
            y.append(sorted_train_labels.iloc[idx,1])
            break
    
print(y)

	filename	vertex_count	edge_count	delta_max	density
0	01IsoiSMh5gxyDYTl4CB	274	333	137	0.081319
1	01SuzwMJEIXsK7A8dQbl	187	196	82	0.181314
2	01azqd4InC7m9JpocGv5	158	1533	95	0.140927
3	01jsnpXSAlgw6aPeDxrU	26	126	35	0.600000
4	01kcPWA9K2BOxQeS5Rju	61	53	24	0.504762

	filename	edx+1	fn	$+5	ebp+var_	...
0	01kcPWA9K2BOxQeS5Rju	0	0	0	3	...
1	04EjIdbPV5e1XroFOpiN	22	5	0	117	...
2	04QzZ3DVdPsEp9elLR65	104	0	0	0	...
3	04sJnMaORYc1SV5pKjrP	0	0	1	1	...
4	05Kps4iFw8mOLJZQrb1H	155	0	0	9	...

	filename	edx+1	fn	$+5	ebp+var_	...
1478	01kcPWA9K2BOxQeS5Rju	0	0	0	3	...
775	04EjIdbPV5e1XroFOpiN	22	5	0	117	...
1536	04QzZ3DVdPsEp9elLR65	104	0	0	0	...
1180	04sJnMaORYc1SV5pKjrP	0	0	1	1	...
941	05Kps4iFw8mOLJZQrb1H	155	0	0	9	...

	filename	edx+1_x	fn_x	start_x	$+5_x	ebp+var__x	__securi_x	eax_x	ebx_x	edi_x	...	MoveFile_y	__itoa	PulseEve_y	_432D1C_y	glIndexi
0	02K5GMYITj7bBoAisEmD	255	0	0	0	6	25	26	7	0	...	2	1	9	0	0
1	02MRILoE6rNhmt7FUi45	113	0	0	0	7	0	0	8	0	...	0	0	0	0	1
2	02mlBLHZTDFXGa7Nt6cr	44	0	0	0	0	0	0	2	0	...	0	0	0	0	0
3	04BfoQRA6XEshiNuI7pF	0	0	0	1	0	0	0	1	0	...	0	0	0	10	0
4	04mcPSei852tgIKUwTJr	148	0	0	0	0	0	0	0	0	...	0	0	0	0	0
5	05EeG39MTRrI6VY21DPd	0	0	0	0	7	0	19	38	6	...	0	0	0	0	0
6	06KfrF7ltESna2ZHPVp5	0	0	0	1	0	0	0	0	1	...	0	0	0	0	0
7	06QinlpeFIWj8qHc7Vys	53	0	0	1	0	0	0	1	0	...	0	0	0	0	0
8	06arUi9q3wHS2C8RZxeB	0	0	0	1	0	0	0	0	0	...	0	0	0	0	0
9	07ECKjDTyQLnabNoxrIH	1000	0	0	1	0	1	0	0	0	...	0	0	0	0	0
10	08BX5Slp2I1FraZWbc6j	2231	0	0	1	191	1	0	5	58	...	0	0	0	0	0
11	09LXtWxm1EbK5uVqcQS3	150	0	0	0	0	0	0	0	1	...	0	0	0	0	0
12	0A32eTdBKayjCWhZqDOQ	141	0	0	0	8	0	0	9	0	...	0	0	0	0	0
13	0ASH2csN7k8jZyoRaqtn	55	0	0	0	6	0	0	6	0	...	0	0	0	0	0
14	0AwWs42SUQ19mI7eDcTC	0	2	8	0	40	1	0	31	14	...	0	0	0	0	0
15	0BFIPv1rO83whtpMYyAs	55	0	0	0	0	1	0	0	0	...	0	0	0	0	0
16	0Cq4wfhLrKBJiut1lYAZ	84	0	0	0	0	8	0	10	0	...	0	0	0	0	0
17	0D9IedmC1viTPugLRWX6	0	0	0	1	0	0	0	0	0	...	0	0	0	0	0
18	0DNVFKwYlcjO7bTfJ5p1	0	0	0	0	51	4	84	55	10	...	0	0	0	0	0
19	0FdOaDWrfBU6TqwCRYxA	329	0	0	0	0	0	0	110	0	...	0	0	0	0	0

	filename	edx+1	fn	ebp+var_	...
94	01jsnpXSAlgw6aPeDxrU	0	0	0	...
2345	02zcUmKV16Lya5xqnPGB	214	0	7	...
450	04cvLCVPqBMs6yn5xGlE	328	0	0	...
1451	04hSzLv5s2TDYPlcgpHB	81	0	0	...
1402	05IXcWGxvnkto4sq17zZ	169	10	46	...

	filename	edx+1	$+5	ebp+var_	...
1749	01IsoiSMh5gxyDYTl4CB	216	0	10	...
663	01SuzwMJEIXsK7A8dQbl	80	0	0	...
1106	01azqd4InC7m9JpocGv5	1632	0	0	...
508	02IOCvYEy8mjiuAQHax3	0	1	0	...
1539	02JqQ7H3yEoD8viYWlmS	14	0	6	...