1. Reduce Call Graph Feature Sets

   The original feature set of function calls extracted from the call graphs had
   over 130,000 features. Reducing the feature set by truncating the function names
   produces a feature set of 71,319 function names. Using chi-squared tests to select
   the best 10% of features produces a set of 1561 call graph features.

In [2]:
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import chi2
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import cross_val_score, KFold
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

2. Load The Call Graph Training Data Features


In [ ]:


In [ ]:


In [11]:
# Preliminary column name setup.
colf = open('data/all-reduced-function-column-names.csv', 'r')
all_column_names = []
column_lines = colf.readlines()
for line in column_lines:
    all_column_names += line.split(',')
    
col_names_len = len(all_column_names)
colf.close()
print("Column Names: {:d}".format(col_names_len))


Column Names: 71319

In [3]:
# First load the .asm training data and training labels
# call_graph_features_train = pd.read_csv('data/call-graph-features-train.csv')
sorted_call_graph_features_train = pd.read_csv('data/sorted-call-graph-features-train.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')

In [3]:
#sorted_call_graph_features_train = call_graph_features_train.sort_values(by='filename')
#sorted_call_graph_features_train.to_csv('data/sorted-call-graph-features-train.csv', index=False)
sorted_call_graph_features_train.head()


Out[3]:
filename vertex_count edge_count delta_max density
0 01IsoiSMh5gxyDYTl4CB 274 333 137 0.081319
1 01SuzwMJEIXsK7A8dQbl 187 196 82 0.181314
2 01azqd4InC7m9JpocGv5 158 1533 95 0.140927
3 01jsnpXSAlgw6aPeDxrU 26 126 35 0.600000
4 01kcPWA9K2BOxQeS5Rju 61 53 24 0.504762

In [4]:
sorted_train_labels.head()


Out[4]:
Id Class
0 01IsoiSMh5gxyDYTl4CB 2
1 01SuzwMJEIXsK7A8dQbl 8
2 01azqd4InC7m9JpocGv5 9
3 01jsnpXSAlgw6aPeDxrU 9
4 01kcPWA9K2BOxQeS5Rju 1

In [ ]:

2.2 Process 1 of 4 Function Count Files.


In [2]:
# Load the sorted function counts (1/4)
#call_graph_function_train_1 = pd.read_csv('/opt/kaggle/5833-call-graph-reduced-function_counts.csv', index_col=False, header=None, names=all_column_names)
#sorted_call_graph_function_train_1 = call_graph_function_train_1.sort_values(by='filename')
sorted_call_graph_function_train_1 = pd.read_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-1.csv')
sorted_call_graph_function_train_1.head()


Out[2]:
filename DialogFu sub_4 edx+1 fn eax+7 start $+5 esi ebp+var_ ... _4570C4 _4570C0 _4570C8 _457A58 _457A60 _457A5C _40F0D8 1E387BAE "3AC83716 "
0 01kcPWA9K2BOxQeS5Rju 0 0 0 0 0 0 0 0 3 ... 0 0 0 0 0 0 0 0 0 0
1 04EjIdbPV5e1XroFOpiN 0 0 22 5 0 0 0 0 117 ... 0 0 0 0 0 0 0 0 0 0
2 04QzZ3DVdPsEp9elLR65 0 0 104 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 04sJnMaORYc1SV5pKjrP 0 0 0 0 0 0 1 0 1 ... 0 0 0 0 0 0 0 0 0 0
4 05Kps4iFw8mOLJZQrb1H 0 0 155 0 0 0 0 0 9 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 71319 columns


In [9]:
# Preliminary sorted file generation only.
del(call_graph_function_train_1)
sorted_call_graph_function_train_1.to_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-1.csv', index=False)
sorted_call_graph_function_train_1.head()


Out[9]:
filename DialogFu sub_4 edx+1 fn eax+7 start $+5 esi ebp+var_ ... _4570C4 _4570C0 _4570C8 _457A58 _457A60 _457A5C _40F0D8 1E387BAE "3AC83716 "
1478 01kcPWA9K2BOxQeS5Rju 0 0 0 0 0 0 0 0 3 ... 0 0 0 0 0 0 0 0 0 0
775 04EjIdbPV5e1XroFOpiN 0 0 22 5 0 0 0 0 117 ... 0 0 0 0 0 0 0 0 0 0
1536 04QzZ3DVdPsEp9elLR65 0 0 104 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1180 04sJnMaORYc1SV5pKjrP 0 0 0 0 0 0 1 0 1 ... 0 0 0 0 0 0 0 0 0 0
941 05Kps4iFw8mOLJZQrb1H 0 0 155 0 0 0 0 0 9 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 71319 columns


In [3]:
# Ok, so we still have 71000+ features even after severely reducing the function name lengths.
# This is a problem. Having to process such a huge sparse matrix requires a lot of memory.
# Solution 1: rent an AWS server with plenty-o-ram. (costs money and requires high bandwidth for file transfer)
# Solution 2: buy more RAM for my linux box. (costs money)
# Solution 3: break the sparse matrix into smaller chunks and process individually. (Ok)
# Solution 4: try the pandas sparse matrix data structure. (too slow)
sorted_call_graph_function_train_1.shape


Out[3]:
(2666, 71319)

In [40]:
# Solution 3: slice the matrix into small chunks for processing.
# the pandas spare matrix still takes too long, break up into 10 different feature sets and try again.
onetenth = int(sorted_call_graph_function_train_1.shape[1]/10)
startidx = 1 # skip the filename column
endidx = onetenth
for idx1 in range(1,10):
    print("Processing column set {:d} -> {:d}".format(startidx, endidx))
    X = sorted_call_graph_function_train_1.iloc[:,startidx:endidx]
    y = []
    train_names = sorted_train_labels['Id']
    for fname in sorted_call_graph_function_train_1['filename']:
        # print("Appending {:s}".format(fname))
        for idx2,fname2 in enumerate(sorted_train_labels['Id']):
            if (fname2 == fname):
                y.append(sorted_train_labels.iloc[idx2,1])
                break

    # Find the top 10 percent variance features.
    print(X.shape)
    print(len(y))
    fsp = SelectPercentile(chi2, 10)
    X_new_10 = fsp.fit_transform(X,y)
    selected_names = fsp.get_support(indices=True)
    selected_names = selected_names + 1
    data_trimmed = sorted_call_graph_function_train_1.iloc[:,selected_names]
    data_fnames = pd.DataFrame(sorted_call_graph_function_train_1['filename'])
    data_reduced = data_fnames.join(data_trimmed)
    # Write to file as we do not have enough memory.
    filename = "data/sorted-function-counts-" + str(idx1) + "-10perc.csv"
    print("Writing file: {:s}".format(filename))
    data_reduced.to_csv(filename, index=False)
    startidx = endidx
    endidx += onetenth
    
    
# finish of the remaining columns
print("Processing final column set {:d} -> {:d}".format(startidx, endidx))
X = sorted_call_graph_function_train_1.iloc[:,startidx:]
y = []
train_names = sorted_train_labels['Id']
for fname in sorted_call_graph_function_train_1['filename']:
    for idx1,fname2 in enumerate(sorted_train_labels['Id']):
        if (fname2 == fname):
            y.append(sorted_train_labels.iloc[idx1,1])
            break

# Find the top 10 percent variance features.
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
data_trimmed = sorted_call_graph_function_train_1.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_call_graph_function_train_1['filename'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/sorted-function-counts-10-10perc.csv"
data_reduced.to_csv(filename, index=False)


Processing column set 1 -> 7131
(2666, 7130)
2666
Processing column set 7131 -> 14262
(2666, 7131)
2666
Processing column set 14262 -> 21393
(2666, 7131)
2666
Processing column set 21393 -> 28524
(2666, 7131)
2666
Processing column set 28524 -> 35655
(2666, 7131)
2666
Processing column set 35655 -> 42786
(2666, 7131)
2666
Processing column set 42786 -> 49917
(2666, 7131)
2666
Processing column set 49917 -> 57048
(2666, 7131)
2666
Processing column set 57048 -> 64179
(2666, 7131)
2666
Processing final column set 64179 -> 71310

In [7]:
# Now recombine the reduced sets and perform chi-squared tests again.
fname = "data/sorted-function-counts-1-10perc.csv"
reduced_function_counts = pd.read_csv(fname)
for idx in range(2,11):
    fname = "data/sorted-function-counts-" + str(idx) + "-10perc.csv"
    print("Processing file: {:s}".format(fname))
    nextfc = pd.read_csv(fname)
    reduced_function_counts = pd.merge(reduced_function_counts, nextfc, on='filename')
    

reduced_function_counts.head(20)


Processing file: data/sorted-function-counts-2-10perc.csv
Processing file: data/sorted-function-counts-3-10perc.csv
Processing file: data/sorted-function-counts-4-10perc.csv
Processing file: data/sorted-function-counts-5-10perc.csv
Processing file: data/sorted-function-counts-6-10perc.csv
Processing file: data/sorted-function-counts-7-10perc.csv
Processing file: data/sorted-function-counts-8-10perc.csv
Processing file: data/sorted-function-counts-9-10perc.csv
Processing file: data/sorted-function-counts-10-10perc.csv
Out[7]:
filename edx+1_x fn_x start_x $+5_x ebp+var__x __securi_x eax_x ebx_x edi_x ... MoveFile __itoa PulseEve_y _432D1C_y About ExtAbout_y GetProdu glEvalPo_y glIndexf_y glIndexi
0 01kcPWA9K2BOxQeS5Rju 0 0 0 0 3 0 2 3 2 ... 0 0 0 0 0 0 0 0 0 0
1 04EjIdbPV5e1XroFOpiN 22 5 0 0 117 0 31 53 8 ... 0 0 0 0 0 0 0 0 0 0
2 04QzZ3DVdPsEp9elLR65 104 0 0 0 0 0 0 1 0 ... 0 0 0 5 0 0 0 0 0 0
3 04sJnMaORYc1SV5pKjrP 0 0 0 1 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 05Kps4iFw8mOLJZQrb1H 155 0 0 0 9 0 0 7 1 ... 0 0 0 0 0 0 0 0 0 0
5 05rJTUWYAKNegBk2wE8X 0 12 5 0 563 2 0 75 149 ... 0 0 0 0 0 0 0 0 0 0
6 06aLOj8EUXMByS423sum 343 0 0 0 9 0 0 8 2 ... 0 0 0 0 0 0 0 0 0 0
7 06osXqPUVM1HbvBGNncT 326 0 0 1 1 0 0 110 0 ... 0 0 0 0 0 0 0 0 0 0
8 09sXMJUHwQWVanrhzAoT 29 0 0 1 9 0 0 9 10 ... 0 0 0 0 0 0 0 0 0 0
9 0AguvpOCcaf2myVDYFGb 20 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
10 0BIdbVDEgmPwjYF4xzir 148 0 0 0 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
11 0BKcmNv4iGY2hsVSaXJ6 0 0 0 1 1 0 0 0 0 ... 0 0 0 3 0 0 0 0 0 0
12 0BY2iPso3bEmudlUzpfq 31 0 0 1 0 1 0 2 0 ... 0 0 0 0 0 0 0 0 0 0
13 0BZQIJak6Pu2tyAXfrzR 35 0 0 0 0 0 0 2 0 ... 0 0 0 0 0 0 0 0 0 0
14 0DM3hS6Gg2QVKb1fZydv 14 0 0 0 6 0 0 6 0 ... 0 0 0 0 0 0 0 0 1 0
15 0DTs2PhZfCwEv7q8349K 0 0 0 1 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
16 0DbLeKSoxu47wjqVHsi9 255 0 0 0 6 25 26 7 0 ... 2 1 9 0 0 0 0 0 0 0
17 0DqUX5rkg3IbMY6BLGCE 0 21 1 0 13 0 0 19 21 ... 0 0 0 0 0 0 0 0 0 0
18 0EAdHtLDypMcwjTFJziC 169 0 0 0 8 0 0 7 0 ... 0 0 0 0 0 0 0 0 0 0
19 0G2RV1chBlIbkt6JqA5Q 174 0 0 0 7 0 0 7 0 ... 0 0 0 0 0 0 0 0 0 0

20 rows × 7132 columns


In [8]:
reduced_function_counts.shape


Out[8]:
(2666, 7132)

In [9]:
reduced_function_counts.to_csv('data/reduced-fcounts-1.csv', index=False)

In [ ]:
help(pd.read_csv)

2.3 Process 2 of 4 Function Count Files.


In [12]:
# call_graph_function_train_2 = pd.read_csv('/opt/kaggle/27137-call-graph-function_counts.csv', header=None, names=all_column_names)
# Load the sorted function counts (2/4)
call_graph_function_train_2 = pd.read_csv('/opt/kaggle/5834-call-graph-reduced-function_counts.csv', index_col=False, header=None, names=all_column_names)
sorted_call_graph_function_train_2 = call_graph_function_train_2.sort_values(by='filename')
sorted_call_graph_function_train_2.to_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-2.csv', index=False)
sorted_call_graph_function_train_2.head()
del(call_graph_function_train_2)

In [ ]:
sorted_call_graph_function_train_2 = pd.read_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-2.csv')
sorted_call_graph_function_train_2.head()

In [14]:
# Solution 3: slice the matrix into small chunks for processing.
# the pandas spare matrix still takes too long, break up into 10 different feature sets and try again.
onetenth = int(sorted_call_graph_function_train_2.shape[1]/10)
startidx = 1 # skip the filename column
endidx = onetenth
for idx1 in range(1,10):
    print("Processing column set {:d} -> {:d}".format(startidx, endidx))
    X = sorted_call_graph_function_train_2.iloc[:,startidx:endidx]
    y = []
    train_names = sorted_train_labels['Id']
    for fname in sorted_call_graph_function_train_2['filename']:
        # print("Appending {:s}".format(fname))
        for idx2,fname2 in enumerate(sorted_train_labels['Id']):
            if (fname2 == fname):
                y.append(sorted_train_labels.iloc[idx2,1])
                break

    # Find the top 10 percent variance features.
    print(X.shape)
    print(len(y))
    fsp = SelectPercentile(chi2, 10)
    X_new_10 = fsp.fit_transform(X,y)
    selected_names = fsp.get_support(indices=True)
    selected_names = selected_names + 1
    data_trimmed = sorted_call_graph_function_train_2.iloc[:,selected_names]
    data_fnames = pd.DataFrame(sorted_call_graph_function_train_2['filename'])
    data_reduced = data_fnames.join(data_trimmed)
    # Write to file as we do not have enough memory.
    filename = "data/sorted-function-counts-" + str(idx1) + "-10perc.csv"
    print("Writing file: {:s}".format(filename))
    data_reduced.to_csv(filename, index=False)
    startidx = endidx
    endidx += onetenth
    
    
# finish of the remaining columns
print("Processing final column set {:d} -> {:d}".format(startidx, endidx))
X = sorted_call_graph_function_train_2.iloc[:,startidx:]
y = []
train_names = sorted_train_labels['Id']
for fname in sorted_call_graph_function_train_2['filename']:
    for idx1,fname2 in enumerate(sorted_train_labels['Id']):
        if (fname2 == fname):
            y.append(sorted_train_labels.iloc[idx1,1])
            break

# Find the top 10 percent variance features.
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
data_trimmed = sorted_call_graph_function_train_2.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_call_graph_function_train_2['filename'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/sorted-function-counts-10-10perc.csv"
data_reduced.to_csv(filename, index=False)


Processing column set 1 -> 7131
(2724, 7130)
2724
Writing file: data/sorted-function-counts-1-10perc.csv
Processing column set 7131 -> 14262
(2724, 7131)
2724
Writing file: data/sorted-function-counts-2-10perc.csv
Processing column set 14262 -> 21393
(2724, 7131)
2724
Writing file: data/sorted-function-counts-3-10perc.csv
Processing column set 21393 -> 28524
(2724, 7131)
2724
Writing file: data/sorted-function-counts-4-10perc.csv
Processing column set 28524 -> 35655
(2724, 7131)
2724
Writing file: data/sorted-function-counts-5-10perc.csv
Processing column set 35655 -> 42786
(2724, 7131)
2724
Writing file: data/sorted-function-counts-6-10perc.csv
Processing column set 42786 -> 49917
(2724, 7131)
2724
Writing file: data/sorted-function-counts-7-10perc.csv
Processing column set 49917 -> 57048
(2724, 7131)
2724
Writing file: data/sorted-function-counts-8-10perc.csv
Processing column set 57048 -> 64179
(2724, 7131)
2724
Writing file: data/sorted-function-counts-9-10perc.csv
Processing final column set 64179 -> 71310

In [15]:
# Now recombine the reduced sets and perform chi-squared tests again.
fname = "data/sorted-function-counts-1-10perc.csv"
reduced_function_counts = pd.read_csv(fname)
for idx in range(2,11):
    fname = "data/sorted-function-counts-" + str(idx) + "-10perc.csv"
    print("Processing file: {:s}".format(fname))
    nextfc = pd.read_csv(fname)
    reduced_function_counts = pd.merge(reduced_function_counts, nextfc, on='filename')
    

reduced_function_counts.head(20)


Processing file: data/sorted-function-counts-2-10perc.csv
Processing file: data/sorted-function-counts-3-10perc.csv
Processing file: data/sorted-function-counts-4-10perc.csv
Processing file: data/sorted-function-counts-5-10perc.csv
Processing file: data/sorted-function-counts-6-10perc.csv
Processing file: data/sorted-function-counts-7-10perc.csv
Processing file: data/sorted-function-counts-8-10perc.csv
Processing file: data/sorted-function-counts-9-10perc.csv
Processing file: data/sorted-function-counts-10-10perc.csv
Out[15]:
filename edx+1_x fn_x start_x $+5_x ebp+var__x __securi_x eax_x ebx_x edi_x ... MoveFile_y __itoa PulseEve_y _432D1C_y About ExtAbout GetProdu glEvalPo_y glIndexf glIndexi
0 02K5GMYITj7bBoAisEmD 255 0 0 0 6 25 26 7 0 ... 2 1 9 0 0 0 0 0 0 0
1 02MRILoE6rNhmt7FUi45 113 0 0 0 7 0 0 8 0 ... 0 0 0 0 0 0 0 0 0 1
2 02mlBLHZTDFXGa7Nt6cr 44 0 0 0 0 0 0 2 0 ... 0 0 0 0 0 0 0 0 0 0
3 04BfoQRA6XEshiNuI7pF 0 0 0 1 0 0 0 1 0 ... 0 0 0 10 0 0 0 0 0 0
4 04mcPSei852tgIKUwTJr 148 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
5 05EeG39MTRrI6VY21DPd 0 0 0 0 7 0 19 38 6 ... 0 0 0 0 0 0 0 0 0 0
6 06KfrF7ltESna2ZHPVp5 0 0 0 1 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
7 06QinlpeFIWj8qHc7Vys 53 0 0 1 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
8 06arUi9q3wHS2C8RZxeB 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9 07ECKjDTyQLnabNoxrIH 1000 0 0 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
10 08BX5Slp2I1FraZWbc6j 2231 0 0 1 191 1 0 5 58 ... 0 0 0 0 0 0 0 0 0 0
11 09LXtWxm1EbK5uVqcQS3 150 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0
12 0A32eTdBKayjCWhZqDOQ 141 0 0 0 8 0 0 9 0 ... 0 0 0 0 0 0 0 0 0 0
13 0ASH2csN7k8jZyoRaqtn 55 0 0 0 6 0 0 6 0 ... 0 0 0 0 0 0 0 0 0 0
14 0AwWs42SUQ19mI7eDcTC 0 2 8 0 40 1 0 31 14 ... 0 0 0 0 0 0 0 0 0 0
15 0BFIPv1rO83whtpMYyAs 55 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
16 0Cq4wfhLrKBJiut1lYAZ 84 0 0 0 0 8 0 10 0 ... 0 0 0 0 0 0 0 0 0 0
17 0D9IedmC1viTPugLRWX6 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
18 0DNVFKwYlcjO7bTfJ5p1 0 0 0 0 51 4 84 55 10 ... 0 0 0 0 0 0 0 0 0 0
19 0FdOaDWrfBU6TqwCRYxA 329 0 0 0 0 0 0 110 0 ... 0 0 0 0 0 0 0 0 0 0

20 rows × 7132 columns


In [16]:
reduced_function_counts.shape


Out[16]:
(2724, 7132)

In [17]:
reduced_function_counts.to_csv('data/reduced-fcounts-2.csv', index=False)

2.4 Process 3 of 4 Function Count Files.


In [18]:
#call_graph_function_train_3 = pd.read_csv('/opt/kaggle/27138-call-graph-function_counts.csv', header=None, names=all_column_names)
# Load the sorted function counts (3/4)
call_graph_function_train_3 = pd.read_csv('/opt/kaggle/5835-call-graph-reduced-function_counts.csv', index_col=False, header=None, names=all_column_names)
sorted_call_graph_function_train_3 = call_graph_function_train_3.sort_values(by='filename')
sorted_call_graph_function_train_3.to_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-3.csv', index=False)
sorted_call_graph_function_train_3.head()
del(call_graph_function_train_3)

In [19]:
sorted_call_graph_function_train_3 = pd.read_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-3.csv')
sorted_call_graph_function_train_3.head()


Out[19]:
filename DialogFu sub_4 edx+1 fn eax+7 start $+5 esi ebp+var_ ... _4570C4 _4570C0 _4570C8 _457A58 _457A60 _457A5C _40F0D8 1E387BAE "3AC83716 "
94 01jsnpXSAlgw6aPeDxrU 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2345 02zcUmKV16Lya5xqnPGB 0 0 214 0 0 0 0 0 7 ... 0 0 0 0 0 0 0 0 0 0
450 04cvLCVPqBMs6yn5xGlE 0 0 328 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1451 04hSzLv5s2TDYPlcgpHB 0 0 81 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1402 05IXcWGxvnkto4sq17zZ 0 0 169 10 0 0 0 0 46 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 71319 columns


In [20]:
# Solution 3: slice the matrix into small chunks for processing.
# the pandas spare matrix still takes too long, break up into 10 different feature sets and try again.
onetenth = int(sorted_call_graph_function_train_3.shape[1]/10)
startidx = 1 # skip the filename column
endidx = onetenth
for idx1 in range(1,10):
    print("Processing column set {:d} -> {:d}".format(startidx, endidx))
    X = sorted_call_graph_function_train_3.iloc[:,startidx:endidx]
    y = []
    train_names = sorted_train_labels['Id']
    for fname in sorted_call_graph_function_train_3['filename']:
        # print("Appending {:s}".format(fname))
        for idx2,fname2 in enumerate(sorted_train_labels['Id']):
            if (fname2 == fname):
                y.append(sorted_train_labels.iloc[idx2,1])
                break

    # Find the top 10 percent variance features.
    print(X.shape)
    print(len(y))
    fsp = SelectPercentile(chi2, 10)
    X_new_10 = fsp.fit_transform(X,y)
    selected_names = fsp.get_support(indices=True)
    selected_names = selected_names + 1
    data_trimmed = sorted_call_graph_function_train_3.iloc[:,selected_names]
    data_fnames = pd.DataFrame(sorted_call_graph_function_train_3['filename'])
    data_reduced = data_fnames.join(data_trimmed)
    # Write to file as we do not have enough memory.
    filename = "data/sorted-function-counts-" + str(idx1) + "-10perc.csv"
    print("Writing file: {:s}".format(filename))
    data_reduced.to_csv(filename, index=False)
    startidx = endidx
    endidx += onetenth
    
    
# finish of the remaining columns
print("Processing final column set {:d} -> {:d}".format(startidx, endidx))
X = sorted_call_graph_function_train_3.iloc[:,startidx:]
y = []
train_names = sorted_train_labels['Id']
for fname in sorted_call_graph_function_train_3['filename']:
    for idx1,fname2 in enumerate(sorted_train_labels['Id']):
        if (fname2 == fname):
            y.append(sorted_train_labels.iloc[idx1,1])
            break

# Find the top 10 percent variance features.
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
data_trimmed = sorted_call_graph_function_train_3.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_call_graph_function_train_3['filename'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/sorted-function-counts-10-10perc.csv"
data_reduced.to_csv(filename, index=False)


Processing column set 1 -> 7131
(2742, 7130)
2742
Writing file: data/sorted-function-counts-1-10perc.csv
Processing column set 7131 -> 14262
(2742, 7131)
2742
Writing file: data/sorted-function-counts-2-10perc.csv
Processing column set 14262 -> 21393
(2742, 7131)
2742
Writing file: data/sorted-function-counts-3-10perc.csv
Processing column set 21393 -> 28524
(2742, 7131)
2742
Writing file: data/sorted-function-counts-4-10perc.csv
Processing column set 28524 -> 35655
(2742, 7131)
2742
Writing file: data/sorted-function-counts-5-10perc.csv
Processing column set 35655 -> 42786
(2742, 7131)
2742
Writing file: data/sorted-function-counts-6-10perc.csv
Processing column set 42786 -> 49917
(2742, 7131)
2742
Writing file: data/sorted-function-counts-7-10perc.csv
Processing column set 49917 -> 57048
(2742, 7131)
2742
Writing file: data/sorted-function-counts-8-10perc.csv
Processing column set 57048 -> 64179
(2742, 7131)
2742
Writing file: data/sorted-function-counts-9-10perc.csv
Processing final column set 64179 -> 71310

In [21]:
# Now recombine the reduced sets and perform chi-squared tests again.
fname = "data/sorted-function-counts-1-10perc.csv"
reduced_function_counts = pd.read_csv(fname)
for idx in range(2,11):
    fname = "data/sorted-function-counts-" + str(idx) + "-10perc.csv"
    print("Processing file: {:s}".format(fname))
    nextfc = pd.read_csv(fname)
    reduced_function_counts = pd.merge(reduced_function_counts, nextfc, on='filename')
    

reduced_function_counts.head(20)


Processing file: data/sorted-function-counts-2-10perc.csv
Processing file: data/sorted-function-counts-3-10perc.csv
Processing file: data/sorted-function-counts-4-10perc.csv
Processing file: data/sorted-function-counts-5-10perc.csv
Processing file: data/sorted-function-counts-6-10perc.csv
Processing file: data/sorted-function-counts-7-10perc.csv
Processing file: data/sorted-function-counts-8-10perc.csv
Processing file: data/sorted-function-counts-9-10perc.csv
Processing file: data/sorted-function-counts-10-10perc.csv
Out[21]:
filename edx+1_x fn_x start_x $+5_x ebp+var__x __securi_x eax_x ebx_x edi_x ... MoveFile __itoa_y PulseEve _432D1C_y About ExtAbout GetProdu glEvalPo_y glIndexf_y glIndexi_y
0 01jsnpXSAlgw6aPeDxrU 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 02zcUmKV16Lya5xqnPGB 214 0 0 0 7 0 0 7 0 ... 0 0 0 0 0 0 0 0 0 0
2 04cvLCVPqBMs6yn5xGlE 328 0 0 0 0 0 0 110 0 ... 0 0 0 0 0 0 0 0 0 0
3 04hSzLv5s2TDYPlcgpHB 81 0 0 0 0 1 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
4 05IXcWGxvnkto4sq17zZ 169 10 0 0 46 6 24 9 4 ... 2 1 0 0 0 0 0 0 0 0
5 05LHG8fR3iPn6agIo9z7 43 0 0 0 1 0 0 4 1 ... 0 0 0 0 0 0 0 0 0 0
6 065EZhxgbLRSHsB87uIF 157 0 0 0 10 0 0 7 0 ... 0 0 0 0 0 0 0 0 1 1
7 07iSOIG2urUvsMl9E5Rn 0 0 0 0 7 0 0 7 1 ... 0 0 0 0 0 0 0 0 0 1
8 09CPNMYyQjSguFrE8UOf 557 0 0 1 0 0 0 694 0 ... 0 0 0 0 0 0 0 0 0 0
9 09bfacpUzuBN5W3S8KTo 0 0 0 1 0 0 0 40 1 ... 0 0 0 0 0 0 0 0 0 0
10 0ACDbR5M3ZhBJajygTuf 2 0 0 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
11 0AV6MPlrTWG4fYI7NBtQ 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
12 0B2RwKm6dq9fjUWDNIOa 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
13 0BLbmzJRkjNynCgQIdtV 326 0 0 1 1 0 0 110 0 ... 0 0 0 0 0 0 0 0 0 0
14 0CPaAXtyswrBq83D6VEg 1531 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
15 0CzL6rfwaTqGOu9eghBt 326 0 0 1 0 0 0 110 0 ... 0 0 0 0 0 0 0 0 0 0
16 0Dk7Wd8MERu3b5rmQzCK 0 0 0 1 0 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
17 0Eo9qT6idXHDMebwmvPA 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
18 0F4qIHaR7xOrm19Set3o 178 0 0 0 7 0 0 7 0 ... 0 0 0 0 0 0 0 0 0 0
19 0FOXjzmnD9CUMVcSlEqh 494 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 0 0

20 rows × 7132 columns


In [22]:
reduced_function_counts.shape


Out[22]:
(2742, 7132)

In [23]:
reduced_function_counts.to_csv('data/reduced-fcounts-3.csv', index=False)

In [ ]:


In [ ]:

2.5 Process 4 of 4 Function Count Files.


In [26]:
#call_graph_function_train_4 = pd.read_csv('/opt/kaggle/27139-call-graph-function_counts.csv', header=None, names=all_column_names)
# Load the sorted function counts (2/4)
call_graph_function_train_4 = pd.read_csv('/opt/kaggle/5836-call-graph-reduced-function_counts.csv', index_col=False, header=None, names=all_column_names)
sorted_call_graph_function_train_4 = call_graph_function_train_4.sort_values(by='filename')
sorted_call_graph_function_train_4.to_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-4.csv', index=False)
sorted_call_graph_function_train_4.head()
del(call_graph_function_train_4)

In [27]:
sorted_call_graph_function_train_4 = pd.read_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-4.csv')
sorted_call_graph_function_train_4.head()


Out[27]:
filename DialogFu sub_4 edx+1 fn eax+7 start $+5 esi ebp+var_ ... _4570C4 _4570C0 _4570C8 _457A58 _457A60 _457A5C _40F0D8 1E387BAE "3AC83716 "
1749 01IsoiSMh5gxyDYTl4CB 0 0 216 0 0 0 0 0 10 ... 0 0 0 0 0 0 0 0 0 0
663 01SuzwMJEIXsK7A8dQbl 0 0 80 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1106 01azqd4InC7m9JpocGv5 0 0 1632 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
508 02IOCvYEy8mjiuAQHax3 0 0 0 0 0 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
1539 02JqQ7H3yEoD8viYWlmS 0 0 14 0 0 0 0 0 6 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 71319 columns


In [28]:
# Solution 3: slice the matrix into small chunks for processing.
# the pandas spare matrix still takes too long, break up into 10 different feature sets and try again.
onetenth = int(sorted_call_graph_function_train_4.shape[1]/10)
startidx = 1 # skip the filename column
endidx = onetenth
for idx1 in range(1,10):
    print("Processing column set {:d} -> {:d}".format(startidx, endidx))
    X = sorted_call_graph_function_train_4.iloc[:,startidx:endidx]
    y = []
    train_names = sorted_train_labels['Id']
    for fname in sorted_call_graph_function_train_4['filename']:
        # print("Appending {:s}".format(fname))
        for idx2,fname2 in enumerate(sorted_train_labels['Id']):
            if (fname2 == fname):
                y.append(sorted_train_labels.iloc[idx2,1])
                break

    # Find the top 10 percent variance features.
    print(X.shape)
    print(len(y))
    fsp = SelectPercentile(chi2, 10)
    X_new_10 = fsp.fit_transform(X,y)
    selected_names = fsp.get_support(indices=True)
    selected_names = selected_names + 1
    data_trimmed = sorted_call_graph_function_train_4.iloc[:,selected_names]
    data_fnames = pd.DataFrame(sorted_call_graph_function_train_4['filename'])
    data_reduced = data_fnames.join(data_trimmed)
    # Write to file as we do not have enough memory.
    filename = "data/sorted-function-counts-" + str(idx1) + "-10perc.csv"
    print("Writing file: {:s}".format(filename))
    data_reduced.to_csv(filename, index=False)
    startidx = endidx
    endidx += onetenth
    
    
# finish of the remaining columns
print("Processing final column set {:d} -> {:d}".format(startidx, endidx))
X = sorted_call_graph_function_train_4.iloc[:,startidx:]
y = []
train_names = sorted_train_labels['Id']
for fname in sorted_call_graph_function_train_4['filename']:
    for idx1,fname2 in enumerate(sorted_train_labels['Id']):
        if (fname2 == fname):
            y.append(sorted_train_labels.iloc[idx1,1])
            break

# Find the top 10 percent variance features.
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
data_trimmed = sorted_call_graph_function_train_4.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_call_graph_function_train_4['filename'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/sorted-function-counts-10-10perc.csv"
data_reduced.to_csv(filename, index=False)


Processing column set 1 -> 7131
(2736, 7130)
2736
Writing file: data/sorted-function-counts-1-10perc.csv
Processing column set 7131 -> 14262
(2736, 7131)
2736
Writing file: data/sorted-function-counts-2-10perc.csv
Processing column set 14262 -> 21393
(2736, 7131)
2736
Writing file: data/sorted-function-counts-3-10perc.csv
Processing column set 21393 -> 28524
(2736, 7131)
2736
Writing file: data/sorted-function-counts-4-10perc.csv
Processing column set 28524 -> 35655
(2736, 7131)
2736
Writing file: data/sorted-function-counts-5-10perc.csv
Processing column set 35655 -> 42786
(2736, 7131)
2736
Writing file: data/sorted-function-counts-6-10perc.csv
Processing column set 42786 -> 49917
(2736, 7131)
2736
Writing file: data/sorted-function-counts-7-10perc.csv
Processing column set 49917 -> 57048
(2736, 7131)
2736
Writing file: data/sorted-function-counts-8-10perc.csv
Processing column set 57048 -> 64179
(2736, 7131)
2736
Writing file: data/sorted-function-counts-9-10perc.csv
Processing final column set 64179 -> 71310

In [29]:
# Now recombine the reduced sets and perform chi-squared tests again.
fname = "data/sorted-function-counts-1-10perc.csv"
reduced_function_counts = pd.read_csv(fname)
for idx in range(2,11):
    fname = "data/sorted-function-counts-" + str(idx) + "-10perc.csv"
    print("Processing file: {:s}".format(fname))
    nextfc = pd.read_csv(fname)
    reduced_function_counts = pd.merge(reduced_function_counts, nextfc, on='filename')
    

reduced_function_counts.head(20)


Processing file: data/sorted-function-counts-2-10perc.csv
Processing file: data/sorted-function-counts-3-10perc.csv
Processing file: data/sorted-function-counts-4-10perc.csv
Processing file: data/sorted-function-counts-5-10perc.csv
Processing file: data/sorted-function-counts-6-10perc.csv
Processing file: data/sorted-function-counts-7-10perc.csv
Processing file: data/sorted-function-counts-8-10perc.csv
Processing file: data/sorted-function-counts-9-10perc.csv
Processing file: data/sorted-function-counts-10-10perc.csv
Out[29]:
filename edx+1_x fn_x start_x $+5_x ebp+var__x __securi_x eax_x ebx_x edi_x ... removeAt aout_For 0CFliteD GetEcoMo_y IsValidS SetSetti off_407C __malloc __flush _IndexIn_y
0 01IsoiSMh5gxyDYTl4CB 216 0 0 0 10 0 0 7 0 ... 0 0 0 0 0 0 0 0 0 0
1 01SuzwMJEIXsK7A8dQbl 80 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 01azqd4InC7m9JpocGv5 1632 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 02IOCvYEy8mjiuAQHax3 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 02JqQ7H3yEoD8viYWlmS 14 0 0 0 6 0 0 6 0 ... 0 0 0 0 0 0 0 0 0 0
5 03nJaQV6K2ObICUmyWoR 62 0 0 0 0 0 0 2 0 ... 0 0 0 0 0 0 0 0 0 0
6 05aiMRw13bYWqZ8OHvjl 195 0 0 0 9 0 0 9 0 ... 0 0 0 0 0 0 0 0 0 0
7 07nrG1cLKUPxjOlWMFiV 0 0 0 1 1 0 0 89 0 ... 0 0 0 0 0 0 0 0 0 0
8 0AnoOZDNbPXIr2MRBSCJ 0 0 0 0 15 0 0 6 10 ... 0 0 0 0 0 0 0 0 0 0
9 0BEsCP7NAUy8XmkenHWG 148 0 0 0 8 0 0 7 2 ... 0 0 0 0 0 0 0 0 0 0
10 0C4aVbN58O1nAigFJt9z 482 1 0 0 9 1 0 11 4 ... 0 0 0 0 0 0 0 0 0 0
11 0DTp59Av1RLifoKlUdm7 139 0 0 0 10 0 0 9 0 ... 0 0 0 0 0 0 0 0 0 0
12 0EL7OGZKozbiNCVP61gk 33 0 0 0 3 0 0 2 0 ... 0 0 0 0 0 0 0 0 0 0
13 0FKerJl18xOc3jdoyg4A 113 0 0 0 7 0 0 8 0 ... 0 0 0 0 0 0 0 0 0 0
14 0GKp9ZJclxTABMunIOD2 148 0 0 0 8 0 0 9 0 ... 0 0 0 0 0 0 0 0 0 0
15 0GKzFQ81IYXqUWkmfv26 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
16 0GUIi7xAlODwZ4YBenNM 47 0 0 0 2 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
17 0GvtWEPUBfDAcMbiYVSR 0 0 0 1 0 0 0 89 0 ... 0 0 0 0 0 0 0 0 0 0
18 0HKFs3AXTt1IrOl52eVu 84 0 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
19 0HVAnMrp1LjKDmuoOJFY 63 0 0 0 0 1 0 1 0 ... 0 0 0 0 0 0 0 0 0 0

20 rows × 7132 columns


In [30]:
reduced_function_counts.shape


Out[30]:
(2736, 7132)

In [31]:
reduced_function_counts.to_csv('data/reduced-fcounts-4.csv', index=False)

3. Concatenate the 10% Best Features of the Function Count Sets.


In [4]:
# Now recombine the reduced sets and perform chi-squared tests again. Create a list of dataframes then
# call the pd.concat funtion once as this is more efficient than calling the pd.concat function
# multiple times.

dflist = []
for idx in range(1,5):
    fname = "data/reduced-fcounts-" + str(idx) + ".csv"
    print("Processing file: {:s}".format(fname))
    dflist.append(pd.read_csv(fname))
    

reduced_function_counts = pd.concat(dflist, ignore_index=True)
# Replace all the NaN values with 0
reduced_function_counts.fillna(0, inplace=True)
sorted_reduced_function_counts = reduced_function_counts.sort_values(by='filename')
final_reduced_function_counts = sorted_call_graph_features_train.merge(sorted_reduced_function_counts, on='filename')
final_reduced_function_counts.head(20)


Processing file: data/reduced-fcounts-1.csv
Processing file: data/reduced-fcounts-2.csv
Processing file: data/reduced-fcounts-3.csv
Processing file: data/reduced-fcounts-4.csv
Out[4]:
filename vertex_count edge_count delta_max density """""""""" """""""ignored""" $+5 $+5_x $+5_x.1 ... xallocio xsgetn$b xsputn$b ycpDestr ycpDestr_x ycpDestr_y ycpInitY ycpInitY_x ycpInitY_y yearQDat
0 01IsoiSMh5gxyDYTl4CB 274 333 137 0.081319 0.0 0.0 0.0 0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 01SuzwMJEIXsK7A8dQbl 187 196 82 0.181314 0.0 0.0 0.0 0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 01azqd4InC7m9JpocGv5 158 1533 95 0.140927 0.0 0.0 0.0 0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 01jsnpXSAlgw6aPeDxrU 26 126 35 0.600000 0.0 0.0 0.0 0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 01kcPWA9K2BOxQeS5Rju 61 53 24 0.504762 0.0 0.0 0.0 0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5 02IOCvYEy8mjiuAQHax3 8 5 5 1.666667 0.0 0.0 1.0 1 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
6 02JqQ7H3yEoD8viYWlmS 77 65 22 0.477941 0.0 0.0 0.0 0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
7 02K5GMYITj7bBoAisEmD 779 827 100 0.112044 0.0 0.0 0.0 0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
8 02MRILoE6rNhmt7FUi45 182 201 66 0.140461 0.0 0.0 0.0 0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
9 02mlBLHZTDFXGa7Nt6cr 28 55 28 0.604396 0.0 0.0 0.0 0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
10 02zcUmKV16Lya5xqnPGB 229 303 74 0.095886 0.0 0.0 0.0 0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
11 03nJaQV6K2ObICUmyWoR 32 75 16 1.136364 0.0 0.0 0.0 0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
12 04BfoQRA6XEshiNuI7pF 171 168 9 0.126697 0.0 0.0 1.0 1 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
13 04EjIdbPV5e1XroFOpiN 1570 2900 248 0.010810 0.0 0.0 0.0 0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14 04QzZ3DVdPsEp9elLR65 173 168 10 0.109091 0.0 0.0 0.0 0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
15 04cvLCVPqBMs6yn5xGlE 282 278 6 0.020056 0.0 0.0 0.0 0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
16 04hSzLv5s2TDYPlcgpHB 174 185 87 0.157313 0.0 0.0 0.0 0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
17 04mcPSei852tgIKUwTJr 34 134 28 0.354497 0.0 0.0 0.0 0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
18 04sJnMaORYc1SV5pKjrP 140 137 6 0.144820 0.0 0.0 0.0 1 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
19 05EeG39MTRrI6VY21DPd 585 461 31 0.016625 0.0 0.0 0.0 0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

20 rows × 15615 columns


In [6]:
# Find the top 10 percent variance features.
X = final_reduced_function_counts.iloc[:,1:]
y = sorted_train_labels.iloc[:,1]
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
data_trimmed = final_reduced_function_counts.iloc[:,selected_names]
data_fnames = pd.DataFrame(final_reduced_function_counts['filename'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/final-call-graph-features-10percent.csv"
data_reduced.to_csv(filename, index=False)
data_reduced.head(20)


Out[6]:
filename vertex_count edge_count delta_max density $+5 $+5_x $+5_y $5MGU$ch $6MDU$ch ... subst_x subst_y unkno unkno_x unkno_x.1 unkno_x.2 unkno_y unkno_y.1 unkno_y.2 wpa_hexd
0 01IsoiSMh5gxyDYTl4CB 274 333 137 0.081319 0.0 0 0 0 0.0 ... 0.0 0.0 1.0 1 0.0 0.0 1 0.0 0.0 0.0
1 01SuzwMJEIXsK7A8dQbl 187 196 82 0.181314 0.0 0 0 0 0.0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
2 01azqd4InC7m9JpocGv5 158 1533 95 0.140927 0.0 0 0 0 0.0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
3 01jsnpXSAlgw6aPeDxrU 26 126 35 0.600000 0.0 0 0 0 0.0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
4 01kcPWA9K2BOxQeS5Rju 61 53 24 0.504762 0.0 0 0 0 0.0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
5 02IOCvYEy8mjiuAQHax3 8 5 5 1.666667 1.0 1 1 0 0.0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
6 02JqQ7H3yEoD8viYWlmS 77 65 22 0.477941 0.0 0 0 0 0.0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
7 02K5GMYITj7bBoAisEmD 779 827 100 0.112044 0.0 0 0 0 0.0 ... 0.0 0.0 0.0 1 1.0 1.0 1 1.0 1.0 0.0
8 02MRILoE6rNhmt7FUi45 182 201 66 0.140461 0.0 0 0 0 0.0 ... 0.0 0.0 0.0 1 1.0 1.0 1 1.0 1.0 0.0
9 02mlBLHZTDFXGa7Nt6cr 28 55 28 0.604396 0.0 0 0 0 0.0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
10 02zcUmKV16Lya5xqnPGB 229 303 74 0.095886 0.0 0 0 0 0.0 ... 0.0 0.0 1.0 1 1.0 0.0 1 1.0 0.0 0.0
11 03nJaQV6K2ObICUmyWoR 32 75 16 1.136364 0.0 0 0 0 0.0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
12 04BfoQRA6XEshiNuI7pF 171 168 9 0.126697 1.0 1 1 0 0.0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
13 04EjIdbPV5e1XroFOpiN 1570 2900 248 0.010810 0.0 0 0 0 0.0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
14 04QzZ3DVdPsEp9elLR65 173 168 10 0.109091 0.0 0 0 0 0.0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
15 04cvLCVPqBMs6yn5xGlE 282 278 6 0.020056 0.0 0 0 0 0.0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
16 04hSzLv5s2TDYPlcgpHB 174 185 87 0.157313 0.0 0 0 0 0.0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
17 04mcPSei852tgIKUwTJr 34 134 28 0.354497 0.0 0 0 0 0.0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
18 04sJnMaORYc1SV5pKjrP 140 137 6 0.144820 0.0 1 1 0 0.0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0
19 05EeG39MTRrI6VY21DPd 585 461 31 0.016625 0.0 0 0 0 0.0 ... 0.0 0.0 0.0 0 0.0 0.0 0 0.0 0.0 0.0

20 rows × 1562 columns

4. Perform Some Classification Tests


In [10]:
def run_cv(X,y, clf):

    # Construct a kfolds object
    kf = KFold(len(y),n_folds=10,shuffle=True)
    y_prob = np.zeros((len(y),9))
    y_pred = np.zeros(len(y))
    
    # Iterate through folds
    for train_index, test_index in kf:
        print(test_index, train_index)
        X_train = X.loc[train_index,:]
        X_test = X.loc[test_index,:]
        y_train = y[train_index]

        clf.fit(X_train,y_train)
        y_prob[test_index] = clf.predict_proba(X_test)
        y_pred[test_index] = clf.predict(X_test)
    
    return y_prob, y_pred

In [7]:
ytrain = np.array(y)
X = data_reduced.iloc[:,1:]
X.shape


Out[7]:
(10868, 1561)

In [8]:
y.shape


Out[8]:
(10868,)

In [11]:
# Now we can build a hypothesis
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,ytrain,clf1)
print("logloss = ", log_loss(y, p1))
print("score = ", accuracy_score(ytrain, pred1))
cm = confusion_matrix(y, pred1)
print(cm)


[   57    61    71 ..., 10818 10833 10856] [    0     1     2 ..., 10865 10866 10867]
[    5    16    17 ..., 10844 10847 10850] [    0     1     2 ..., 10865 10866 10867]
[   18    19    25 ..., 10857 10860 10862] [    0     1     2 ..., 10865 10866 10867]
[   10    12    14 ..., 10823 10832 10855] [    0     1     2 ..., 10865 10866 10867]
[    7    11    36 ..., 10822 10831 10839] [    0     1     2 ..., 10865 10866 10867]
[    0    23    35 ..., 10849 10852 10858] [    1     2     3 ..., 10865 10866 10867]
[    1    48    51 ..., 10827 10851 10861] [    0     2     3 ..., 10865 10866 10867]
[    8    20    21 ..., 10846 10853 10859] [    0     1     2 ..., 10865 10866 10867]
[    2     4     6 ..., 10845 10866 10867] [    0     1     3 ..., 10863 10864 10865]
[    3     9    13 ..., 10863 10864 10865] [    0     1     2 ..., 10862 10866 10867]
logloss =  0.0707704951195
score =  0.985645933014
[[1507    1    0   12    0    4    0   16    1]
 [   3 2467    0    2    0    1    0    3    2]
 [   1    0 2936    4    0    1    0    0    0]
 [   3    0    0  470    0    0    1    1    0]
 [   0    1    0    0   38    2    0    1    0]
 [   6    1    1    1    0  738    2    1    1]
 [   0    0    0   11    0    0  387    0    0]
 [  40    1    0   14    1    2    2 1164    4]
 [   1    1    1    1    0    1    0    3 1005]]

5. Test Code Only


In [ ]:
for idx in range(1,10):
    print("Index: {:d}".format(idx))
    filename = "data/sorted-function-counts-" + str(idx) + "-10perc.csv"
    print(filename)

In [ ]:
y = []
train_names = sorted_train_labels['Id']
for fname in sorted_call_graph_function_train_1['filename']:
    # print("Appending {:s}".format(fname))
    for idx,fname2 in enumerate(sorted_train_labels['Id']):
        if (fname2 == fname):
            y.append(sorted_train_labels.iloc[idx,1])
            break
    
print(y)