The original feature set of function calls extracted from the call graphs had
over 130,000 features. Reducing the feature set by truncating the function names
produces a feature set of 71,319 function names. Using chi-squared tests to select
the best 10% of features produces a set of 1561 call graph features.
In [2]:
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import chi2
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import cross_val_score, KFold
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
In [ ]:
In [ ]:
In [11]:
# Preliminary column name setup.
colf = open('data/all-reduced-function-column-names.csv', 'r')
all_column_names = []
column_lines = colf.readlines()
for line in column_lines:
all_column_names += line.split(',')
col_names_len = len(all_column_names)
colf.close()
print("Column Names: {:d}".format(col_names_len))
In [3]:
# First load the .asm training data and training labels
# call_graph_features_train = pd.read_csv('data/call-graph-features-train.csv')
sorted_call_graph_features_train = pd.read_csv('data/sorted-call-graph-features-train.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
In [3]:
#sorted_call_graph_features_train = call_graph_features_train.sort_values(by='filename')
#sorted_call_graph_features_train.to_csv('data/sorted-call-graph-features-train.csv', index=False)
sorted_call_graph_features_train.head()
Out[3]:
In [4]:
sorted_train_labels.head()
Out[4]:
In [ ]:
In [2]:
# Load the sorted function counts (1/4)
#call_graph_function_train_1 = pd.read_csv('/opt/kaggle/5833-call-graph-reduced-function_counts.csv', index_col=False, header=None, names=all_column_names)
#sorted_call_graph_function_train_1 = call_graph_function_train_1.sort_values(by='filename')
sorted_call_graph_function_train_1 = pd.read_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-1.csv')
sorted_call_graph_function_train_1.head()
Out[2]:
In [9]:
# Preliminary sorted file generation only.
del(call_graph_function_train_1)
sorted_call_graph_function_train_1.to_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-1.csv', index=False)
sorted_call_graph_function_train_1.head()
Out[9]:
In [3]:
# Ok, so we still have 71000+ features even after severely reducing the function name lengths.
# This is a problem. Having to process such a huge sparse matrix requires a lot of memory.
# Solution 1: rent an AWS server with plenty-o-ram. (costs money and requires high bandwidth for file transfer)
# Solution 2: buy more RAM for my linux box. (costs money)
# Solution 3: break the sparse matrix into smaller chunks and process individually. (Ok)
# Solution 4: try the pandas sparse matrix data structure. (too slow)
sorted_call_graph_function_train_1.shape
Out[3]:
In [40]:
# Solution 3: slice the matrix into small chunks for processing.
# the pandas spare matrix still takes too long, break up into 10 different feature sets and try again.
onetenth = int(sorted_call_graph_function_train_1.shape[1]/10)
startidx = 1 # skip the filename column
endidx = onetenth
for idx1 in range(1,10):
print("Processing column set {:d} -> {:d}".format(startidx, endidx))
X = sorted_call_graph_function_train_1.iloc[:,startidx:endidx]
y = []
train_names = sorted_train_labels['Id']
for fname in sorted_call_graph_function_train_1['filename']:
# print("Appending {:s}".format(fname))
for idx2,fname2 in enumerate(sorted_train_labels['Id']):
if (fname2 == fname):
y.append(sorted_train_labels.iloc[idx2,1])
break
# Find the top 10 percent variance features.
print(X.shape)
print(len(y))
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
data_trimmed = sorted_call_graph_function_train_1.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_call_graph_function_train_1['filename'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/sorted-function-counts-" + str(idx1) + "-10perc.csv"
print("Writing file: {:s}".format(filename))
data_reduced.to_csv(filename, index=False)
startidx = endidx
endidx += onetenth
# finish of the remaining columns
print("Processing final column set {:d} -> {:d}".format(startidx, endidx))
X = sorted_call_graph_function_train_1.iloc[:,startidx:]
y = []
train_names = sorted_train_labels['Id']
for fname in sorted_call_graph_function_train_1['filename']:
for idx1,fname2 in enumerate(sorted_train_labels['Id']):
if (fname2 == fname):
y.append(sorted_train_labels.iloc[idx1,1])
break
# Find the top 10 percent variance features.
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
data_trimmed = sorted_call_graph_function_train_1.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_call_graph_function_train_1['filename'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/sorted-function-counts-10-10perc.csv"
data_reduced.to_csv(filename, index=False)
In [7]:
# Now recombine the reduced sets and perform chi-squared tests again.
fname = "data/sorted-function-counts-1-10perc.csv"
reduced_function_counts = pd.read_csv(fname)
for idx in range(2,11):
fname = "data/sorted-function-counts-" + str(idx) + "-10perc.csv"
print("Processing file: {:s}".format(fname))
nextfc = pd.read_csv(fname)
reduced_function_counts = pd.merge(reduced_function_counts, nextfc, on='filename')
reduced_function_counts.head(20)
Out[7]:
In [8]:
reduced_function_counts.shape
Out[8]:
In [9]:
reduced_function_counts.to_csv('data/reduced-fcounts-1.csv', index=False)
In [ ]:
help(pd.read_csv)
In [12]:
# call_graph_function_train_2 = pd.read_csv('/opt/kaggle/27137-call-graph-function_counts.csv', header=None, names=all_column_names)
# Load the sorted function counts (2/4)
call_graph_function_train_2 = pd.read_csv('/opt/kaggle/5834-call-graph-reduced-function_counts.csv', index_col=False, header=None, names=all_column_names)
sorted_call_graph_function_train_2 = call_graph_function_train_2.sort_values(by='filename')
sorted_call_graph_function_train_2.to_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-2.csv', index=False)
sorted_call_graph_function_train_2.head()
del(call_graph_function_train_2)
In [ ]:
sorted_call_graph_function_train_2 = pd.read_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-2.csv')
sorted_call_graph_function_train_2.head()
In [14]:
# Solution 3: slice the matrix into small chunks for processing.
# the pandas spare matrix still takes too long, break up into 10 different feature sets and try again.
onetenth = int(sorted_call_graph_function_train_2.shape[1]/10)
startidx = 1 # skip the filename column
endidx = onetenth
for idx1 in range(1,10):
print("Processing column set {:d} -> {:d}".format(startidx, endidx))
X = sorted_call_graph_function_train_2.iloc[:,startidx:endidx]
y = []
train_names = sorted_train_labels['Id']
for fname in sorted_call_graph_function_train_2['filename']:
# print("Appending {:s}".format(fname))
for idx2,fname2 in enumerate(sorted_train_labels['Id']):
if (fname2 == fname):
y.append(sorted_train_labels.iloc[idx2,1])
break
# Find the top 10 percent variance features.
print(X.shape)
print(len(y))
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
data_trimmed = sorted_call_graph_function_train_2.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_call_graph_function_train_2['filename'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/sorted-function-counts-" + str(idx1) + "-10perc.csv"
print("Writing file: {:s}".format(filename))
data_reduced.to_csv(filename, index=False)
startidx = endidx
endidx += onetenth
# finish of the remaining columns
print("Processing final column set {:d} -> {:d}".format(startidx, endidx))
X = sorted_call_graph_function_train_2.iloc[:,startidx:]
y = []
train_names = sorted_train_labels['Id']
for fname in sorted_call_graph_function_train_2['filename']:
for idx1,fname2 in enumerate(sorted_train_labels['Id']):
if (fname2 == fname):
y.append(sorted_train_labels.iloc[idx1,1])
break
# Find the top 10 percent variance features.
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
data_trimmed = sorted_call_graph_function_train_2.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_call_graph_function_train_2['filename'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/sorted-function-counts-10-10perc.csv"
data_reduced.to_csv(filename, index=False)
In [15]:
# Now recombine the reduced sets and perform chi-squared tests again.
fname = "data/sorted-function-counts-1-10perc.csv"
reduced_function_counts = pd.read_csv(fname)
for idx in range(2,11):
fname = "data/sorted-function-counts-" + str(idx) + "-10perc.csv"
print("Processing file: {:s}".format(fname))
nextfc = pd.read_csv(fname)
reduced_function_counts = pd.merge(reduced_function_counts, nextfc, on='filename')
reduced_function_counts.head(20)
Out[15]:
In [16]:
reduced_function_counts.shape
Out[16]:
In [17]:
reduced_function_counts.to_csv('data/reduced-fcounts-2.csv', index=False)
In [18]:
#call_graph_function_train_3 = pd.read_csv('/opt/kaggle/27138-call-graph-function_counts.csv', header=None, names=all_column_names)
# Load the sorted function counts (3/4)
call_graph_function_train_3 = pd.read_csv('/opt/kaggle/5835-call-graph-reduced-function_counts.csv', index_col=False, header=None, names=all_column_names)
sorted_call_graph_function_train_3 = call_graph_function_train_3.sort_values(by='filename')
sorted_call_graph_function_train_3.to_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-3.csv', index=False)
sorted_call_graph_function_train_3.head()
del(call_graph_function_train_3)
In [19]:
sorted_call_graph_function_train_3 = pd.read_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-3.csv')
sorted_call_graph_function_train_3.head()
Out[19]:
In [20]:
# Solution 3: slice the matrix into small chunks for processing.
# the pandas spare matrix still takes too long, break up into 10 different feature sets and try again.
onetenth = int(sorted_call_graph_function_train_3.shape[1]/10)
startidx = 1 # skip the filename column
endidx = onetenth
for idx1 in range(1,10):
print("Processing column set {:d} -> {:d}".format(startidx, endidx))
X = sorted_call_graph_function_train_3.iloc[:,startidx:endidx]
y = []
train_names = sorted_train_labels['Id']
for fname in sorted_call_graph_function_train_3['filename']:
# print("Appending {:s}".format(fname))
for idx2,fname2 in enumerate(sorted_train_labels['Id']):
if (fname2 == fname):
y.append(sorted_train_labels.iloc[idx2,1])
break
# Find the top 10 percent variance features.
print(X.shape)
print(len(y))
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
data_trimmed = sorted_call_graph_function_train_3.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_call_graph_function_train_3['filename'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/sorted-function-counts-" + str(idx1) + "-10perc.csv"
print("Writing file: {:s}".format(filename))
data_reduced.to_csv(filename, index=False)
startidx = endidx
endidx += onetenth
# finish of the remaining columns
print("Processing final column set {:d} -> {:d}".format(startidx, endidx))
X = sorted_call_graph_function_train_3.iloc[:,startidx:]
y = []
train_names = sorted_train_labels['Id']
for fname in sorted_call_graph_function_train_3['filename']:
for idx1,fname2 in enumerate(sorted_train_labels['Id']):
if (fname2 == fname):
y.append(sorted_train_labels.iloc[idx1,1])
break
# Find the top 10 percent variance features.
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
data_trimmed = sorted_call_graph_function_train_3.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_call_graph_function_train_3['filename'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/sorted-function-counts-10-10perc.csv"
data_reduced.to_csv(filename, index=False)
In [21]:
# Now recombine the reduced sets and perform chi-squared tests again.
fname = "data/sorted-function-counts-1-10perc.csv"
reduced_function_counts = pd.read_csv(fname)
for idx in range(2,11):
fname = "data/sorted-function-counts-" + str(idx) + "-10perc.csv"
print("Processing file: {:s}".format(fname))
nextfc = pd.read_csv(fname)
reduced_function_counts = pd.merge(reduced_function_counts, nextfc, on='filename')
reduced_function_counts.head(20)
Out[21]:
In [22]:
reduced_function_counts.shape
Out[22]:
In [23]:
reduced_function_counts.to_csv('data/reduced-fcounts-3.csv', index=False)
In [ ]:
In [ ]:
In [26]:
#call_graph_function_train_4 = pd.read_csv('/opt/kaggle/27139-call-graph-function_counts.csv', header=None, names=all_column_names)
# Load the sorted function counts (2/4)
call_graph_function_train_4 = pd.read_csv('/opt/kaggle/5836-call-graph-reduced-function_counts.csv', index_col=False, header=None, names=all_column_names)
sorted_call_graph_function_train_4 = call_graph_function_train_4.sort_values(by='filename')
sorted_call_graph_function_train_4.to_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-4.csv', index=False)
sorted_call_graph_function_train_4.head()
del(call_graph_function_train_4)
In [27]:
sorted_call_graph_function_train_4 = pd.read_csv('/opt/kaggle/sorted-call-graph-reduced-fcounts-4.csv')
sorted_call_graph_function_train_4.head()
Out[27]:
In [28]:
# Solution 3: slice the matrix into small chunks for processing.
# the pandas spare matrix still takes too long, break up into 10 different feature sets and try again.
onetenth = int(sorted_call_graph_function_train_4.shape[1]/10)
startidx = 1 # skip the filename column
endidx = onetenth
for idx1 in range(1,10):
print("Processing column set {:d} -> {:d}".format(startidx, endidx))
X = sorted_call_graph_function_train_4.iloc[:,startidx:endidx]
y = []
train_names = sorted_train_labels['Id']
for fname in sorted_call_graph_function_train_4['filename']:
# print("Appending {:s}".format(fname))
for idx2,fname2 in enumerate(sorted_train_labels['Id']):
if (fname2 == fname):
y.append(sorted_train_labels.iloc[idx2,1])
break
# Find the top 10 percent variance features.
print(X.shape)
print(len(y))
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
data_trimmed = sorted_call_graph_function_train_4.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_call_graph_function_train_4['filename'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/sorted-function-counts-" + str(idx1) + "-10perc.csv"
print("Writing file: {:s}".format(filename))
data_reduced.to_csv(filename, index=False)
startidx = endidx
endidx += onetenth
# finish of the remaining columns
print("Processing final column set {:d} -> {:d}".format(startidx, endidx))
X = sorted_call_graph_function_train_4.iloc[:,startidx:]
y = []
train_names = sorted_train_labels['Id']
for fname in sorted_call_graph_function_train_4['filename']:
for idx1,fname2 in enumerate(sorted_train_labels['Id']):
if (fname2 == fname):
y.append(sorted_train_labels.iloc[idx1,1])
break
# Find the top 10 percent variance features.
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
data_trimmed = sorted_call_graph_function_train_4.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_call_graph_function_train_4['filename'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/sorted-function-counts-10-10perc.csv"
data_reduced.to_csv(filename, index=False)
In [29]:
# Now recombine the reduced sets and perform chi-squared tests again.
fname = "data/sorted-function-counts-1-10perc.csv"
reduced_function_counts = pd.read_csv(fname)
for idx in range(2,11):
fname = "data/sorted-function-counts-" + str(idx) + "-10perc.csv"
print("Processing file: {:s}".format(fname))
nextfc = pd.read_csv(fname)
reduced_function_counts = pd.merge(reduced_function_counts, nextfc, on='filename')
reduced_function_counts.head(20)
Out[29]:
In [30]:
reduced_function_counts.shape
Out[30]:
In [31]:
reduced_function_counts.to_csv('data/reduced-fcounts-4.csv', index=False)
In [4]:
# Now recombine the reduced sets and perform chi-squared tests again. Create a list of dataframes then
# call the pd.concat funtion once as this is more efficient than calling the pd.concat function
# multiple times.
dflist = []
for idx in range(1,5):
fname = "data/reduced-fcounts-" + str(idx) + ".csv"
print("Processing file: {:s}".format(fname))
dflist.append(pd.read_csv(fname))
reduced_function_counts = pd.concat(dflist, ignore_index=True)
# Replace all the NaN values with 0
reduced_function_counts.fillna(0, inplace=True)
sorted_reduced_function_counts = reduced_function_counts.sort_values(by='filename')
final_reduced_function_counts = sorted_call_graph_features_train.merge(sorted_reduced_function_counts, on='filename')
final_reduced_function_counts.head(20)
Out[4]:
In [6]:
# Find the top 10 percent variance features.
X = final_reduced_function_counts.iloc[:,1:]
y = sorted_train_labels.iloc[:,1]
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
data_trimmed = final_reduced_function_counts.iloc[:,selected_names]
data_fnames = pd.DataFrame(final_reduced_function_counts['filename'])
data_reduced = data_fnames.join(data_trimmed)
# Write to file as we do not have enough memory.
filename = "data/final-call-graph-features-10percent.csv"
data_reduced.to_csv(filename, index=False)
data_reduced.head(20)
Out[6]:
In [10]:
def run_cv(X,y, clf):
# Construct a kfolds object
kf = KFold(len(y),n_folds=10,shuffle=True)
y_prob = np.zeros((len(y),9))
y_pred = np.zeros(len(y))
# Iterate through folds
for train_index, test_index in kf:
print(test_index, train_index)
X_train = X.loc[train_index,:]
X_test = X.loc[test_index,:]
y_train = y[train_index]
clf.fit(X_train,y_train)
y_prob[test_index] = clf.predict_proba(X_test)
y_pred[test_index] = clf.predict(X_test)
return y_prob, y_pred
In [7]:
ytrain = np.array(y)
X = data_reduced.iloc[:,1:]
X.shape
Out[7]:
In [8]:
y.shape
Out[8]:
In [11]:
# Now we can build a hypothesis
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,ytrain,clf1)
print("logloss = ", log_loss(y, p1))
print("score = ", accuracy_score(ytrain, pred1))
cm = confusion_matrix(y, pred1)
print(cm)
In [ ]:
for idx in range(1,10):
print("Index: {:d}".format(idx))
filename = "data/sorted-function-counts-" + str(idx) + "-10perc.csv"
print(filename)
In [ ]:
y = []
train_names = sorted_train_labels['Id']
for fname in sorted_call_graph_function_train_1['filename']:
# print("Appending {:s}".format(fname))
for idx,fname2 in enumerate(sorted_train_labels['Id']):
if (fname2 == fname):
y.append(sorted_train_labels.iloc[idx,1])
break
print(y)