In [1]:
# Just import everything we might need, it saves time.
import warnings
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from time import time
from scipy.stats import randint as sp_randint
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score, KFold, train_test_split
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.svm import SVC
import seaborn as sns
%pylab inline
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
In [2]:
# First load the ASM training data and training labels
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
combined_train_data = pd.read_csv('data/final-combined-train-data-30percent.csv')
call_graph_features_train = pd.read_csv('data/final-call-graph-features-10percent.csv')
In [3]:
sorted_train_labels.head()
Out[3]:
In [4]:
combined_train_data.head()
Out[4]:
In [5]:
call_graph_features_train.head()
Out[5]:
In [3]:
# Merge the original ASM feature set with the call graph feature set.
all_combined_train_data = combined_train_data.merge(call_graph_features_train, on='filename', suffixes=('_asm','_cg'))
all_combined_train_data.head(20)
Out[3]:
In [5]:
all_combined_train_data.to_csv('data/all-combined-train-data-final.csv', index=False)
In [ ]:
# TODO: load and combine 30% best ASM features and 10% best call graph features.
combined_test_data = pd.read_csv('data/final-combined-test-data-30percent.csv')
In [7]:
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
"""Multi class version of Logarithmic Loss metric.
https://www.kaggle.com/wiki/MultiClassLogLoss
Parameters
----------
y_true : array, shape = [n_samples]
true class, intergers in [0, n_classes - 1)
y_pred : array, shape = [n_samples, n_classes]
Returns
-------
loss : float
"""
predictions = np.clip(y_pred, eps, 1 - eps)
# normalize row sums to 1
predictions /= predictions.sum(axis=1)[:, np.newaxis]
actual = np.zeros(y_pred.shape)
n_samples = actual.shape[0]
actual[np.arange(n_samples), y_true.astype(int)] = 1
vectsum = np.sum(actual * np.log(predictions))
loss = -1.0 / n_samples * vectsum
return loss
In [8]:
def run_cv(X,y, clf):
# Construct a kfolds object
kf = KFold(len(y),n_folds=10,shuffle=True)
y_prob = np.zeros((len(y),9))
y_pred = np.zeros(len(y))
# Iterate through folds
for train_index, test_index in kf:
print(test_index, train_index)
X_train = X.loc[train_index,:]
X_test = X.loc[test_index,:]
y_train = y[train_index]
clf.fit(X_train,y_train)
y_prob[test_index] = clf.predict_proba(X_test)
y_pred[test_index] = clf.predict(X_test)
return y_prob, y_pred
In [9]:
# Set our X,y for the classifiers
#all_combined_train_data = pd.read_csv('data/all-combined-train-data-final.csv')
#sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
X = all_combined_train_data.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
yloss = y - 1
In [10]:
all_combined_train_data.head()
Out[10]:
In [11]:
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss, p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)
In [24]:
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)
In [ ]:
combined_train_data['class'] = sorted_train_labels.iloc[:,1]
class1 = X[combined_train_data['class'] == 1]
class2 = X[combined_train_data['class'] == 2]
class3 = X[combined_train_data['class'] == 3]
class4 = X[combined_train_data['class'] == 4]
class5 = X[combined_train_data['class'] == 5]
class6 = X[combined_train_data['class'] == 6]
class7 = X[combined_train_data['class'] == 7]
class8 = X[combined_train_data['class'] == 8]
class9 = X[combined_train_data['class'] == 9]
columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns)
class_stats['mean'][1] = 1.0
class_stats.head()
classx = X[combined_train_data['class'] == i]
classxmean = classx.mean()
classxmean.head()
columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns, dtype=float)
for i in range(1,10):
classx = X[combined_train_data['class'] == i]
class_stats['mean'][i] = classx.mean().sum()
class_stats['std'][i] = classx.std().sum()
#class_stats['corr'][i] = classx.corr().sum()
#class_stats['cov'][i] = classx.cov().sum()
class_stats.head()
plt.figure(figsize=(15,15))
plt.xlabel("means")
plt.ylabel("standard deviation")
plt.scatter(class_stats['mean'], class_stats['std'], c=[ 1,2,3,4,5,6,7,8,9 ], cmap='brg')
In [4]:
name_map = {}
column_names = all_combined_train_data.columns
for cname in column_names:
if cname not in name_map:
name_map[cname] = 1
else:
name_map[cname] += 1
if name_map[cname] > 1:
print("Feature Name: {:s} -> {:d}".format(cname, name_map[cname]))